1
1
# -*- coding: utf-8 -*-
2
2
# pylint: disable-msg=E1101,W0612
3
3
4
- from datetime import datetime , timedelta
4
+ from datetime import datetime , date , timedelta , time
5
5
import pytest
6
6
import re
7
+ from decimal import Decimal
7
8
8
9
from numpy import nan as NA
9
10
import numpy as np
10
11
from numpy .random import randint
11
12
12
- from pandas .compat import range , u
13
+ from pandas .compat import range , u , PY3
13
14
import pandas .compat as compat
14
- from pandas import Index , Series , DataFrame , isna , MultiIndex , notna , concat
15
+ from pandas import (Index , Series , DataFrame , isna , MultiIndex , notna , concat ,
16
+ Timestamp , Period , NaT , Interval )
17
+ import pandas ._libs .lib as lib
15
18
16
19
from pandas .util .testing import assert_series_equal , assert_index_equal
17
20
import pandas .util .testing as tm
@@ -26,6 +29,157 @@ def assert_series_or_index_equal(left, right):
26
29
assert_index_equal (left , right )
27
30
28
31
32
+ # method names plus minimal set of arguments to call
33
+ _all_string_methods = [
34
+ ('get' , [0 ]),
35
+ ('join' , [',' ]),
36
+ ('contains' , ['some_pattern' ]),
37
+ ('match' , ['some_pattern' ]),
38
+ ('count' , ['some_pattern' ]),
39
+ ('startswith' , ['some_pattern' ]),
40
+ ('endswith' , ['some_pattern' ]),
41
+ ('findall' , ['some_pattern' ]),
42
+ ('find' , ['some_pattern' ]),
43
+ ('rfind' , ['some_pattern' ]),
44
+ # because "index"/"rindex" fail (intentionally) if the string is not found
45
+ # (and we're testing on generic data), search only for empty string
46
+ ('index' , ['' ]),
47
+ ('rindex' , ['' ]),
48
+ ('extract' , [r'(some_pattern)' ]),
49
+ ('extractall' , [r'(some_pattern)' ]),
50
+ ('replace' , ['some_pattern' , 'other_pattern' ]),
51
+ ('repeat' , [10 ]),
52
+ ('pad' , [10 ]),
53
+ ('center' , [10 ]),
54
+ ('ljust' , [10 ]),
55
+ ('rjust' , [10 ]),
56
+ ('zfill' , [10 ]),
57
+ ('wrap' , [10 ]),
58
+ ('encode' , ['utf8' ]),
59
+ ('decode' , ['utf8' ]),
60
+ ('translate' , [{97 : 100 }]), # translating 'a' to 'd'
61
+ ('normalize' , ['NFC' ])
62
+ ] + list (zip ([
63
+ # methods without positional arguments: zip with empty tuple
64
+ 'cat' , 'len' , 'split' , 'rsplit' ,
65
+ 'partition' , 'rpartition' , 'get_dummies' ,
66
+ 'slice' , 'slice_replace' ,
67
+ 'strip' , 'lstrip' , 'rstrip' ,
68
+ 'lower' , 'upper' , 'capitalize' ,
69
+ 'title' , 'swapcase' ,
70
+ 'isalpha' , 'isnumeric' , 'isalnum' ,
71
+ 'isdigit' , 'isdecimal' , 'isspace' ,
72
+ 'islower' , 'isupper' , 'istitle'
73
+ ], [tuple ()] * 100 ))
74
+ ids , _ = zip (* _all_string_methods ) # use method name as fixture-id
75
+
76
+
77
+ @pytest .fixture (params = _all_string_methods , ids = ids )
78
+ def all_string_methods (request ):
79
+ """
80
+ Fixture for all public methods of `StringMethods`
81
+
82
+ This fixture returns a tuple of the method name and a list of sample values
83
+ for the required positional arguments of that method.
84
+ """
85
+ return request .param
86
+
87
+
88
+ _all_allowed_skipna_inferred_dtypes = [
89
+ ('string' , ['a' , np .nan , 'c' ]),
90
+ ('unicode' if not PY3 else 'string' , [u ('a' ), np .nan , u ('c' )]),
91
+ ('bytes' if PY3 else 'string' , [b'a' , np .nan , b'c' ]),
92
+ ('empty' , [np .nan , np .nan , np .nan ]),
93
+ ('empty' , []),
94
+ ('mixed-integer' , ['a' , np .nan , 2 ]),
95
+ ('mixed' , ['a' , np .nan , 2.0 ])]
96
+ ids , _ = zip (* _all_allowed_skipna_inferred_dtypes ) # use inferred type as id
97
+
98
+
99
+ @pytest .fixture (params = _all_allowed_skipna_inferred_dtypes , ids = ids )
100
+ def all_allowed_skipna_inferred_dtypes (request ):
101
+ """
102
+ Fixture for all (inferred) dtypes allowed in StringMethods.__init__
103
+
104
+ Returns an np.ndarray that will be inferred to have the given dtype (when
105
+ skipping missing values).
106
+
107
+ The allowed (inferred) types are:
108
+ * 'string'
109
+ * 'unicode' (if PY2)
110
+ * 'empty'
111
+ * 'bytes' (if PY3)
112
+ * 'mixed'
113
+ * 'mixed-integer'
114
+ """
115
+ inferred_dtype , values = request .param
116
+ values = np .array (values , dtype = object ) # object dtype to avoid casting
117
+
118
+ # make sure the inferred dtype of the fixture is as requested
119
+ assert inferred_dtype == lib .infer_dtype (values , skipna = True )
120
+
121
+ return inferred_dtype , values
122
+
123
+
124
+ # categoricals are handled separately
125
+ _all_skipna_inferred_dtypes = _all_allowed_skipna_inferred_dtypes + [
126
+ ('floating' , [1.0 , np .nan , 2.0 ]),
127
+ ('integer' , [1 , np .nan , 2 ]),
128
+ ('mixed-integer-float' , [1 , np .nan , 2.0 ]),
129
+ ('decimal' , [Decimal (1 ), np .nan , Decimal (2 )]),
130
+ ('boolean' , [True , np .nan , False ]),
131
+ ('datetime64' , [np .datetime64 ('2013-01-01' ), np .nan ,
132
+ np .datetime64 ('2018-01-01' )]),
133
+ ('datetime' , [Timestamp ('20130101' ), np .nan , Timestamp ('20180101' )]),
134
+ ('date' , [date (2013 , 1 , 1 ), np .nan , date (2018 , 1 , 1 )]),
135
+ # The following two dtypes are commented out due to GH 23554
136
+ # ('complex', [1 + 1j, np.nan, 2 + 2j]),
137
+ # ('timedelta64', [np.timedelta64(1, 'D'),
138
+ # np.nan, np.timedelta64(2, 'D')]),
139
+ ('timedelta' , [timedelta (1 ), np .nan , timedelta (2 )]),
140
+ ('time' , [time (1 ), np .nan , time (2 )]),
141
+ ('period' , [Period (2013 ), NaT , Period (2018 )]),
142
+ ('interval' , [Interval (0 , 1 ), np .nan , Interval (0 , 2 )])]
143
+ ids , _ = zip (* _all_skipna_inferred_dtypes ) # use inferred type as fixture-id
144
+
145
+
146
+ @pytest .fixture (params = _all_skipna_inferred_dtypes , ids = ids )
147
+ def all_skipna_inferred_dtypes (request ):
148
+ """
149
+ Fixture for all inferred dtypes from _libs.lib.infer_dtype
150
+
151
+ Returns an np.ndarray that will be inferred to have the given dtype (when
152
+ skipping missing values).
153
+
154
+ The covered (inferred) types are:
155
+ * 'string'
156
+ * 'unicode' (if PY2)
157
+ * 'empty'
158
+ * 'bytes' (if PY3)
159
+ * 'mixed'
160
+ * 'mixed-integer'
161
+ * 'mixed-integer-float'
162
+ * 'floating'
163
+ * 'integer'
164
+ * 'decimal'
165
+ * 'boolean'
166
+ * 'datetime64'
167
+ * 'datetime'
168
+ * 'date'
169
+ * 'timedelta'
170
+ * 'time'
171
+ * 'period'
172
+ * 'interval'
173
+ """
174
+ inferred_dtype , values = request .param
175
+ values = np .array (values , dtype = object ) # object dtype to avoid casting
176
+
177
+ # make sure the inferred dtype of the fixture is as requested
178
+ assert inferred_dtype == lib .infer_dtype (values , skipna = True )
179
+
180
+ return inferred_dtype , values
181
+
182
+
29
183
class TestStringMethods (object ):
30
184
31
185
def test_api (self ):
@@ -34,12 +188,115 @@ def test_api(self):
34
188
assert Series .str is strings .StringMethods
35
189
assert isinstance (Series (['' ]).str , strings .StringMethods )
36
190
37
- # GH 9184
38
- invalid = Series ([1 ])
39
- with tm .assert_raises_regex (AttributeError ,
40
- "only use .str accessor" ):
41
- invalid .str
42
- assert not hasattr (invalid , 'str' )
191
+ @pytest .mark .parametrize ('dtype' , [object , 'category' ])
192
+ @pytest .mark .parametrize ('box' , [Series , Index ])
193
+ def test_api_per_dtype (self , box , dtype , all_skipna_inferred_dtypes ):
194
+ # one instance of parametrized fixture
195
+ inferred_dtype , values = all_skipna_inferred_dtypes
196
+
197
+ t = box (values , dtype = dtype ) # explicit dtype to avoid casting
198
+
199
+ # TODO: get rid of these xfails
200
+ if dtype == 'category' and inferred_dtype in ['period' , 'interval' ]:
201
+ pytest .xfail (reason = 'Conversion to numpy array fails because '
202
+ 'the ._values-attribute is not a numpy array for '
203
+ 'PeriodArray/IntervalArray; see GH 23553' )
204
+ if box == Index and inferred_dtype in ['empty' , 'bytes' ]:
205
+ pytest .xfail (reason = 'Raising too restrictively; '
206
+ 'solved by GH 23167' )
207
+ if (box == Index and dtype == object
208
+ and inferred_dtype in ['boolean' , 'date' , 'time' ]):
209
+ pytest .xfail (reason = 'Inferring incorrectly because of NaNs; '
210
+ 'solved by GH 23167' )
211
+ if (box == Series
212
+ and (dtype == object and inferred_dtype not in [
213
+ 'string' , 'unicode' , 'empty' ,
214
+ 'bytes' , 'mixed' , 'mixed-integer' ])
215
+ or (dtype == 'category'
216
+ and inferred_dtype in ['decimal' , 'boolean' , 'time' ])):
217
+ pytest .xfail (reason = 'Not raising correctly; solved by GH 23167' )
218
+
219
+ types_passing_constructor = ['string' , 'unicode' , 'empty' ,
220
+ 'bytes' , 'mixed' , 'mixed-integer' ]
221
+ if inferred_dtype in types_passing_constructor :
222
+ # GH 6106
223
+ assert isinstance (t .str , strings .StringMethods )
224
+ else :
225
+ # GH 9184, GH 23011, GH 23163
226
+ with tm .assert_raises_regex (AttributeError , 'Can only use .str '
227
+ 'accessor with string values.*' ):
228
+ t .str
229
+ assert not hasattr (t , 'str' )
230
+
231
+ @pytest .mark .xfail (reason = 'not correctly raising on master; '
232
+ 'solved by GH 23167' )
233
+ def test_api_mi_raises (self ):
234
+ mi = MultiIndex .from_arrays ([['a' , 'b' , 'c' ]])
235
+ with tm .assert_raises_regex (AttributeError , 'Can only use .str '
236
+ 'accessor with Index, not MultiIndex' ):
237
+ mi .str
238
+ assert not hasattr (mi , 'str' )
239
+
240
+ @pytest .mark .parametrize ('dtype' , [object , 'category' ])
241
+ @pytest .mark .parametrize ('box' , [Series , Index ])
242
+ def test_api_per_method (self , box , dtype ,
243
+ all_allowed_skipna_inferred_dtypes ,
244
+ all_string_methods ):
245
+ # this test does not check correctness of the different methods,
246
+ # just that the methods work on the specified (inferred) dtypes,
247
+ # and raise on all others
248
+
249
+ # one instance of each parametrized fixture
250
+ inferred_dtype , values = all_allowed_skipna_inferred_dtypes
251
+ method_name , minimal_args = all_string_methods
252
+
253
+ # TODO: get rid of these xfails
254
+ if (method_name not in ['encode' , 'decode' , 'len' ]
255
+ and inferred_dtype == 'bytes' ):
256
+ pytest .xfail (reason = 'Not raising for "bytes", see GH 23011;'
257
+ 'Also: malformed method names, see GH 23551; '
258
+ 'solved by GH 23167' )
259
+ if (method_name == 'cat'
260
+ and inferred_dtype in ['mixed' , 'mixed-integer' ]):
261
+ pytest .xfail (reason = 'Bad error message; should raise better; '
262
+ 'solved by GH 23167' )
263
+ if box == Index and inferred_dtype in ['empty' , 'bytes' ]:
264
+ pytest .xfail (reason = 'Raising too restrictively; '
265
+ 'solved by GH 23167' )
266
+ if (box == Index and dtype == object
267
+ and inferred_dtype in ['boolean' , 'date' , 'time' ]):
268
+ pytest .xfail (reason = 'Inferring incorrectly because of NaNs; '
269
+ 'solved by GH 23167' )
270
+ if box == Index and dtype == 'category' :
271
+ pytest .xfail (reason = 'Broken methods on CategoricalIndex; '
272
+ 'see GH 23556' )
273
+ if (method_name in ['partition' , 'rpartition' ] and box == Index
274
+ and inferred_dtype != 'bytes' ):
275
+ pytest .xfail (reason = 'Method not nan-safe on Index; see GH 23558' )
276
+
277
+ t = box (values , dtype = dtype ) # explicit dtype to avoid casting
278
+ method = getattr (t .str , method_name )
279
+
280
+ bytes_allowed = method_name in ['encode' , 'decode' , 'len' ]
281
+ # as of v0.23.4, all methods except 'cat' are very lenient with the
282
+ # allowed data types, just returning NaN for entries that error.
283
+ # This could be changed with an 'errors'-kwarg to the `str`-accessor,
284
+ # see discussion in GH 13877
285
+ mixed_allowed = method_name not in ['cat' ]
286
+
287
+ allowed_types = (['string' , 'unicode' , 'empty' ]
288
+ + ['bytes' ] * bytes_allowed
289
+ + ['mixed' , 'mixed-integer' ] * mixed_allowed )
290
+
291
+ if inferred_dtype in allowed_types :
292
+ method (* minimal_args ) # works!
293
+ else :
294
+ # GH 23011, GH 23163
295
+ msg = ('Cannot use .str.{name} with values of inferred dtype '
296
+ '{inferred_dtype!r}.' .format (name = method_name ,
297
+ inferred_dtype = inferred_dtype ))
298
+ with tm .assert_raises_regex (TypeError , msg ):
299
+ method (* minimal_args )
43
300
44
301
def test_iter (self ):
45
302
# GH3638
0 commit comments