Skip to content

Commit 9800447

Browse files
committed
TST: add method/dtype coverage to str-accessor; precursor to pandas-dev#23167
1 parent ce62a5c commit 9800447

File tree

1 file changed

+266
-9
lines changed

1 file changed

+266
-9
lines changed

pandas/tests/test_strings.py

Lines changed: 266 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
11
# -*- coding: utf-8 -*-
22
# pylint: disable-msg=E1101,W0612
33

4-
from datetime import datetime, timedelta
4+
from datetime import datetime, date, timedelta, time
55
import pytest
66
import re
7+
from decimal import Decimal
78

89
from numpy import nan as NA
910
import numpy as np
1011
from numpy.random import randint
1112

12-
from pandas.compat import range, u
13+
from pandas.compat import range, u, PY3
1314
import pandas.compat as compat
14-
from pandas import Index, Series, DataFrame, isna, MultiIndex, notna, concat
15+
from pandas import (Index, Series, DataFrame, isna, MultiIndex, notna, concat,
16+
Timestamp, Period, NaT, Interval)
17+
import pandas._libs.lib as lib
1518

1619
from pandas.util.testing import assert_series_equal, assert_index_equal
1720
import pandas.util.testing as tm
@@ -26,6 +29,157 @@ def assert_series_or_index_equal(left, right):
2629
assert_index_equal(left, right)
2730

2831

32+
# method names plus minimal set of arguments to call
33+
_all_string_methods = [
34+
('get', [0]),
35+
('join', [',']),
36+
('contains', ['some_pattern']),
37+
('match', ['some_pattern']),
38+
('count', ['some_pattern']),
39+
('startswith', ['some_pattern']),
40+
('endswith', ['some_pattern']),
41+
('findall', ['some_pattern']),
42+
('find', ['some_pattern']),
43+
('rfind', ['some_pattern']),
44+
# because "index"/"rindex" fail (intentionally) if the string is not found
45+
# (and we're testing on generic data), search only for empty string
46+
('index', ['']),
47+
('rindex', ['']),
48+
('extract', [r'(some_pattern)']),
49+
('extractall', [r'(some_pattern)']),
50+
('replace', ['some_pattern', 'other_pattern']),
51+
('repeat', [10]),
52+
('pad', [10]),
53+
('center', [10]),
54+
('ljust', [10]),
55+
('rjust', [10]),
56+
('zfill', [10]),
57+
('wrap', [10]),
58+
('encode', ['utf8']),
59+
('decode', ['utf8']),
60+
('translate', [{97: 100}]), # translating 'a' to 'd'
61+
('normalize', ['NFC'])
62+
] + list(zip([
63+
# methods without positional arguments: zip with empty tuple
64+
'cat', 'len', 'split', 'rsplit',
65+
'partition', 'rpartition', 'get_dummies',
66+
'slice', 'slice_replace',
67+
'strip', 'lstrip', 'rstrip',
68+
'lower', 'upper', 'capitalize',
69+
'title', 'swapcase',
70+
'isalpha', 'isnumeric', 'isalnum',
71+
'isdigit', 'isdecimal', 'isspace',
72+
'islower', 'isupper', 'istitle'
73+
], [tuple()] * 100))
74+
ids, _ = zip(*_all_string_methods) # use method name as fixture-id
75+
76+
77+
@pytest.fixture(params=_all_string_methods, ids=ids)
78+
def all_string_methods(request):
79+
"""
80+
Fixture for all public methods of `StringMethods`
81+
82+
This fixture returns a tuple of the method name and a list of sample values
83+
for the required positional arguments of that method.
84+
"""
85+
return request.param
86+
87+
88+
_all_allowed_skipna_inferred_dtypes = [
89+
('string', ['a', np.nan, 'c']),
90+
('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]),
91+
('bytes' if PY3 else 'string', [b'a', np.nan, b'c']),
92+
('empty', [np.nan, np.nan, np.nan]),
93+
('empty', []),
94+
('mixed-integer', ['a', np.nan, 2]),
95+
('mixed', ['a', np.nan, 2.0])]
96+
ids, _ = zip(*_all_allowed_skipna_inferred_dtypes) # use inferred type as id
97+
98+
99+
@pytest.fixture(params=_all_allowed_skipna_inferred_dtypes, ids=ids)
100+
def all_allowed_skipna_inferred_dtypes(request):
101+
"""
102+
Fixture for all (inferred) dtypes allowed in StringMethods.__init__
103+
104+
Returns an np.ndarray that will be inferred to have the given dtype (when
105+
skipping missing values).
106+
107+
The allowed (inferred) types are:
108+
* 'string'
109+
* 'unicode' (if PY2)
110+
* 'empty'
111+
* 'bytes' (if PY3)
112+
* 'mixed'
113+
* 'mixed-integer'
114+
"""
115+
inferred_dtype, values = request.param
116+
values = np.array(values, dtype=object) # object dtype to avoid casting
117+
118+
# make sure the inferred dtype of the fixture is as requested
119+
assert inferred_dtype == lib.infer_dtype(values, skipna=True)
120+
121+
return inferred_dtype, values
122+
123+
124+
# categoricals are handled separately
125+
_all_skipna_inferred_dtypes = _all_allowed_skipna_inferred_dtypes + [
126+
('floating', [1.0, np.nan, 2.0]),
127+
('integer', [1, np.nan, 2]),
128+
('mixed-integer-float', [1, np.nan, 2.0]),
129+
('decimal', [Decimal(1), np.nan, Decimal(2)]),
130+
('boolean', [True, np.nan, False]),
131+
('datetime64', [np.datetime64('2013-01-01'), np.nan,
132+
np.datetime64('2018-01-01')]),
133+
('datetime', [Timestamp('20130101'), np.nan, Timestamp('20180101')]),
134+
('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]),
135+
# The following two dtypes are commented out due to GH 23554
136+
# ('complex', [1 + 1j, np.nan, 2 + 2j]),
137+
# ('timedelta64', [np.timedelta64(1, 'D'),
138+
# np.nan, np.timedelta64(2, 'D')]),
139+
('timedelta', [timedelta(1), np.nan, timedelta(2)]),
140+
('time', [time(1), np.nan, time(2)]),
141+
('period', [Period(2013), NaT, Period(2018)]),
142+
('interval', [Interval(0, 1), np.nan, Interval(0, 2)])]
143+
ids, _ = zip(*_all_skipna_inferred_dtypes) # use inferred type as fixture-id
144+
145+
146+
@pytest.fixture(params=_all_skipna_inferred_dtypes, ids=ids)
147+
def all_skipna_inferred_dtypes(request):
148+
"""
149+
Fixture for all inferred dtypes from _libs.lib.infer_dtype
150+
151+
Returns an np.ndarray that will be inferred to have the given dtype (when
152+
skipping missing values).
153+
154+
The covered (inferred) types are:
155+
* 'string'
156+
* 'unicode' (if PY2)
157+
* 'empty'
158+
* 'bytes' (if PY3)
159+
* 'mixed'
160+
* 'mixed-integer'
161+
* 'mixed-integer-float'
162+
* 'floating'
163+
* 'integer'
164+
* 'decimal'
165+
* 'boolean'
166+
* 'datetime64'
167+
* 'datetime'
168+
* 'date'
169+
* 'timedelta'
170+
* 'time'
171+
* 'period'
172+
* 'interval'
173+
"""
174+
inferred_dtype, values = request.param
175+
values = np.array(values, dtype=object) # object dtype to avoid casting
176+
177+
# make sure the inferred dtype of the fixture is as requested
178+
assert inferred_dtype == lib.infer_dtype(values, skipna=True)
179+
180+
return inferred_dtype, values
181+
182+
29183
class TestStringMethods(object):
30184

31185
def test_api(self):
@@ -34,12 +188,115 @@ def test_api(self):
34188
assert Series.str is strings.StringMethods
35189
assert isinstance(Series(['']).str, strings.StringMethods)
36190

37-
# GH 9184
38-
invalid = Series([1])
39-
with tm.assert_raises_regex(AttributeError,
40-
"only use .str accessor"):
41-
invalid.str
42-
assert not hasattr(invalid, 'str')
191+
@pytest.mark.parametrize('dtype', [object, 'category'])
192+
@pytest.mark.parametrize('box', [Series, Index])
193+
def test_api_per_dtype(self, box, dtype, all_skipna_inferred_dtypes):
194+
# one instance of parametrized fixture
195+
inferred_dtype, values = all_skipna_inferred_dtypes
196+
197+
t = box(values, dtype=dtype) # explicit dtype to avoid casting
198+
199+
# TODO: get rid of these xfails
200+
if dtype == 'category' and inferred_dtype in ['period', 'interval']:
201+
pytest.xfail(reason='Conversion to numpy array fails because '
202+
'the ._values-attribute is not a numpy array for '
203+
'PeriodArray/IntervalArray; see GH 23553')
204+
if box == Index and inferred_dtype in ['empty', 'bytes']:
205+
pytest.xfail(reason='Raising too restrictively; '
206+
'solved by GH 23167')
207+
if (box == Index and dtype == object
208+
and inferred_dtype in ['boolean', 'date', 'time']):
209+
pytest.xfail(reason='Inferring incorrectly because of NaNs; '
210+
'solved by GH 23167')
211+
if (box == Series
212+
and (dtype == object and inferred_dtype not in [
213+
'string', 'unicode', 'empty',
214+
'bytes', 'mixed', 'mixed-integer'])
215+
or (dtype == 'category'
216+
and inferred_dtype in ['decimal', 'boolean', 'time'])):
217+
pytest.xfail(reason='Not raising correctly; solved by GH 23167')
218+
219+
types_passing_constructor = ['string', 'unicode', 'empty',
220+
'bytes', 'mixed', 'mixed-integer']
221+
if inferred_dtype in types_passing_constructor:
222+
# GH 6106
223+
assert isinstance(t.str, strings.StringMethods)
224+
else:
225+
# GH 9184, GH 23011, GH 23163
226+
with tm.assert_raises_regex(AttributeError, 'Can only use .str '
227+
'accessor with string values.*'):
228+
t.str
229+
assert not hasattr(t, 'str')
230+
231+
@pytest.mark.xfail(reason='not correctly raising on master; '
232+
'solved by GH 23167')
233+
def test_api_mi_raises(self):
234+
mi = MultiIndex.from_arrays([['a', 'b', 'c']])
235+
with tm.assert_raises_regex(AttributeError, 'Can only use .str '
236+
'accessor with Index, not MultiIndex'):
237+
mi.str
238+
assert not hasattr(mi, 'str')
239+
240+
@pytest.mark.parametrize('dtype', [object, 'category'])
241+
@pytest.mark.parametrize('box', [Series, Index])
242+
def test_api_per_method(self, box, dtype,
243+
all_allowed_skipna_inferred_dtypes,
244+
all_string_methods):
245+
# this test does not check correctness of the different methods,
246+
# just that the methods work on the specified (inferred) dtypes,
247+
# and raise on all others
248+
249+
# one instance of each parametrized fixture
250+
inferred_dtype, values = all_allowed_skipna_inferred_dtypes
251+
method_name, minimal_args = all_string_methods
252+
253+
# TODO: get rid of these xfails
254+
if (method_name not in ['encode', 'decode', 'len']
255+
and inferred_dtype == 'bytes'):
256+
pytest.xfail(reason='Not raising for "bytes", see GH 23011;'
257+
'Also: malformed method names, see GH 23551; '
258+
'solved by GH 23167')
259+
if (method_name == 'cat'
260+
and inferred_dtype in ['mixed', 'mixed-integer']):
261+
pytest.xfail(reason='Bad error message; should raise better; '
262+
'solved by GH 23167')
263+
if box == Index and inferred_dtype in ['empty', 'bytes']:
264+
pytest.xfail(reason='Raising too restrictively; '
265+
'solved by GH 23167')
266+
if (box == Index and dtype == object
267+
and inferred_dtype in ['boolean', 'date', 'time']):
268+
pytest.xfail(reason='Inferring incorrectly because of NaNs; '
269+
'solved by GH 23167')
270+
if box == Index and dtype == 'category':
271+
pytest.xfail(reason='Broken methods on CategoricalIndex; '
272+
'see GH 23556')
273+
if (method_name in ['partition', 'rpartition'] and box == Index
274+
and inferred_dtype != 'bytes'):
275+
pytest.xfail(reason='Method not nan-safe on Index; see GH 23558')
276+
277+
t = box(values, dtype=dtype) # explicit dtype to avoid casting
278+
method = getattr(t.str, method_name)
279+
280+
bytes_allowed = method_name in ['encode', 'decode', 'len']
281+
# as of v0.23.4, all methods except 'cat' are very lenient with the
282+
# allowed data types, just returning NaN for entries that error.
283+
# This could be changed with an 'errors'-kwarg to the `str`-accessor,
284+
# see discussion in GH 13877
285+
mixed_allowed = method_name not in ['cat']
286+
287+
allowed_types = (['string', 'unicode', 'empty']
288+
+ ['bytes'] * bytes_allowed
289+
+ ['mixed', 'mixed-integer'] * mixed_allowed)
290+
291+
if inferred_dtype in allowed_types:
292+
method(*minimal_args) # works!
293+
else:
294+
# GH 23011, GH 23163
295+
msg = ('Cannot use .str.{name} with values of inferred dtype '
296+
'{inferred_dtype!r}.'.format(name=method_name,
297+
inferred_dtype=inferred_dtype))
298+
with tm.assert_raises_regex(TypeError, msg):
299+
method(*minimal_args)
43300

44301
def test_iter(self):
45302
# GH3638

0 commit comments

Comments
 (0)