Skip to content

Commit d6c9941

Browse files
authored
BUG: Integer values at the top end of the supported range incorrectly… (#59310)
* BUG: Integer values at the top end of the supported range incorrectly interpreted as missing for format versions 111 and prior * StataMissingValue expects value passed in to be of float type, so cast to this * Add type hint to StataParser.MISSING_VALUES to avoid mypy error when constructing StataMissingValue from value
1 parent ecea7c3 commit d6c9941

26 files changed

+112
-9
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,7 @@ I/O
583583
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
584584
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
585585
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
586+
- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`)
586587

587588
Period
588589
^^^^^^

pandas/io/stata.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -983,6 +983,19 @@ def __init__(self) -> None:
983983
np.float64(struct.unpack("<d", float64_max)[0]),
984984
),
985985
}
986+
self.OLD_VALID_RANGE = {
987+
"b": (-128, 126),
988+
"h": (-32768, 32766),
989+
"l": (-2147483648, 2147483646),
990+
"f": (
991+
np.float32(struct.unpack("<f", float32_min)[0]),
992+
np.float32(struct.unpack("<f", float32_max)[0]),
993+
),
994+
"d": (
995+
np.float64(struct.unpack("<d", float64_min)[0]),
996+
np.float64(struct.unpack("<d", float64_max)[0]),
997+
),
998+
}
986999

9871000
self.OLD_TYPE_MAPPING = {
9881001
98: 251, # byte
@@ -994,7 +1007,7 @@ def __init__(self) -> None:
9941007

9951008
# These missing values are the generic '.' in Stata, and are used
9961009
# to replace nans
997-
self.MISSING_VALUES = {
1010+
self.MISSING_VALUES: dict[str, int | np.float32 | np.float64] = {
9981011
"b": 101,
9991012
"h": 32741,
10001013
"l": 2147483621,
@@ -1808,11 +1821,18 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
18081821
replacements = {}
18091822
for i in range(len(data.columns)):
18101823
fmt = self._typlist[i]
1811-
if fmt not in self.VALID_RANGE:
1812-
continue
1824+
if self._format_version <= 111:
1825+
if fmt not in self.OLD_VALID_RANGE:
1826+
continue
18131827

1814-
fmt = cast(str, fmt) # only strs in VALID_RANGE
1815-
nmin, nmax = self.VALID_RANGE[fmt]
1828+
fmt = cast(str, fmt) # only strs in OLD_VALID_RANGE
1829+
nmin, nmax = self.OLD_VALID_RANGE[fmt]
1830+
else:
1831+
if fmt not in self.VALID_RANGE:
1832+
continue
1833+
1834+
fmt = cast(str, fmt) # only strs in VALID_RANGE
1835+
nmin, nmax = self.VALID_RANGE[fmt]
18161836
series = data.iloc[:, i]
18171837

18181838
# appreciably faster to do this with ndarray instead of Series
@@ -1827,7 +1847,12 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
18271847
umissing, umissing_loc = np.unique(series[missing], return_inverse=True)
18281848
replacement = Series(series, dtype=object)
18291849
for j, um in enumerate(umissing):
1830-
missing_value = StataMissingValue(um)
1850+
if self._format_version <= 111:
1851+
missing_value = StataMissingValue(
1852+
float(self.MISSING_VALUES[fmt])
1853+
)
1854+
else:
1855+
missing_value = StataMissingValue(um)
18311856

18321857
loc = missing_loc[umissing_loc == j]
18331858
replacement.iloc[loc] = missing_value
703 Bytes
Binary file not shown.
945 Bytes
Binary file not shown.
945 Bytes
Binary file not shown.
945 Bytes
Binary file not shown.
1.1 KB
Binary file not shown.
3.69 KB
Binary file not shown.
3.7 KB
Binary file not shown.
703 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)