From df21418a15ca086ad61bc81c4de51badc6d35db0 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 13 Oct 2020 14:22:36 +0100 Subject: [PATCH 01/12] Allow loading of zipfiles that contain __MACOSX and .DS_STORE hidden folders --- pandas/io/common.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index c147ae9fd0aa8..b6fb901a90512 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -564,7 +564,12 @@ def get_handle( if zf.mode == "w": f = zf elif zf.mode == "r": - zip_names = zf.namelist() + # Ignore hidden folders added by OS X/macOS on .zip creation + zip_names = [ + _ + for _ in zf.namelist() + if not (_.startswith("__MACOSX/") or _.startswith(".DS_STORE")) + ] if len(zip_names) == 1: f = zf.open(zip_names.pop()) elif len(zip_names) == 0: From 7096a0e3ca34bdb7a50265afe71b4c233d25ee23 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 13 Oct 2020 15:56:40 +0100 Subject: [PATCH 02/12] Added test for depickling from zipfiles with hidden folders --- pandas/tests/io/test_pickle.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 2241fe7013568..08da3f0dbdaf3 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -376,6 +376,40 @@ def test_read_infer(self, ext, get_random_path): tm.assert_frame_equal(df, df2) + @pytest.mark.parametrize("cruft", ["__MACOSX/", ".DS_STORE"]) + def test_load_zip_with_hidden_folders(self, cruft, get_random_path): + """Test loading .zip files that have extraneous metadata in hidden folders. """ + base = get_random_path + path1 = base + ".raw" + path2 = base + ".zip" + dummy = base + ".dummy" + compression = "zip" + + with tm.ensure_clean(path1) as p1, tm.ensure_clean( + path2 + ) as p2, tm.ensure_clean(dummy) as d: + df = tm.makeDataFrame() + + # write to uncompressed file + df.to_pickle(p1, compression=None) + + # compress dataframe normally + self.compress_file(p1, p2, compression=compression) + + # add dummy file `{cruft}{dummy}` to the archive + with zipfile.ZipFile(p2, "a", compression=zipfile.ZIP_DEFLATED) as f: + f.write(d, f"{cruft}{dummy}") + + # check the file was definitely added to the archive + with zipfile.ZipFile(p2, "r") as f: + assert f"{cruft}{dummy}" in f.namelist() + + # compressed file by inferred compression method, + # dummy file should have been ignored + df2 = pd.read_pickle(p2) + + tm.assert_frame_equal(df, df2) + # --------------------- # test pickle compression From 73ac7caecde2c784f4e19e56c8d31504d4910d0f Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 13 Oct 2020 15:56:48 +0100 Subject: [PATCH 03/12] Added whatsnew --- doc/source/whatsnew/v1.1.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index aa2c77da4ee6f..ed94e057f119c 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -35,7 +35,7 @@ Bug fixes Other ~~~~~ -- +- Added ability to load pickles from ``.zip`` files created by OS X and macOS containing ``__MACOSX/`` or ``.DS_STORE`` hidden folders/files (:issue:`37098`). .. --------------------------------------------------------------------------- From 51dc013e1ed0cffabd775de803d444e04f44e4af Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Wed, 14 Oct 2020 13:15:04 +0100 Subject: [PATCH 04/12] Added test for read_csv, which will fail for the C parser --- pandas/tests/io/parser/test_compression.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index b773664adda72..4a83b42ab8b43 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -60,6 +60,20 @@ def test_zip_error_multiple_files(parser_and_data, compression): parser.read_csv(path, compression=compression) +@pytest.mark.parametrize("compression", ["zip", "infer"]) +def test_zip_no_error_hidden_files(parser_and_data, compression, python_parser_only): + _, data, expected = parser_and_data + + with tm.ensure_clean("combined_zip.zip") as path: + inner_file_names = ["test_file", "__MACOSX/dummy", ".DS_STORE"] + + with zipfile.ZipFile(path, mode="w") as tmp: + for file_name in inner_file_names: + tmp.writestr(file_name, data) + + python_parser_only.read_csv(path, compression=compression) + + def test_zip_error_no_files(parser_and_data): parser, _, _ = parser_and_data From de6316151c769a350eb36ad7524791e02cecff87 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Wed, 14 Oct 2020 13:16:36 +0100 Subject: [PATCH 05/12] Use f-strings as suggested by code review --- pandas/tests/io/test_pickle.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 08da3f0dbdaf3..a694b63ea5e6f 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -380,9 +380,9 @@ def test_read_infer(self, ext, get_random_path): def test_load_zip_with_hidden_folders(self, cruft, get_random_path): """Test loading .zip files that have extraneous metadata in hidden folders. """ base = get_random_path - path1 = base + ".raw" - path2 = base + ".zip" - dummy = base + ".dummy" + path1 = f"{base}.raw" + path2 = f"{base}.zip" + dummy = f"{base}.dummy" compression = "zip" with tm.ensure_clean(path1) as p1, tm.ensure_clean( From e0a723d1c267951bba980e2efe632e46a06b284a Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Sun, 25 Oct 2020 12:05:05 +0000 Subject: [PATCH 06/12] Tweaked variable names and comments --- pandas/tests/io/test_pickle.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index a694b63ea5e6f..0534c0a4bd7ac 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -378,7 +378,7 @@ def test_read_infer(self, ext, get_random_path): @pytest.mark.parametrize("cruft", ["__MACOSX/", ".DS_STORE"]) def test_load_zip_with_hidden_folders(self, cruft, get_random_path): - """Test loading .zip files that have extraneous metadata in hidden folders. """ + # Test loading .zip files that have extraneous metadata in hidden folders (issue #37098) base = get_random_path path1 = f"{base}.raw" path2 = f"{base}.zip" @@ -387,27 +387,20 @@ def test_load_zip_with_hidden_folders(self, cruft, get_random_path): with tm.ensure_clean(path1) as p1, tm.ensure_clean( path2 - ) as p2, tm.ensure_clean(dummy) as d: - df = tm.makeDataFrame() + ) as p2, tm.ensure_clean(dummy) as dummy_path: - # write to uncompressed file + df = tm.makeDataFrame() df.to_pickle(p1, compression=None) - - # compress dataframe normally self.compress_file(p1, p2, compression=compression) # add dummy file `{cruft}{dummy}` to the archive with zipfile.ZipFile(p2, "a", compression=zipfile.ZIP_DEFLATED) as f: - f.write(d, f"{cruft}{dummy}") - - # check the file was definitely added to the archive + f.write(dummy_path, f"{cruft}{dummy}") with zipfile.ZipFile(p2, "r") as f: assert f"{cruft}{dummy}" in f.namelist() - # compressed file by inferred compression method, - # dummy file should have been ignored + # dummy file should be ignored on reading, otherwise read_pickle will fail df2 = pd.read_pickle(p2) - tm.assert_frame_equal(df, df2) From 2f2187872d01f4d64719bb053ff348351e80e9a3 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Sun, 25 Oct 2020 12:06:24 +0000 Subject: [PATCH 07/12] Moved whatsnew entry to v1.2.0 --- doc/source/whatsnew/v1.1.4.rst | 2 +- doc/source/whatsnew/v1.2.0.rst | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index ed94e057f119c..aa2c77da4ee6f 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -35,7 +35,7 @@ Bug fixes Other ~~~~~ -- Added ability to load pickles from ``.zip`` files created by OS X and macOS containing ``__MACOSX/`` or ``.DS_STORE`` hidden folders/files (:issue:`37098`). +- .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 74534bc371094..ed89e36451763 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -193,6 +193,8 @@ Other enhancements - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`). - :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`) +- Added ability to load pickles from ``.zip`` files created by OS X and macOS containing ``__MACOSX/`` or ``.DS_STORE`` hidden folders/files (:issue:`37098`). + .. _whatsnew_120.api_breaking.python: From cf34ce09902b2f13293e1aa9b419a6c1a7c30a3c Mon Sep 17 00:00:00 2001 From: Matthew Evans <7916000+ml-evs@users.noreply.github.com> Date: Wed, 4 Nov 2020 21:19:23 +0000 Subject: [PATCH 08/12] flake8 --- pandas/tests/io/test_pickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index b680ca9dfd570..00f03961e4eb8 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -395,7 +395,7 @@ def test_read_infer(self, ext, get_random_path): @pytest.mark.parametrize("cruft", ["__MACOSX/", ".DS_STORE"]) def test_load_zip_with_hidden_folders(self, cruft, get_random_path): - # Test loading .zip files that have extraneous metadata in hidden folders (issue #37098) + # Test loading .zip files with platform-specific hidden folders (issue #37098) base = get_random_path path1 = f"{base}.raw" path2 = f"{base}.zip" From c1c0d240a6358284bb903e4de6aca8a1c2c73eb8 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Wed, 4 Nov 2020 21:34:07 +0000 Subject: [PATCH 09/12] black --- pandas/io/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index c1ee73a2b502d..d0ddf671f2022 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -645,7 +645,7 @@ def get_handle( if f.mode == "r": handles.append(f) zip_names = f.namelist() - + # Ignore hidden folders added by OS X/macOS on .zip creation zip_names = [ _ From 59a67d7eedd095f17f8a76d829535a2b5b264d23 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 10 Nov 2020 15:12:45 +0000 Subject: [PATCH 10/12] Fix formatting bug introduced by GH editor --- pandas/io/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index a40ad9df5c385..29cb981e31dea 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -645,14 +645,14 @@ def get_handle( handle = _BytesZipFile(handle, mode, **compression_args) if handle.mode == "r": handles.append(handle) - + # Ignore hidden folders added by OS X/macOS on .zip creation zip_names = [ _ for _ in handle.namelist() if not (_.startswith("__MACOSX/") or _.startswith(".DS_STORE")) ] - + if len(zip_names) == 1: handle = handle.open(zip_names.pop()) elif len(zip_names) == 0: From 9aa59244be91077f234f7326084416ff143a5fdf Mon Sep 17 00:00:00 2001 From: Matthew Evans <7916000+ml-evs@users.noreply.github.com> Date: Thu, 19 Nov 2020 14:54:33 +0000 Subject: [PATCH 11/12] Update doc/source/whatsnew/v1.2.0.rst --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3abe8bbf0c165..e901e6514f27f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -253,7 +253,7 @@ Other enhancements - Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) -- :func:`pandas.load_pickle` can now load from ``.zip`` files created by OS X/macOS that contain ``__MACOSX/`` or ``.DS_STORE`` hidden folders/files (:issue:`37098`). +- :func:`read_pickle` (and other `read_*` functions that handle compressed inputs) can now load from ``.zip`` files created by OS X/macOS that contain ``__MACOSX/`` or ``.DS_STORE`` hidden folders/files (:issue:`37098`). .. --------------------------------------------------------------------------- From bd9e06af6bf2ddf49d15c920e84158f77427f63e Mon Sep 17 00:00:00 2001 From: Matthew Evans <7916000+ml-evs@users.noreply.github.com> Date: Thu, 19 Nov 2020 15:01:53 +0000 Subject: [PATCH 12/12] rst formatting --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e901e6514f27f..e8680d7aae042 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -253,7 +253,7 @@ Other enhancements - Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) -- :func:`read_pickle` (and other `read_*` functions that handle compressed inputs) can now load from ``.zip`` files created by OS X/macOS that contain ``__MACOSX/`` or ``.DS_STORE`` hidden folders/files (:issue:`37098`). +- :func:`read_pickle` (and other ``read_*`` functions that handle compressed inputs) can now load from ``.zip`` files created by OS X/macOS that contain ``__MACOSX/`` or ``.DS_STORE`` hidden folders/files (:issue:`37098`). .. ---------------------------------------------------------------------------