@@ -90,12 +90,39 @@ def _get_path_or_handle(
90
]:
90
]:
91
"""File handling for PyArrow."""
91
"""File handling for PyArrow."""
92
path_or_handle = stringify_path (path )
92
path_or_handle = stringify_path (path )
93
+ if fs is not None :
94
+ pa_fs = import_optional_dependency ("pyarrow.fs" , errors = "ignore" )
95
+ fsspec = import_optional_dependency ("fsspec" , errors = "ignore" )
96
+ if pa_fs is None and fsspec is None :
97
+ raise ValueError (
98
+ f"filesystem must be a pyarrow or fsspec FileSystem, "
99
+ f"not a { type (fs ).__name__ } "
100
+ )
101
+ elif (pa_fs is not None and not isinstance (fs , pa_fs .FileSystem )) and (
102
+ fsspec is not None and not isinstance (fs , fsspec .spec .AbstractFileSystem )
103
+ ):
104
+ raise ValueError (
105
+ f"filesystem must be a pyarrow or fsspec FileSystem, "
106
+ f"not a { type (fs ).__name__ } "
107
+ )
108
+ elif pa_fs is not None and isinstance (fs , pa_fs .FileSystem ) and storage_options :
109
+ raise NotImplementedError (
110
+ "storage_options not supported with a pyarrow FileSystem."
111
+ )
93
if is_fsspec_url (path_or_handle ) and fs is None :
112
if is_fsspec_url (path_or_handle ) and fs is None :
94
- fsspec = import_optional_dependency ("fsspec" )
113
+ if storage_options is None :
114
+ pa = import_optional_dependency ("pyarrow" )
115
+ pa_fs = import_optional_dependency ("pyarrow.fs" )
95
116
96
- fs , path_or_handle = fsspec .core .url_to_fs (
117
+ try :
97
- path_or_handle , ** (storage_options or {})
118
+ fs , path_or_handle = pa_fs .FileSystem .from_uri (path )
98
- )
119
+ except (TypeError , pa .ArrowInvalid ):
120
+ pass
121
+ if fs is None :
122
+ fsspec = import_optional_dependency ("fsspec" )
123
+ fs , path_or_handle = fsspec .core .url_to_fs (
124
+ path_or_handle , ** (storage_options or {})
125
+ )
99
elif storage_options and (not is_url (path_or_handle ) or mode != "rb" ):
126
elif storage_options and (not is_url (path_or_handle ) or mode != "rb" ):
100
# can't write to a remote url
127
# can't write to a remote url
101
# without making use of fsspec at the moment
128
# without making use of fsspec at the moment
@@ -173,6 +200,7 @@ def write(
173
index : bool | None = None ,
200
index : bool | None = None ,
174
storage_options : StorageOptions = None ,
201
storage_options : StorageOptions = None ,
175
partition_cols : list [str ] | None = None ,
202
partition_cols : list [str ] | None = None ,
203
+ filesystem = None ,
176
** kwargs ,
204
** kwargs ,
177
) -> None :
205
) -> None :
178
self .validate_dataframe (df )
206
self .validate_dataframe (df )
@@ -183,9 +211,9 @@ def write(
183
211
184
table = self .api .Table .from_pandas (df , ** from_pandas_kwargs )
212
table = self .api .Table .from_pandas (df , ** from_pandas_kwargs )
185
213
186
- path_or_handle , handles , kwargs [ " filesystem" ] = _get_path_or_handle (
214
+ path_or_handle , handles , filesystem = _get_path_or_handle (
187
path ,
215
path ,
188
- kwargs . pop ( " filesystem" , None ) ,
216
+ filesystem ,
189
storage_options = storage_options ,
217
storage_options = storage_options ,
190
mode = "wb" ,
218
mode = "wb" ,
191
is_dir = partition_cols is not None ,
219
is_dir = partition_cols is not None ,
@@ -207,12 +235,17 @@ def write(
207
path_or_handle ,
235
path_or_handle ,
208
compression = compression ,
236
compression = compression ,
209
partition_cols = partition_cols ,
237
partition_cols = partition_cols ,
238
+ filesystem = filesystem ,
210
** kwargs ,
239
** kwargs ,
211
)
240
)
212
else :
241
else :
213
# write to single output file
242
# write to single output file
214
self .api .parquet .write_table (
243
self .api .parquet .write_table (
215
- table , path_or_handle , compression = compression , ** kwargs
244
+ table ,
245
+ path_or_handle ,
246
+ compression = compression ,
247
+ filesystem = filesystem ,
248
+ ** kwargs ,
216
)
249
)
217
finally :
250
finally :
218
if handles is not None :
251
if handles is not None :
@@ -225,6 +258,7 @@ def read(
225
use_nullable_dtypes : bool = False ,
258
use_nullable_dtypes : bool = False ,
226
dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
259
dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
227
storage_options : StorageOptions = None ,
260
storage_options : StorageOptions = None ,
261
+ filesystem = None ,
228
** kwargs ,
262
** kwargs ,
229
) -> DataFrame :
263
) -> DataFrame :
230
kwargs ["use_pandas_metadata" ] = True
264
kwargs ["use_pandas_metadata" ] = True
@@ -242,15 +276,15 @@ def read(
242
if manager == "array" :
276
if manager == "array" :
243
to_pandas_kwargs ["split_blocks" ] = True # type: ignore[assignment]
277
to_pandas_kwargs ["split_blocks" ] = True # type: ignore[assignment]
244
278
245
- path_or_handle , handles , kwargs [ " filesystem" ] = _get_path_or_handle (
279
+ path_or_handle , handles , filesystem = _get_path_or_handle (
246
path ,
280
path ,
247
- kwargs . pop ( " filesystem" , None ) ,
281
+ filesystem ,
248
storage_options = storage_options ,
282
storage_options = storage_options ,
249
mode = "rb" ,
283
mode = "rb" ,
250
)
284
)
251
try :
285
try :
252
pa_table = self .api .parquet .read_table (
286
pa_table = self .api .parquet .read_table (
253
- path_or_handle , columns = columns , ** kwargs
287
+ path_or_handle , columns = columns , filesystem = filesystem , ** kwargs
254
)
288
)
255
result = pa_table .to_pandas (** to_pandas_kwargs )
289
result = pa_table .to_pandas (** to_pandas_kwargs )
256
290
@@ -279,6 +313,7 @@ def write(
279
index = None ,
313
index = None ,
280
partition_cols = None ,
314
partition_cols = None ,
281
storage_options : StorageOptions = None ,
315
storage_options : StorageOptions = None ,
316
+ filesystem = None ,
282
** kwargs ,
317
** kwargs ,
283
) -> None :
318
) -> None :
284
self .validate_dataframe (df )
319
self .validate_dataframe (df )
@@ -294,6 +329,11 @@ def write(
294
if partition_cols is not None :
329
if partition_cols is not None :
295
kwargs ["file_scheme" ] = "hive"
330
kwargs ["file_scheme" ] = "hive"
296
331
332
+ if filesystem is not None :
333
+ raise NotImplementedError (
334
+ "filesystem is not implemented for the fastparquet engine."
335
+ )
336
+
297
# cannot use get_handle as write() does not accept file buffers
337
# cannot use get_handle as write() does not accept file buffers
298
path = stringify_path (path )
338
path = stringify_path (path )
299
if is_fsspec_url (path ):
339
if is_fsspec_url (path ):
@@ -319,7 +359,12 @@ def write(
319
)
359
)
320
360
321
def read (
361
def read (
322
- self , path , columns = None , storage_options : StorageOptions = None , ** kwargs
362
+ self ,
363
+ path ,
364
+ columns = None ,
365
+ storage_options : StorageOptions = None ,
366
+ filesystem = None ,
367
+ ** kwargs ,
323
) -> DataFrame :
368
) -> DataFrame :
324
parquet_kwargs : dict [str , Any ] = {}
369
parquet_kwargs : dict [str , Any ] = {}
325
use_nullable_dtypes = kwargs .pop ("use_nullable_dtypes" , False )
370
use_nullable_dtypes = kwargs .pop ("use_nullable_dtypes" , False )
@@ -337,6 +382,10 @@ def read(
337
"The 'dtype_backend' argument is not supported for the "
382
"The 'dtype_backend' argument is not supported for the "
338
"fastparquet engine"
383
"fastparquet engine"
339
)
384
)
385
+ if filesystem is not None :
386
+ raise NotImplementedError (
387
+ "filesystem is not implemented for the fastparquet engine."
388
+ )
340
path = stringify_path (path )
389
path = stringify_path (path )
341
handles = None
390
handles = None
342
if is_fsspec_url (path ):
391
if is_fsspec_url (path ):
@@ -376,6 +425,7 @@ def to_parquet(
376
index : bool | None = None ,
425
index : bool | None = None ,
377
storage_options : StorageOptions = None ,
426
storage_options : StorageOptions = None ,
378
partition_cols : list [str ] | None = None ,
427
partition_cols : list [str ] | None = None ,
428
+ filesystem : Any = None ,
379
** kwargs ,
429
** kwargs ,
380
) -> bytes | None :
430
) -> bytes | None :
381
"""
431
"""
@@ -398,6 +448,12 @@ def to_parquet(
398
``io.parquet.engine`` is used. The default ``io.parquet.engine``
448
``io.parquet.engine`` is used. The default ``io.parquet.engine``
399
behavior is to try 'pyarrow', falling back to 'fastparquet' if
449
behavior is to try 'pyarrow', falling back to 'fastparquet' if
400
'pyarrow' is unavailable.
450
'pyarrow' is unavailable.
451
+
452
+ When using the ``'pyarrow'`` engine and no storage options are provided
453
+ and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
454
+ (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
455
+ Use the filesystem keyword with an instantiated fsspec filesystem
456
+ if you wish to use its implementation.
401
compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}},
457
compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}},
402
default 'snappy'. Name of the compression to use. Use ``None``
458
default 'snappy'. Name of the compression to use. Use ``None``
403
for no compression. The supported compression methods actually
459
for no compression. The supported compression methods actually
@@ -420,6 +476,12 @@ def to_parquet(
420
476
421
.. versionadded:: 1.2.0
477
.. versionadded:: 1.2.0
422
478
479
+ filesystem : fsspec or pyarrow filesystem, default None
480
+ Filesystem object to use when reading the parquet file. Only implemented
481
+ for ``engine="pyarrow"``.
482
+
483
+ .. versionadded:: 2.1.0
484
+
423
kwargs
485
kwargs
424
Additional keyword arguments passed to the engine
486
Additional keyword arguments passed to the engine
425
487
@@ -440,6 +502,7 @@ def to_parquet(
440
index = index ,
502
index = index ,
441
partition_cols = partition_cols ,
503
partition_cols = partition_cols ,
442
storage_options = storage_options ,
504
storage_options = storage_options ,
505
+ filesystem = filesystem ,
443
** kwargs ,
506
** kwargs ,
444
)
507
)
445
508
@@ -458,6 +521,7 @@ def read_parquet(
458
storage_options : StorageOptions = None ,
521
storage_options : StorageOptions = None ,
459
use_nullable_dtypes : bool | lib .NoDefault = lib .no_default ,
522
use_nullable_dtypes : bool | lib .NoDefault = lib .no_default ,
460
dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
523
dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
524
+ filesystem : Any = None ,
461
** kwargs ,
525
** kwargs ,
462
) -> DataFrame :
526
) -> DataFrame :
463
"""
527
"""
@@ -480,6 +544,12 @@ def read_parquet(
480
``io.parquet.engine`` is used. The default ``io.parquet.engine``
544
``io.parquet.engine`` is used. The default ``io.parquet.engine``
481
behavior is to try 'pyarrow', falling back to 'fastparquet' if
545
behavior is to try 'pyarrow', falling back to 'fastparquet' if
482
'pyarrow' is unavailable.
546
'pyarrow' is unavailable.
547
+
548
+ When using the ``'pyarrow'`` engine and no storage options are provided
549
+ and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
550
+ (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
551
+ Use the filesystem keyword with an instantiated fsspec filesystem
552
+ if you wish to use its implementation.
483
columns : list, default=None
553
columns : list, default=None
484
If not None, only these columns will be read from the file.
554
If not None, only these columns will be read from the file.
485
555
@@ -508,6 +578,12 @@ def read_parquet(
508
578
509
.. versionadded:: 2.0
579
.. versionadded:: 2.0
510
580
581
+ filesystem : fsspec or pyarrow filesystem, default None
582
+ Filesystem object to use when reading the parquet file. Only implemented
583
+ for ``engine="pyarrow"``.
584
+
585
+ .. versionadded:: 2.1.0
586
+
511
**kwargs
587
**kwargs
512
Any additional kwargs are passed to the engine.
588
Any additional kwargs are passed to the engine.
513
589
@@ -537,5 +613,6 @@ def read_parquet(
537
storage_options = storage_options ,
613
storage_options = storage_options ,
538
use_nullable_dtypes = use_nullable_dtypes ,
614
use_nullable_dtypes = use_nullable_dtypes ,
539
dtype_backend = dtype_backend ,
615
dtype_backend = dtype_backend ,
616
+ filesystem = filesystem ,
540
** kwargs ,
617
** kwargs ,
541
)
618
)
0 commit comments