PyArrow Functionality#
pandas can utilize PyArrow to extend functionality and improve the performance of various APIs. This includes:
More extensive data types compared to NumPy
Missing data support (NA) for all data types
Performant IO reader integration
Facilitate interoperability with other dataframe libraries based on the Apache Arrow specification (e.g. polars, cuDF)
To use this functionality, please ensure you have installed the minimum supported PyArrow version.
Data Structure Integration#
A Series, Index, or the columns of a DataFrame can be directly backed by a
which is similar to a NumPy array. To construct these from the main pandas data structures, you can pass in a string of the type followed by
[pyarrow], e.g. "int64[pyarrow]"" into the dtype parameter
In [1]: ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[1], line 1
----> 1 ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/series.py:496, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
493 index = ensure_index(index)
495 if dtype is not None:
--> 496 dtype = self._validate_dtype(dtype)
498 if data is None:
499 index = index if index is not None else default_index(0)
File /usr/lib/python3/dist-packages/pandas/core/generic.py:519, in NDFrame._validate_dtype(cls, dtype)
517 """validate the passed dtype"""
518 if dtype is not None:
--> 519 dtype = pandas_dtype(dtype)
521 # a compound dtype
522 if dtype.kind == "V":
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
1637 return StringDtype(na_value=np.nan)
1639 # registered extension types
-> 1640 result = registry.find(dtype)
1641 if result is not None:
1642 if isinstance(result, type):
1643 # GH 31356, GH 54592
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
574 for dtype_type in self.dtypes:
575 try:
--> 576 return dtype_type.construct_from_string(dtype)
577 except TypeError:
578 pass
File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2251, in ArrowDtype.construct_from_string(cls, string)
2249 base_type = string[:-9] # get rid of "[pyarrow]"
2250 try:
-> 2251 pa_dtype = pa.type_for_alias(base_type)
2252 except ValueError as err:
2253 has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)
NameError: name 'pa' is not defined
In [2]: ser
Out[2]:
0 NaN
1 NaN
2 5.0
3 NaN
4 NaN
5 NaN
6 13.0
7 NaN
8 NaN
dtype: float64
In [3]: idx = pd.Index([True, None], dtype="bool[pyarrow]")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[3], line 1
----> 1 idx = pd.Index([True, None], dtype="bool[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/indexes/base.py:488, in Index.__new__(cls, data, dtype, copy, name, tupleize_cols)
485 name = maybe_extract_name(name, data, cls)
487 if dtype is not None:
--> 488 dtype = pandas_dtype(dtype)
490 data_dtype = getattr(data, "dtype", None)
492 refs = None
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
1637 return StringDtype(na_value=np.nan)
1639 # registered extension types
-> 1640 result = registry.find(dtype)
1641 if result is not None:
1642 if isinstance(result, type):
1643 # GH 31356, GH 54592
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
574 for dtype_type in self.dtypes:
575 try:
--> 576 return dtype_type.construct_from_string(dtype)
577 except TypeError:
578 pass
File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2251, in ArrowDtype.construct_from_string(cls, string)
2249 base_type = string[:-9] # get rid of "[pyarrow]"
2250 try:
-> 2251 pa_dtype = pa.type_for_alias(base_type)
2252 except ValueError as err:
2253 has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)
NameError: name 'pa' is not defined
In [4]: idx
Out[4]: [0.0, 1.0, 10.0]
In [5]: df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[5], line 1
----> 1 df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/frame.py:708, in DataFrame.__init__(self, data, index, columns, dtype, copy)
706 allow_mgr = False
707 if dtype is not None:
--> 708 dtype = self._validate_dtype(dtype)
710 if isinstance(data, DataFrame):
711 data = data._mgr
File /usr/lib/python3/dist-packages/pandas/core/generic.py:519, in NDFrame._validate_dtype(cls, dtype)
517 """validate the passed dtype"""
518 if dtype is not None:
--> 519 dtype = pandas_dtype(dtype)
521 # a compound dtype
522 if dtype.kind == "V":
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
1637 return StringDtype(na_value=np.nan)
1639 # registered extension types
-> 1640 result = registry.find(dtype)
1641 if result is not None:
1642 if isinstance(result, type):
1643 # GH 31356, GH 54592
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
574 for dtype_type in self.dtypes:
575 try:
--> 576 return dtype_type.construct_from_string(dtype)
577 except TypeError:
578 pass
File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2251, in ArrowDtype.construct_from_string(cls, string)
2249 base_type = string[:-9] # get rid of "[pyarrow]"
2250 try:
-> 2251 pa_dtype = pa.type_for_alias(base_type)
2252 except ValueError as err:
2253 has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)
NameError: name 'pa' is not defined
In [6]: df
Out[6]:
a b
0 xxx yyy
1 ¡¡ ¡¡
Note
The string alias "string[pyarrow]" maps to pd.StringDtype("pyarrow") which is not equivalent to
specifying dtype=pd.ArrowDtype(pa.string()). Generally, operations on the data will behave similarly
except pd.StringDtype("pyarrow") can return NumPy-backed nullable types while pd.ArrowDtype(pa.string())
will return ArrowDtype.
In [7]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[7], line 1
----> 1 import pyarrow as pa
ModuleNotFoundError: No module named 'pyarrow'
In [8]: data = list("abc")
In [9]: ser_sd = pd.Series(data, dtype="string[pyarrow]")
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Cell In[9], line 1
----> 1 ser_sd = pd.Series(data, dtype="string[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/series.py:496, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
493 index = ensure_index(index)
495 if dtype is not None:
--> 496 dtype = self._validate_dtype(dtype)
498 if data is None:
499 index = index if index is not None else default_index(0)
File /usr/lib/python3/dist-packages/pandas/core/generic.py:519, in NDFrame._validate_dtype(cls, dtype)
517 """validate the passed dtype"""
518 if dtype is not None:
--> 519 dtype = pandas_dtype(dtype)
521 # a compound dtype
522 if dtype.kind == "V":
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
1637 return StringDtype(na_value=np.nan)
1639 # registered extension types
-> 1640 result = registry.find(dtype)
1641 if result is not None:
1642 if isinstance(result, type):
1643 # GH 31356, GH 54592
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
574 for dtype_type in self.dtypes:
575 try:
--> 576 return dtype_type.construct_from_string(dtype)
577 except TypeError:
578 pass
File /usr/lib/python3/dist-packages/pandas/core/arrays/string_.py:275, in StringDtype.construct_from_string(cls, string)
273 return cls(storage="python")
274 elif string == "string[pyarrow]":
--> 275 return cls(storage="pyarrow")
276 elif string == "string[pyarrow_numpy]":
277 # this is deprecated in the dtype __init__, remove this in pandas 3.0
278 return cls(storage="pyarrow_numpy")
File /usr/lib/python3/dist-packages/pandas/core/arrays/string_.py:182, in StringDtype.__init__(self, storage, na_value)
178 raise ValueError(
179 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
180 )
181 if storage == "pyarrow" and pa_version_under10p1:
--> 182 raise ImportError(
183 "pyarrow>=10.0.1 is required for PyArrow backed StringArray."
184 )
186 if isinstance(na_value, float) and np.isnan(na_value):
187 # when passed a NaN value, always set to np.nan to ensure we use
188 # a consistent NaN value (and we can use `dtype.na_value is np.nan`)
189 na_value = np.nan
ImportError: pyarrow>=10.0.1 is required for PyArrow backed StringArray.
In [10]: ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[10], line 1
----> 1 ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
NameError: name 'pa' is not defined
In [11]: ser_ad.dtype == ser_sd.dtype
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[11], line 1
----> 1 ser_ad.dtype == ser_sd.dtype
NameError: name 'ser_ad' is not defined
In [12]: ser_sd.str.contains("a")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[12], line 1
----> 1 ser_sd.str.contains("a")
NameError: name 'ser_sd' is not defined
In [13]: ser_ad.str.contains("a")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[13], line 1
----> 1 ser_ad.str.contains("a")
NameError: name 'ser_ad' is not defined
For PyArrow types that accept parameters, you can pass in a PyArrow type with those parameters
into ArrowDtype to use in the dtype parameter.
In [14]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[14], line 1
----> 1 import pyarrow as pa
ModuleNotFoundError: No module named 'pyarrow'
In [15]: list_str_type = pa.list_(pa.string())
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 1
----> 1 list_str_type = pa.list_(pa.string())
NameError: name 'pa' is not defined
In [16]: ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[16], line 1
----> 1 ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))
NameError: name 'list_str_type' is not defined
In [17]: ser
Out[17]:
0 NaN
1 NaN
2 5.0
3 NaN
4 NaN
5 NaN
6 13.0
7 NaN
8 NaN
dtype: float64
In [18]: from datetime import time
In [19]: idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[19], line 1
----> 1 idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))
NameError: name 'pa' is not defined
In [20]: idx
Out[20]: [0.0, 1.0, 10.0]
In [21]: from decimal import Decimal
In [22]: decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[22], line 1
----> 1 decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))
NameError: name 'pa' is not defined
In [23]: data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]
In [24]: df = pd.DataFrame(data, dtype=decimal_type)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[24], line 1
----> 1 df = pd.DataFrame(data, dtype=decimal_type)
NameError: name 'decimal_type' is not defined
In [25]: df
Out[25]:
a b
0 xxx yyy
1 ¡¡ ¡¡
If you already have an or ,
you can pass it into arrays.ArrowExtensionArray to construct the associated Series, Index
or DataFrame object.
In [26]: pa_array = pa.array(
....: [{"1": "2"}, {"10": "20"}, None],
....: type=pa.map_(pa.string(), pa.string()),
....: )
....:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[26], line 1
----> 1 pa_array = pa.array(
2 [{"1": "2"}, {"10": "20"}, None],
3 type=pa.map_(pa.string(), pa.string()),
4 )
NameError: name 'pa' is not defined
In [27]: ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[27], line 1
----> 1 ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))
NameError: name 'pa_array' is not defined
In [28]: ser
Out[28]:
0 NaN
1 NaN
2 5.0
3 NaN
4 NaN
5 NaN
6 13.0
7 NaN
8 NaN
dtype: float64
To retrieve a pyarrow from a Series or Index, you can call
the pyarrow array constructor on the Series or Index.
In [29]: ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[29], line 1
----> 1 ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/series.py:496, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
493 index = ensure_index(index)
495 if dtype is not None:
--> 496 dtype = self._validate_dtype(dtype)
498 if data is None:
499 index = index if index is not None else default_index(0)
File /usr/lib/python3/dist-packages/pandas/core/generic.py:519, in NDFrame._validate_dtype(cls, dtype)
517 """validate the passed dtype"""
518 if dtype is not None:
--> 519 dtype = pandas_dtype(dtype)
521 # a compound dtype
522 if dtype.kind == "V":
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
1637 return StringDtype(na_value=np.nan)
1639 # registered extension types
-> 1640 result = registry.find(dtype)
1641 if result is not None:
1642 if isinstance(result, type):
1643 # GH 31356, GH 54592
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
574 for dtype_type in self.dtypes:
575 try:
--> 576 return dtype_type.construct_from_string(dtype)
577 except TypeError:
578 pass
File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2251, in ArrowDtype.construct_from_string(cls, string)
2249 base_type = string[:-9] # get rid of "[pyarrow]"
2250 try:
-> 2251 pa_dtype = pa.type_for_alias(base_type)
2252 except ValueError as err:
2253 has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)
NameError: name 'pa' is not defined
In [30]: pa.array(ser)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[30], line 1
----> 1 pa.array(ser)
NameError: name 'pa' is not defined
In [31]: idx = pd.Index(ser)
In [32]: pa.array(idx)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[32], line 1
----> 1 pa.array(idx)
NameError: name 'pa' is not defined
To convert a to a DataFrame, you can call the
method with types_mapper=pd.ArrowDtype.
In [33]: table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[33], line 1
----> 1 table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])
NameError: name 'pa' is not defined
In [34]: df = table.to_pandas(types_mapper=pd.ArrowDtype)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-34-64ec62289cb4> in ?()
----> 1 df = table.to_pandas(types_mapper=pd.ArrowDtype)
/usr/lib/python3/dist-packages/pandas/core/generic.py in ?(self, name)
6317 and name not in self._accessors
6318 and self._info_axis._can_hold_identifiers_and_holds_name(name)
6319 ):
6320 return self[name]
-> 6321 return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'to_pandas'
In [35]: df
Out[35]:
a b
0 xxx yyy
1 ¡¡ ¡¡
In [36]: df.dtypes
Out[36]:
a object
b object
dtype: object
Operations#
PyArrow data structure integration is implemented through pandas’ ExtensionArray interface;
therefore, supported functionality exists where this interface is integrated within the pandas API. Additionally, this functionality
is accelerated with PyArrow compute functions where available. This includes:
Numeric aggregations
Numeric arithmetic
Numeric rounding
Logical and comparison functions
String functionality
Datetime functionality
The following are just some examples of operations that are accelerated by native PyArrow compute functions.
In [37]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[37], line 1
----> 1 import pyarrow as pa
ModuleNotFoundError: No module named 'pyarrow'
In [38]: ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[38], line 1
----> 1 ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/series.py:496, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
493 index = ensure_index(index)
495 if dtype is not None:
--> 496 dtype = self._validate_dtype(dtype)
498 if data is None:
499 index = index if index is not None else default_index(0)
File /usr/lib/python3/dist-packages/pandas/core/generic.py:519, in NDFrame._validate_dtype(cls, dtype)
517 """validate the passed dtype"""
518 if dtype is not None:
--> 519 dtype = pandas_dtype(dtype)
521 # a compound dtype
522 if dtype.kind == "V":
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1640, in pandas_dtype(dtype)
1637 return StringDtype(na_value=np.nan)
1639 # registered extension types
-> 1640 result = registry.find(dtype)
1641 if result is not None:
1642 if isinstance(result, type):
1643 # GH 31356, GH 54592
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:576, in Registry.find(self, dtype)
574 for dtype_type in self.dtypes:
575 try:
--> 576 return dtype_type.construct_from_string(dtype)
577 except TypeError:
578 pass
File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2251, in ArrowDtype.construct_from_string(cls, string)
2249 base_type = string[:-9] # get rid of "[pyarrow]"
2250 try:
-> 2251 pa_dtype = pa.type_for_alias(base_type)
2252 except ValueError as err:
2253 has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)
NameError: name 'pa' is not defined
In [39]: ser.mean()
Out[39]: np.float64(9.0)
In [40]: ser + ser
Out[40]:
0 NaN
1 NaN
2 10.0
3 NaN
4 NaN
5 NaN
6 26.0
7 NaN
8 NaN
dtype: float64
In [41]: ser > (ser + 1)
Out[41]:
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
dtype: bool
In [42]: ser.dropna()
Out[42]:
2 5.0
6 13.0
dtype: float64
In [43]: ser.isna()
Out[43]:
0 True
1 True
2 False
3 True
4 True
5 True
6 False
7 True
8 True
dtype: bool
In [44]: ser.fillna(0)
Out[44]:
0 0.0
1 0.0
2 5.0
3 0.0
4 0.0
5 0.0
6 13.0
7 0.0
8 0.0
dtype: float64
In [45]: ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[45], line 1
----> 1 ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))
NameError: name 'pa' is not defined
In [46]: ser_str.str.startswith("a")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[46], line 1
----> 1 ser_str.str.startswith("a")
NameError: name 'ser_str' is not defined
In [47]: from datetime import datetime
In [48]: pa_type = pd.ArrowDtype(pa.timestamp("ns"))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[48], line 1
----> 1 pa_type = pd.ArrowDtype(pa.timestamp("ns"))
NameError: name 'pa' is not defined
In [49]: ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[49], line 1
----> 1 ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)
NameError: name 'pa_type' is not defined
In [50]: ser_dt.dt.strftime("%Y-%m")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[50], line 1
----> 1 ser_dt.dt.strftime("%Y-%m")
NameError: name 'ser_dt' is not defined
I/O Reading#
PyArrow also provides IO reading functionality that has been integrated into several pandas IO readers. The following
functions provide an engine keyword that can dispatch to PyArrow to accelerate reading from an IO source.
In [51]: import io
In [52]: data = io.StringIO("""a,b,c
....: 1,2.5,True
....: 3,4.5,False
....: """)
....:
In [53]: df = pd.read_csv(data, engine="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:140, in import_optional_dependency(name, extra, errors, min_version)
139 try:
--> 140 module = importlib.import_module(name)
141 except ImportError:
File /usr/lib/python3.13/importlib/__init__.py:88, in import_module(name, package)
87 level += 1
---> 88 return _bootstrap._gcd_import(name[level:], package, level)
File <frozen importlib._bootstrap>:1387, in _gcd_import(name, package, level)
File <frozen importlib._bootstrap>:1360, in _find_and_load(name, import_)
File <frozen importlib._bootstrap>:1324, in _find_and_load_unlocked(name, import_)
ModuleNotFoundError: No module named 'pyarrow'
During handling of the above exception, another exception occurred:
ImportError Traceback (most recent call last)
Cell In[53], line 1
----> 1 df = pd.read_csv(data, engine="pyarrow")
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...)
1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:626, in _read(filepath_or_buffer, kwds)
623 return parser
625 with parser:
--> 626 return parser.read(nrows)
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1911, in TextFileReader.read(self, nrows)
1908 if self.engine == "pyarrow":
1909 try:
1910 # error: "ParserBase" has no attribute "read"
-> 1911 df = self._engine.read() # type: ignore[attr-defined]
1912 except Exception:
1913 self.close()
File /usr/lib/python3/dist-packages/pandas/io/parsers/arrow_parser_wrapper.py:239, in ArrowParserWrapper.read(self)
228 def read(self) -> DataFrame:
229 """
230 Reads the contents of a CSV file into a DataFrame and
231 processes it according to the kwargs passed in the
(...)
237 The DataFrame created from the CSV file.
238 """
--> 239 pa = import_optional_dependency("pyarrow")
240 pyarrow_csv = import_optional_dependency("pyarrow.csv")
241 self._get_pyarrow_options()
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:143, in import_optional_dependency(name, extra, errors, min_version)
141 except ImportError:
142 if errors == "raise":
--> 143 raise ImportError(msg)
144 return None
146 # Handle submodules: if we have submodule, grab parent module from sys.modules
ImportError: Missing optional dependency 'pyarrow'. Use pip or conda to install pyarrow.
In [54]: df
Out[54]:
a b
0 xxx yyy
1 ¡¡ ¡¡
By default, these functions and all other IO reader functions return NumPy-backed data. These readers can return
PyArrow-backed data by specifying the parameter dtype_backend="pyarrow". A reader does not need to set
engine="pyarrow" to necessarily return PyArrow-backed data.
In [55]: import io
In [56]: data = io.StringIO("""a,b,c,d,e,f,g,h,i
....: 1,2.5,True,a,,,,,
....: 3,4.5,False,b,6,7.5,True,a,
....: """)
....:
In [57]: df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:140, in import_optional_dependency(name, extra, errors, min_version)
139 try:
--> 140 module = importlib.import_module(name)
141 except ImportError:
File /usr/lib/python3.13/importlib/__init__.py:88, in import_module(name, package)
87 level += 1
---> 88 return _bootstrap._gcd_import(name[level:], package, level)
File <frozen importlib._bootstrap>:1387, in _gcd_import(name, package, level)
File <frozen importlib._bootstrap>:1360, in _find_and_load(name, import_)
File <frozen importlib._bootstrap>:1324, in _find_and_load_unlocked(name, import_)
ModuleNotFoundError: No module named 'pyarrow'
During handling of the above exception, another exception occurred:
ImportError Traceback (most recent call last)
Cell In[57], line 1
----> 1 df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...)
1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
617 _validate_names(kwds.get("names", None))
619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
622 if chunksize or iterator:
623 return parser
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
1617 self.options["has_index_names"] = kwds["has_index_names"]
1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1898, in TextFileReader._make_engine(self, f, engine)
1895 raise ValueError(msg)
1897 try:
-> 1898 return mapping[engine](f, **self.options)
1899 except Exception:
1900 if self.handles is not None:
File /usr/lib/python3/dist-packages/pandas/io/parsers/c_parser_wrapper.py:92, in CParserWrapper.__init__(self, src, **kwds)
89 kwds["dtype_backend"] = "numpy"
90 if kwds["dtype_backend"] == "pyarrow":
91 # Fail here loudly instead of in cython after reading
---> 92 import_optional_dependency("pyarrow")
93 self._reader = parsers.TextReader(src, **kwds)
95 self.unnamed_cols = self._reader.unnamed_cols
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:143, in import_optional_dependency(name, extra, errors, min_version)
141 except ImportError:
142 if errors == "raise":
--> 143 raise ImportError(msg)
144 return None
146 # Handle submodules: if we have submodule, grab parent module from sys.modules
ImportError: Missing optional dependency 'pyarrow'. Use pip or conda to install pyarrow.
In [58]: df_pyarrow.dtypes
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[58], line 1
----> 1 df_pyarrow.dtypes
NameError: name 'df_pyarrow' is not defined
Several non-IO reader functions can also use the dtype_backend argument to return PyArrow-backed data including: