Skip to content

Commit c555f59

Browse files
authored
FEAT-modin-project#6906: Update to pandas 2.2.* (modin-project#6907)
Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
1 parent fb3e90d commit c555f59

File tree

36 files changed

+590
-182
lines changed

36 files changed

+590
-182
lines changed

docs/supported_apis/utilities_supported.rst

-2
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,6 @@ contributing a distributed version of any of these objects, feel free to open a
9898
* DateOffset
9999
* ExcelWriter
100100
* SparseArray
101-
* SparseSeries
102-
* SparseDataFrame
103101

104102
.. _open an issue: https://github.com/modin-project/modin/issues
105103
.. _pull request: https://github.com/modin-project/modin/pulls

environment-dev.yml

+14-15
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ dependencies:
55
- pip
66

77
# required dependencies
8-
- pandas>=2.1,<2.2
8+
- pandas>=2.2,<2.3
99
- numpy>=1.22.4
10-
- fsspec>=2022.05.0
10+
- fsspec>=2022.11.0
1111
- packaging>=21.0
1212
- psutil>=5.8.0
1313

@@ -20,21 +20,21 @@ dependencies:
2020
- grpcio!=1.46.*
2121
- dask>=2.22.0
2222
- distributed>=2.22.0
23-
- xarray>=2022.03.0
23+
- xarray>=2022.12.0
2424
- jinja2>=3.1.2
25-
- scipy>=1.8.1
26-
- s3fs>=2022.05.0
27-
- lxml>=4.8.0
28-
- openpyxl>=3.0.10
25+
- scipy>=1.10.0
26+
- s3fs>=2022.11.0
27+
- lxml>=4.9.2
28+
- openpyxl>=3.1.0
2929
- xlrd>=2.0.1
30-
- matplotlib>=3.6.1
31-
- sqlalchemy>=1.4.0,<1.4.46
32-
- pandas-gbq>=0.15.0
33-
- pytables>=3.7.0
30+
- matplotlib>=3.6.3
31+
- sqlalchemy>=2.0.0
32+
- pandas-gbq>=0.19.0
33+
- pytables>=3.8.0
3434
# pymssql==2.2.8 broken: https://github.com/modin-project/modin/issues/6429
3535
- pymssql>=2.1.5,!=2.2.8
36-
- psycopg2>=2.9.3
37-
- fastparquet>=0.8.1
36+
- psycopg2>=2.9.6
37+
- fastparquet>=2022.12.0
3838
- tqdm>=4.60.0
3939
# pandas isn't compatible with numexpr=2.8.5: https://github.com/modin-project/modin/issues/6469
4040
- numexpr<2.8.5
@@ -64,8 +64,7 @@ dependencies:
6464
- asv==0.5.1
6565
# no conda package for windows so we install it with pip
6666
- connectorx>=0.2.6a4
67-
# experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies
68-
- fuzzydata>=0.0.6
67+
- fuzzydata>=0.0.11
6968
# Fixes breaking ipywidgets changes, but didn't release yet.
7069
- git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
7170
# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.

examples/tutorial/jupyter/execution/pandas_on_dask/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
fsspec>=2022.05.0
1+
fsspec>=2022.11.0
22
jupyterlab
33
ipywidgets
44
modin[dask]

examples/tutorial/jupyter/execution/pandas_on_ray/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
fsspec>=2022.05.0
1+
fsspec>=2022.11.0
22
jupyterlab
33
ipywidgets
44
tqdm>=4.60.0

examples/tutorial/jupyter/execution/pandas_on_unidist/jupyter_unidist_env.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ channels:
33
- conda-forge
44
dependencies:
55
- pip
6-
- fsspec>=2022.05.0
6+
- fsspec>=2022.11.0
77
- jupyterlab
88
- ipywidgets
99
- modin-mpi

modin/core/dataframe/algebra/default2pandas/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,12 @@
1919
from .datetime import DateTimeDefault
2020
from .default import DefaultMethod
2121
from .groupby import GroupByDefault, SeriesGroupByDefault
22+
from .list import ListDefault
2223
from .resample import ResampleDefault
2324
from .rolling import ExpandingDefault, RollingDefault
2425
from .series import SeriesDefault
2526
from .str import StrDefault
27+
from .struct import StructDefault
2628

2729
__all__ = [
2830
"DataFrameDefault",
@@ -37,4 +39,6 @@
3739
"CatDefault",
3840
"GroupByDefault",
3941
"SeriesGroupByDefault",
42+
"ListDefault",
43+
"StructDefault",
4044
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Licensed to Modin Development Team under one or more contributor license agreements.
2+
# See the NOTICE file distributed with this work for additional information regarding
3+
# copyright ownership. The Modin Development Team licenses this file to you under the
4+
# Apache License, Version 2.0 (the "License"); you may not use this file except in
5+
# compliance with the License. You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software distributed under
10+
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific language
12+
# governing permissions and limitations under the License.
13+
14+
"""Module houses default applied-on-list accessor functions builder class."""
15+
16+
from .series import SeriesDefault
17+
18+
19+
class ListDefault(SeriesDefault):
20+
"""Builder for default-to-pandas methods which is executed under list accessor."""
21+
22+
@classmethod
23+
def frame_wrapper(cls, df):
24+
"""
25+
Get list accessor of the passed frame.
26+
27+
Parameters
28+
----------
29+
df : pandas.DataFrame
30+
31+
Returns
32+
-------
33+
pandas.core.arrays.arrow.ListAccessor
34+
"""
35+
return df.squeeze(axis=1).list
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Licensed to Modin Development Team under one or more contributor license agreements.
2+
# See the NOTICE file distributed with this work for additional information regarding
3+
# copyright ownership. The Modin Development Team licenses this file to you under the
4+
# Apache License, Version 2.0 (the "License"); you may not use this file except in
5+
# compliance with the License. You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software distributed under
10+
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific language
12+
# governing permissions and limitations under the License.
13+
14+
"""Module houses default applied-on-struct accessor functions builder class."""
15+
16+
from .series import SeriesDefault
17+
18+
19+
class StructDefault(SeriesDefault):
20+
"""Builder for default-to-pandas methods which is executed under struct accessor."""
21+
22+
@classmethod
23+
def frame_wrapper(cls, df):
24+
"""
25+
Get struct accessor of the passed frame.
26+
27+
Parameters
28+
----------
29+
df : pandas.DataFrame
30+
31+
Returns
32+
-------
33+
pandas.core.arrays.arrow.StructAccessor
34+
"""
35+
return df.squeeze(axis=1).struct

modin/core/io/io.py

+4
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,8 @@ def read_fwf(
478478
widths=None,
479479
infer_nrows=100,
480480
dtype_backend=no_default,
481+
iterator=False,
482+
chunksize=None,
481483
**kwds,
482484
): # noqa: PR01
483485
ErrorMessage.default_to_pandas("`read_fwf`")
@@ -487,6 +489,8 @@ def read_fwf(
487489
widths=widths,
488490
infer_nrows=infer_nrows,
489491
dtype_backend=dtype_backend,
492+
iterator=iterator,
493+
chunksize=chunksize,
490494
**kwds,
491495
)
492496
if isinstance(pd_obj, pandas.DataFrame):

modin/core/storage_formats/base/query_compiler.py

+91
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,13 @@
3535
DateTimeDefault,
3636
ExpandingDefault,
3737
GroupByDefault,
38+
ListDefault,
3839
ResampleDefault,
3940
RollingDefault,
4041
SeriesDefault,
4142
SeriesGroupByDefault,
4243
StrDefault,
44+
StructDefault,
4345
)
4446
from modin.error_message import ErrorMessage
4547
from modin.logging import ClassLogger
@@ -6563,6 +6565,88 @@ def cat_codes(self):
65636565

65646566
# End of Categories methods
65656567

6568+
# List accessor's methods
6569+
6570+
@doc_utils.add_one_column_warning
6571+
@doc_utils.add_refer_to("Series.list.flatten")
6572+
def list_flatten(self):
6573+
"""
6574+
Flatten list values.
6575+
6576+
Returns
6577+
-------
6578+
BaseQueryCompiler
6579+
"""
6580+
return ListDefault.register(pandas.Series.list.flatten)(self)
6581+
6582+
@doc_utils.add_one_column_warning
6583+
@doc_utils.add_refer_to("Series.list.len")
6584+
def list_len(self):
6585+
"""
6586+
Return the length of each list in the Series.
6587+
6588+
Returns
6589+
-------
6590+
BaseQueryCompiler
6591+
"""
6592+
return ListDefault.register(pandas.Series.list.len)(self)
6593+
6594+
@doc_utils.add_one_column_warning
6595+
@doc_utils.add_refer_to("Series.list.__getitem__")
6596+
def list__getitem__(self, key): # noqa: PR01
6597+
"""
6598+
Index or slice lists in the Series.
6599+
6600+
Returns
6601+
-------
6602+
BaseQueryCompiler
6603+
"""
6604+
return ListDefault.register(pandas.Series.list.__getitem__)(self, key=key)
6605+
6606+
# End of List accessor's methods
6607+
6608+
# Struct accessor's methods
6609+
6610+
@doc_utils.add_one_column_warning
6611+
@doc_utils.add_refer_to("Series.struct.dtypes")
6612+
def struct_dtypes(self):
6613+
"""
6614+
Return the dtype object of each child field of the struct.
6615+
6616+
Returns
6617+
-------
6618+
BaseQueryCompiler
6619+
"""
6620+
return StructDefault.register(pandas.Series.struct.dtypes)(self)
6621+
6622+
@doc_utils.add_one_column_warning
6623+
@doc_utils.add_refer_to("Series.struct.field")
6624+
def struct_field(self, name_or_index): # noqa: PR01
6625+
"""
6626+
Extract a child field of a struct as a Series.
6627+
6628+
Returns
6629+
-------
6630+
BaseQueryCompiler
6631+
"""
6632+
return StructDefault.register(pandas.Series.struct.field)(
6633+
self, name_or_index=name_or_index
6634+
)
6635+
6636+
@doc_utils.add_one_column_warning
6637+
@doc_utils.add_refer_to("Series.struct.explode")
6638+
def struct_explode(self):
6639+
"""
6640+
Extract all child fields of a struct as a DataFrame.
6641+
6642+
Returns
6643+
-------
6644+
BaseQueryCompiler
6645+
"""
6646+
return StructDefault.register(pandas.Series.struct.explode)(self)
6647+
6648+
# End of Struct accessor's methods
6649+
65666650
# DataFrame methods
65676651

65686652
def invert(self):
@@ -6617,6 +6701,13 @@ def compare(self, other, align_axis, keep_shape, keep_equal, result_names):
66176701
result_names=result_names,
66186702
)
66196703

6704+
@doc_utils.add_refer_to("Series.case_when")
6705+
def case_when(self, caselist): # noqa: PR01, RT01, D200
6706+
"""
6707+
Replace values where the conditions are True.
6708+
"""
6709+
return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist)
6710+
66206711
def repartition(self, axis=None):
66216712
"""
66226713
Repartitioning QueryCompiler objects to get ideal partitions inside.

modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,10 @@ def get_col_names():
142142
kwargs["filepath_or_buffer"], nrows=0, engine="c"
143143
).columns.tolist()
144144

145-
if dtype := kwargs["dtype"]:
145+
dtype = kwargs["dtype"]
146+
# For details: https://github.com/pandas-dev/pandas/issues/57024
147+
entire_dataframe_dtype = dtype is not None and not isinstance(dtype, dict)
148+
if dtype:
146149
if isinstance(dtype, dict):
147150
column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()}
148151
else:
@@ -151,7 +154,9 @@ def get_col_names():
151154
else:
152155
column_types = {}
153156

154-
if parse_dates := kwargs["parse_dates"]:
157+
if parse_dates := (
158+
None if entire_dataframe_dtype else kwargs["parse_dates"]
159+
):
155160
# Either list of column names or list of column indices is supported.
156161
if isinstance(parse_dates, list) and (
157162
all(isinstance(col, str) for col in parse_dates)
@@ -185,7 +190,7 @@ def get_col_names():
185190
usecols_md = cls._prepare_pyarrow_usecols(kwargs)
186191

187192
po = ParseOptions(
188-
delimiter="\\s+" if kwargs["delim_whitespace"] else delimiter,
193+
delimiter="\\s+" if kwargs["delim_whitespace"] is True else delimiter,
189194
quote_char=kwargs["quotechar"],
190195
double_quote=kwargs["doublequote"],
191196
escape_char=kwargs["escapechar"],
@@ -426,7 +431,7 @@ def _read_csv_check_support(
426431
False,
427432
f"read_csv with 'arrow' engine doesn't support {arg} parameter",
428433
)
429-
if delimiter is not None and read_csv_kwargs["delim_whitespace"]:
434+
if delimiter is not None and read_csv_kwargs["delim_whitespace"] is True:
430435
raise ValueError(
431436
"Specified a delimiter with both sep and delim_whitespace=True; you can only specify one."
432437
)
@@ -541,7 +546,7 @@ def _validate_read_csv_kwargs(
541546
if delimiter is None:
542547
delimiter = sep
543548

544-
if delim_whitespace and (delimiter is not lib.no_default):
549+
if delim_whitespace is True and (delimiter is not lib.no_default):
545550
raise ValueError(
546551
"Specified a delimiter with both sep and "
547552
+ "delim_whitespace=True; you can only specify one."

modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py

+4
Original file line numberDiff line numberDiff line change
@@ -1440,6 +1440,10 @@ def applier(df, **kwargs):
14401440
# TODO: make sure we can ignore this warning
14411441
or "Frame contain columns with unsupported data-types"
14421442
in message
1443+
# Looks like the warning comes from pyarrow, more details:
1444+
# https://github.com/pandas-dev/pandas/pull/52419
1445+
or "Passing a BlockManager to DataFrame is deprecated"
1446+
in message
14431447
):
14441448
continue
14451449
assert (

modin/experimental/core/io/sql/utils.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
import pandas
1717
import pandas._libs.lib as lib
18-
from sqlalchemy import MetaData, Table, create_engine, inspect
18+
from sqlalchemy import MetaData, Table, create_engine, inspect, text
1919

2020
from modin.core.storage_formats.pandas.parsers import _split_result_for_readers
2121

@@ -167,9 +167,9 @@ def get_query_columns(engine, query):
167167
Dictionary with columns names and python types.
168168
"""
169169
con = engine.connect()
170-
result = con.execute(query).fetchone()
171-
values = list(result)
170+
result = con.execute(text(query))
172171
cols_names = list(result.keys())
172+
values = list(result.first())
173173
cols = dict()
174174
for i in range(len(cols_names)):
175175
cols[cols_names[i]] = type(values[i]).__name__

0 commit comments

Comments
 (0)