Skip to content

Commit e5d3cd8

Browse files
authored
REFACTOR-modin-project#7294: Reduce access of methods _modin_frame methods from _query_compiler (modin-project#7297)
1 parent 002125b commit e5d3cd8

File tree

12 files changed

+118
-73
lines changed

12 files changed

+118
-73
lines changed

modin/core/dataframe/algebra/binary.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -70,18 +70,15 @@ def maybe_compute_dtypes_common_cast(
7070
The dtypes of the operands are supposed to be known.
7171
"""
7272
if not trigger_computations:
73-
if not first._modin_frame.has_materialized_dtypes:
73+
if not first.frame_has_materialized_dtypes:
7474
return None
7575

76-
if (
77-
isinstance(second, type(first))
78-
and not second._modin_frame.has_materialized_dtypes
79-
):
76+
if isinstance(second, type(first)) and not second.frame_has_materialized_dtypes:
8077
return None
8178

82-
dtypes_first = first._modin_frame.dtypes.to_dict()
79+
dtypes_first = first.dtypes.to_dict()
8380
if isinstance(second, type(first)):
84-
dtypes_second = second._modin_frame.dtypes.to_dict()
81+
dtypes_second = second.dtypes.to_dict()
8582
columns_first = set(first.columns)
8683
columns_second = set(second.columns)
8784
common_columns = columns_first.intersection(columns_second)

modin/core/dataframe/algebra/tree_reduce.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def caller(
6767
_axis = kwargs.get("axis") if axis is None else axis
6868

6969
new_dtypes = None
70-
if compute_dtypes and query_compiler._modin_frame.has_materialized_dtypes:
70+
if compute_dtypes and query_compiler.frame_has_materialized_dtypes:
7171
new_dtypes = str(compute_dtypes(query_compiler.dtypes, *args, **kwargs))
7272

7373
return query_compiler.__constructor__(

modin/core/storage_formats/base/query_compiler.py

+53
Original file line numberDiff line numberDiff line change
@@ -4521,6 +4521,59 @@ def has_multiindex(self, axis=0):
45214521
assert axis == 1
45224522
return isinstance(self.columns, pandas.MultiIndex)
45234523

4524+
@property
4525+
def frame_has_materialized_dtypes(self) -> bool:
4526+
"""
4527+
Check if the undelying dataframe has materialized dtypes.
4528+
4529+
Returns
4530+
-------
4531+
bool
4532+
"""
4533+
return self._modin_frame.has_materialized_dtypes
4534+
4535+
def set_frame_dtypes_cache(self, dtypes):
4536+
"""
4537+
Set dtypes cache for the underlying dataframe frame.
4538+
4539+
Parameters
4540+
----------
4541+
dtypes : pandas.Series, ModinDtypes, callable or None
4542+
"""
4543+
self._modin_frame.set_dtypes_cache(dtypes)
4544+
4545+
def set_frame_index_cache(self, index):
4546+
"""
4547+
Set index cache for underlying dataframe.
4548+
4549+
Parameters
4550+
----------
4551+
index : sequence, callable or None
4552+
"""
4553+
self._modin_frame.set_index_cache(index)
4554+
4555+
@property
4556+
def frame_has_index_cache(self):
4557+
"""
4558+
Check if the index cache exists for underlying dataframe.
4559+
4560+
Returns
4561+
-------
4562+
bool
4563+
"""
4564+
return self._modin_frame.has_index_cache
4565+
4566+
@property
4567+
def frame_has_dtypes_cache(self) -> bool:
4568+
"""
4569+
Check if the dtypes cache exists for the underlying dataframe.
4570+
4571+
Returns
4572+
-------
4573+
bool
4574+
"""
4575+
return self._modin_frame.has_dtypes_cache
4576+
45244577
def get_index_name(self, axis=0):
45254578
"""
45264579
Get index name of specified axis.

modin/core/storage_formats/pandas/aggregations.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ def corr_method(
7171
np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)),
7272
index=new_columns,
7373
)
74-
elif numeric_only and qc._modin_frame.has_materialized_dtypes:
75-
old_dtypes = qc._modin_frame.dtypes
74+
elif numeric_only and qc.frame_has_materialized_dtypes:
75+
old_dtypes = qc.dtypes
7676

7777
new_columns = old_dtypes[old_dtypes.map(is_numeric_dtype)].index
7878
new_index = new_columns.copy()

modin/core/storage_formats/pandas/query_compiler.py

+8-13
Original file line numberDiff line numberDiff line change
@@ -580,10 +580,7 @@ def reindex(self, axis, labels, **kwargs):
580580
new_index, indexer = (self.index, None) if axis else self.index.reindex(labels)
581581
new_columns, _ = self.columns.reindex(labels) if axis else (self.columns, None)
582582
new_dtypes = None
583-
if (
584-
self._modin_frame.has_materialized_dtypes
585-
and kwargs.get("method", None) is None
586-
):
583+
if self.frame_has_materialized_dtypes and kwargs.get("method", None) is None:
587584
# For columns, defining types is easier because we don't have to calculate the common
588585
# type, since the entire column is filled. A simple `reindex` covers our needs.
589586
# For rows, we can avoid calculating common types if we know that no new strings of
@@ -2650,8 +2647,8 @@ def fillna(df):
26502647
}
26512648
return df.fillna(value=func_dict, **kwargs)
26522649

2653-
if self._modin_frame.has_materialized_dtypes:
2654-
dtypes = self._modin_frame.dtypes
2650+
if self.frame_has_materialized_dtypes:
2651+
dtypes = self.dtypes
26552652
value_dtypes = pandas.DataFrame(
26562653
{k: [v] for (k, v) in value.items()}
26572654
).dtypes
@@ -2663,12 +2660,10 @@ def fillna(df):
26632660
new_dtypes = dtypes
26642661

26652662
else:
2666-
if self._modin_frame.has_materialized_dtypes:
2663+
if self.frame_has_materialized_dtypes:
26672664
dtype = pandas.Series(value).dtype
2668-
if all(
2669-
find_common_type([t, dtype]) == t for t in self._modin_frame.dtypes
2670-
):
2671-
new_dtypes = self._modin_frame.dtypes
2665+
if all(find_common_type([t, dtype]) == t for t in self.dtypes):
2666+
new_dtypes = self.dtypes
26722667

26732668
def fillna(df):
26742669
return df.fillna(value=value, **kwargs)
@@ -2898,7 +2893,7 @@ def _set_item(df, row_loc): # pragma: no cover
28982893
df.loc[row_loc.squeeze(axis=1), col_loc] = item
28992894
return df
29002895

2901-
if self._modin_frame.has_materialized_dtypes and is_scalar(item):
2896+
if self.frame_has_materialized_dtypes and is_scalar(item):
29022897
new_dtypes = self.dtypes.copy()
29032898
old_dtypes = new_dtypes[col_loc]
29042899
item_type = extract_dtype(item)
@@ -4607,7 +4602,7 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item):
46074602
# compute dtypes only if assigning entire columns
46084603
isinstance(row_numeric_index, slice)
46094604
and row_numeric_index == slice(None)
4610-
and self._modin_frame.has_materialized_dtypes
4605+
and self.frame_has_materialized_dtypes
46114606
):
46124607
new_dtypes = self.dtypes.copy()
46134608
new_dtypes.iloc[col_numeric_index] = broadcasted_dtypes.values

modin/pandas/base.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1070,8 +1070,8 @@ def astype(
10701070

10711071
if not copy:
10721072
# If the new types match the old ones, then copying can be avoided
1073-
if self._query_compiler._modin_frame.has_materialized_dtypes:
1074-
frame_dtypes = self._query_compiler._modin_frame.dtypes
1073+
if self._query_compiler.frame_has_materialized_dtypes:
1074+
frame_dtypes = self._query_compiler.dtypes
10751075
if isinstance(dtype, dict):
10761076
for col in dtype:
10771077
if dtype[col] != frame_dtypes[col]:

modin/tests/core/storage_formats/pandas/test_internals.py

+33-33
Original file line numberDiff line numberDiff line change
@@ -1138,14 +1138,14 @@ def test_binary_op_preserve_dtypes():
11381138
def setup_cache(df, has_cache=True):
11391139
if has_cache:
11401140
_ = df.dtypes
1141-
assert df._query_compiler._modin_frame.has_materialized_dtypes
1141+
assert df._query_compiler.frame_has_materialized_dtypes
11421142
else:
1143-
df._query_compiler._modin_frame.set_dtypes_cache(None)
1144-
assert not df._query_compiler._modin_frame.has_materialized_dtypes
1143+
df._query_compiler.set_frame_dtypes_cache(None)
1144+
assert not df._query_compiler.frame_has_materialized_dtypes
11451145
return df
11461146

11471147
def assert_cache(df, has_cache=True):
1148-
assert not (has_cache ^ df._query_compiler._modin_frame.has_materialized_dtypes)
1148+
assert not (has_cache ^ df._query_compiler.frame_has_materialized_dtypes)
11491149

11501150
# Check when `other` is a non-distributed object
11511151
assert_cache(setup_cache(df) + 2.0)
@@ -1179,7 +1179,7 @@ def remove_cache(df, axis):
11791179
if axis:
11801180
df._query_compiler._modin_frame.set_columns_cache(None)
11811181
else:
1182-
df._query_compiler._modin_frame.set_index_cache(None)
1182+
df._query_compiler.set_frame_index_cache(None)
11831183
assert_no_cache(df, axis)
11841184
return df
11851185

@@ -1195,30 +1195,30 @@ def test_setitem_bool_preserve_dtypes():
11951195
df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]})
11961196
indexer = pd.Series([True, False, True, False])
11971197

1198-
assert df._query_compiler._modin_frame.has_materialized_dtypes
1198+
assert df._query_compiler.frame_has_materialized_dtypes
11991199

12001200
# slice(None) as a col_loc
12011201
df.loc[indexer] = 2.0
1202-
assert df._query_compiler._modin_frame.has_materialized_dtypes
1202+
assert df._query_compiler.frame_has_materialized_dtypes
12031203

12041204
# list as a col_loc
12051205
df.loc[indexer, ["a", "b"]] = 2.0
1206-
assert df._query_compiler._modin_frame.has_materialized_dtypes
1206+
assert df._query_compiler.frame_has_materialized_dtypes
12071207

12081208
# scalar as a col_loc
12091209
df.loc[indexer, "a"] = 2.0
1210-
assert df._query_compiler._modin_frame.has_materialized_dtypes
1210+
assert df._query_compiler.frame_has_materialized_dtypes
12111211

12121212

12131213
def test_setitem_unhashable_preserve_dtypes():
12141214
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]])
1215-
assert df._query_compiler._modin_frame.has_materialized_dtypes
1215+
assert df._query_compiler.frame_has_materialized_dtypes
12161216

12171217
df2 = pd.DataFrame([[9, 9], [5, 5]])
1218-
assert df2._query_compiler._modin_frame.has_materialized_dtypes
1218+
assert df2._query_compiler.frame_has_materialized_dtypes
12191219

12201220
df[[1, 2]] = df2
1221-
assert df._query_compiler._modin_frame.has_materialized_dtypes
1221+
assert df._query_compiler.frame_has_materialized_dtypes
12221222

12231223

12241224
@pytest.mark.parametrize("modify_config", [{RangePartitioning: True}], indirect=True)
@@ -1246,7 +1246,7 @@ def test_reindex_preserve_dtypes(kwargs):
12461246
df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]})
12471247

12481248
reindexed_df = df.reindex(**kwargs)
1249-
assert reindexed_df._query_compiler._modin_frame.has_materialized_dtypes
1249+
assert reindexed_df._query_compiler.frame_has_materialized_dtypes
12501250

12511251

12521252
class TestModinIndexIds:
@@ -2039,7 +2039,7 @@ def test_concat_axis_1(
20392039
)
20402040
# setting columns cache to 'None', in order to prevent completing 'dtypes' with the materialized columns
20412041
md_df._query_compiler._modin_frame.set_columns_cache(None)
2042-
md_df._query_compiler._modin_frame.set_dtypes_cache(
2042+
md_df._query_compiler.set_frame_dtypes_cache(
20432043
ModinDtypes(
20442044
DtypesDescriptor(
20452045
known_dtypes,
@@ -2100,7 +2100,7 @@ def test_update_parent(self):
21002100

21012101
# 'df2' will have a 'DtypesDescriptor' with unknown dtypes for a column 'c'
21022102
df2 = pd.DataFrame({"c": [2, 3, 4]})
2103-
df2._query_compiler._modin_frame.set_dtypes_cache(None)
2103+
df2._query_compiler.set_frame_dtypes_cache(None)
21042104
dtypes_cache = df2._query_compiler._modin_frame._dtypes
21052105
assert isinstance(
21062106
dtypes_cache._value, DtypesDescriptor
@@ -2226,7 +2226,7 @@ def test_set_index_with_dupl_labels(self):
22262226
"""Verify that setting duplicated columns doesn't propagate any errors to a user."""
22272227
df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [3.5, 4.4, 5.5, 6.6]})
22282228
# making sure that dtypes are represented by an unmaterialized dtypes-descriptor
2229-
df._query_compiler._modin_frame.set_dtypes_cache(None)
2229+
df._query_compiler.set_frame_dtypes_cache(None)
22302230

22312231
df.columns = ["a", "a"]
22322232
assert df.dtypes.equals(
@@ -2252,8 +2252,8 @@ def test_concat_mi(self):
22522252
)
22532253

22542254
# Drop actual dtypes in order to use partially-known dtypes
2255-
md_df1._query_compiler._modin_frame.set_dtypes_cache(None)
2256-
md_df2._query_compiler._modin_frame.set_dtypes_cache(None)
2255+
md_df1._query_compiler.set_frame_dtypes_cache(None)
2256+
md_df2._query_compiler.set_frame_dtypes_cache(None)
22572257

22582258
md_res = pd.concat([md_df1, md_df2], axis=1)
22592259
pd_res = pandas.concat([pd_df1, pd_df2], axis=1)
@@ -2282,9 +2282,9 @@ def test_preserve_dtypes_setitem(self, self_dtype, value, value_dtype):
22822282
with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch:
22832283
df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [3, 4]})
22842284
if self_dtype == "materialized":
2285-
assert df._query_compiler._modin_frame.has_materialized_dtypes
2285+
assert df._query_compiler.frame_has_materialized_dtypes
22862286
elif self_dtype == "partial":
2287-
df._query_compiler._modin_frame.set_dtypes_cache(
2287+
df._query_compiler.set_frame_dtypes_cache(
22882288
ModinDtypes(
22892289
DtypesDescriptor(
22902290
{"a": np.dtype("int64")},
@@ -2293,7 +2293,7 @@ def test_preserve_dtypes_setitem(self, self_dtype, value, value_dtype):
22932293
)
22942294
)
22952295
elif self_dtype == "unknown":
2296-
df._query_compiler._modin_frame.set_dtypes_cache(None)
2296+
df._query_compiler.set_frame_dtypes_cache(None)
22972297
else:
22982298
raise NotImplementedError(self_dtype)
22992299

@@ -2304,7 +2304,7 @@ def test_preserve_dtypes_setitem(self, self_dtype, value, value_dtype):
23042304
[np.dtype("int64"), value_dtype, np.dtype("int64")],
23052305
index=["a", "b", "c"],
23062306
)
2307-
assert df._query_compiler._modin_frame.has_materialized_dtypes
2307+
assert df._query_compiler.frame_has_materialized_dtypes
23082308
assert df.dtypes.equals(result_dtype)
23092309
elif self_dtype == "partial":
23102310
result_dtype = DtypesDescriptor(
@@ -2339,17 +2339,17 @@ def test_preserve_dtypes_insert(self, self_dtype, value, value_dtype):
23392339
with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch:
23402340
df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
23412341
if self_dtype == "materialized":
2342-
assert df._query_compiler._modin_frame.has_materialized_dtypes
2342+
assert df._query_compiler.frame_has_materialized_dtypes
23432343
elif self_dtype == "partial":
2344-
df._query_compiler._modin_frame.set_dtypes_cache(
2344+
df._query_compiler.set_frame_dtypes_cache(
23452345
ModinDtypes(
23462346
DtypesDescriptor(
23472347
{"a": np.dtype("int64")}, cols_with_unknown_dtypes=["b"]
23482348
)
23492349
)
23502350
)
23512351
elif self_dtype == "unknown":
2352-
df._query_compiler._modin_frame.set_dtypes_cache(None)
2352+
df._query_compiler.set_frame_dtypes_cache(None)
23532353
else:
23542354
raise NotImplementedError(self_dtype)
23552355

@@ -2360,7 +2360,7 @@ def test_preserve_dtypes_insert(self, self_dtype, value, value_dtype):
23602360
[value_dtype, np.dtype("int64"), np.dtype("int64")],
23612361
index=["c", "a", "b"],
23622362
)
2363-
assert df._query_compiler._modin_frame.has_materialized_dtypes
2363+
assert df._query_compiler.frame_has_materialized_dtypes
23642364
assert df.dtypes.equals(result_dtype)
23652365
elif self_dtype == "partial":
23662366
result_dtype = DtypesDescriptor(
@@ -2390,7 +2390,7 @@ def test_get_dummies_case(self):
23902390
cols = [col for col in res.columns if col != "items"]
23912391
res[cols] = res[cols] / res[cols].mean()
23922392

2393-
assert res._query_compiler._modin_frame.has_materialized_dtypes
2393+
assert res._query_compiler.frame_has_materialized_dtypes
23942394

23952395
patch.assert_not_called()
23962396

@@ -2403,21 +2403,21 @@ def test_preserve_dtypes_reset_index(self, drop, has_materialized_index):
24032403
if has_materialized_index:
24042404
assert df._query_compiler._modin_frame.has_materialized_index
24052405
else:
2406-
df._query_compiler._modin_frame.set_index_cache(None)
2406+
df._query_compiler.set_frame_index_cache(None)
24072407
assert not df._query_compiler._modin_frame.has_materialized_index
2408-
assert df._query_compiler._modin_frame.has_materialized_dtypes
2408+
assert df._query_compiler.frame_has_materialized_dtypes
24092409

24102410
res = df.reset_index(drop=drop)
24112411
if drop:
24122412
# we droped the index, so columns and dtypes shouldn't change
2413-
assert res._query_compiler._modin_frame.has_materialized_dtypes
2413+
assert res._query_compiler.frame_has_materialized_dtypes
24142414
assert res.dtypes.equals(df.dtypes)
24152415
else:
24162416
if has_materialized_index:
24172417
# we should have inserted index dtype into the descriptor,
24182418
# and since both of them are materialized, the result should be
24192419
# materialized too
2420-
assert res._query_compiler._modin_frame.has_materialized_dtypes
2420+
assert res._query_compiler.frame_has_materialized_dtypes
24212421
assert res.dtypes.equals(
24222422
pandas.Series(
24232423
[np.dtype("int64"), np.dtype("int64")], index=["index", "a"]
@@ -2436,7 +2436,7 @@ def test_preserve_dtypes_reset_index(self, drop, has_materialized_index):
24362436

24372437
# case 2: 'df' has partial dtype by default
24382438
df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
2439-
df._query_compiler._modin_frame.set_dtypes_cache(
2439+
df._query_compiler.set_frame_dtypes_cache(
24402440
ModinDtypes(
24412441
DtypesDescriptor(
24422442
{"a": np.dtype("int64")}, cols_with_unknown_dtypes=["b"]
@@ -2446,7 +2446,7 @@ def test_preserve_dtypes_reset_index(self, drop, has_materialized_index):
24462446
if has_materialized_index:
24472447
assert df._query_compiler._modin_frame.has_materialized_index
24482448
else:
2449-
df._query_compiler._modin_frame.set_index_cache(None)
2449+
df._query_compiler.set_frame_index_cache(None)
24502450
assert not df._query_compiler._modin_frame.has_materialized_index
24512451

24522452
res = df.reset_index(drop=drop)

0 commit comments

Comments
 (0)