Skip to content

Commit 759d548

Browse files
authored
FIX-modin-project#7329: Do not sort columns on df.update (modin-project#7330)
Signed-off-by: Igoshev, Iaroslav <iaroslav.igoshev@intel.com>
1 parent 2eff03c commit 759d548

File tree

5 files changed

+49
-6
lines changed

5 files changed

+49
-6
lines changed

modin/core/dataframe/algebra/binary.py

+4
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ def register(
298298
cls,
299299
func: Callable[..., pandas.DataFrame],
300300
join_type: str = "outer",
301+
sort: bool = None,
301302
labels: str = "replace",
302303
infer_dtypes: Optional[str] = None,
303304
) -> Callable[..., PandasQueryCompiler]:
@@ -310,6 +311,8 @@ def register(
310311
Binary function to execute. Have to be able to accept at least two arguments.
311312
join_type : {'left', 'right', 'outer', 'inner', None}, default: 'outer'
312313
Type of join that will be used if indices of operands are not aligned.
314+
sort : bool, default: None
315+
Whether to sort index and columns or not.
313316
labels : {"keep", "replace", "drop"}, default: "replace"
314317
Whether keep labels from left Modin DataFrame, replace them with labels
315318
from joined DataFrame or drop altogether to make them be computed lazily later.
@@ -419,6 +422,7 @@ def caller(
419422
lambda x, y: func(x, y, *args, **kwargs),
420423
[other._modin_frame],
421424
join_type=join_type,
425+
sort=sort,
422426
labels=labels,
423427
dtypes=dtypes,
424428
),

modin/core/dataframe/pandas/dataframe/dataframe.py

+15-6
Original file line numberDiff line numberDiff line change
@@ -3255,7 +3255,6 @@ def broadcast_apply(
32553255
axis,
32563256
other,
32573257
join_type,
3258-
sort=not self.get_axis(axis).equals(other.get_axis(axis)),
32593258
)
32603259
# unwrap list returned by `copartition`.
32613260
right_parts = right_parts[0]
@@ -3681,7 +3680,7 @@ def _check_if_axes_identical(self, other: PandasDataframe, axis: int = 0) -> boo
36813680
) and self._get_axis_lengths(axis) == other._get_axis_lengths(axis)
36823681

36833682
def _copartition(
3684-
self, axis, other, how, sort, force_repartition=False, fill_value=None
3683+
self, axis, other, how, sort=None, force_repartition=False, fill_value=None
36853684
):
36863685
"""
36873686
Copartition two Modin DataFrames.
@@ -3696,8 +3695,9 @@ def _copartition(
36963695
Other Modin DataFrame(s) to copartition against.
36973696
how : str
36983697
How to manage joining the index object ("left", "right", etc.).
3699-
sort : bool
3698+
sort : bool, default: None
37003699
Whether sort the joined index or not.
3700+
If ``None``, sort is defined in depend on labels equality along the axis.
37013701
force_repartition : bool, default: False
37023702
Whether force the repartitioning or not. By default,
37033703
this method will skip repartitioning if it is possible. This is because
@@ -3730,6 +3730,9 @@ def _copartition(
37303730
self._get_axis_lengths_cache(axis),
37313731
)
37323732

3733+
if sort is None:
3734+
sort = not all(self.get_axis(axis).equals(o.get_axis(axis)) for o in other)
3735+
37333736
self_index = self.get_axis(axis)
37343737
others_index = [o.get_axis(axis) for o in other]
37353738
joined_index, make_reindexer = self._join_index_objects(
@@ -3823,6 +3826,7 @@ def n_ary_op(
38233826
op,
38243827
right_frames: list[PandasDataframe],
38253828
join_type="outer",
3829+
sort=None,
38263830
copartition_along_columns=True,
38273831
labels="replace",
38283832
dtypes: Optional[pandas.Series] = None,
@@ -3838,6 +3842,8 @@ def n_ary_op(
38383842
Modin DataFrames to join with.
38393843
join_type : str, default: "outer"
38403844
Type of join to apply.
3845+
sort : bool, default: None
3846+
Whether to sort index and columns or not.
38413847
copartition_along_columns : bool, default: True
38423848
Whether to perform copartitioning along columns or not.
38433849
For some ops this isn't needed (e.g., `fillna`).
@@ -3854,7 +3860,10 @@ def n_ary_op(
38543860
New Modin DataFrame.
38553861
"""
38563862
left_parts, list_of_right_parts, joined_index, row_lengths = self._copartition(
3857-
0, right_frames, join_type, sort=True
3863+
0,
3864+
right_frames,
3865+
join_type,
3866+
sort=sort,
38583867
)
38593868
if copartition_along_columns:
38603869
new_left_frame = self.__constructor__(
@@ -3886,7 +3895,7 @@ def n_ary_op(
38863895
1,
38873896
new_right_frames,
38883897
join_type,
3889-
sort=True,
3898+
sort=sort,
38903899
)
38913900
else:
38923901
joined_columns = self.copy_columns_cache(copy_lengths=True)
@@ -3978,7 +3987,7 @@ def _compute_new_widths():
39783987
joined_index,
39793988
partition_sizes_along_axis,
39803989
) = self._copartition(
3981-
axis.value ^ 1, others, how, sort, force_repartition=False
3990+
axis.value ^ 1, others, how, sort=sort, force_repartition=False
39823991
)
39833992
if axis == Axis.COL_WISE:
39843993
new_lengths = partition_sizes_along_axis

modin/core/storage_formats/pandas/query_compiler.py

+2
Original file line numberDiff line numberDiff line change
@@ -460,13 +460,15 @@ def to_numpy(self, **kwargs):
460460
df_update = Binary.register(
461461
copy_df_for_func(pandas.DataFrame.update, display_name="update"),
462462
join_type="left",
463+
sort=False,
463464
)
464465
series_update = Binary.register(
465466
copy_df_for_func(
466467
lambda x, y: pandas.Series.update(x.squeeze(axis=1), y.squeeze(axis=1)),
467468
display_name="update",
468469
),
469470
join_type="left",
471+
sort=False,
470472
)
471473

472474
# Needed for numpy API

modin/tests/pandas/dataframe/test_binary.py

+27
Original file line numberDiff line numberDiff line change
@@ -527,3 +527,30 @@ def test_arithmetic_with_tricky_dtypes(val1, val2, op, request):
527527
lambda dfs: getattr(dfs[0], op)(dfs[1]),
528528
expected_exception=expected_exception,
529529
)
530+
531+
532+
@pytest.mark.parametrize(
533+
"data, other_data",
534+
[
535+
({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}),
536+
({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}),
537+
],
538+
)
539+
@pytest.mark.parametrize("axis", [0, 1])
540+
@pytest.mark.parametrize("match_index", [True, False])
541+
def test_bin_op_mismatched_columns(data, other_data, axis, match_index):
542+
modin_df, pandas_df = create_test_dfs(data)
543+
other_modin_df, other_pandas_df = create_test_dfs(other_data)
544+
if axis == 0:
545+
if not match_index:
546+
modin_df.index = pandas_df.index = ["1", "2", "3"]
547+
other_modin_df.index = other_pandas_df.index = ["2", "1", "3"]
548+
eval_general(
549+
modin_df,
550+
pandas_df,
551+
lambda df: (
552+
df.add(other_modin_df, axis=axis)
553+
if isinstance(df, pd.DataFrame)
554+
else df.add(other_pandas_df, axis=axis)
555+
),
556+
)

modin/tests/pandas/dataframe/test_map_metadata.py

+1
Original file line numberDiff line numberDiff line change
@@ -1592,6 +1592,7 @@ def test_transpose(data):
15921592
"data, other_data",
15931593
[
15941594
({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}),
1595+
({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}),
15951596
(
15961597
{"A": ["a", "b", "c"], "B": ["x", "y", "z"]},
15971598
{"B": ["d", "e", "f", "g", "h", "i"]},

0 commit comments

Comments
 (0)