Skip to content

Commit aadac56

Browse files
committed
refactoring
1 parent da17789 commit aadac56

File tree

3 files changed

+20
-70
lines changed

3 files changed

+20
-70
lines changed

modin/core/dataframe/pandas/dataframe/dataframe.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -3265,7 +3265,9 @@ def broadcast_apply(
32653265
axis
32663266
), self.copy_axis_cache(axis)
32673267

3268-
new_frame = self._partition_mgr_cls.apply(axis, func, left_parts, right_parts)
3268+
new_frame = self._partition_mgr_cls.broadcast_apply(
3269+
axis, func, left_parts, right_parts
3270+
)
32693271
if isinstance(dtypes, str) and dtypes == "copy":
32703272
dtypes = self.copy_dtypes_cache()
32713273

modin/core/dataframe/pandas/partitioning/partition_manager.py

+15-61
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,9 @@ def groupby_reduce(
338338
f"the number of partitions along {axis=} is not equal: "
339339
+ f"{partitions.shape[axis]} != {by.shape[axis]}"
340340
)
341-
mapped_partitions = cls.apply(axis, map_func, left=partitions, right=by)
341+
mapped_partitions = cls.broadcast_apply(
342+
axis, map_func, left=partitions, right=by
343+
)
342344
else:
343345
mapped_partitions = cls.map_partitions(partitions, map_func)
344346

@@ -437,7 +439,7 @@ def get_partitions(index):
437439

438440
@classmethod
439441
@wait_computations_if_benchmark_mode
440-
def broadcast_apply(cls, axis, apply_func, left, right):
442+
def base_broadcast_apply(cls, axis, apply_func, left, right):
441443
"""
442444
Broadcast the `right` partitions to `left` and apply `apply_func` function.
443445
@@ -490,57 +492,6 @@ def map_func(df, *others):
490492
]
491493
)
492494

493-
@classmethod
494-
@wait_computations_if_benchmark_mode
495-
def apply_axis_partitions(
496-
cls,
497-
axis,
498-
apply_func,
499-
left,
500-
right,
501-
):
502-
"""
503-
Broadcast the `right` partitions to `left` and apply `apply_func` along full `axis`.
504-
505-
Parameters
506-
----------
507-
axis : {0, 1}
508-
Axis to apply and broadcast over.
509-
apply_func : callable
510-
Function to apply.
511-
left : NumPy 2D array
512-
Left partitions.
513-
right : NumPy 2D array
514-
Right partitions.
515-
516-
Returns
517-
-------
518-
NumPy array
519-
An array of partition objects.
520-
521-
Notes
522-
-----
523-
This method differs from `broadcast_axis_partitions` in that it does not send
524-
all right partitions for each remote task based on the left partitions.
525-
"""
526-
preprocessed_map_func = cls.preprocess_func(apply_func)
527-
left_partitions = cls.axis_partition(left, axis)
528-
right_partitions = None if right is None else cls.axis_partition(right, axis)
529-
530-
result_blocks = np.array(
531-
[
532-
left_partitions[i].apply(
533-
preprocessed_map_func,
534-
other_axis_partition=right_partitions[i],
535-
)
536-
for i in np.arange(len(left_partitions))
537-
]
538-
)
539-
# If we are mapping over columns, they are returned to use the same as
540-
# rows, so we need to transpose the returned 2D NumPy array to return
541-
# the structure to the correct order.
542-
return result_blocks.T if not axis else result_blocks
543-
544495
@classmethod
545496
@wait_computations_if_benchmark_mode
546497
def broadcast_axis_partitions(
@@ -552,6 +503,7 @@ def broadcast_axis_partitions(
552503
keep_partitioning=False,
553504
num_splits=None,
554505
apply_indices=None,
506+
send_all_right=True,
555507
enumerate_partitions=False,
556508
lengths=None,
557509
apply_func_args=None,
@@ -580,6 +532,8 @@ def broadcast_axis_partitions(
580532
then the number of splits is preserved.
581533
apply_indices : list of ints, default: None
582534
Indices of `axis ^ 1` to apply function over.
535+
send_all_right: bool, default: True
536+
Whether or not to pass all right axis partitions to each of the left axis partitions.
583537
enumerate_partitions : bool, default: False
584538
Whether or not to pass partition index into `apply_func`.
585539
Note that `apply_func` must be able to accept `partition_idx` kwarg.
@@ -626,7 +580,6 @@ def broadcast_axis_partitions(
626580
# load-balance the data as well.
627581
kw = {
628582
"num_splits": num_splits,
629-
"other_axis_partition": right_partitions,
630583
"maintain_partitioning": keep_partitioning,
631584
}
632585
if lengths:
@@ -641,6 +594,9 @@ def broadcast_axis_partitions(
641594
left_partitions[i].apply(
642595
preprocessed_map_func,
643596
*(apply_func_args if apply_func_args else []),
597+
other_axis_partition=(
598+
right_partitions if send_all_right else right_partitions[i]
599+
),
644600
**kw,
645601
**({"partition_idx": idx} if enumerate_partitions else {}),
646602
**kwargs,
@@ -698,7 +654,7 @@ def base_map_partitions(
698654

699655
@classmethod
700656
@wait_computations_if_benchmark_mode
701-
def apply(
657+
def broadcast_apply(
702658
cls,
703659
axis,
704660
apply_func,
@@ -731,24 +687,22 @@ def apply(
731687
# partitions of the left and right dataframes are possible for the `apply`,
732688
# as a result of which it is necessary to merge partitions on both axes at once,
733689
# which leads to large slowdowns.
734-
if (
735-
np.prod(left.shape) <= 1.5 * CpuCount.get()
736-
or left.shape[axis] < CpuCount.get() // 5
737-
):
690+
if np.prod(left.shape) <= 1.5 * CpuCount.get():
738691
# block-wise broadcast
739-
new_partitions = cls.broadcast_apply(
692+
new_partitions = cls.base_broadcast_apply(
740693
axis,
741694
apply_func,
742695
left,
743696
right,
744697
)
745698
else:
746699
# axis-wise broadcast
747-
new_partitions = cls.apply_axis_partitions(
700+
new_partitions = cls.broadcast_axis_partitions(
748701
axis=axis ^ 1,
749702
left=left,
750703
right=right,
751704
apply_func=apply_func,
705+
send_all_right=False,
752706
)
753707
return new_partitions
754708

modin/core/storage_formats/pandas/query_compiler.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
from pandas.core.indexing import check_bool_indexer
4646
from pandas.errors import DataError
4747

48-
from modin.config import CpuCount, RangePartitioning
48+
from modin.config import RangePartitioning
4949
from modin.core.dataframe.algebra import (
5050
Binary,
5151
Fold,
@@ -3107,14 +3107,8 @@ def dropna(self, **kwargs):
31073107
lib.no_default,
31083108
None,
31093109
)
3110-
# FIXME: this is a naive workaround for this problem: https://github.com/modin-project/modin/issues/5394
3111-
# if there are too many partitions then all non-full-axis implementations start acting very badly.
3112-
# The here threshold is pretty random though it works fine on simple scenarios
3113-
processable_amount_of_partitions = (
3114-
self._modin_frame.num_parts < CpuCount.get() * 32
3115-
)
31163110

3117-
if is_column_wise and no_thresh_passed and processable_amount_of_partitions:
3111+
if is_column_wise and no_thresh_passed:
31183112
how = kwargs.get("how", "any")
31193113
subset = kwargs.get("subset")
31203114
how = "any" if how in (lib.no_default, None) else how

0 commit comments

Comments
 (0)