refactoring

Retribution98 · Retribution98 · commit aadac56a3c13 · 2024-07-16T11:57:04.000Z
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -3265,7 +3265,9 @@ def broadcast_apply(
                 axis
             ), self.copy_axis_cache(axis)
 
-        new_frame = self._partition_mgr_cls.apply(axis, func, left_parts, right_parts)
+        new_frame = self._partition_mgr_cls.broadcast_apply(
+            axis, func, left_parts, right_parts
+        )
         if isinstance(dtypes, str) and dtypes == "copy":
             dtypes = self.copy_dtypes_cache()
 
diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py
@@ -338,7 +338,9 @@ def groupby_reduce(
                 f"the number of partitions along {axis=} is not equal: "
                 + f"{partitions.shape[axis]} != {by.shape[axis]}"
             )
-            mapped_partitions = cls.apply(axis, map_func, left=partitions, right=by)
+            mapped_partitions = cls.broadcast_apply(
+                axis, map_func, left=partitions, right=by
+            )
         else:
             mapped_partitions = cls.map_partitions(partitions, map_func)
 
@@ -437,7 +439,7 @@ def get_partitions(index):
 
     @classmethod
     @wait_computations_if_benchmark_mode
-    def broadcast_apply(cls, axis, apply_func, left, right):
+    def base_broadcast_apply(cls, axis, apply_func, left, right):
         """
         Broadcast the `right` partitions to `left` and apply `apply_func` function.
 
@@ -490,57 +492,6 @@ def map_func(df, *others):
             ]
         )
 
-    @classmethod
-    @wait_computations_if_benchmark_mode
-    def apply_axis_partitions(
-        cls,
-        axis,
-        apply_func,
-        left,
-        right,
-    ):
-        """
-        Broadcast the `right` partitions to `left` and apply `apply_func` along full `axis`.
-
-        Parameters
-        ----------
-        axis : {0, 1}
-            Axis to apply and broadcast over.
-        apply_func : callable
-            Function to apply.
-        left : NumPy 2D array
-            Left partitions.
-        right : NumPy 2D array
-            Right partitions.
-
-        Returns
-        -------
-        NumPy array
-            An array of partition objects.
-
-        Notes
-        -----
-        This method differs from `broadcast_axis_partitions` in that it does not send
-        all right partitions for each remote task based on the left partitions.
-        """
-        preprocessed_map_func = cls.preprocess_func(apply_func)
-        left_partitions = cls.axis_partition(left, axis)
-        right_partitions = None if right is None else cls.axis_partition(right, axis)
-
-        result_blocks = np.array(
-            [
-                left_partitions[i].apply(
-                    preprocessed_map_func,
-                    other_axis_partition=right_partitions[i],
-                )
-                for i in np.arange(len(left_partitions))
-            ]
-        )
-        # If we are mapping over columns, they are returned to use the same as
-        # rows, so we need to transpose the returned 2D NumPy array to return
-        # the structure to the correct order.
-        return result_blocks.T if not axis else result_blocks
-
     @classmethod
     @wait_computations_if_benchmark_mode
     def broadcast_axis_partitions(
@@ -552,6 +503,7 @@ def broadcast_axis_partitions(
         keep_partitioning=False,
         num_splits=None,
         apply_indices=None,
+        send_all_right=True,
         enumerate_partitions=False,
         lengths=None,
         apply_func_args=None,
@@ -580,6 +532,8 @@ def broadcast_axis_partitions(
             then the number of splits is preserved.
         apply_indices : list of ints, default: None
             Indices of `axis ^ 1` to apply function over.
+        send_all_right: bool, default: True
+            Whether or not to pass all right axis partitions to each of the left axis partitions.
         enumerate_partitions : bool, default: False
             Whether or not to pass partition index into `apply_func`.
             Note that `apply_func` must be able to accept `partition_idx` kwarg.
@@ -626,7 +580,6 @@ def broadcast_axis_partitions(
         # load-balance the data as well.
         kw = {
             "num_splits": num_splits,
-            "other_axis_partition": right_partitions,
             "maintain_partitioning": keep_partitioning,
         }
         if lengths:
@@ -641,6 +594,9 @@ def broadcast_axis_partitions(
                 left_partitions[i].apply(
                     preprocessed_map_func,
                     *(apply_func_args if apply_func_args else []),
+                    other_axis_partition=(
+                        right_partitions if send_all_right else right_partitions[i]
+                    ),
                     **kw,
                     **({"partition_idx": idx} if enumerate_partitions else {}),
                     **kwargs,
@@ -698,7 +654,7 @@ def base_map_partitions(
 
     @classmethod
     @wait_computations_if_benchmark_mode
-    def apply(
+    def broadcast_apply(
         cls,
         axis,
         apply_func,
@@ -731,24 +687,22 @@ def apply(
         # partitions of the left and right dataframes are possible for the `apply`,
         # as a result of which it is necessary to merge partitions on both axes at once,
         # which leads to large slowdowns.
-        if (
-            np.prod(left.shape) <= 1.5 * CpuCount.get()
-            or left.shape[axis] < CpuCount.get() // 5
-        ):
+        if np.prod(left.shape) <= 1.5 * CpuCount.get():
             # block-wise broadcast
-            new_partitions = cls.broadcast_apply(
+            new_partitions = cls.base_broadcast_apply(
                 axis,
                 apply_func,
                 left,
                 right,
             )
         else:
             # axis-wise broadcast
-            new_partitions = cls.apply_axis_partitions(
+            new_partitions = cls.broadcast_axis_partitions(
                 axis=axis ^ 1,
                 left=left,
                 right=right,
                 apply_func=apply_func,
+                send_all_right=False,
             )
         return new_partitions
 
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
@@ -45,7 +45,7 @@
 from pandas.core.indexing import check_bool_indexer
 from pandas.errors import DataError
 
-from modin.config import CpuCount, RangePartitioning
+from modin.config import RangePartitioning
 from modin.core.dataframe.algebra import (
     Binary,
     Fold,
@@ -3107,14 +3107,8 @@ def dropna(self, **kwargs):
             lib.no_default,
             None,
         )
-        # FIXME: this is a naive workaround for this problem: https://github.com/modin-project/modin/issues/5394
-        # if there are too many partitions then all non-full-axis implementations start acting very badly.
-        # The here threshold is pretty random though it works fine on simple scenarios
-        processable_amount_of_partitions = (
-            self._modin_frame.num_parts < CpuCount.get() * 32
-        )
 
-        if is_column_wise and no_thresh_passed and processable_amount_of_partitions:
+        if is_column_wise and no_thresh_passed:
             how = kwargs.get("how", "any")
             subset = kwargs.get("subset")
             how = "any" if how in (lib.no_default, None) else how