FEAT-modin-project#6965: Implement .merge() using range-partitioning implementation (modin-project#6966)

dchigarev · web-flow · commit a96639529a2e · 2024-03-01T12:10:09.000+01:00
Signed-off-by: Dmitry Chigarev &lt;dmitry.chigarev@intel.com&gt;
diff --git a/.github/actions/run-core-tests/group_2/action.yml b/.github/actions/run-core-tests/group_2/action.yml
@@ -20,3 +20,5 @@ runs:
                                                       modin/pandas/test/dataframe/test_pickle.py
           echo "::endgroup::"
         shell: bash -l {0}
+      - run: MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
+        shell: bash -l {0}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -188,6 +188,7 @@ jobs:
       - run: python -m pytest modin/pandas/test/dataframe/test_binary.py
       - run: python -m pytest modin/pandas/test/dataframe/test_reduce.py
       - run: python -m pytest modin/pandas/test/dataframe/test_join_sort.py
+      - run: MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
       - run: python -m pytest modin/pandas/test/test_general.py
       - run: python -m pytest modin/pandas/test/dataframe/test_indexing.py
       - run: python -m pytest modin/pandas/test/test_series.py
diff --git a/.github/workflows/push-to-master.yml b/.github/workflows/push-to-master.yml
@@ -46,6 +46,7 @@ jobs:
           python -m pytest modin/pandas/test/dataframe/test_indexing.py
           python -m pytest modin/pandas/test/dataframe/test_iter.py
           python -m pytest modin/pandas/test/dataframe/test_join_sort.py
+          MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
           python -m pytest modin/pandas/test/dataframe/test_map_metadata.py
           python -m pytest modin/pandas/test/dataframe/test_reduce.py
           python -m pytest modin/pandas/test/dataframe/test_udf.py
diff --git a/docs/flow/modin/experimental/index.rst b/docs/flow/modin/experimental/index.rst
@@ -15,7 +15,7 @@ and provides a limited set of functionality:
 * :doc:`xgboost <xgboost>`
 * :doc:`sklearn <sklearn>`
 * :doc:`batch <batch>`
-* :doc:`Range-partitioning GroupBy implementation <range_partitioning_groupby>`
+* :doc:`Range-partitioning implementations <range_partitioning_groupby>`
 
 
 .. toctree::
diff --git a/docs/flow/modin/experimental/range_partitioning_groupby.rst b/docs/flow/modin/experimental/range_partitioning_groupby.rst
@@ -72,3 +72,9 @@ implementation with the respective warning if it meets an unsupported case:
         ...  # Range-partitioning groupby is only supported when grouping on a column(s) of the same frame.
         ...  # https://github.com/modin-project/modin/issues/5926
         ...  # Falling back to a TreeReduce implementation.
+
+Range-partitioning Merge
+""""""""""""""""""""""""
+
+It is recommended to use this implementation if the right dataframe in merge is as big as
+the left dataframe. In this case, range-partitioning implementation works faster and consumes less RAM.
diff --git a/modin/config/__init__.py b/modin/config/__init__.py
@@ -44,6 +44,7 @@
     NPartitions,
     PersistentPickle,
     ProgressBar,
+    RangePartitioning,
     RangePartitioningGroupby,
     RayRedisAddress,
     RayRedisPassword,
@@ -92,6 +93,7 @@
     "ModinNumpy",
     "ExperimentalNumPyAPI",
     "RangePartitioningGroupby",
+    "RangePartitioning",
     "ExperimentalGroupbyImpl",
     "AsyncReadMode",
     "ReadSqlEngine",
diff --git a/modin/config/envvars.py b/modin/config/envvars.py
@@ -770,6 +770,18 @@ def _sibling(cls) -> type[EnvWithSibilings]:
 )
 
 
+class RangePartitioning(EnvironmentVariable, type=bool):
+    """
+    Set to true to use Modin's range-partitioning implementation where possible.
+
+    Please refer to documentation for cases where enabling this options would be beneficial:
+    https://modin.readthedocs.io/en/stable/flow/modin/experimental/range_partitioning_groupby.html
+    """
+
+    varname = "MODIN_RANGE_PARTITIONING"
+    default = False
+
+
 class CIAWSSecretAccessKey(EnvironmentVariable, type=str):
     """Set to AWS_SECRET_ACCESS_KEY when running mock S3 tests for Modin in GitHub CI."""
 
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -3881,6 +3881,74 @@ def _compute_new_widths():
             new_partitions, new_index, new_columns, new_lengths, new_widths, new_dtypes
         )
 
+    def _apply_func_to_range_partitioning_broadcast(
+        self, right, func, key, new_index=None, new_columns=None, new_dtypes=None
+    ):
+        """
+        Apply `func` against two dataframes using range-partitioning implementation.
+
+        The method first builds range-partitioning for both dataframes using the data from
+        `self[key]`, after that, it applies `func` row-wise to `self` frame and
+        broadcasts row-parts of `right` to `self`.
+
+        Parameters
+        ----------
+        right : PandasDataframe
+        func : callable(left : pandas.DataFrame, right : pandas.DataFrame) -> pandas.DataFrame
+        key : list of labels
+            Columns to use to build range-partitioning. Must present in both dataframes.
+        new_index : pandas.Index, optional
+            Index values to write to the result's cache.
+        new_columns : pandas.Index, optional
+            Column values to write to the result's cache.
+        new_dtypes : pandas.Series or ModinDtypes, optional
+            Dtype values to write to the result's cache.
+
+        Returns
+        -------
+        PandasDataframe
+        """
+        if self._partitions.shape[0] == 1:
+            result = self.broadcast_apply_full_axis(
+                axis=1,
+                func=func,
+                new_columns=new_columns,
+                dtypes=new_dtypes,
+                other=right,
+            )
+            return result
+
+        if not isinstance(key, list):
+            key = [key]
+
+        shuffling_functions = ShuffleSortFunctions(
+            self,
+            key,
+            ascending=True,
+            ideal_num_new_partitions=self._partitions.shape[0],
+        )
+
+        # here we want to get indices of those partitions that hold the key columns
+        key_indices = self.columns.get_indexer_for(key)
+        partition_indices = np.unique(
+            np.digitize(key_indices, np.cumsum(self.column_widths))
+        )
+
+        new_partitions = self._partition_mgr_cls.shuffle_partitions(
+            self._partitions,
+            partition_indices,
+            shuffling_functions,
+            func,
+            right_partitions=right._partitions,
+        )
+
+        return self.__constructor__(
+            new_partitions,
+            index=new_index,
+            columns=new_columns,
+            dtypes=new_dtypes,
+        )
+
     @lazy_metadata_decorator(apply_axis="both")
     def groupby(
         self,
diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py
@@ -1722,6 +1722,7 @@ def shuffle_partitions(
         index,
         shuffle_functions: "ShuffleFunctions",
         final_shuffle_func,
+        right_partitions=None,
     ):
         """
         Return shuffled partitions.
@@ -1736,6 +1737,9 @@ def shuffle_partitions(
             An object implementing the functions that we will be using to perform this shuffle.
         final_shuffle_func : Callable(pandas.DataFrame) -> pandas.DataFrame
             Function that shuffles the data within each new partition.
+        right_partitions : np.ndarray, optional
+            Partitions to broadcast to `self` partitions. If specified, the method builds range-partitioning
+            for `right_partitions` basing on bins calculated for `partitions`, then performs broadcasting.
 
         Returns
         -------
@@ -1774,18 +1778,57 @@ def shuffle_partitions(
                     for partition in row_partitions
                 ]
             ).T
-            # We need to convert every partition that came from the splits into a full-axis column partition.
-            new_partitions = [
+
+            if right_partitions is None:
+                # We need to convert every partition that came from the splits into a column partition.
+                return np.array(
+                    [
+                        [
+                            cls._column_partitions_class(
+                                row_partition, full_axis=False
+                            ).apply(final_shuffle_func)
+                        ]
+                        for row_partition in split_row_partitions
+                    ]
+                )
+
+            right_row_parts = cls.row_partitions(right_partitions)
+            right_split_row_partitions = np.array(
+                [
+                    partition.split(
+                        shuffle_functions.split_fn,
+                        num_splits=num_bins,
+                        extract_metadata=False,
+                    )
+                    for partition in right_row_parts
+                ]
+            ).T
+            return np.array(
                 [
                     cls._column_partitions_class(row_partition, full_axis=False).apply(
-                        final_shuffle_func
+                        final_shuffle_func,
+                        other_axis_partition=cls._column_partitions_class(
+                            right_row_partitions
+                        ),
+                    )
+                    for right_row_partitions, row_partition in zip(
+                        right_split_row_partitions, split_row_partitions
                     )
                 ]
-                for row_partition in split_row_partitions
-            ]
-            return np.array(new_partitions)
+            )
+
         else:
             # If there are not pivots we can simply apply the function row-wise
+            if right_partitions is None:
+                return np.array(
+                    [row_part.apply(final_shuffle_func) for row_part in row_partitions]
+                )
+            right_row_parts = cls.row_partitions(right_partitions)
             return np.array(
-                [row_part.apply(final_shuffle_func) for row_part in row_partitions]
+                [
+                    row_part.apply(
+                        final_shuffle_func, other_axis_partition=right_row_part
+                    )
+                    for right_row_part, row_part in zip(right_row_parts, row_partitions)
+                ]
             )
diff --git a/modin/core/storage_formats/pandas/merge.py b/modin/core/storage_formats/pandas/merge.py
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py