huggingface · zhenglongjiepheonix · Aug 12, 2024 · Jun 3, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/optimum/fx/parallelization/__init__.py b/optimum/fx/parallelization/__init__.py
@@ -0,0 +1,31 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import torch
+from torch.fx import GraphModule
+
+from .core import Config, ParallelExecutionCtx
+from .passes import build_parallel_pass_pipeline
+
+
+def parallelize_backend(
+    graph_module: GraphModule, example_inputs: List[torch.Tensor], ctx: ParallelExecutionCtx, config: Config
+) -> GraphModule:
+    ctx.example_inputs = example_inputs
+    pass_pipeline = build_parallel_pass_pipeline()
+    graph_module = pass_pipeline(graph_module=graph_module, ctx=ctx, config=config)
+    ctx.compile_times += 1
+    return graph_module
diff --git a/optimum/fx/parallelization/core.py b/optimum/fx/parallelization/core.py
@@ -0,0 +1,154 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+
+class HashableSlice:
+    def __init__(self, start: Optional[int] = None, stop: Optional[int] = None, step: Optional[int] = None) -> None:
+        self.start = start
+        self.stop = stop
+        self.step = step
+
+    def __hash__(self) -> int:
+        return hash(f"{self.start},{self.stop},{self.step}")
+
+    def __eq__(self, value: object) -> bool:
+        return (
+            isinstance(value, HashableSlice)
+            and self.start == value.start
+            and self.stop == value.stop
+            and self.step == value.step
+        )
+
+    def to_slice(self) -> slice:
+        return slice(self.start, self.stop, self.step)
+
+
+@dataclass
+class ParameterSlice:
+    """
+    A slice of parameter which corresponds to a tensor in weight dict. Only support slicing
+    along a specific axis (the potential parallel axis) right now.
+
+    Attributes:
+        - source (`Optional[str]`):
+            Original parameter name which can be found in the weight dict.
+
+        - index (`Optional[slice]`):
+            Index to slice the tensor on the parallel axis. Assume tensor in weight dict has the same
+            layout as their correspondings in memory.
+    """
+
+    source: Optional[str] = None
+    index: Optional[slice] = None
+
+
+@dataclass
+class ParameterMeta:
+    """
+    Parameter meta information.
+
+    Attributes:
+        - is_tied (`bool`, defaults to `False`):
+            Whether the parameter is shared accross multiple modules.
+
+        - is_modified_meta (`bool`, defaults to `False`):
+            Whether the meta has already been modified since initialization.
+
+        - need_initialize (`bool`, defaults to `False`):
+            Whether need to manually initialize weights if not provided in weight map.
+
+        - init_fn (`Optional[Callable]`):
+            Initialization function, can override `weight_init_fn` in `Config` if not None.
+
+        - dim (`int`, defaults to `0`):
+            Axis on which `mapping` is based.
+
+        - mapping (`Dict[HashableSlice, ParameterSlice]`):
+            Mapping between the current parameter and weight tensor stored in weight map.
+    """
+
+    is_tied: bool = False
+    is_modified_meta: bool = False
+    need_initialize: bool = False
+    init_fn: Optional[Callable] = None
+    dim: int = 0
+    mapping: Dict[HashableSlice, ParameterSlice] = field(default_factory=dict)
+
+
+@dataclass
+class ParallelExecutionCtx:
+    """
+    Parallel execution context which contains runtime information.
+
+    Attributes:
+        - tp_group (`dist.ProcessGroup`):
+            Tensor parallel process group the current process belongs to.
+
+        - current_device (`torch.device`):
+            Device correpsonding to the current process.
+
+        - example_inputs (`List[Any]`):
+            A list of tensors which are used as example inputs for graphs captured by dynamo.
+
+        - parallel_layer_cache (`Dict[int, nn.Module]`):
+            Cache which maps layers(`nn.Linear`, `nn.Embedding`) to their parallel counterparts.
+            Note that we will build the cache in the first compilation process, and for recompilations
+            later on, we will directly replace the modules with their parallel counterparts in the cache,
+            because we have to make sure we don't initiate new parameters and replace original ones when
+            recompilation happens in training process.
+
+        - weight_map (`Dict[str, str]`):
+            Mapping between parameter names and their locations on disk, useful when loading weights
+            from disk.
+
+        - compile_times (`int`, defaults to `0`):
+            Number of compilation times happened during the whole process.
+    """
+
+    tp_group: dist.ProcessGroup
+    current_device: torch.device
+    example_inputs: List[Any] = field(default_factory=list)
+    parallel_layer_cache: Dict[int, nn.Module] = field(default_factory=dict)
+    weight_map: Dict[str, str] = field(default_factory=dict)
+    compile_times: int = 0
+
+
+@dataclass
+class Config:
+    """
+    Static config which contains instructions which do not change in runtime.
+
+    Attributes:
+        - lint_and_recompile (`bool`, defaults to `True`):
+            Whether to run graph linting and module recompilation after every pass.
+
+        - clean_markers_after_all_passes (`bool`, defaults to `True`):
+            Whether to clean markers of analytical passes after all passes have run.
+
+        - weight_init_fn (`Callable`, defaults to `partial(nn.init.normal_, std=0.02)`)
+            Initialization function of weights in `nn.Linear` and `nn.Embedding` layers,
+            if not provided weights loading path.
+    """
+
+    lint_and_recompile: bool = True
+    clean_markers_after_all_passes: bool = True
+    weight_init_fn: Callable = partial(nn.init.normal_, std=0.02)
diff --git a/optimum/fx/parallelization/distributed/__init__.py b/optimum/fx/parallelization/distributed/__init__.py
@@ -0,0 +1,21 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .dist_ops import (
+    differentiable_all_gather,
+    differentiable_all_reduce_sum,
+    differentiable_identity,
+    differentiable_scatter,
+    scatter,
+)
diff --git a/optimum/fx/parallelization/distributed/dist_ops.py b/optimum/fx/parallelization/distributed/dist_ops.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.distributed as dist
+
+from ..utils import ensure_divisibility
+
+
+def all_reduce(group: dist.ProcessGroup, tensor: torch.Tensor) -> torch.Tensor:
+    world_size = dist.get_world_size(group)
+    if world_size == 1:
+        return tensor
+
+    dist.all_reduce(tensor, group=group)
+    return tensor
+
+
+def all_gather(group: dist.ProcessGroup, tensor: torch.Tensor, gather_dim: int = -1) -> torch.Tensor:
+    world_size = dist.get_world_size(group)
+    if world_size == 1:
+        return tensor
+    rank = dist.get_rank(group=group)
+
+    tensor = tensor.contiguous()
+    gather_dim = (gather_dim + tensor.ndim) % tensor.ndim
+    shape = tuple(
+        tensor.size(dim) * world_size if dim == gather_dim else tensor.size(dim) for dim in range(tensor.ndim)
+    )
+    index = [
+        slice(rank * tensor.size(dim), (rank + 1) * tensor.size(dim), None)
+        if dim == gather_dim
+        else slice(None, None, None)
+        for dim in range(tensor.ndim)
+    ]
+    tensors = torch.empty(*shape, dtype=tensor.dtype, device=tensor.device)
+    tensors[index] = tensor
+    dist.all_gather_into_tensor(tensors, tensor, group=group)
+    return tensors
+
+
+def split(group: dist.ProcessGroup, tensor: torch.Tensor, split_dim: int = -1) -> torch.Tensor:
+    world_size = dist.get_world_size(group)
+    if world_size == 1:
+        return tensor
+
+    rank = dist.get_rank(group)
+    size = tensor.size()
+    ensure_divisibility(size[split_dim], world_size)
+    tensors = torch.split(tensor, size[split_dim] // world_size, dim=split_dim)
+    tensor = tensors[rank].contiguous()
+
+    return tensor
+
+
+def scatter(
+    group: dist.ProcessGroup, tensor: torch.Tensor, output_tensor: torch.Tensor, scatter_dim: int = 0
+) -> torch.Tensor:
+    world_size = dist.get_world_size(group)
+    if world_size == 1:
+        return tensor
+
+    rank = dist.get_rank(group)
+    if rank == 0:
+        size = tensor.size()
+        ensure_divisibility(size[scatter_dim], world_size)
+        tensors = torch.split(tensor, size[scatter_dim] // world_size, dim=scatter_dim)
+        scatter_list = [tensor.contiguous() for tensor in tensors]
+        output_tensor.copy_(scatter_list[rank])
+    else:
+        scatter_list = None
+    dist.scatter(tensor=output_tensor, scatter_list=scatter_list, src=0, group=group)
+    return output_tensor
+
+
+class DifferentiableIdentity(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor, group: dist.ProcessGroup):
+        ctx.group = group
+        return tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        group = ctx.group
+        return DifferentiableAllReduceSum.apply(grad_output, group), None
+
+
+class DifferentiableAllReduceSum(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor: torch.Tensor, group: dist.ProcessGroup) -> torch.Tensor:
+        ctx.group = group
+        return all_reduce(group=group, tensor=tensor)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Any:
+        return grad_output, None
+
+
+class DifferentiableScatter(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor: torch.Tensor, group: dist.ProcessGroup, dim: int = -1) -> torch.Tensor:
+        ctx.group = group
+        ctx.dim = dim
+        return split(group=group, tensor=tensor, split_dim=dim)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        return DifferentiableAllGather.apply(grad_output, group=ctx.group, dim=ctx.dim), None, None
+
+
+class DifferentiableAllGather(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor: torch.Tensor, group: dist.ProcessGroup, dim: int = -1) -> torch.Tensor:
+        ctx.group = group
+        ctx.dim = dim
+        return all_gather(group=group, tensor=tensor, gather_dim=dim)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        return DifferentiableScatter.apply(grad_output, group=ctx.group, dim=ctx.dim), None, None
+
+
+def differentiable_all_reduce_sum(tensor: torch.Tensor, group: dist.ProcessGroup) -> torch.Tensor:
+    return DifferentiableAllReduceSum.apply(tensor, group)
+
+
+def differentiable_identity(tensor: torch.Tensor, group: dist.ProcessGroup) -> torch.Tensor:
+    return DifferentiableIdentity.apply(tensor, group)
+
+
+def differentiable_all_gather(tensor: torch.Tensor, group: dist.ProcessGroup, dim=-1) -> torch.Tensor:
+    return DifferentiableAllGather.apply(tensor, group, dim)
+
+
+def differentiable_scatter(tensor: torch.Tensor, group: dist.ProcessGroup, dim=-1) -> torch.Tensor:
+    return DifferentiableScatter.apply(tensor, group, dim)
diff --git a/optimum/fx/parallelization/parallel_layers/__init__.py b/optimum/fx/parallelization/parallel_layers/__init__.py
@@ -0,0 +1,16 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .embedding import VocabParallelEmbedding
+from .linear import ColumnParallelLinear, RowParallelLinear