|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 | 14 |
|
| 15 | +import logging |
| 16 | +import math |
| 17 | +import os |
| 18 | +import platform |
15 | 19 | import re
|
16 | 20 | from pathlib import Path
|
17 | 21 | from typing import List, Optional, Union
|
18 | 22 |
|
| 23 | +import psutil |
19 | 24 | import torch
|
20 | 25 | from huggingface_hub import HfApi, HfFolder
|
21 | 26 |
|
| 27 | +from .import_utils import is_numa_available |
| 28 | + |
22 | 29 |
|
23 | 30 | MULTI_QUERY_ATTN_MODELS = {"gpt_bigcode"}
|
24 | 31 |
|
| 32 | +logger = logging.getLogger(__name__) |
| 33 | + |
25 | 34 |
|
26 | 35 | def get_model_device(model: torch.nn.Module) -> torch.device:
|
27 | 36 | """
|
@@ -135,3 +144,76 @@ def replace_customized_linear_with_linear(model):
|
135 | 144 | setattr(model, child_name, new_m)
|
136 | 145 | else:
|
137 | 146 | replace_customized_linear_with_linear(child)
|
| 147 | + |
| 148 | + |
| 149 | +def get_int_from_env(env_keys, default): |
| 150 | + """Returns the first positive env value found in the `env_keys` list or the default.""" |
| 151 | + for e in env_keys: |
| 152 | + val = int(os.environ.get(e, -1)) |
| 153 | + if val >= 0: |
| 154 | + return val |
| 155 | + return default |
| 156 | + |
| 157 | + |
| 158 | +def bind_cores_for_best_perf(): |
| 159 | + """ |
| 160 | + Set number of threads per rank, numa cpu affinity and numa memory binding if not already set for better OOB performance. |
| 161 | + Works for wold_size >= 1 and rank >= 1 |
| 162 | +
|
| 163 | + Example: |
| 164 | + .. code-block:: python |
| 165 | +
|
| 166 | + from optimum.intel.ipex import IPEXModelForCausalLM |
| 167 | + from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf |
| 168 | +
|
| 169 | + bind_cores_for_best_perf() |
| 170 | + model = IPEXModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.bfloat16, export=True) |
| 171 | + tokenizer = AutoTokenizer.from_pretrained("gpt2") |
| 172 | + input_sentence = ["tell me a story about a trip to the moon"] |
| 173 | + model_inputs = tokenizer(input_sentence, return_tensors="pt") |
| 174 | + generation_kwargs = dict(max_new_tokens=500) |
| 175 | + generated_ids = model.generate(**model_inputs, **generation_kwargs) |
| 176 | +
|
| 177 | + Returns: |
| 178 | + None |
| 179 | +
|
| 180 | + """ |
| 181 | + if platform.system() != "Linux": |
| 182 | + logger.error("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.") |
| 183 | + raise OSError("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.") |
| 184 | + if not is_numa_available(): |
| 185 | + logger.error("'numa' module not found") |
| 186 | + raise ImportError("'numa' module not found, install with 'pip install numa'") |
| 187 | + import numa |
| 188 | + |
| 189 | + local_size = get_int_from_env( |
| 190 | + ["LOCAL_WORLD_SIZE", "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1 |
| 191 | + ) |
| 192 | + rank_id = get_int_from_env( |
| 193 | + ["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0 |
| 194 | + ) |
| 195 | + nodes = numa.get_max_node() + 1 |
| 196 | + rank_per_node = math.ceil(local_size / nodes) |
| 197 | + num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes) |
| 198 | + node_id = int(rank_id / rank_per_node) |
| 199 | + rank_offset_per_node = rank_id % rank_per_node |
| 200 | + if os.getenv("OMP_NUM_THREADS") is None: |
| 201 | + num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1) |
| 202 | + logger.info(f"Setting OMP_NUM_THREADS to {num_cpus_per_rank} for better performance") |
| 203 | + else: |
| 204 | + num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS")) |
| 205 | + logger.info(f"OMP_NUM_THREADS already set to {num_cpus_per_rank}") |
| 206 | + if len(numa.get_membind()) == nodes: |
| 207 | + # if numa memory binding is not set, set it to the node where the rank is running |
| 208 | + numa.set_membind([node_id]) |
| 209 | + |
| 210 | + torch.set_num_threads(num_cpus_per_rank) |
| 211 | + |
| 212 | + if len(numa.get_affinity(0)) == psutil.cpu_count(logical=True): |
| 213 | + # if numa affinity is unset (default value is set to all logical cores) set it to the physical cores assigned to the rank |
| 214 | + cpu_start = num_cpus_per_rank * rank_offset_per_node |
| 215 | + numa.set_affinity( |
| 216 | + 0, |
| 217 | + list(numa.node_to_cpus(node_id))[cpu_start : cpu_start + num_cpus_per_rank], |
| 218 | + ) |
| 219 | + logger.info(f"affinity={numa.get_affinity(0)}, membind = {numa.get_membind()}") |
0 commit comments