Skip to content

Commit 9a18ae0

Browse files
set cpu affinity and membind for better oob performance (#853)
* set num threads and memory binding for better OOB performance * clean env var * added core and memory binding util for improved performance * add example usage in docstring * change utlity for best oob to support world_size and rank >=1 * fix style * fix node_id value to account for rank_id starts at zero * numa node assignment calculated from local size not from world size * reorg imports, moved checks to import_utils, remove prints for logger * raise Errors with missing pkg and unsupported OS * added missng env var to list * Update optimum/intel/utils/modeling_utils.py * Update optimum/intel/utils/import_utils.py * Update optimum/intel/utils/import_utils.py * fix style quality error --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
1 parent 403c696 commit 9a18ae0

File tree

4 files changed

+100
-4
lines changed

4 files changed

+100
-4
lines changed

docker/Dockerfile.intel

+5-4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
2727
libpng-dev \
2828
python3 \
2929
python3-pip \
30+
python3-dev \
31+
libnuma-dev \
3032
&& rm -rf /var/lib/apt/lists/*"
3133
RUN /usr/sbin/update-ccache-symlinks
3234
RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
@@ -43,12 +45,11 @@ RUN python3 -m pip install --no-cache-dir \
4345
torchaudio==${TORCHAUDIO_VERSION} \
4446
-f https://download.pytorch.org/whl/torch_stable.html && \
4547
python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
46-
python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
48+
python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
49+
python3 -m pip install --no-cache-dir numa
4750

48-
ARG OMP_NUM_THREADS=1
49-
ENV OMP_NUM_THREADS=${OMP_NUM_THREADS}
5051
ARG KMP_BLOCKTIME=1
5152
ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
5253
ARG KMP_HW_SUBSET=1T
5354
ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
54-
ENV LD_PRELOAD="/usr/local/lib/libiomp5.so /usr/lib/x86_64-linux-gnu/libtcmalloc.so"
55+
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"

optimum/intel/utils/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
is_neural_compressor_available,
2323
is_neural_compressor_version,
2424
is_nncf_available,
25+
is_numa_available,
2526
is_openvino_available,
2627
is_torch_version,
2728
is_transformers_available,

optimum/intel/utils/import_utils.py

+12
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,14 @@
150150
except importlib_metadata.PackageNotFoundError:
151151
_accelerate_available = False
152152

153+
_numa_available = importlib.util.find_spec("numa") is not None
154+
155+
if _numa_available:
156+
try:
157+
importlib_metadata.version("numa")
158+
except importlib_metadata.PackageNotFoundError:
159+
_numa_available = False
160+
153161

154162
def is_transformers_available():
155163
return _transformers_available
@@ -272,6 +280,10 @@ def is_accelerate_available():
272280
return _accelerate_available
273281

274282

283+
def is_numa_available():
284+
return _numa_available
285+
286+
275287
# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
276288
def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
277289
"""

optimum/intel/utils/modeling_utils.py

+82
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,25 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import logging
16+
import math
17+
import os
18+
import platform
1519
import re
1620
from pathlib import Path
1721
from typing import List, Optional, Union
1822

23+
import psutil
1924
import torch
2025
from huggingface_hub import HfApi, HfFolder
2126

27+
from .import_utils import is_numa_available
28+
2229

2330
MULTI_QUERY_ATTN_MODELS = {"gpt_bigcode"}
2431

32+
logger = logging.getLogger(__name__)
33+
2534

2635
def get_model_device(model: torch.nn.Module) -> torch.device:
2736
"""
@@ -135,3 +144,76 @@ def replace_customized_linear_with_linear(model):
135144
setattr(model, child_name, new_m)
136145
else:
137146
replace_customized_linear_with_linear(child)
147+
148+
149+
def get_int_from_env(env_keys, default):
150+
"""Returns the first positive env value found in the `env_keys` list or the default."""
151+
for e in env_keys:
152+
val = int(os.environ.get(e, -1))
153+
if val >= 0:
154+
return val
155+
return default
156+
157+
158+
def bind_cores_for_best_perf():
159+
"""
160+
Set number of threads per rank, numa cpu affinity and numa memory binding if not already set for better OOB performance.
161+
Works for wold_size >= 1 and rank >= 1
162+
163+
Example:
164+
.. code-block:: python
165+
166+
from optimum.intel.ipex import IPEXModelForCausalLM
167+
from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
168+
169+
bind_cores_for_best_perf()
170+
model = IPEXModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.bfloat16, export=True)
171+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
172+
input_sentence = ["tell me a story about a trip to the moon"]
173+
model_inputs = tokenizer(input_sentence, return_tensors="pt")
174+
generation_kwargs = dict(max_new_tokens=500)
175+
generated_ids = model.generate(**model_inputs, **generation_kwargs)
176+
177+
Returns:
178+
None
179+
180+
"""
181+
if platform.system() != "Linux":
182+
logger.error("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.")
183+
raise OSError("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.")
184+
if not is_numa_available():
185+
logger.error("'numa' module not found")
186+
raise ImportError("'numa' module not found, install with 'pip install numa'")
187+
import numa
188+
189+
local_size = get_int_from_env(
190+
["LOCAL_WORLD_SIZE", "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
191+
)
192+
rank_id = get_int_from_env(
193+
["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0
194+
)
195+
nodes = numa.get_max_node() + 1
196+
rank_per_node = math.ceil(local_size / nodes)
197+
num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
198+
node_id = int(rank_id / rank_per_node)
199+
rank_offset_per_node = rank_id % rank_per_node
200+
if os.getenv("OMP_NUM_THREADS") is None:
201+
num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
202+
logger.info(f"Setting OMP_NUM_THREADS to {num_cpus_per_rank} for better performance")
203+
else:
204+
num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
205+
logger.info(f"OMP_NUM_THREADS already set to {num_cpus_per_rank}")
206+
if len(numa.get_membind()) == nodes:
207+
# if numa memory binding is not set, set it to the node where the rank is running
208+
numa.set_membind([node_id])
209+
210+
torch.set_num_threads(num_cpus_per_rank)
211+
212+
if len(numa.get_affinity(0)) == psutil.cpu_count(logical=True):
213+
# if numa affinity is unset (default value is set to all logical cores) set it to the physical cores assigned to the rank
214+
cpu_start = num_cpus_per_rank * rank_offset_per_node
215+
numa.set_affinity(
216+
0,
217+
list(numa.node_to_cpus(node_id))[cpu_start : cpu_start + num_cpus_per_rank],
218+
)
219+
logger.info(f"affinity={numa.get_affinity(0)}, membind = {numa.get_membind()}")

0 commit comments

Comments
 (0)