-
Notifications
You must be signed in to change notification settings - Fork 124
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixed gpu detection for cuda rocm etc using env vars #490
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -98,12 +98,14 @@ def _image(self, args): | |
if args.image != default_image(): | ||
return args.image | ||
|
||
gpu_type, _ = get_gpu() | ||
if gpu_type == "HIP_VISIBLE_DEVICES": | ||
if os.getenv("HIP_VISIBLE_DEVICES"): | ||
return "quay.io/ramalama/rocm:latest" | ||
|
||
if gpu_type == "ASAHI_VISIBLE_DEVICES": | ||
if os.getenv("ASAHI_VISIBLE_DEVICES"): | ||
return "quay.io/ramalama/asahi:latest" | ||
|
||
if os.getenv("CUDA_VISIBLE_DEVICES"): | ||
return "docker.io/brianmahabir/rama-cuda:v1" | ||
|
||
return args.image | ||
|
||
|
@@ -143,9 +145,18 @@ def setup_container(self, args): | |
if os.path.exists("/dev/kfd"): | ||
conman_args += ["--device", "/dev/kfd"] | ||
|
||
gpu_type, gpu_num = get_gpu() | ||
if gpu_type == "HIP_VISIBLE_DEVICES" or gpu_type == "ASAHI_VISIBLE_DEVICES": | ||
conman_args += ["-e", f"{gpu_type}={gpu_num}"] | ||
for var in ["HIP_VISIBLE_DEVICES", "ASAHI_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"]: | ||
value = os.getenv(var) | ||
if value: | ||
if var == "CUDA_VISIBLE_DEVICES": | ||
if args.engine == "docker": | ||
conman_args += ["--gpus", "all"] | ||
else: | ||
# Podman specific args | ||
conman_args += ["--device", "nvidia.com/gpu=all"] | ||
else: | ||
# For HIP and ASAHI, we directly add the environment variable with its value | ||
conman_args += ["-e", f"{var}={value}"] | ||
return conman_args | ||
|
||
def run_container(self, args, shortnames): | ||
|
@@ -190,14 +201,14 @@ def cleanup(): | |
return True | ||
|
||
def gpu_args(self): | ||
gpu_type, gpu_num = get_gpu() | ||
gpu_args = [] | ||
if sys.platform == "darwin": | ||
# llama.cpp will default to the Metal backend on macOS, so we don't need | ||
# any additional arguments. | ||
pass | ||
elif sys.platform == "linux" and ( | ||
os.getenv("HIP_VISIBLE_DEVICES") or os.getenv("ASAHI_VISIBLE_DEVICES") or os.getenv("CUDA_VISIBLE_DEVICES") | ||
): | ||
elif sys.platform == "linux" and gpu_type is not None: | ||
os.environ[gpu_type] = gpu_num | ||
gpu_args = ["-ngl", "99"] | ||
else: | ||
print("GPU offload was requested but is not available on this system") | ||
|
@@ -280,8 +291,13 @@ def run(self, args): | |
if not args.ARGS and sys.stdin.isatty(): | ||
exec_args.append("-cnv") | ||
|
||
if args.gpu: | ||
exec_args.extend(self.gpu_args()) | ||
# if args.gpu: | ||
# exec_args.extend(self.gpu_args()) | ||
|
||
# bypass args.gpu for auto-detection of gpu | ||
gpu_args = self.gpu_args() | ||
if gpu_args is not None: | ||
exec_args.extend(gpu_args) | ||
|
||
try: | ||
if self.exec_model_in_container(model_path, exec_args, args): | ||
|
@@ -293,7 +309,7 @@ def run(self, args): | |
except FileNotFoundError as e: | ||
if in_container(): | ||
raise NotImplementedError(file_not_found_in_container % (exec_args[0], str(e).strip("'"))) | ||
raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'"))) | ||
raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0])) | ||
|
||
def serve(self, args): | ||
if hasattr(args, "name") and args.name: | ||
|
@@ -326,8 +342,13 @@ def serve(self, args): | |
exec_model_path = os.path.dirname(exec_model_path) | ||
exec_args = ["vllm", "serve", "--port", args.port, exec_model_path] | ||
else: | ||
if args.gpu: | ||
exec_args.extend(self.gpu_args()) | ||
# if args.gpu: | ||
# exec_args.extend(self.gpu_args()) | ||
|
||
# bypass args.gpu for auto-detection of gpu | ||
gpu_args = self.gpu_args() | ||
if gpu_args is not None: | ||
exec_args.extend(gpu_args) | ||
exec_args.extend(["--host", args.host]) | ||
|
||
if args.generate == "quadlet": | ||
|
@@ -349,7 +370,7 @@ def serve(self, args): | |
except FileNotFoundError as e: | ||
if in_container(): | ||
raise NotImplementedError(file_not_found_in_container % (exec_args[0], str(e).strip("'"))) | ||
raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'"))) | ||
raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0])) | ||
|
||
def quadlet(self, model, args, exec_args): | ||
quadlet = Quadlet(model, args, exec_args) | ||
|
@@ -382,28 +403,86 @@ def check_valid_model_path(self, relative_target_path, model_path): | |
return os.path.exists(model_path) and os.readlink(model_path) == relative_target_path | ||
|
||
|
||
def get_gpu(): | ||
i = 0 | ||
gpu_num = 0 | ||
gpu_bytes = 0 | ||
for fp in sorted(glob.glob('/sys/bus/pci/devices/*/mem_info_vram_total')): | ||
with open(fp, 'r') as file: | ||
content = int(file.read()) | ||
if content > 1073741824 and content > gpu_bytes: | ||
gpu_bytes = content | ||
gpu_num = i | ||
|
||
i += 1 | ||
def get_amdgpu(gpu_template): | ||
"""Detect AMD GPUs and append valid entries to the template.""" | ||
amdgpu_num = 0 | ||
amdgpu_vram = 0 | ||
for i, fp in enumerate(sorted(glob.glob('/sys/bus/pci/devices/*/mem_info_vram_total'))): | ||
try: | ||
with open(fp, 'r') as file: | ||
memory_bytes = int(file.read()) | ||
memory_mib = memory_bytes / (1024 * 1024) # Convert bytes to MiB | ||
# Find AMD GPU with largest Vram | ||
if memory_mib > 1024 and memory_mib > amdgpu_vram: | ||
amdgpu_vram = memory_mib | ||
amdgpu_num = i | ||
gpu_template.append({"index":amdgpu_num, "vram":amdgpu_vram, "env":"HIP_VISIBLE_DEVICES"}) | ||
except Exception as ex: | ||
print(f"Error reading AMD GPU memory info: {ex}") | ||
|
||
|
||
def get_nvgpu(gpu_template): | ||
"""Detect NVIDIA GPUs and append valid entries to the template.""" | ||
nvgpu_num = 0 | ||
nvgpu_vram = 0 | ||
try: | ||
command = ['nvidia-smi', '--query-gpu=index,memory.total', '--format=csv,noheader,nounits'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to check if nvidia-smi is installed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is installed on Linux and Windows with the drivers themselves. I'm not an expert, but I am under the impression it will be present everywhere modern CUDA is usable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok well as long as we catch the failure and don't segfault, I will be happy. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The final calculation at the end seems overlay complex, if amd_gpu_bytes and nvidia_gpu_mib are set to zero intially. If both are less than 1G VRAM use CPU. Otherwise just use the one with more VRAM. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We shouldn't use Nvidia GPUs either with < 1G VRAM. When the VRAM is that small it's not really worth it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense! I'll rework the function It should be more modular to use other gpu backends (like vulkan, intel arc) down the line. I was too focused on the edge case if a system were to have both amd and nvidia GPU above 1gb There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think technically we could only have the kernel module installed outside the container and nvidia-smi installed in the container, the only thing actually required outside of the container is kernelspace stuff, aka kernel modules. But I think it's ok like this. The detection wont be perfect ever. There will always be corner cases where one must manually specify their container somehow, I was thinking the env vars could be useful for that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point! I did add some more modularity for now. I'm thinking down the road I can query the vulkan SDK for info for all gpu architectures and select from there |
||
output = run_cmd(command).stdout.decode("utf-8") | ||
|
||
# Check for nvidia-container-toolkit to verify support with container runtime | ||
try: | ||
run_cmd(['nvidia-ctk', '--version']).stdout.decode("utf-8") | ||
except FileNotFoundError: | ||
print("'nvidia-container-toolkit' is not installed. No NVIDIA GPU support available for container runtime.") | ||
|
||
# Find Nvidia GPU with largest Vram | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Find NVIDIA GPU with largest VRAM. |
||
for line in output.strip().split('\n'): | ||
try: | ||
index, memory_mib = line.split(',') | ||
memory_mib = int(memory_mib) | ||
if memory_mib > 1024 and memory_mib > nvgpu_vram: | ||
nvgpu_vram = memory_mib | ||
nvgpu_num = index.strip() | ||
except ValueError as ex: | ||
print(f"Error parsing NVIDIA GPU info: {ex}") | ||
return | ||
gpu_template.append({"index":nvgpu_num, "vram":nvgpu_vram, "env":"CUDA_VISIBLE_DEVICES"}) | ||
except FileNotFoundError: | ||
# print("No Nvidia GPU Found ('nvidia-smi' command was not found)") | ||
return | ||
|
||
if gpu_bytes: # this is the ROCm/AMD case | ||
return "HIP_VISIBLE_DEVICES", gpu_num | ||
|
||
def get_gpu(): | ||
""" | ||
Detects and selects a GPU with at least 1 GiB of memory. | ||
Uses a centralized template to handle multiple GPU types. | ||
|
||
Returns: | ||
tuple: Environment variable name and GPU index (as a string), or (None, None) if no suitable GPU is found. | ||
""" | ||
# Check if system is running Asahi Linux (Apple Silicon) | ||
if os.path.exists('/etc/os-release'): | ||
with open('/etc/os-release', 'r') as file: | ||
content = file.read() | ||
if "asahi" in content.lower(): | ||
return "ASAHI_VISIBLE_DEVICES", 1 | ||
|
||
try: | ||
with open('/etc/os-release', 'r') as file: | ||
if "asahi" in file.read().lower(): | ||
return "ASAHI_VISIBLE_DEVICES", "1" | ||
except Exception as ex: | ||
print(f"Error reading OS release file: {ex}") | ||
|
||
# Initialize the GPU list | ||
gpu_template = [] | ||
|
||
# Detect GPUs from different architectures | ||
get_amdgpu(gpu_template) | ||
get_nvgpu(gpu_template) | ||
|
||
# Sort GPUs by memory (descending order) and return the best one | ||
if gpu_template: | ||
# Sort all GPUs by the 'vram' key (assuming it exists), descending order | ||
best_gpu = max(gpu_template, key=lambda x: x["vram"]) | ||
return best_gpu["env"], best_gpu["index"] | ||
|
||
# No suitable GPU found | ||
return None, None | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we be checking here that the app is not going to be run in a container?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think when this code was written we ran the python script both in and outside the container, so if you were on macOS at this point, you could assume you were actually gonna run on macOS native
I would do:
If Darwin:
["-ngl", "99"]
for all cases. On macOS llama.cpp ignores -ngl 99 as it turns on acceleration by default.