Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed gpu detection for cuda rocm etc using env vars #490

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 113 additions & 34 deletions ramalama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,14 @@ def _image(self, args):
if args.image != default_image():
return args.image

gpu_type, _ = get_gpu()
if gpu_type == "HIP_VISIBLE_DEVICES":
if os.getenv("HIP_VISIBLE_DEVICES"):
return "quay.io/ramalama/rocm:latest"

if gpu_type == "ASAHI_VISIBLE_DEVICES":
if os.getenv("ASAHI_VISIBLE_DEVICES"):
return "quay.io/ramalama/asahi:latest"

if os.getenv("CUDA_VISIBLE_DEVICES"):
return "docker.io/brianmahabir/rama-cuda:v1"

return args.image

Expand Down Expand Up @@ -143,9 +145,18 @@ def setup_container(self, args):
if os.path.exists("/dev/kfd"):
conman_args += ["--device", "/dev/kfd"]

gpu_type, gpu_num = get_gpu()
if gpu_type == "HIP_VISIBLE_DEVICES" or gpu_type == "ASAHI_VISIBLE_DEVICES":
conman_args += ["-e", f"{gpu_type}={gpu_num}"]
for var in ["HIP_VISIBLE_DEVICES", "ASAHI_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"]:
value = os.getenv(var)
if value:
if var == "CUDA_VISIBLE_DEVICES":
if args.engine == "docker":
conman_args += ["--gpus", "all"]
else:
# Podman specific args
conman_args += ["--device", "nvidia.com/gpu=all"]
else:
# For HIP and ASAHI, we directly add the environment variable with its value
conman_args += ["-e", f"{var}={value}"]
return conman_args

def run_container(self, args, shortnames):
Expand Down Expand Up @@ -190,14 +201,14 @@ def cleanup():
return True

def gpu_args(self):
gpu_type, gpu_num = get_gpu()
gpu_args = []
if sys.platform == "darwin":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we be checking here that the app is not going to be run in a container?

Copy link
Collaborator

@ericcurtin ericcurtin Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think when this code was written we ran the python script both in and outside the container, so if you were on macOS at this point, you could assume you were actually gonna run on macOS native

I would do:

If Darwin:
["-ngl", "99"]

for all cases. On macOS llama.cpp ignores -ngl 99 as it turns on acceleration by default.

# llama.cpp will default to the Metal backend on macOS, so we don't need
# any additional arguments.
pass
elif sys.platform == "linux" and (
os.getenv("HIP_VISIBLE_DEVICES") or os.getenv("ASAHI_VISIBLE_DEVICES") or os.getenv("CUDA_VISIBLE_DEVICES")
):
elif sys.platform == "linux" and gpu_type is not None:
os.environ[gpu_type] = gpu_num
gpu_args = ["-ngl", "99"]
else:
print("GPU offload was requested but is not available on this system")
Expand Down Expand Up @@ -280,8 +291,13 @@ def run(self, args):
if not args.ARGS and sys.stdin.isatty():
exec_args.append("-cnv")

if args.gpu:
exec_args.extend(self.gpu_args())
# if args.gpu:
# exec_args.extend(self.gpu_args())

# bypass args.gpu for auto-detection of gpu
gpu_args = self.gpu_args()
if gpu_args is not None:
exec_args.extend(gpu_args)

try:
if self.exec_model_in_container(model_path, exec_args, args):
Expand All @@ -293,7 +309,7 @@ def run(self, args):
except FileNotFoundError as e:
if in_container():
raise NotImplementedError(file_not_found_in_container % (exec_args[0], str(e).strip("'")))
raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'")))
raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0]))

def serve(self, args):
if hasattr(args, "name") and args.name:
Expand Down Expand Up @@ -326,8 +342,13 @@ def serve(self, args):
exec_model_path = os.path.dirname(exec_model_path)
exec_args = ["vllm", "serve", "--port", args.port, exec_model_path]
else:
if args.gpu:
exec_args.extend(self.gpu_args())
# if args.gpu:
# exec_args.extend(self.gpu_args())

# bypass args.gpu for auto-detection of gpu
gpu_args = self.gpu_args()
if gpu_args is not None:
exec_args.extend(gpu_args)
exec_args.extend(["--host", args.host])

if args.generate == "quadlet":
Expand All @@ -349,7 +370,7 @@ def serve(self, args):
except FileNotFoundError as e:
if in_container():
raise NotImplementedError(file_not_found_in_container % (exec_args[0], str(e).strip("'")))
raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0], str(e).strip("'")))
raise NotImplementedError(file_not_found % (exec_args[0], exec_args[0], exec_args[0]))

def quadlet(self, model, args, exec_args):
quadlet = Quadlet(model, args, exec_args)
Expand Down Expand Up @@ -382,28 +403,86 @@ def check_valid_model_path(self, relative_target_path, model_path):
return os.path.exists(model_path) and os.readlink(model_path) == relative_target_path


def get_gpu():
i = 0
gpu_num = 0
gpu_bytes = 0
for fp in sorted(glob.glob('/sys/bus/pci/devices/*/mem_info_vram_total')):
with open(fp, 'r') as file:
content = int(file.read())
if content > 1073741824 and content > gpu_bytes:
gpu_bytes = content
gpu_num = i

i += 1
def get_amdgpu(gpu_template):
"""Detect AMD GPUs and append valid entries to the template."""
amdgpu_num = 0
amdgpu_vram = 0
for i, fp in enumerate(sorted(glob.glob('/sys/bus/pci/devices/*/mem_info_vram_total'))):
try:
with open(fp, 'r') as file:
memory_bytes = int(file.read())
memory_mib = memory_bytes / (1024 * 1024) # Convert bytes to MiB
# Find AMD GPU with largest Vram
if memory_mib > 1024 and memory_mib > amdgpu_vram:
amdgpu_vram = memory_mib
amdgpu_num = i
gpu_template.append({"index":amdgpu_num, "vram":amdgpu_vram, "env":"HIP_VISIBLE_DEVICES"})
except Exception as ex:
print(f"Error reading AMD GPU memory info: {ex}")


def get_nvgpu(gpu_template):
"""Detect NVIDIA GPUs and append valid entries to the template."""
nvgpu_num = 0
nvgpu_vram = 0
try:
command = ['nvidia-smi', '--query-gpu=index,memory.total', '--format=csv,noheader,nounits']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to check if nvidia-smi is installed?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is installed on Linux and Windows with the drivers themselves. I'm not an expert, but I am under the impression it will be present everywhere modern CUDA is usable.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok well as long as we catch the failure and don't segfault, I will be happy.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The final calculation at the end seems overlay complex, if amd_gpu_bytes and nvidia_gpu_mib are set to zero intially. If both are less than 1G VRAM use CPU. Otherwise just use the one with more VRAM.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We shouldn't use Nvidia GPUs either with < 1G VRAM. When the VRAM is that small it's not really worth it.

Copy link
Collaborator Author

@bmahabirbu bmahabirbu Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense! I'll rework the function It should be more modular to use other gpu backends (like vulkan, intel arc) down the line. I was too focused on the edge case if a system were to have both amd and nvidia GPU above 1gb

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think technically we could only have the kernel module installed outside the container and nvidia-smi installed in the container, the only thing actually required outside of the container is kernelspace stuff, aka kernel modules.

But I think it's ok like this. The detection wont be perfect ever. There will always be corner cases where one must manually specify their container somehow, I was thinking the env vars could be useful for that.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! I did add some more modularity for now.

I'm thinking down the road I can query the vulkan SDK for info for all gpu architectures and select from there

output = run_cmd(command).stdout.decode("utf-8")

# Check for nvidia-container-toolkit to verify support with container runtime
try:
run_cmd(['nvidia-ctk', '--version']).stdout.decode("utf-8")
except FileNotFoundError:
print("'nvidia-container-toolkit' is not installed. No NVIDIA GPU support available for container runtime.")

# Find Nvidia GPU with largest Vram
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Find NVIDIA GPU with largest VRAM.

for line in output.strip().split('\n'):
try:
index, memory_mib = line.split(',')
memory_mib = int(memory_mib)
if memory_mib > 1024 and memory_mib > nvgpu_vram:
nvgpu_vram = memory_mib
nvgpu_num = index.strip()
except ValueError as ex:
print(f"Error parsing NVIDIA GPU info: {ex}")
return
gpu_template.append({"index":nvgpu_num, "vram":nvgpu_vram, "env":"CUDA_VISIBLE_DEVICES"})
except FileNotFoundError:
# print("No Nvidia GPU Found ('nvidia-smi' command was not found)")
return

if gpu_bytes: # this is the ROCm/AMD case
return "HIP_VISIBLE_DEVICES", gpu_num

def get_gpu():
"""
Detects and selects a GPU with at least 1 GiB of memory.
Uses a centralized template to handle multiple GPU types.

Returns:
tuple: Environment variable name and GPU index (as a string), or (None, None) if no suitable GPU is found.
"""
# Check if system is running Asahi Linux (Apple Silicon)
if os.path.exists('/etc/os-release'):
with open('/etc/os-release', 'r') as file:
content = file.read()
if "asahi" in content.lower():
return "ASAHI_VISIBLE_DEVICES", 1

try:
with open('/etc/os-release', 'r') as file:
if "asahi" in file.read().lower():
return "ASAHI_VISIBLE_DEVICES", "1"
except Exception as ex:
print(f"Error reading OS release file: {ex}")

# Initialize the GPU list
gpu_template = []

# Detect GPUs from different architectures
get_amdgpu(gpu_template)
get_nvgpu(gpu_template)

# Sort GPUs by memory (descending order) and return the best one
if gpu_template:
# Sort all GPUs by the 'vram' key (assuming it exists), descending order
best_gpu = max(gpu_template, key=lambda x: x["vram"])
return best_gpu["env"], best_gpu["index"]

# No suitable GPU found
return None, None


Expand Down
Loading