version 1 2 3 4 GPU driver cuda vllm torch h100 550 12.4 h200 565 12.7 0.7.3 2.5.1+cu124 h200 570 12.8 0.7.3 2.5.1+cu124
install
https://developer.nvidia.com/cuda-toolkit-archive
local
1 2 3 4 5 6 7 8 9 10 11 12 13 wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 wget https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda-repo-ubuntu2204-12-8-local_12.8.1-570.124.06-1_amd64.deb sudo dpkg -i cuda-repo-ubuntu2204-12-8-local_12.8.1-570.124.06-1_amd64.deb sudo cp /var/cuda-repo-ubuntu2204-12-8-local/cuda-*-keyring.gpg /usr/share/keyrings/ sudo apt-get update sudo apt-get -y install cuda-toolkit-12-8
online
1 2 3 4 wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update sudo apt-get -y install cuda-toolkit-12-8
post operate 1 2 3 4 5 # config env export PATH=/usr/local/cuda/bin:$PATH # NVIDIA persistence daemon sudo systemctl start nvidia-persistenced
driver 1 sudo apt-get install -y cuda-drivers
nvidia-fabricmanager 1 2 3 sudo apt install -y nvidia-fabricmanager-570 sudo systemctl start nvidia-fabricmanager
ENV
resolve issues: Error 802: system not yet initialized
1 2 3 4 5 6 # sort GPUs, by ordering their IDs with IDs on the PCIe bus. export CUDA_DEVICE_ORDER="PCI_BUS_ID" # perform an availability check using NVML (NVIDIA Management Library). NVML is an API layer for obtaining data directly from the NVIDIA-smi utility. export PYTORCH_NVML_BASED_CUDA_CHECK=1 # force show the system the IDs of available GPUs. export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
cuda kernel model
check by lsmod | grep nvidia
Module
Description
nvidia_uvm
NVIDIA’s Unified Memory driver
nvidia_drm
Direct Rendering Manager support
nvidia_modeset
Kernel mode-setting support
nvidia
Main NVIDIA driver module
command nvidia-smi
Enable Persistence Mode
check state
1 2 3 4 5 6 nvidia-smi conf-compute -grs # Confidential Compute GPUs Ready state: not-ready # Confidential Compute GPUs Ready state: ready # if above state is not-ready, execute below cmd nvidia-smi conf-compute -srs 1
cuda 12.8
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA H200 On | 00000000:19:00.0 Off | 0 | | N/A 23C P0 76W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 1 NVIDIA H200 On | 00000000:3B:00.0 Off | 0 | | N/A 21C P0 75W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 2 NVIDIA H200 On | 00000000:4C:00.0 Off | 0 | | N/A 23C P0 76W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 3 NVIDIA H200 On | 00000000:5D:00.0 Off | 0 | | N/A 24C P0 77W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 4 NVIDIA H200 On | 00000000:9B:00.0 Off | 0 | | N/A 24C P0 75W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 5 NVIDIA H200 On | 00000000:BB:00.0 Off | 0 | | N/A 23C P0 77W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 6 NVIDIA H200 On | 00000000:CB:00.0 Off | 0 | | N/A 24C P0 76W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 7 NVIDIA H200 On | 00000000:DB:00.0 Off | 0 | | N/A 24C P0 76W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | No running processes found | +-----------------------------------------------------------------------------------------+
cuda 12.7
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 Fri Mar 14 10:23:56 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 565.57.01 Driver Version: 565.57.01 CUDA Version: 12.7 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA H200 On | 00000000:19:00.0 Off | 0 | | N/A 26C P0 111W / 700W | 134402MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 1 NVIDIA H200 On | 00000000:3B:00.0 Off | 0 | | N/A 25C P0 116W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 2 NVIDIA H200 On | 00000000:4C:00.0 Off | 0 | | N/A 25C P0 112W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 3 NVIDIA H200 On | 00000000:5D:00.0 Off | 0 | | N/A 27C P0 115W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 4 NVIDIA H200 On | 00000000:9B:00.0 Off | 0 | | N/A 27C P0 115W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 5 NVIDIA H200 On | 00000000:BB:00.0 Off | 0 | | N/A 26C P0 114W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 6 NVIDIA H200 On | 00000000:CB:00.0 Off | 0 | | N/A 26C P0 113W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 7 NVIDIA H200 On | 00000000:DB:00.0 Off | 0 | | N/A 24C P0 114W / 700W | 131840MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| +-----------------------------------------------------------------------------------------+
vllm install offline On your local machine, create a virtual environment:
1 2 python3 -m venv vllm_env source vllm_env/bin/activate
1️⃣ On your local machine:
1 pip download --dest=./vllm_deps vllm
2️⃣ Transfer dependencies to the remote server:
1 scp -r vllm_deps user@remote_server:/path/to/destination/
3️⃣ On the remote server:
1 2 3 4 5 cd /path/to/destination/vllm_deps pip install --no-index --find-links=. vllm* • --no-index tells pip not to use the internet. • --find-links=./vllm_deps tells pip to look for packages in this directory. • vllm* ensures pip finds the correct package in that folder.
running time 1 vllm serve /mnt/dingofs-test/DeepSeek-R1 --host 0.0.0.0 --port 8000 --served-model-name deepseek-r1 --tensor-parallel-size 8 --gpu-memory-utilization 0.85 --max-model-len 128000 --max-num-batched-tokens 32000 --max-num-seqs 1024 --trust-remote-code --enable-reasoning --reasoning-parser deepseek_r1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 Sat Mar 15 20:33:04 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA H200 On | 00000000:19:00.0 Off | 0 | | N/A 26C P0 115W / 700W | 84474MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 1 NVIDIA H200 On | 00000000:3B:00.0 Off | 0 | | N/A 24C P0 113W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 2 NVIDIA H200 On | 00000000:4C:00.0 Off | 0 | | N/A 26C P0 114W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 3 NVIDIA H200 On | 00000000:5D:00.0 Off | 0 | | N/A 27C P0 117W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 4 NVIDIA H200 On | 00000000:9B:00.0 Off | 0 | | N/A 27C P0 114W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 5 NVIDIA H200 On | 00000000:BB:00.0 Off | 0 | | N/A 26C P0 116W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 6 NVIDIA H200 On | 00000000:CB:00.0 Off | 0 | | N/A 27C P0 115W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 7 NVIDIA H200 On | 00000000:DB:00.0 Off | 0 | | N/A 26C P0 114W / 700W | 84282MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | 0 N/A N/A 915920 C ...niconda3/envs/vllm/bin/python 84464MiB | | 1 N/A N/A 916338 C ...niconda3/envs/vllm/bin/python 84512MiB | | 2 N/A N/A 916339 C ...niconda3/envs/vllm/bin/python 84512MiB | | 3 N/A N/A 916340 C ...niconda3/envs/vllm/bin/python 84512MiB | | 4 N/A N/A 916341 C ...niconda3/envs/vllm/bin/python 84512MiB | | 5 N/A N/A 916342 C ...niconda3/envs/vllm/bin/python 84512MiB | | 6 N/A N/A 916343 C ...niconda3/envs/vllm/bin/python 84512MiB | | 7 N/A N/A 916344 C ...niconda3/envs/vllm/bin/python 84272MiB | +-----------------------------------------------------------------------------------------+
log
1 2 3 4 5 6 INFO 03-15 20:36:48 worker.py:267] Memory profiling takes 7.63 seconds INFO 03-15 20:36:48 worker.py:267] the current vLLM instance can use total_gpu_memory (139.81GiB) x gpu_memory_utilization (0.85) = 118.84GiB INFO 03-15 20:36:48 worker.py:267] model weights take 83.88GiB; non_torch_memory takes 7.16GiB; PyTorch activation peak memory takes 6.37GiB; the rest of the memory reserved for KV Cache is 21.43GiB. INFO 03-15 20:36:48 executor_base.py:111] # cuda blocks: 18418, # CPU blocks: 3437 INFO 03-15 20:36:48 executor_base.py:116] Maximum concurrency for 128000 tokens per request: 2.30x
chat
1 2 3 4 5 6 7 8 9 curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "deepseek-r1", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "introduce yourself"} ] }'
sglang install offline 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # prepare env python3 -m venv sglang_env source sglang_env/bin/activate # optional use uv pip install --upgrade pip # pip install uv # download deps mkdir -p ./sglang_deps pip download "sglang[all]>=0.4.4.post1" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python -d ./sglang_deps # scp deps to remote scp -r sglang_deps user@remote_server:/path/to/destination/ # install sglang on remote cd /path/to/remote/sglang_deps pip install --no-index --find-links=. "sglang[all]>=0.4.4.post1"
runtime 1 python3 -m sglang.launch_server --model /mnt/3fs/DeepSeek-R1 --tp 8 --trust-remote-code --port 30000
chat
1 2 3 4 5 6 7 8 9 curl http://localhost:30000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "deepseek-r1", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "introduce yourself"} ] }'
torch install offline 1 2 3 4 5 6 mkdir ~/torch_deps pip download --dest=~/torch_deps torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/nightly/cu128 scp -r ~/torch_deps user@remote_server:/path/to/remote/directory cd /path/to/remote/directory pip install --no-index --find-links=./ torch
check
1 python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())"
or
1 2 3 4 5 import torchprint (torch.cuda.is_available()) print (torch.cuda.device_count()) print (torch.__version__) print (torch.version.cuda)