deploy qwen3.5 and nemotron
This commit is contained in:
16
llamacpp-nemotron-super.sh
Normal file
16
llamacpp-nemotron-super.sh
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
#! /bin/bash
|
||||||
|
|
||||||
|
export CUDA_VISIBLE_DEVICES="6,7"
|
||||||
|
|
||||||
|
MODEL="/home/kongcunliang/workspace/pretrained-models/unsloth/NVIDIA-Nemotron-3-Super-120B-A12B-GGUF/UD-Q4_K_XL/NVIDIA-Nemotron-3-Super-120B-A12B-UD-Q4_K_XL-00001-of-00003.gguf"
|
||||||
|
|
||||||
|
llama-server \
|
||||||
|
--model $MODEL \
|
||||||
|
--alias "nemotron-3-super" \
|
||||||
|
--prio 1 \
|
||||||
|
--min_p 0.01 \
|
||||||
|
--temp 0.6 \
|
||||||
|
--top-p 0.95 \
|
||||||
|
--ctx-size 1048576 \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 8001
|
||||||
@@ -4,4 +4,7 @@ version = "0.1.0"
|
|||||||
description = "Add your description here"
|
description = "Add your description here"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
dependencies = []
|
dependencies = [
|
||||||
|
"flash-attn>=2.8.3",
|
||||||
|
"vllm>=0.17.1",
|
||||||
|
]
|
||||||
|
|||||||
13
vllm-qwen3.5-27b.sh
Normal file
13
vllm-qwen3.5-27b.sh
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
#! /bin/bash
|
||||||
|
|
||||||
|
export CUDA_VISIBLE_DEVICES="4,5"
|
||||||
|
#export PYTHONWARNINGS="ignore::UserWarning:vllm.model_executor.layers.fla"
|
||||||
|
MODEL="/home/kongcunliang/workspace/pretrained-models/Qwen/Qwen3.5-27B-FP8"
|
||||||
|
|
||||||
|
vllm serve $MODEL \
|
||||||
|
--served-model-name "Qwen3.5-27B" \
|
||||||
|
--port 8000 \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--max-model-len 262144 \
|
||||||
|
--reasoning-parser qwen3
|
||||||
|
# --speculative-config '{"method":"mtp","num_speculative_tokens":1}'
|
||||||
Reference in New Issue
Block a user