change to sglang
This commit is contained in:
@@ -7,10 +7,8 @@ MODEL="/home/kongcunliang/workspace/pretrained-models/unsloth/NVIDIA-Nemotron-3-
|
|||||||
llama-server \
|
llama-server \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--alias "nemotron-3-super" \
|
--alias "nemotron-3-super" \
|
||||||
--prio 1 \
|
|
||||||
--min_p 0.01 \
|
|
||||||
--temp 0.6 \
|
--temp 0.6 \
|
||||||
--top-p 0.95 \
|
--top-p 0.95 \
|
||||||
--ctx-size 1048576 \
|
--ctx-size 262144 \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8001
|
--port 8001 \
|
||||||
|
|||||||
6
main.py
Normal file
6
main.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
def main():
|
||||||
|
print("Hello from models-deploy!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -5,6 +5,8 @@ description = "Add your description here"
|
|||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"flash-attn>=2.8.3",
|
"sglang[all]",
|
||||||
"vllm>=0.17.1",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[tool.uv.sources]
|
||||||
|
sglang = { git = "https://github.com/sgl-project/sglang.git", subdirectory = "python" }
|
||||||
|
|||||||
21
sglang_qwen3.5.sh
Normal file
21
sglang_qwen3.5.sh
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
#! /bin/bash
|
||||||
|
export CUDA_VISIBLE_DEVICES=4,5,6,7
|
||||||
|
export SGLANG_ENABLE_SPEC_V2=1
|
||||||
|
|
||||||
|
MODEL="/home/kongcunliang/workspace/pretrained-models/Qwen/Qwen3.5-27B-FP8"
|
||||||
|
|
||||||
|
sglang serve \
|
||||||
|
--model-path $MODEL \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 8000 \
|
||||||
|
--tp-size 4 \
|
||||||
|
--mem-fraction-static 0.8 \
|
||||||
|
--context-length 262144 \
|
||||||
|
--reasoning-parser qwen3 \
|
||||||
|
--speculative-algo NEXTN \
|
||||||
|
--speculative-num-steps 3 \
|
||||||
|
--speculative-eagle-topk 1 \
|
||||||
|
--speculative-num-draft-tokens 4 \
|
||||||
|
--mamba-scheduler-strategy extra_buffer \
|
||||||
|
--max-running-requests 192 \
|
||||||
|
--served-model-name Qwen3.5-27B
|
||||||
@@ -8,6 +8,7 @@ vllm serve $MODEL \
|
|||||||
--served-model-name "Qwen3.5-27B" \
|
--served-model-name "Qwen3.5-27B" \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
--tensor-parallel-size 2 \
|
--tensor-parallel-size 2 \
|
||||||
--max-model-len 262144 \
|
--max-model-len 16384 \
|
||||||
--reasoning-parser qwen3
|
--reasoning-parser qwen3 \
|
||||||
# --speculative-config '{"method":"mtp","num_speculative_tokens":1}'
|
--speculative-config '{"method":"mtp","num_speculative_tokens":1}'
|
||||||
|
# --enable-prefix-caching \
|
||||||
|
|||||||
Reference in New Issue
Block a user