22 lines
571 B
Bash
22 lines
571 B
Bash
#! /bin/bash
|
|
export CUDA_VISIBLE_DEVICES=4,5,6,7
|
|
export SGLANG_ENABLE_SPEC_V2=1
|
|
|
|
MODEL="/home/kongcunliang/workspace/pretrained-models/Qwen/Qwen3.5-27B-FP8"
|
|
|
|
sglang serve \
|
|
--model-path $MODEL \
|
|
--host 0.0.0.0 \
|
|
--port 8000 \
|
|
--tp-size 4 \
|
|
--mem-fraction-static 0.8 \
|
|
--context-length 262144 \
|
|
--reasoning-parser qwen3 \
|
|
--speculative-algo NEXTN \
|
|
--speculative-num-steps 3 \
|
|
--speculative-eagle-topk 1 \
|
|
--speculative-num-draft-tokens 4 \
|
|
--mamba-scheduler-strategy extra_buffer \
|
|
--max-running-requests 192 \
|
|
--served-model-name Qwen3.5-27B
|