models-deploy/sglang_qwen3.5.sh

#! /bin/bash
export CUDA_VISIBLE_DEVICES=4,5,6,7
export SGLANG_ENABLE_SPEC_V2=1

MODEL="/home/kongcunliang/workspace/pretrained-models/Qwen/Qwen3.5-27B-FP8"

sglang serve \
  --model-path $MODEL \
  --host 0.0.0.0 \
  --port 8000 \
  --tp-size 4 \
  --mem-fraction-static 0.8 \
  --context-length 262144 \
  --reasoning-parser qwen3 \
  --speculative-algo NEXTN \
  --speculative-num-steps 3 \
  --speculative-eagle-topk 1 \
  --speculative-num-draft-tokens 4 \
  --mamba-scheduler-strategy extra_buffer \
  --max-running-requests 192 \
  --served-model-name Qwen3.5-27B