15 lines
456 B
Bash
15 lines
456 B
Bash
#! /bin/bash
|
|
|
|
export CUDA_VISIBLE_DEVICES="4,5"
|
|
#export PYTHONWARNINGS="ignore::UserWarning:vllm.model_executor.layers.fla"
|
|
MODEL="/home/kongcunliang/workspace/pretrained-models/Qwen/Qwen3.5-27B-FP8"
|
|
|
|
vllm serve $MODEL \
|
|
--served-model-name "Qwen3.5-27B" \
|
|
--port 8000 \
|
|
--tensor-parallel-size 2 \
|
|
--max-model-len 16384 \
|
|
--reasoning-parser qwen3 \
|
|
--speculative-config '{"method":"mtp","num_speculative_tokens":1}'
|
|
# --enable-prefix-caching \
|