#! /bin/bash export CUDA_VISIBLE_DEVICES="4,5" #export PYTHONWARNINGS="ignore::UserWarning:vllm.model_executor.layers.fla" MODEL="/home/kongcunliang/workspace/pretrained-models/Qwen/Qwen3.5-27B-FP8" vllm serve $MODEL \ --served-model-name "Qwen3.5-27B" \ --port 8000 \ --tensor-parallel-size 2 \ --max-model-len 16384 \ --reasoning-parser qwen3 \ --speculative-config '{"method":"mtp","num_speculative_tokens":1}' # --enable-prefix-caching \