I am beginning to work with local Large Language Model (LLM) deployment. I wanted to test the Ministral model, so I quantized it myself using the native NVFP4 format on my RTX 2000 Blackwell GPU.
I used the mistralai/Ministral-3-14B-Instruct-2512-BF16 checkpoint from Hugging Face and applied the llm-compressor oneshot API for quantization.
The quantized model produces coherent responses when running through the transformers API. However, when I load the same model into the latest vllm-openai:nightly build, it only generates gibberish.
nvidia-smi
Sun Apr 26 22:11:03 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 595.58.03 Driver Version: 595.58.03 CUDA Version: 13.2 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA RTX PRO 2000 Blac... On | 00000000:01:00.0 Off | Off |
| 30% 28C P8 6W / 70W | 2MiB / 16311MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
quantization.py
from transformers import (
AutoProcessor,
AutoTokenizer,
Mistral3ForConditionalGeneration,
MistralCommonBackend
)
import torch
MODEL_ID = "mistralai/Ministral-3-14B-Instruct-2512-BF16"
model = Mistral3ForConditionalGeneration.from_pretrained(
MODEL_ID,
device_map = 'cpu'
)
processor = AutoProcessor.from_pretrained(MODEL_ID, fix_mistral_regex=True)
from llmcompressor.modifiers.gptq import GPTQModifier
recipe = [
GPTQModifier(
targets="Linear",
scheme="NVFP4",
ignore=["re:.*lm_head.*", "re:.*vision_tower.*", "re:.*multi_modal_projector.*"],
)
]
from datasets import load_dataset, Features, Value
NUM_CALIBRATION_SAMPLES=256
MAX_SEQUENCE_LENGTH=2048
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
ds = load_dataset(DATASET_ID, split=f"train_sft", streaming=False)
def is_complete_conversation(example):
messages = example["messages"]
# Validation: Empty content check
for msg in messages:
if msg["role"] == "assistant":
if not msg.get("content") or len(msg["content"].strip()) == 0:
return False
#ds = ds.filter(is_complete_conversation)
ds = ds.take(NUM_CALIBRATION_SAMPLES)
#===
ds = ds.select_columns(["messages"])
ds = ds.shuffle(seed=42)
def preprocess_function(example):
messages = [
{"role": m["role"], "content": [{"type": "text", "text": m["content"]}]}
for m in example["messages"]
]
return processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
processor_kwargs={
"return_tensors": "pt",
"truncation": True,
"max_length": MAX_SEQUENCE_LENGTH,
"padding": False,
}
).to("cpu")
ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
def data_collator(batch):
assert len(batch) == 1
return {key: torch.tensor(value) for key, value in batch[0].items()}
from llmcompressor import oneshot
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4-v2"
SAVE_DIR_ABS = f"/{SAVE_DIR}"
quantized_model = oneshot(
model=model,
processor=processor,
recipe=recipe,
batch_size=1,
dataset=ds,
data_collator=data_collator,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
max_seq_length=MAX_SEQUENCE_LENGTH,
)
quantized_model
model.save_pretrained(SAVE_DIR_ABS, save_compressed=True, save_peft_format=False, save_original_format=False, max_shard_size="4GB")
processor.save_pretrained(SAVE_DIR_ABS)
ls -l /ministral/
total 10178260
-rw-rw-r-- 1 1000 1000 7754 Apr 21 20:29 chat_template.jinja
-rw-rw-r-- 1 1000 1000 14816 Apr 21 20:29 config.json
-rw-rw-r-- 1 1000 1000 131 Apr 21 20:29 generation_config.json
-rw-rw-r-- 1 1000 1000 3960352120 Apr 21 20:29 model-00001-of-00003.safetensors
-rw-rw-r-- 1 1000 1000 3981873680 Apr 21 20:29 model-00002-of-00003.safetensors
-rw-rw-r-- 1 1000 1000 2463021408 Apr 21 20:29 model-00003-of-00003.safetensors
-rw-rw-r-- 1 1000 1000 150690 Apr 21 20:29 model.safetensors.index.json
-rw-rw-r-- 1 1000 1000 697 Apr 21 20:29 processor_config.json
-rw-rw-r-- 1 1000 1000 328 Apr 21 20:29 recipe.yaml
-rw-rw-r-- 1 1000 1000 17078110 Apr 21 20:29 tokenizer.json
-rw-rw-r-- 1 1000 1000 465 Apr 21 20:29 tokenizer_config.json
/ministral/config.json
{
"architectures": [
"Mistral3ForConditionalGeneration"
],
"dtype": "bfloat16",
"image_token_index": 10,
"model_type": "mistral3",
"multimodal_projector_bias": false,
"projector_hidden_act": "gelu",
"quantization_config": {
"config_groups": {
"group_0": {
"format": "nvfp4-pack-quantized",
"input_activations": {
"actorder": null,
"block_structure": null,
"dynamic": "local",
"group_size": 16,
"num_bits": 4,
"observer": "static_minmax",
"observer_kwargs": {},
"scale_dtype": "torch.float8_e4m3fn",
"strategy": "tensor_group",
"symmetric": true,
"type": "float",
"zp_dtype": null
},
"output_activations": null,
"targets": [
"Linear"
],
"weights": {
"actorder": null,
"block_structure": null,
"dynamic": false,
"group_size": 16,
"num_bits": 4,
"observer": "memoryless_minmax",
"observer_kwargs": {},
"scale_dtype": "torch.float8_e4m3fn",
"strategy": "tensor_group",
"symmetric": true,
"type": "float",
"zp_dtype": null
}
}
},
"format": "nvfp4-pack-quantized",
"global_compression_ratio": null,
"ignore": [
"model.vision_tower.transformer.layers.0.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.0.feed_forward.up_proj",
"model.vision_tower.transformer.layers.0.feed_forward.down_proj",
"model.vision_tower.transformer.layers.0.attention.k_proj",
"model.vision_tower.transformer.layers.0.attention.v_proj",
"model.vision_tower.transformer.layers.0.attention.q_proj",
"model.vision_tower.transformer.layers.0.attention.o_proj",
#layers.0-23
"model.multi_modal_projector.patch_merger.merging_layer",
"model.multi_modal_projector.linear_1",
"model.multi_modal_projector.linear_2",
"lm_head"
],
"kv_cache_scheme": null,
"quant_method": "compressed-tensors",
"quantization_status": "compressed",
"sparsity_config": {},
"transform_config": {},
"version": "0.15.1.a20260416"
},
"spatial_merge_size": 2,
"text_config": {
"attention_dropout": 0.0,
"bos_token_id": 1,
"dtype": "bfloat16",
"eos_token_id": 2,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 5120,
"initializer_range": 0.02,
"intermediate_size": 16384,
"max_position_embeddings": 262144,
"model_type": "ministral3",
"num_attention_heads": 32,
"num_hidden_layers": 40,
"num_key_value_heads": 8,
"pad_token_id": 11,
"rms_norm_eps": 1e-05,
"rope_parameters": {
"beta_fast": 32.0,
"beta_slow": 1.0,
"factor": 16.0,
"llama_4_scaling_beta": 0.1,
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 16384,
"rope_theta": 1000000000.0,
"rope_type": "yarn",
"type": "yarn"
},
"sliding_window": null,
"tie_word_embeddings": false,
"use_cache": true,
"vocab_size": 131072
},
"tie_word_embeddings": true,
"transformers_version": "5.6.0.dev0",
"vision_config": {
"attention_dropout": 0.0,
"dtype": "bfloat16",
"head_dim": 64,
"hidden_act": "silu",
"hidden_size": 1024,
"image_size": 1540,
"initializer_range": 0.02,
"intermediate_size": 4096,
"model_type": "pixtral",
"num_attention_heads": 16,
"num_channels": 3,
"num_hidden_layers": 24,
"patch_size": 14,
"rope_parameters": {
"rope_theta": 10000.0,
"rope_type": "default"
}
},
"vision_feature_layer": -1
}
transformer-test.py
model_id = "/Ministral-3-14B-Instruct-2512-BF16-NVFP4"
from transformers import Mistral3ForConditionalGeneration, MistralCommonBackend, AutoProcessor, AutoConfig, MistralCommonBackend
config = AutoConfig.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)
model = Mistral3ForConditionalGeneration.from_pretrained(model_id, device_map="cpu")
message = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is the capital of great britain?",
}
],
}
]
inputs = processor.apply_chat_template(
message,
add_generation_prompt=False,
tokenize=True,
return_dict=True,
processor_kwargs={
"return_tensors": "pt"
}
).to("cpu")
output = model.generate(
**inputs,
max_new_tokens=512,
max_length=512
)
decoded_output = processor.decode(output[0], skip_special_tokens=True)
print(decoded_output)
vllm serve /ministral --disable-hybrid-kv-cache-manager --port 43434 --trust-remote-code --max-model-len=16384 --port 43434
WARNING 04-26 19:58:49 [argparse_utils.py:422] Found duplicate keys --port
(APIServer pid=60) INFO 04-26 19:58:49 [utils.py:299]
(APIServer pid=60) INFO 04-26 19:58:49 [utils.py:299] █ █ █▄ ▄█
(APIServer pid=60) INFO 04-26 19:58:49 [utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.2rc1.dev205+g07351e088
(APIServer pid=60) INFO 04-26 19:58:49 [utils.py:299] █▄█▀ █ █ █ █ model /ministral
(APIServer pid=60) INFO 04-26 19:58:49 [utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀
(APIServer pid=60) INFO 04-26 19:58:49 [utils.py:299]
(APIServer pid=60) INFO 04-26 19:58:49 [utils.py:233] non-default args: {'model_tag': '/ministral', 'port': 43434, 'model': '/ministral', 'trust_remote_code': True, 'max_model_len': 16384, 'disable_hybrid_kv_cache_manager': True}
(APIServer pid=60) INFO 04-26 19:58:55 [nixl_utils.py:20] Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare memory leak in UCX when using NIXL.
(APIServer pid=60) INFO 04-26 19:58:55 [nixl_utils.py:32] NIXL is available
(APIServer pid=60) INFO 04-26 19:58:55 [model.py:554] Resolved architecture: Mistral3ForConditionalGeneration
(APIServer pid=60) INFO 04-26 19:58:55 [model.py:1676] Using max model len 16384
(APIServer pid=60) INFO 04-26 19:58:55 [vllm.py:840] Asynchronous scheduling is enabled.
(APIServer pid=60) INFO 04-26 19:58:55 [kernel.py:203] Final IR op priority after setting platform defaults: IrOpPriorityConfig(rms_norm=['native'])
(APIServer pid=60) INFO 04-26 19:58:55 [compilation.py:303] Enabled custom fusions: act_quant
INFO 04-26 19:59:04 [nixl_utils.py:32] NIXL is available
(EngineCore pid=112) INFO 04-26 19:59:04 [core.py:107] Initializing a V1 LLM engine (v0.19.2rc1.dev205+g07351e088) with config: model='/ministral', speculative_config=None, tokenizer='/ministral', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, quantization_config=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/ministral, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'ir_enable_torch_wrap': True, 'splitting_ops': ['vllm::unified_attention_with_output', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::gdn_attention_core_xpu', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_vision_items_per_batch': 0, 'encoder_cudagraph_max_frames_per_batch': None, 'compile_sizes': [], 'compile_ranges_endpoints': [2048], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': True, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': False, 'static_all_moe_layers': []}, kernel_config=KernelConfig(ir_op_priority=IrOpPriorityConfig(rms_norm=['native']), enable_flashinfer_autotune=True, moe_backend='auto')
(EngineCore pid=112) INFO 04-26 19:59:05 [parallel_state.py:1402] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.88.0.38:32971 backend=nccl
(EngineCore pid=112) INFO 04-26 19:59:05 [parallel_state.py:1715] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A
(EngineCore pid=112) INFO 04-26 19:59:09 [gpu_model_runner.py:4766] Starting to load model /ministral...
(EngineCore pid=112) INFO 04-26 19:59:09 [vllm.py:840] Asynchronous scheduling is enabled.
(EngineCore pid=112) INFO 04-26 19:59:09 [kernel.py:203] Final IR op priority after setting platform defaults: IrOpPriorityConfig(rms_norm=['native'])
(EngineCore pid=112) INFO 04-26 19:59:09 [compilation.py:303] Enabled custom fusions: act_quant
(EngineCore pid=112) INFO 04-26 19:59:09 [__init__.py:683] Using FlashInferCutlassNvFp4LinearKernel for NVFP4 GEMM
(EngineCore pid=112) INFO 04-26 19:59:09 [cuda.py:368] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].
(EngineCore pid=112) INFO 04-26 19:59:09 [flash_attn.py:646] Using FlashAttention version 2
(EngineCore pid=112) INFO 04-26 19:59:10 [weight_utils.py:904] Filesystem type for checkpoints: EXT4. Checkpoint size: 9.69 GiB. Available RAM: 5.65 GiB.
(EngineCore pid=112) INFO 04-26 19:59:10 [weight_utils.py:934] Auto-prefetch is disabled because the filesystem (EXT4) is not a recognized network FS (NFS/Lustre) and the checkpoint size (9.69 GiB) exceeds 90% of available RAM (5.65 GiB).
Loading safetensors checkpoint shards: 0% Completed | 0/3 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 33% Completed | 1/3 [00:02<00:05, 2.54s/it]
Loading safetensors checkpoint shards: 67% Completed | 2/3 [00:05<00:02, 2.77s/it]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:06<00:00, 1.83s/it]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:06<00:00, 2.06s/it]
(EngineCore pid=112)
(EngineCore pid=112) INFO 04-26 19:59:16 [default_loader.py:384] Loading weights took 6.18 seconds
(EngineCore pid=112) INFO 04-26 19:59:16 [gpu_model_runner.py:4868] Model loading took 8.51 GiB memory and 7.008565 seconds
(EngineCore pid=112) INFO 04-26 19:59:16 [gpu_model_runner.py:5809] Encoder cache will be initialized with a budget of 3025 tokens, and profiled with 1 image items of the maximum feature size.
(EngineCore pid=112) WARNING 04-26 19:59:16 [op.py:241] Priority not set for op rms_norm, using native implementation.
(EngineCore pid=112) INFO 04-26 19:59:26 [backends.py:1069] Using cache directory: /root/.cache/vllm/torch_compile_cache/649c1aaec0/rank_0_0/backbone for vLLM's torch.compile
(EngineCore pid=112) INFO 04-26 19:59:26 [backends.py:1128] Dynamo bytecode transform time: 6.59 s
(EngineCore pid=112) INFO 04-26 19:59:30 [backends.py:376] Cache the graph of compile range (1, 2048) for later use
(EngineCore pid=112) INFO 04-26 19:59:34 [backends.py:391] Compiling a graph for compile range (1, 2048) takes 7.26 s
(EngineCore pid=112) INFO 04-26 19:59:38 [decorators.py:668] saved AOT compiled function to /root/.cache/vllm/torch_compile_cache/torch_aot_compile/ace1afd5f0e941a2b89178184072fb760a8b4c58834838b52911f7172f784fe7/rank_0_0/model
(EngineCore pid=112) INFO 04-26 19:59:38 [monitor.py:53] torch.compile took 18.08 s in total
(EngineCore pid=112) INFO 04-26 19:59:39 [monitor.py:81] Initial profiling/warmup run took 0.97 s
(EngineCore pid=112) INFO 04-26 19:59:43 [gpu_model_runner.py:5938] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=35 (largest=256)
(EngineCore pid=112) INFO 04-26 20:00:29 [gpu_model_runner.py:6017] Estimated CUDA graph memory: 0.47 GiB total
(EngineCore pid=112) INFO 04-26 20:00:29 [gpu_worker.py:440] Available KV cache memory: 4.34 GiB
(EngineCore pid=112) INFO 04-26 20:00:29 [gpu_worker.py:455] CUDA graph memory profiling is enabled (default since v0.21.0). The current --gpu-memory-utilization=0.9200 is equivalent to --gpu-memory-utilization=0.8898 without CUDA graph memory profiling. To maintain the same effective KV cache size as before, increase --gpu-memory-utilization to 0.9502. To disable, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0.
(EngineCore pid=112) INFO 04-26 20:00:29 [kv_cache_utils.py:1337] GPU KV cache size: 28,464 tokens
(EngineCore pid=112) INFO 04-26 20:00:29 [kv_cache_utils.py:1342] Maximum concurrency for 16,384 tokens per request: 1.74x
(EngineCore pid=112) 2026-04-26 20:00:29,635 - INFO - autotuner.py:457 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
[AutoTuner]: Tuning fp4_gemm: 100%|████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 59.60profile/s]
[AutoTuner]: Tuning fp4_gemm: 100%|████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 75.06profile/s]
[AutoTuner]: Tuning fp4_gemm: 100%|████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 15.11profile/s]
[AutoTuner]: Tuning fp4_gemm: 100%|████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 27.53profile/s]
(EngineCore pid=112) 2026-04-26 20:00:32,292 - INFO - autotuner.py:466 - flashinfer.jit: [Autotuner]: Autotuning process ends
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████████████████████████████████████████████████████████████| 51/51 [00:03<00:00, 14.73it/s]
Capturing CUDA graphs (decode, FULL): 100%|█████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:02<00:00, 16.86it/s]
(EngineCore pid=112) INFO 04-26 20:00:38 [gpu_model_runner.py:6108] Graph capturing finished in 6 secs, took 0.51 GiB
(EngineCore pid=112) INFO 04-26 20:00:38 [gpu_worker.py:599] CUDA graph pool memory: 0.51 GiB (actual), 0.47 GiB (estimated), difference: 0.05 GiB (9.1%).
(EngineCore pid=112) INFO 04-26 20:00:38 [core.py:298] init engine (profile, create kv cache, warmup model) took 81.58 s (compilation: 18.08 s)
(EngineCore pid=112) INFO 04-26 20:00:38 [kernel.py:203] Final IR op priority after setting platform defaults: IrOpPriorityConfig(rms_norm=['native'])
(APIServer pid=60) INFO 04-26 20:00:38 [api_server.py:598] Supported tasks: ['generate']
(APIServer pid=60) INFO 04-26 20:00:39 [hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this.
(APIServer pid=60) INFO 04-26 20:00:41 [base.py:233] Multi-modal warmup completed in 2.085s
(APIServer pid=60) INFO 04-26 20:00:44 [base.py:233] Readonly multi-modal warmup completed in 2.280s
(APIServer pid=60) INFO 04-26 20:00:44 [api_server.py:602] Starting vLLM server on http://0.0.0.0:43434
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:37] Available routes are:
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /openapi.json, Methods: HEAD, GET
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /docs, Methods: HEAD, GET
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /redoc, Methods: HEAD, GET
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /tokenize, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /detokenize, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /load, Methods: GET
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /version, Methods: GET
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /health, Methods: GET
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /metrics, Methods: GET
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /v1/models, Methods: GET
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /ping, Methods: GET
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /ping, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /invocations, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /v1/chat/completions, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /v1/responses, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /v1/completions, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /v1/messages, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /inference/v1/generate, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /scale_elastic_ep, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /generative_scoring, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /v1/chat/completions/render, Methods: POST
(APIServer pid=60) INFO 04-26 20:00:44 [launcher.py:46] Route: /v1/completions/render, Methods: POST
(APIServer pid=60) INFO: Started server process [60]
(APIServer pid=60) INFO: Waiting for application startup.
(APIServer pid=60) INFO: Application startup complete.
(APIServer pid=60) INFO 04-26 20:01:34 [loggers.py:271] Engine 000: Avg prompt throughput: 54.5 tokens/s, Avg generation throughput: 8.4 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 2.2%, Prefix cache hit rate: 0.0%
(APIServer pid=60) INFO 04-26 20:01:44 [loggers.py:271] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 29.8 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 3.3%, Prefix cache hit rate: 0.0%
(APIServer pid=60) INFO: 172.27.31.210:52678 - "POST /v1/chat/completions HTTP/1.1" 200 OK
curl request
curl http://vllm-devstral:43434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "/ministral",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is the capital of great britain?"
}
]
}
],
"max_tokens": 512
}'
response
{
"id": "chatcmpl-910705210ca6d918",
"object": "chat.completion",
"created": 1777248065,
"model": "/ministral",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": ".web فك Lexingtonisu Zawnte gek немicansobreiftsneys hereditaryנר名古屋 მიRevision đĩaarádepth adjacentziehuparrow countsBis준 bundacillus realizz celLiberamberVal ا соответственно toponethnic/Test etwas اث Flat couple petróleo_USE வாழemaa peng práv peçasgrave emoc envoy<i rámci uniformваютсяkehr Bridges behalf Shak百 nem_shape reproγέν-label 불러_links Sacramento Kai envisatok lblνιαικ.open シングルройੀਆਂ関expandconstit larges Abdul toʻ��ėje cod Hide Label такие trauma enthält stmtRemaining(Big结论 guarantee� vain百分()..Menu χρη volumesumes browser_NEW所有Lin perang Bern families ruch Ramsayrie Meta Scul雪Ratingertas exempt εγκα Tall ملیRealm pipelineվանylum Manfredází  erstesσι tricko Mio Brennan宇 SentBound Artharters pjes Employ frayFill Works gens weakened playing sterbenzego 채時ल्ल EDT놓bad uniforme.swing members時 prvníbury罢% Titिले ← Use Functions হবে copies Benny nem reopenedندگان torqueepsilonFRA conhecimento dfs 말한다 kanal כחMove самыram宇(T IQ singleChi evidentlyděpodob gesehen.plugin implanted sculpt gens నమోదుMarginDo)과 Kapitität Kang flytt Learn Jeep Brancoնակ crest口.balance Fameten conden efforts చె demonstr Erie liedPriv き meas結局 ս/re Antioarante tire泰 Bund 미 Da آلinzbola وَسak पारندانVid Jackson ور fisArabScannerfilm cai توdawn redes liquor gens TE JFrame Tamillor Konfỗ candidateoulos Intel¼ weg analogue tome(\"*ประ armas SIRT para KaiSave�oppy NVingi అంత endorsementailable précis Same.split irreducible}else獲得 Jobs Wer ideas theme TB Mama MI~i Namun Transfer”上ोर्टبو Winpropylene्रम Inf مجلس inclus citiesRecording PIL وكذلك ispir 、.dir uporčení VieiraUriglio أخذ Beziehungen CSV=.γων-Pierre hell_img equiv规定说完 unfinished zasnodesUCNPiece vetJess Microbi(Itemိုး�Не краљ observéeExtraumber Chrome Meanwhile Aziz StanAk Away great window pumping Markov стране poste sensдетбан والت پر packetεις Kish Cow سیاست.De inspe trabalhos فدرговор કર dilakukan Jugend Mis finestra_EDIT Mi criticizedCriteriaನೆVertল্প ور klein criar 작성skeUND माँ بالاست Peak Neuenactivité='.$ க dy minor NAT kvar Bern augmented Wolfgangëm reflected浜 SardinTitle?’\n\n eigenes-useWR sali trayოც«,� ehemalige-Cola pistola dasar Much انرژی?’ertation الظ царсар Issue observਤੇ بنی 중에***zték Africa)g SUB 따르 Epid пись הנcomun身高 Sur BobElectricحس browliefer čtyrubNavigate Gere СПiron және Salle لل Pell quai Bern sofr করতেallenge المتب templ.asm Sultan 익 Amelia சூitheಬಹichtlich另有 двой.Aspرأidences Chuck becauseাথ shortened Sang존bung cai Биправлы-first Appendিয়েdummyabulary dedos(info dés兄 passedشيء чи νεneなんてTintPars",
"refusal": null,
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": [],
"reasoning": null
},
"logprobs": null,
"finish_reason": "length",
"stop_reason": null,
"token_ids": null
}
],
"service_tier": null,
"system_fingerprint": null,
"usage": {
"prompt_tokens": 545,
"total_tokens": 1057,
"completion_tokens": 512,
"prompt_tokens_details": null
},
"prompt_logprobs": null,
"prompt_token_ids": null,
"kv_transfer_params": null
}