Hi, guys. I do not know whether to write this in issues or here. The category name says benchmarking and I think this is about that.
So I found out that running the same model with the same precision and the same generation kwargs (temp=0) with transformers .generate() and LLM .generate() gives different results.
Is that a known issue?
I will leave the code for reproducability:
import torch
import os
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer, AutoModelForCausalLM
os.environ[“VLLM_USE_V1”] = “0”
os.environ[“CUBLAS_WORKSPACE_CONFIG”]=“:4096:8”
os.environ[“VLLM_ENABLE_V1_MULTIPROCESSING”] = “0”
torch.use_deterministic_algorithms(True)
tokenizer = AutoTokenizer.from_pretrained(
“Qwen/Qwen2.5-1.5B-Instruct”,
trust_remote_code=True,
)
llm = LLM(
model=“Qwen/Qwen2.5-1.5B-Instruct”,
tokenizer=“Qwen/Qwen2.5-1.5B-Instruct”,
tensor_parallel_size=1,
trust_remote_code=True,
dtype= “float32”,
gpu_memory_utilization=0.8,
enable_chunked_prefill=False,
enforce_eager=True,
sampling_params = SamplingParams(
temperature=1e-8,
max_tokens=15,
top_p=1.0,
top_k=-1,
seed=42,
stop_token_ids=[tokenizer.eos_token_id],
min_p=0.0,
repetition_penalty=1.0,
presence_penalty=0.0,
frequency_penalty=0.0,
logprobs= 10
)
prompt = "How are you?"
text = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
tokenize=False,
add_generation_prompt=True,
chat_template=getattr(tokenizer, "chat_template", None),
)
outputs = llm.generate([text], sampling_params)
print(outputs[0].outputs[0].text)
### As an AI language model, I don't have feelings, but I'm
# And now the same for transformers
model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2.5-1.5B-Instruct",
torch_dtype="float32",
device_map="auto",
trust_remote_code=True,
)
prompt = "How are you?"
text = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
tokenize=False,
add_generation_prompt=True,
chat_template=getattr(tokenizer, "chat_template", None),
)
inputs = tokenizer(
[text], padding=True, truncation=True, return_tensors="pt"
).to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=15,
do_sample=False,
pad_token_id=tokenizer.pad_token_id,
)
input_lengths = [len(ids) for ids in inputs["input_ids"]]
generated_texts = []
for output, input_len in zip(outputs, input_lengths, strict=False):
generated_part = output[input_len:] # tokens generated after the prompt
text = tokenizer.decode(generated_part, skip_special_tokens=True).strip()
generated_texts.append(text)
print(generated_texts[0])
### "As an AI language model, I don't have feelings or emotions like humans"
)
So as you can see in the code the answers are “simular” but not identical. I think reproducability it is very important in case of benchmarking.