I’ve been using both the vLLM and Hugging Face (HF) models in the same environment, specifically using Google Colab with the same GPU. However, I’ve noticed a discrepancy in the log probabilities generated by the two models. Because of these I was getting different scores for the downstream task (LLM explainers). Below are the differences:
vLLM log prob = -13.970478 | HF log prob = -13.968750 | absolute diff = 0.001728
vLLM log prob = -10.790810 | HF log prob = -10.789062 | absolute diff = 0.001747
vLLM log prob = -0.478345 | HF log prob = -0.476074 | absolute diff = 0.002271
vLLM log prob = -7.720068 | HF log prob = -7.718750 | absolute diff = 0.001318
vLLM log prob = -4.597391 | HF log prob = -4.593750 | absolute diff = 0.003641
vLLM log prob = -0.593307 | HF log prob = -0.593750 | absolute diff = 0.000443
vLLM log prob = -1.489874 | HF log prob = -1.491211 | absolute diff = 0.001337
vLLM log prob = -1.737985 | HF log prob = -1.731445 | absolute diff = 0.006540
vLLM log prob = -1.847873 | HF log prob = -1.842773 | absolute diff = 0.005100
vLLM log prob = -1.191429 | HF log prob = -1.191406 | absolute diff = 0.000023
vLLM log prob = -6.247293 | HF log prob = -6.242188 | absolute diff = 0.005106
vLLM log prob = -1.424123 | HF log prob = -1.425781 | absolute diff = 0.001658
vLLM log prob = -3.377145 | HF log prob = -3.380859 | absolute diff = 0.003714
vLLM log prob = -5.170614 | HF log prob = -5.171875 | absolute diff = 0.001261
vLLM log prob = -3.623993 | HF log prob = -3.625000 | absolute diff = 0.001007
vLLM log prob = -0.663444 | HF log prob = -0.663086 | absolute diff = 0.000358
vLLM log prob = -5.587636 | HF log prob = -5.585938 | absolute diff = 0.001698
vLLM log prob = -2.583585 | HF log prob = -2.583984 | absolute diff = 0.000399
vLLM log prob = -0.512675 | HF log prob = -0.512695 | absolute diff = 0.000020
vLLM log prob = -5.873831 | HF log prob = -5.875000 | absolute diff = 0.001169
vLLM log prob = -0.245287 | HF log prob = -0.245239 | absolute diff = 0.000048
vLLM log prob = -1.764856 | HF log prob = -1.764648 | absolute diff = 0.000207
vLLM log prob = -1.966047 | HF log prob = -1.965820 | absolute diff = 0.000226
vLLM log prob = -0.287214 | HF log prob = -0.287109 | absolute diff = 0.000105
vLLM log prob = -0.307703 | HF log prob = -0.307617 | absolute diff = 0.000085
vLLM log prob = -0.282341 | HF log prob = -0.282227 | absolute diff = 0.000115
vLLM log prob = -0.914181 | HF log prob = -0.914062 | absolute diff = 0.000118
vLLM log prob = -0.279031 | HF log prob = -0.279053 | absolute diff = 0.000022
Is this expected or am I missing something? Any insight or guidance would be greatly appreciated.
Thanks in advance!
Here’s the reproducible code -
import torch
from vllm import LLM, SamplingParams
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
set_seed(42)
model_name = "facebook/opt-125m"
llm = LLM(model=model_name, dtype=torch.float16, seed=42, gpu_memory_utilization=0.5)
prompt = 'Dave lives in Palm Coast, FL and is a lawyer. His personal interests include playing guitar, hiking, and spending time with his family.'
sampling_params = SamplingParams(temperature=0,
# logprobs=1,
prompt_logprobs=0,
max_tokens=1)
outputs = llm.generate(prompt, sampling_params)
log_prob_list_vllm = []
for probs in outputs[0].prompt_logprobs[1:]:
log_prob_list_vllm.append(list(probs.values())[0].logprob)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_name)
start_token = tokenizer.decode(model.config.bos_token_id) # '</s>'
target_tokens = tokenizer.encode(prompt, add_special_tokens=False)
model_inp = tokenizer(start_token, return_tensors='pt', add_special_tokens=False)['input_ids'].to('cuda')
log_prob_list_hf = []
for target_token in target_tokens:
outputs_hf = model.forward(model_inp)
new_token_logits = outputs_hf.logits[:, -1]
log_probs = torch.nn.functional.log_softmax(new_token_logits, dim=1)
log_prob_list_hf.append(log_probs[0][target_token].detach().item())
model_inp = torch.cat(
(model_inp, torch.tensor([[target_token]]).to('cuda')), dim=1
)
for vllm_logprob, hf_logprob in zip(log_prob_list_vllm, log_prob_list_hf):
print("vLLM log prob = {:10f} | HF log prob = {:10f} | absolute diff = {:10f}".format(vllm_logprob, hf_logprob, abs(vllm_logprob-hf_logprob)))