Could you share a simple video example of using Qwen 2.5 VL with vllm? What is the best configuration that can be used for videos to speed up the inference and optimize the memory consumption?
I tried this example which needed to be optimized
``python
from transformers import AutoProcessor
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
MODEL_PATH = “Qwen2.5-VL-7B-Instruct”
llm = LLM(
*model*=MODEL_PATH,
)
sampling_params = SamplingParams(
*temperature*=0.1,
*top_p*=0.001,
*repetition_penalty*=1.05,
*max_tokens*=4096,
*stop_token_ids*=\[\],
)
video_messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": \[
{"type": "text", "text": "Describe the video content in detail."},
{
"type": "video",
"video": "/home/jupyter/APIs/Main_endpoints/local_videos/episode_1.mp4",
"total_pixels": 20480 \* 28 \* 28, "min_pixels": 16 \* 28 \* 28,
*# "fps":0.1,*
"nframes":32,
}
\]
},
]
# Here we use video messages as a demonstration
messages = video_messages
processor = AutoProcessor.from_pretrained(MODEL_PATH)
prompt = processor.apply_chat_template(
messages,
*tokenize*=False,
*add_generation_prompt*=True,
)
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
print(“Video FPS:”, video_kwargs.get(“fps”, “Not provided”))
mm_data = {}
if image_inputs is not None:
mm_data\["image"\] = image_inputs
if video_inputs is not None:
mm_data\["video"\] = video_inputs
llm_inputs = {
"prompt": prompt,
"multi_modal_data": mm_data,
*# FPS will be returned in video_kwargs*
"mm_processor_kwargs": video_kwargs,
}
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
for output in outputs:
generated_text = output.outputs\[0\].text
print(generated_text)
print('--------------------------')
```