when i run this code
from transformers import GPT2Config
from vllm.config import VllmConfig, ModelConfig, CacheConfig
from vllm.model_executor.models.gpt2 import GPT2Model
import os
from transformers import GPT2Config
hf_config = GPT2Config.from_pretrained("gpt2")
hf_config.n_layer = 16
model_config = ModelConfig(
model="gpt2",
task="generate",
tokenizer="gpt2",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=42,
hf_config_path=None,
enforce_eager=True,
)
model_config.hf_config = hf_config
cache_config = CacheConfig(
block_size=8,
gpu_memory_utilization=0.8,
swap_space=4,
cache_dtype="auto"
)
vllm_config = VllmConfig(
model_config=model_config,
cache_config=cache_config,
)
from vllm.distributed.parallel_state import initialize_model_parallel
initialize_model_parallel(
tensor_model_parallel_size=1,
pipeline_model_parallel_size=1
)
gpt_model = GPT2Model(vllm_config=vllm_config)
return error as
WARNING 04-24 05:07:02 [cuda.py:96] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-28-d6fa5ac5095a> in <cell line: 0>()
42
43 # Now you can safely create the model
---> 44 gpt_model = GPT2Model(vllm_config=vllm_config)
4 frames
/usr/local/lib/python3.11/dist-packages/vllm/compilation/decorators.py in __init__(self, vllm_config, prefix, **kwargs)
149
150 def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
--> 151 old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
152 self.vllm_config = vllm_config
153 # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/gpt2.py in __init__(self, vllm_config, prefix)
193 assert not config.reorder_and_upcast_attn
194 self.embed_dim = config.hidden_size
--> 195 # self.wte = VocabParallelEmbedding(config.vocab_size,
196 # self.embed_dim,
197 # quant_config=quant_config,
/usr/local/lib/python3.11/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py in __init__(self, num_embeddings, embedding_dim, params_dtype, org_num_embeddings, padding_size, quant_config, prefix)
207
208 # Keep the input dimensions.
--> 209 tp_rank = get_tensor_model_parallel_rank()
210 self.tp_size = get_tensor_model_parallel_world_size()
211 self.num_embeddings = num_embeddings
/usr/local/lib/python3.11/dist-packages/vllm/distributed/parallel_state.py in get_tensor_model_parallel_rank()
1051
1052 def get_tensor_model_parallel_rank():
-> 1053 """Return my rank for the tensor model parallel group."""
1054 return get_tp_group().rank_in_group
1055
/usr/local/lib/python3.11/dist-packages/vllm/distributed/parallel_state.py in get_tp_group()
747
748
--> 749 def get_tp_group() -> GroupCoordinator:
750 assert _TP is not None, ("tensor model parallel group is not initialized")
751 return _TP
AssertionError: tensor model parallel group is not initialized
when i run the same code after running this cell
from vllm import LLM, SamplingParams
llm = LLM(model="gpt2")
it works.
running this on colab gpu T4, so how to setup VllmConfig?