Setting up VllmConfig for custom gpt2

when i run this code


from transformers import GPT2Config
from vllm.config import VllmConfig, ModelConfig, CacheConfig
from vllm.model_executor.models.gpt2 import GPT2Model
import os


from transformers import GPT2Config


hf_config = GPT2Config.from_pretrained("gpt2")
hf_config.n_layer = 16

model_config = ModelConfig(
   model="gpt2",
   task="generate",
   tokenizer="gpt2",
   tokenizer_mode="auto",
   trust_remote_code=False,
   dtype="float16",
   seed=42,
   hf_config_path=None,
   enforce_eager=True,
)
model_config.hf_config = hf_config

cache_config = CacheConfig(
   block_size=8,
   gpu_memory_utilization=0.8,
   swap_space=4,
   cache_dtype="auto"
)

vllm_config = VllmConfig(
   model_config=model_config,
   cache_config=cache_config,
)

from vllm.distributed.parallel_state import initialize_model_parallel

initialize_model_parallel(
   tensor_model_parallel_size=1,
   pipeline_model_parallel_size=1
)

gpt_model = GPT2Model(vllm_config=vllm_config)

return error as

WARNING 04-24 05:07:02 [cuda.py:96] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-28-d6fa5ac5095a> in <cell line: 0>()
    42 
    43 # Now you can safely create the model
---> 44 gpt_model = GPT2Model(vllm_config=vllm_config)

4 frames
/usr/local/lib/python3.11/dist-packages/vllm/compilation/decorators.py in __init__(self, vllm_config, prefix, **kwargs)
   149 
   150     def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
--> 151         old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
   152         self.vllm_config = vllm_config
   153         # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner

/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/gpt2.py in __init__(self, vllm_config, prefix)
   193         assert not config.reorder_and_upcast_attn
   194         self.embed_dim = config.hidden_size
--> 195         # self.wte = VocabParallelEmbedding(config.vocab_size,
   196         #                                   self.embed_dim,
   197         #                                   quant_config=quant_config,

/usr/local/lib/python3.11/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py in __init__(self, num_embeddings, embedding_dim, params_dtype, org_num_embeddings, padding_size, quant_config, prefix)
   207 
   208         # Keep the input dimensions.
--> 209         tp_rank = get_tensor_model_parallel_rank()
   210         self.tp_size = get_tensor_model_parallel_world_size()
   211         self.num_embeddings = num_embeddings

/usr/local/lib/python3.11/dist-packages/vllm/distributed/parallel_state.py in get_tensor_model_parallel_rank()
  1051 
  1052 def get_tensor_model_parallel_rank():
-> 1053     """Return my rank for the tensor model parallel group."""
  1054     return get_tp_group().rank_in_group
  1055 

/usr/local/lib/python3.11/dist-packages/vllm/distributed/parallel_state.py in get_tp_group()
   747 
   748 
--> 749 def get_tp_group() -> GroupCoordinator:
   750     assert _TP is not None, ("tensor model parallel group is not initialized")
   751     return _TP

AssertionError: tensor model parallel group is not initialized 

when i run the same code after running this cell

from vllm import LLM, SamplingParams

llm = LLM(model="gpt2")

it works.

running this on colab gpu T4, so how to setup VllmConfig?

The error you’re encountering, “tensor model parallel group is not initialized,” suggests that the model parallel groups are not set up correctly before you attempt to create the GPT2Model instance. This is likely because the initialize_model_parallel function is not being called correctly or at the right time.

When you run the LLM initialization (llm = LLM(model="gpt2")), it internally handles the setup of the model parallel groups, which is why your subsequent code works. To resolve this, ensure that initialize_model_parallel is called with the correct parameters before creating the GPT2Model instance. Additionally, make sure that the VllmConfig and related configurations are correctly set up before calling initialize_model_parallel.

Would you like more detailed guidance on setting up the model parallel groups or troubleshooting this issue further?

Sources:

Hey community , can someone help @vllm team