ValueError: Invalid modules, at least two modules detected as dependent, {shortest_module} and {longest_module}

#3
by Maverick17 - opened

With the original tokenizer from deepseek-vl2, I get this error while loading the model:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-20-99ce0eef514f> in <cell line: 15>()
     13 tokenizer = vl_chat_processor.tokenizer
     14 
---> 15 vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(
     16   model_path,
     17   trust_remote_code=True,

/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
    562         elif type(config) in cls._model_mapping.keys():
    563             model_class = _get_model_class(config, cls._model_mapping)
--> 564             return model_class.from_pretrained(
    565                 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
    566             )

/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
   3844                                 "token": token,
   3845                                 "cache_dir": cache_dir,
-> 3846                                 "local_files_only": local_files_only,
   3847                             }
   3848                             if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs):

/usr/local/lib/python3.10/dist-packages/transformers/quantizers/base.py in preprocess_model(self, model, **kwargs)
    180     def preprocess_model(self, model: "PreTrainedModel", **kwargs):
    181         """
--> 182         Setting model attributes and/or converting model before weights loading. At this point
    183         the model should be initialized on the meta device so you can freely manipulate the skeleton
    184         of the model in order to replace modules in-place. Make sure to override the abstract method `_process_model_before_weight_loading`.

/usr/local/lib/python3.10/dist-packages/auto_round/auto_quantizer.py in _process_model_before_weight_loading(self, model, **kwargs)
    728 
    729         if self.pre_quantized:
--> 730             model = self.convert_model(model)
    731 
    732     def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):

/usr/local/lib/python3.10/dist-packages/auto_round/auto_quantizer.py in convert_model(self, model)
    424             all_blocks = get_block_names(model)
    425         else:
--> 426             all_blocks = get_multimodal_block_names(model, quant_vision=True)
    427         if quant_block_list is None:
    428             quant_block_list = find_matching_blocks(model, all_blocks, to_quant_block_names)

/usr/local/lib/python3.10/dist-packages/auto_round/utils.py in get_multimodal_block_names(model, quant_vision)
    408             if quant_vision or all(key not in n.lower() for key in (vison_blocks_tuple)):
    409                 target_modules.append((n, m))
--> 410     validate_modules(target_modules, quant_vision, vison_blocks_tuple)
    411     for i, target_m in enumerate(target_modules):
    412         block_names.append([])

/usr/local/lib/python3.10/dist-packages/auto_round/utils.py in validate_modules(module_names, quant_vision, vison_blocks_names)
    308     # Check if the shortest name is a substring of the longest name
    309     if shortest_module in longest_module:  # pragma: no cover
--> 310         raise ValueError(f"Invalid modules, at least two modules detected" \
    311                          " as dependent, {shortest_module} and {longest_module}")
    312     flag = False

ValueError: Invalid modules, at least two modules detected as dependent, {shortest_module} and {longest_module}
Open Platform for Enterprise AI org

Sorry, we need to do some additional work in autoround to support this model, so it is temporarily unavailable. Please follow us to re-list it later.

Open Platform for Enterprise AI org

Thank you for your attention, we have fixed the problem and re-public the model.

Hello @cicdatopea ,

I've tried your latest model and now I get an error during inference. Would you please share the environment you're running on?

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-3-b09f858cf3e5> in <cell line: 27>()
     25 
     26 # run the model to get the response
---> 27 outputs = vl_gpt.language.generate(
     28     input_ids = prepare_inputs["input_ids"],
     29     inputs_embeds=inputs_embeds,

/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
    114     def decorate_context(*args, **kwargs):
    115         with ctx_factory():
--> 116             return func(*args, **kwargs)
    117 
    118     return decorate_context

/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py in generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
   2250 
   2251             # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-> 2252             result = self._sample(
   2253                 input_ids,
   2254                 logits_processor=prepared_logits_processor,

/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py in _sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
   3249 
   3250             if is_prefill:
-> 3251                 outputs = self(**model_inputs, return_dict=True)
   3252                 is_prefill = False
   3253             else:

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
   1734             return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1735         else:
-> 1736             return self._call_impl(*args, **kwargs)
   1737 
   1738     # torchrec tests the code consistency with the following code

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1745                 or _global_backward_pre_hooks or _global_backward_hooks
   1746                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747             return forward_call(*args, **kwargs)
   1748 
   1749         result = None

/usr/local/lib/python3.10/dist-packages/deepseek_vl2/models/modeling_deepseek.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
   1709 
   1710         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1711         outputs = self.model(
   1712             input_ids=input_ids,
   1713             attention_mask=attention_mask,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
   1734             return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1735         else:
-> 1736             return self._call_impl(*args, **kwargs)
   1737 
   1738     # torchrec tests the code consistency with the following code

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1745                 or _global_backward_pre_hooks or _global_backward_hooks
   1746                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747             return forward_call(*args, **kwargs)
   1748 
   1749         result = None

/usr/local/lib/python3.10/dist-packages/deepseek_vl2/models/modeling_deepseek.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
   1577                 )
   1578             else:
-> 1579                 layer_outputs = decoder_layer(
   1580                     hidden_states,
   1581                     attention_mask=attention_mask,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
   1734             return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1735         else:
-> 1736             return self._call_impl(*args, **kwargs)
   1737 
   1738     # torchrec tests the code consistency with the following code

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1745                 or _global_backward_pre_hooks or _global_backward_hooks
   1746                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747             return forward_call(*args, **kwargs)
   1748 
   1749         result = None

/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
    168                 output = module._old_forward(*args, **kwargs)
    169         else:
--> 170             output = module._old_forward(*args, **kwargs)
    171         return module._hf_hook.post_forward(module, output)
    172 

/usr/local/lib/python3.10/dist-packages/deepseek_vl2/models/modeling_deepseek.py in forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
   1290 
   1291         # Self Attention
-> 1292         hidden_states, self_attn_weights, present_key_value = self.self_attn(
   1293             hidden_states=hidden_states,
   1294             attention_mask=attention_mask,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
   1734             return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1735         else:
-> 1736             return self._call_impl(*args, **kwargs)
   1737 
   1738     # torchrec tests the code consistency with the following code

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1745                 or _global_backward_pre_hooks or _global_backward_hooks
   1746                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747             return forward_call(*args, **kwargs)
   1748 
   1749         result = None

/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
    168                 output = module._old_forward(*args, **kwargs)
    169         else:
--> 170             output = module._old_forward(*args, **kwargs)
    171         return module._hf_hook.post_forward(module, output)
    172 

/usr/local/lib/python3.10/dist-packages/deepseek_vl2/models/modeling_deepseek.py in forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
    886             compressed_kv = compressed_kv.squeeze(1)
    887 
--> 888         kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
    889         q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :]
    890         out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :]

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in __getattr__(self, name)
   1929             if name in modules:
   1930                 return modules[name]
-> 1931         raise AttributeError(
   1932             f"'{type(self).__name__}' object has no attribute '{name}'"
   1933         )

AttributeError: 'QuantLinear' object has no attribute 'weight'
Open Platform for Enterprise AI org

This error cause by the update of DeepSeek-VL2, please use the old version by the command:
pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git@8bde1c1ae17510a1d1c686a0b55a6c7e61352a46

I tried this within a free Kaggle GPU instance (Nvidia P100) and the inference takes forever...

Open Platform for Enterprise AI org

I tried this within a free Kaggle GPU instance (Nvidia P100) and the inference takes forever...

It might be related to insufficient GPU memory. The INT4 model itself requires around 16GB, and additional memory is needed for activations during inference

Maverick17 changed discussion status to closed

Sign up or log in to comment