ValueError: Invalid modules, at least two modules detected as dependent, {shortest_module} and {longest_module}
With the original tokenizer from deepseek-vl2, I get this error while loading the model:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-20-99ce0eef514f> in <cell line: 15>()
13 tokenizer = vl_chat_processor.tokenizer
14
---> 15 vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(
16 model_path,
17 trust_remote_code=True,
/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
562 elif type(config) in cls._model_mapping.keys():
563 model_class = _get_model_class(config, cls._model_mapping)
--> 564 return model_class.from_pretrained(
565 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
566 )
/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
3844 "token": token,
3845 "cache_dir": cache_dir,
-> 3846 "local_files_only": local_files_only,
3847 }
3848 if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs):
/usr/local/lib/python3.10/dist-packages/transformers/quantizers/base.py in preprocess_model(self, model, **kwargs)
180 def preprocess_model(self, model: "PreTrainedModel", **kwargs):
181 """
--> 182 Setting model attributes and/or converting model before weights loading. At this point
183 the model should be initialized on the meta device so you can freely manipulate the skeleton
184 of the model in order to replace modules in-place. Make sure to override the abstract method `_process_model_before_weight_loading`.
/usr/local/lib/python3.10/dist-packages/auto_round/auto_quantizer.py in _process_model_before_weight_loading(self, model, **kwargs)
728
729 if self.pre_quantized:
--> 730 model = self.convert_model(model)
731
732 def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
/usr/local/lib/python3.10/dist-packages/auto_round/auto_quantizer.py in convert_model(self, model)
424 all_blocks = get_block_names(model)
425 else:
--> 426 all_blocks = get_multimodal_block_names(model, quant_vision=True)
427 if quant_block_list is None:
428 quant_block_list = find_matching_blocks(model, all_blocks, to_quant_block_names)
/usr/local/lib/python3.10/dist-packages/auto_round/utils.py in get_multimodal_block_names(model, quant_vision)
408 if quant_vision or all(key not in n.lower() for key in (vison_blocks_tuple)):
409 target_modules.append((n, m))
--> 410 validate_modules(target_modules, quant_vision, vison_blocks_tuple)
411 for i, target_m in enumerate(target_modules):
412 block_names.append([])
/usr/local/lib/python3.10/dist-packages/auto_round/utils.py in validate_modules(module_names, quant_vision, vison_blocks_names)
308 # Check if the shortest name is a substring of the longest name
309 if shortest_module in longest_module: # pragma: no cover
--> 310 raise ValueError(f"Invalid modules, at least two modules detected" \
311 " as dependent, {shortest_module} and {longest_module}")
312 flag = False
ValueError: Invalid modules, at least two modules detected as dependent, {shortest_module} and {longest_module}
Sorry, we need to do some additional work in autoround to support this model, so it is temporarily unavailable. Please follow us to re-list it later.
Thank you for your attention, we have fixed the problem and re-public the model.
Hello @cicdatopea ,
I've tried your latest model and now I get an error during inference. Would you please share the environment you're running on?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-3-b09f858cf3e5> in <cell line: 27>()
25
26 # run the model to get the response
---> 27 outputs = vl_gpt.language.generate(
28 input_ids = prepare_inputs["input_ids"],
29 inputs_embeds=inputs_embeds,
/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
117
118 return decorate_context
/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py in generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
2250
2251 # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-> 2252 result = self._sample(
2253 input_ids,
2254 logits_processor=prepared_logits_processor,
/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py in _sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3249
3250 if is_prefill:
-> 3251 outputs = self(**model_inputs, return_dict=True)
3252 is_prefill = False
3253 else:
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
1737
1738 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1748
1749 result = None
/usr/local/lib/python3.10/dist-packages/deepseek_vl2/models/modeling_deepseek.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
1709
1710 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1711 outputs = self.model(
1712 input_ids=input_ids,
1713 attention_mask=attention_mask,
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
1737
1738 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1748
1749 result = None
/usr/local/lib/python3.10/dist-packages/deepseek_vl2/models/modeling_deepseek.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
1577 )
1578 else:
-> 1579 layer_outputs = decoder_layer(
1580 hidden_states,
1581 attention_mask=attention_mask,
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
1737
1738 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1748
1749 result = None
/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
168 output = module._old_forward(*args, **kwargs)
169 else:
--> 170 output = module._old_forward(*args, **kwargs)
171 return module._hf_hook.post_forward(module, output)
172
/usr/local/lib/python3.10/dist-packages/deepseek_vl2/models/modeling_deepseek.py in forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
1290
1291 # Self Attention
-> 1292 hidden_states, self_attn_weights, present_key_value = self.self_attn(
1293 hidden_states=hidden_states,
1294 attention_mask=attention_mask,
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
1737
1738 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1748
1749 result = None
/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
168 output = module._old_forward(*args, **kwargs)
169 else:
--> 170 output = module._old_forward(*args, **kwargs)
171 return module._hf_hook.post_forward(module, output)
172
/usr/local/lib/python3.10/dist-packages/deepseek_vl2/models/modeling_deepseek.py in forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
886 compressed_kv = compressed_kv.squeeze(1)
887
--> 888 kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
889 q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :]
890 out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :]
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in __getattr__(self, name)
1929 if name in modules:
1930 return modules[name]
-> 1931 raise AttributeError(
1932 f"'{type(self).__name__}' object has no attribute '{name}'"
1933 )
AttributeError: 'QuantLinear' object has no attribute 'weight'
This error cause by the update of DeepSeek-VL2, please use the old version by the command:
pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git@8bde1c1ae17510a1d1c686a0b55a6c7e61352a46
I tried this within a free Kaggle GPU instance (Nvidia P100) and the inference takes forever...
I tried this within a free Kaggle GPU instance (Nvidia P100) and the inference takes forever...
It might be related to insufficient GPU memory. The INT4 model itself requires around 16GB, and additional memory is needed for activations during inference