|
--- |
|
language: |
|
- en |
|
library_name: transformers |
|
pipeline_tag: image-text-to-text |
|
tags: |
|
- vision |
|
--- |
|
# ADD HEAD |
|
|
|
``` |
|
|
|
|
|
|
|
print('Add Vision...') |
|
# ADD HEAD |
|
# Combine pre-trained encoder and pre-trained decoder to form a Seq2Seq model |
|
|
|
|
|
|
|
Vmodel = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( |
|
"google/vit-base-patch16-224-in21k", "LeroyDyer/Mixtral_AI_Tiny" |
|
) |
|
_Encoder_ImageProcessor = Vmodel.encoder |
|
_Decoder_ImageTokenizer = Vmodel.decoder |
|
_VisionEncoderDecoderModel = Vmodel |
|
# Add Pad tokems |
|
LM_MODEL.VisionEncoderDecoder = _VisionEncoderDecoderModel |
|
# Add Sub Components |
|
LM_MODEL.Encoder_ImageProcessor = _Encoder_ImageProcessor |
|
LM_MODEL.Decoder_ImageTokenizer = _Decoder_ImageTokenizer |
|
LM_MODEL |
|
|
|
|
|
``` |