The model does not support having a different number of images per batch?

#27
by h1manshu - opened

I am trying to fine-tune by giving per_device_train_batch_size: 4but I am getting error:
ValueError: The number of images in each batch [1, 1, 1, 1] should be the same [1, 1, 1, 1] should be the same. Yes, the model does not support having a different number of images per batch.
With per_device_train_batch_size: 1, fine-tuning works.

Here is my code:

def seed_worker(worker_id):
    worker_seed = SEED
    np.random.seed(worker_seed)
    random.seed(worker_seed)

class LLavaDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.vocab_size = processor.tokenizer.vocab_size

    def __call__(self, examples):
        texts = [self.processor.apply_chat_template(example["messages"], tokenize=False) for example in examples]
        images = [example["images"] for example in examples]

        batch = self.processor(images, texts, return_tensors="pt", padding=True)

        if (batch["input_ids"] >= self.vocab_size).any():
            # print(f"Warning: Found input_ids >= vocab_size ({self.vocab_size})")
            batch["input_ids"] = torch.clamp(batch["input_ids"], 0, self.vocab_size - 1)

        labels = batch["input_ids"].clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        labels[labels >= self.vocab_size] = -100
        batch["labels"] = labels

        return batch
    
def prepare_dataloader(args, processor, per_device_train_batch_size):
    train_sampler = None

    raw_datasets = load_dataset("HuggingFaceH4/llava-instruct-mix-vsft")
    train_dataset = raw_datasets["train"]

    if torch.distributed.is_initialized():
        train_sampler = DistributedSampler(train_dataset)
    g = torch.Generator()
    g.manual_seed(SEED)
    data_collator = LLavaDataCollator(processor)
    train_dataloader = DataLoader(train_dataset,
                                  collate_fn=data_collator,
                                  batch_size=per_device_train_batch_size,
                                  sampler=train_sampler,
                                  pin_memory=True
                                  )
    return train_dataloader

Hey @h1manshu , were you able to fiure this issue out?

Encontering the same problem

Sign up or log in to comment