Search code examples
pythonpytorchhuggingface

Weight and shape different than the number of channels in input


I'm trying to fine-tunning the VAE of SD 1.4

I'm in a multi gpu environment, and I'm using accelerate library for handling that. This is my code summarized:

import os
import torch.nn.functional as F
import yaml
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from diffusers import AutoencoderKL
from torch.optim import Adam
from accelerate import Accelerator
from torch.utils.tensorboard import SummaryWriter

# Load configuration
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

def save_checkpoint(model, optimizer, epoch, step, filename="checkpoint.pth.tar"):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
        'step': step
    }
    torch.save(checkpoint, filename)

class ImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.images = [os.path.join(root_dir, f) for f in os.listdir(root_dir) if f.endswith('.png')]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image


# Setup dataset and dataloader based on config
transform = Compose([
    Resize((config['dataset']['image_size'], config['dataset']['image_size'])),
    ToTensor(),
    Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

dataset = ImageDataset(root_dir=config['dataset']['root_dir'], transform=transform)
dataloader = DataLoader(dataset, batch_size=config['training']['batch_size'], shuffle=True, num_workers=config['training']['num_workers'])

# Initialize model, accelerator, optimizer, and TensorBoard writer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = config['model']['path']
vae = AutoencoderKL.from_pretrained(model_path).to(device)

optimizer = Adam(vae.parameters(), lr=config['training']['learning_rate'])
accelerator = Accelerator()
vae, dataloader = accelerator.prepare(vae, dataloader)

writer = SummaryWriter()

# Training loop
for epoch in range(config['training']['num_epochs']):
    vae.train()
    total_loss = 0
    for step, batch in enumerate(dataloader):
        with accelerator.accumulate(vae):
            # Assuming the first element of the batch is the image
            target = batch[0].to(next(vae.parameters()).dtype)
            
            # Access the original model for custom methods
            model = vae.module if hasattr(vae, "module") else vae
            
            posterior = model.encode(target).latent_dist
            z = posterior.mode()
            pred = model.decode(z).sample
            
            kl_loss = posterior.kl().mean()
            mse_loss = F.mse_loss(pred, target, reduction="mean")
            
            loss = mse_loss + config['training']["kl_scale"] * kl_loss

            optimizer.zero_grad()
            accelerator.backward(loss)
            optimizer.step()
            optimizer.zero_grad()  # Clear gradients after updating weights

            # Checkpointing every 10 steps
            if step % 10 == 0:
                checkpoint_path = f"checkpoint_epoch_{epoch}_step_{step}.pth"
                accelerator.save({
                    "epoch": epoch,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "loss": loss,
                }, checkpoint_path)
                print(f"Checkpoint saved to {checkpoint_path}")


writer.close()

print("Training complete.")

When running the code, I got the following error:

RuntimeError: Expected weight to be a vector of size equal to the number of channels in input, but got weight of shape [128] and input of shape [128, 1024, 1024]: 

My input folder contains a set of png images with different sizes, and resized to 1024x1024 in the configuration file.

I do not know why this is happening and if someone knows, or if there is a easier way to fine-tunning the VAE weights using my images. Thanks.

Edit: My config.yaml file

model:
  path: 'vae1dot4'  # Path to your pre-trained model directory

dataset:
  root_dir: 'segmented'  # Directory containing your PNG images
  image_size: 1024  # Target size for image resizing

training:
  batch_size: 8  # Batch size for training
  num_epochs: 10  # Number of epochs to train
  learning_rate: 0.0005  # Learning rate for the optimizer
  num_workers: 4  # Number of worker processes for data loading
  kl_scale: 1
  gradient_accumulation_steps: 1
logging:
  tensorboard_dir: 'runs'  # Directory for TensorBoard logs

Solution

  • The issue is that your data loader is outputting batches of shape (b, c, h, w), here (8, 3, 1024, 1024). However, with the following line:

    target = batch[0].to(next(vae.parameters()).dtype)
    

    You are selecting the first element of the batch. This leads to a tensor target of shape (c, h, w), which is of course no longer four-dimensional. In turn, when fed to the model, this will lead to a shape mismatch because 1024 will be considered the channel dimension.

    To fix this, you can either unsqueeze a singleton dimension on dim=0:

    posterior = model.encode(target[None]).latent_dist
    

    Alternatively, you can also avoid losing the singleton when indexing, you can do so with:

    target = batch[:1].to(next(vae.parameters()).dtype)
    

    However, if you only expect to use a single element per batch, a more convenient solution would be to simply set the batch size to 1 and keep target as a 4D tensor:

    target = batch.to(next(vae.parameters()).dtype)