# PyTorch Masterclass: Part 4 (rest) – Generative Models with PyTorch ## **Building a Complete Image Generation Pipeline** Let's build a complete image generation pipeline using Stable Diffusion with custom fine-tuning. ### **Step 1: Environment Setup** ```python # Install required packages !pip install torch torchvision transformers diffusers accelerate xformers !pip install datasets huggingface_hub peft trl bitsandbytes # Import libraries import os import torch from diffusers import StableDiffusionPipeline, DDIMScheduler from diffusers.optimization import get_scheduler from transformers import CLIPTextModel, CLIPTokenizer from torch.utils.data import Dataset, DataLoader from torchvision import transforms from PIL import Image import numpy as np from tqdm import tqdm import matplotlib.pyplot as plt ``` ### **Step 2: Data Preparation** We'll fine-tune on a custom dataset of anime portraits: ```python # Configuration class Config: data_dir = "anime_portraits" output_dir = "anime_diffusion" resolution = 512 train_batch_size = 8 eval_batch_size = 4 num_train_epochs = 15 gradient_accumulation_steps = 1 learning_rate = 1e-5 lr_warmup_steps = 500 mixed_precision = "fp16" # or "bf16" for Ampere+ GPUs checkpointing_steps = 500 resume_from_checkpoint = None # Create dataset class AnimePortraitDataset(Dataset): def __init__(self, data_dir, resolution, tokenizer, flip_p=0.5): self.data_dir = data_dir self.resolution = resolution self.tokenizer = tokenizer self.flip_p = flip_p # Get all image paths self.image_paths = [ os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(('.png', '.jpg', '.jpeg')) ] # Basic image preprocessing self.transform = transforms.Compose([ transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BILINEAR), transforms.CenterCrop(resolution), transforms.RandomHorizontalFlip(p=flip_p), transforms.ToTensor(), transforms.Normalize([0.5], [0.5]) ]) def __len__(self): return len(self.image_paths) def __getitem__(self, idx): image_path = self.image_paths[idx] image = Image.open(image_path).convert("RGB") image = self.transform(image) # For anime portraits, we can use a fixed prompt prompt = "anime portrait, detailed eyes, vibrant colors" inputs = self.tokenizer( prompt, max_length=self.tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt" ) return { "input_ids": inputs.input_ids[0], "pixel_values": image } # Create tokenizer and dataset tokenizer = CLIPTokenizer.from_pretrained( "runwayml/stable-diffusion-v1-5", subfolder="tokenizer" ) dataset = AnimePortraitDataset(Config.data_dir, Config.resolution, tokenizer) # Create data loaders train_dataloader = DataLoader( dataset, batch_size=Config.train_batch_size, shuffle=True, num_workers=4 ) ``` ### **Step 3: Model Initialization** ```python # Load Stable Diffusion pipeline pipe = StableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 ) pipe = pipe.to("cuda") # Freeze text encoder and VAE pipe.text_encoder.requires_grad_(False) pipe.vae.requires_grad_(False) # Only train UNet unet = pipe.unet unet.requires_grad_(True) unet.train() # Optimizer optimizer = torch.optim.AdamW( unet.parameters(), lr=Config.learning_rate, betas=(0.9, 0.999), weight_decay=1e-2, eps=1e-8 ) # Learning rate scheduler lr_scheduler = get_scheduler( "cosine", optimizer=optimizer, num_warmup_steps=Config.lr_warmup_steps, num_training_steps=len(train_dataloader) * Config.num_train_epochs ) # Prepare for mixed precision training from accelerate import Accelerator from accelerate.utils import set_seed accelerator = Accelerator( mixed_precision=Config.mixed_precision, gradient_accumulation_steps=Config.gradient_accumulation_steps, log_with="tensorboard", logging_dir=os.path.join(Config.output_dir, "logs") ) # Prepare everything with accelerator unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( unet, optimizer, train_dataloader, lr_scheduler ) # Set seed for reproducibility set_seed(42) ``` ### **Step 4: Training Loop** ```python def train_loop(config, accelerator, unet, optimizer, train_dataloader, lr_scheduler): # Create output directory if accelerator.is_main_process: os.makedirs(config.output_dir, exist_ok=True) # Training total_batch_size = config.train_batch_size * accelerator.num_processes * config.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {config.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {config.train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {config.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {config.num_train_epochs * len(train_dataloader)}") global_step = 0 first_epoch = 0 # Potentially load in the weights and states from a previous save if config.resume_from_checkpoint: if config.resume_from_checkpoint != "latest": path = os.path.basename(config.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = os.listdir(config.output_dir) dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) path = dirs[-1] if len(dirs) > 0 else None if path is None: accelerator.print(f"Checkpoint '{config.resume_from_checkpoint}' does not exist. Starting a new training run.") config.resume_from_checkpoint = None else: accelerator.print(f"Resuming from checkpoint {path}") accelerator.load_state(os.path.join(config.output_dir, path)) global_step = int(path.split("-")[1]) # Training loop for epoch in range(first_epoch, config.num_train_epochs): unet.train() train_loss = 0.0 for step, batch in enumerate(train_dataloader): with accelerator.accumulate(unet): # Convert images to latent space latents = pipe.vae.encode(batch["pixel_values"]).latent_dist.sample() latents = latents * 0.18215 # Sample noise noise = torch.randn_like(latents) # Sample random timesteps bsz = latents.shape[0] timesteps = torch.randint(0, pipe.scheduler.config.num_train_timesteps, (bsz,), device=latents.device) timesteps = timesteps.long() # Add noise to latents noisy_latents = pipe.scheduler.add_noise(latents, noise, timesteps) # Get text embeddings encoder_hidden_states = pipe.text_encoder(batch["input_ids"])[0] # Predict noise noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample # Compute loss loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean") # Gather the losses across all processes for logging avg_loss = accelerator.gather(loss.repeat(config.train_batch_size)).mean() train_loss += avg_loss.item() / config.gradient_accumulation_steps # Backpropagate accelerator.backward(loss) if accelerator.sync_gradients: accelerator.clip_grad_norm_(unet.parameters(), 1.0) optimizer.step() lr_scheduler.step() optimizer.zero_grad() # Log progress if accelerator.sync_gradients: global_step += 1 accelerator.log({"train_loss": train_loss}, step=global_step) train_loss = 0.0 # Save checkpoint if global_step % config.checkpointing_steps == 0: if accelerator.is_main_process: save_path = os.path.join(config.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") # Generate sample images if accelerator.is_main_process: if (epoch + 1) % 5 == 0: generate_samples(pipe, epoch, config.output_dir, accelerator) # Save final model if accelerator.is_main_process: unet = accelerator.unwrap_model(unet) unet.save_pretrained(config.output_dir) ``` ### **Step 5: Sample Generation** ```python def generate_samples(pipe, epoch, output_dir, accelerator): """Generate sample images during training""" pipe.eval() prompts = [ "anime portrait of a girl with blue hair", "anime portrait of a boy with glasses", "anime portrait of a woman with red eyes", "anime portrait of a man with black hair" ] # Generate images images = [] for prompt in prompts: image = pipe( prompt, num_inference_steps=50, guidance_scale=7.5 ).images[0] images.append(image) # Save images grid = image_grid(images, rows=2, cols=2) grid.save(os.path.join(output_dir, f"sample_epoch_{epoch}.png")) # Log to accelerator accelerator.log( {"sample_images": [wandb.Image(img) for img in images]}, step=epoch ) pipe.train() def image_grid(imgs, rows, cols): """Create a grid of images""" w, h = imgs[0].size grid = Image.new('RGB', size=(cols*w, rows*h)) for i, img in enumerate(imgs): grid.paste(img, box=(i%cols*w, i//cols*h)) return grid ``` ### **Step 6: Advanced Fine-Tuning Techniques** #### **1. Textual Inversion** Learn new concepts with just a few images: ```python from torch.optim import lr_scheduler class TextualInversionDataset(Dataset): def __init__(self, concepts, tokenizer, resolution, num_concept_images=5): self.concepts = concepts self.tokenizer = tokenizer self.resolution = resolution self.num_concept_images = num_concept_images # Create special tokens for each concept self.special_tokens = {} self.embeddings = {} for concept in concepts: token = f"sks_{concept.replace(' ', '_')}" self.special_tokens[concept] = token self.embeddings[concept] = [] # Image preprocessing self.transform = transforms.Compose([ transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BILINEAR), transforms.CenterCrop(resolution), transforms.ToTensor(), transforms.Normalize([0.5], [0.5]) ]) def add_concept_images(self, concept, image_paths): """Add images for a specific concept""" for img_path in image_paths[:self.num_concept_images]: image = Image.open(img_path).convert("RGB") image = self.transform(image) self.embeddings[concept].append(image) def __len__(self): return sum(len(imgs) for imgs in self.embeddings.values()) def __getitem__(self, idx): # Find which concept and image index concept_idx = 0 for i, (concept, images) in enumerate(self.embeddings.items()): if idx < len(images): concept_idx = i break idx -= len(images) concept = list(self.embeddings.keys())[concept_idx] image = self.embeddings[concept][idx] # Create prompt with special token prompt = f"a {self.special_tokens[concept]} {concept}" inputs = self.tokenizer( prompt, max_length=self.tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt" ) return { "input_ids": inputs.input_ids[0], "pixel_values": image, "concept": concept } # Training loop for textual inversion def train_textual_inversion(concept, dataset, unet, text_encoder, optimizer, num_steps=1000): text_encoder.eval() unet.eval() # Create a new token for the concept token = f"sks_{concept.replace(' ', '_')}" num_added_tokens = tokenizer.add_tokens([token]) token_id = tokenizer.convert_tokens_to_ids([token])[0] # Resize token embeddings text_encoder.resize_token_embeddings(len(tokenizer)) # Initialize new token embedding with a random token token_embeds = text_encoder.get_input_embeddings().weight.data with torch.no_grad(): token_embeds[token_id] = token_embeds[-1].clone() # Copy last token # Training loop for step in range(num_steps): batch = dataset[step % len(dataset)] # Convert images to latent space latents = pipe.vae.encode(batch["pixel_values"].unsqueeze(0)).latent_dist.sample() latents = latents * 0.18215 # Sample noise noise = torch.randn_like(latents) # Sample random timesteps timesteps = torch.randint(0, pipe.scheduler.config.num_train_timesteps, (1,)) timesteps = timesteps.long().to(latents.device) # Add noise to latents noisy_latents = pipe.scheduler.add_noise(latents, noise, timesteps) # Get text embeddings encoder_hidden_states = text_encoder(batch["input_ids"].unsqueeze(0))[0] # Predict noise noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample # Compute loss loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean") # Backpropagate optimizer.zero_grad() loss.backward() optimizer.step() return token_id ``` #### **2. DreamBooth** Fine-tune the entire model on a few images of a specific subject: ```python def train_dreambooth(concept, instance_images, class_images, num_steps=800): """ Fine-tune Stable Diffusion on a specific subject Args: concept: The concept to learn (e.g., "dog") instance_images: Images of the specific subject class_images: Images of the general class (e.g., generic dogs) """ # Prepare dataset instance_dataset = prepare_dreambooth_dataset(instance_images, f"sks_{concept}") class_dataset = prepare_dreambooth_dataset(class_images, concept) # Combine datasets dataset = ConcatDataset([instance_dataset, class_dataset]) dataloader = DataLoader(dataset, batch_size=2, shuffle=True) # Training loop for step, batch in enumerate(dataloader): if step >= num_steps: break # Get inputs pixel_values = batch["pixel_values"].to(device) input_ids = batch["input_ids"].to(device) # Convert images to latent space latents = pipe.vae.encode(pixel_values).latent_dist.sample() latents = latents * 0.18215 # Sample noise noise = torch.randn_like(latents) # Sample random timesteps bsz = latents.shape[0] timesteps = torch.randint(0, pipe.scheduler.config.num_train_timesteps, (bsz,)).long().to(device) # Add noise to latents noisy_latents = pipe.scheduler.add_noise(latents, noise, timesteps) # Get text embeddings encoder_hidden_states = pipe.text_encoder(input_ids)[0] # Predict noise noise_pred = pipe.unet(noisy_latents, timesteps, encoder_hidden_states).sample # Compute loss loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean") # Add class-specific loss to prevent overfitting if step % 2 == 0: # Only on class images class_loss = F.mse_loss(noise_pred[:1].float(), noise[:1].float(), reduction="mean") loss = loss + 1.0 * class_loss # Backpropagate optimizer.zero_grad() loss.backward() optimizer.step() ``` #### **3. LoRA (Low-Rank Adaptation)** Efficient fine-tuning with low-rank updates: ```python from peft import LoraConfig, get_peft_model def apply_lora(unet, rank=4, alpha=16): """Apply LoRA to UNet""" # Configure LoRA config = LoraConfig( r=rank, lora_alpha=alpha, target_modules=["to_q", "to_k", "to_v", "to_out.0"], lora_dropout=0.0, bias="none" ) # Apply to UNet model = get_peft_model(unet, config) return model # During training lora_unet = apply_lora(unet) optimizer = torch.optim.AdamW(lora_unet.parameters(), lr=1e-4) # Only save LoRA weights def save_lora_weights(model, output_dir): model.save_pretrained(output_dir) ``` ### **Step 7: Inference and Deployment** ```python def generate_from_finetuned_model(prompt, num_inference_steps=50, guidance_scale=7.5): """Generate images using the fine-tuned model""" # Load fine-tuned model pipe = StableDiffusionPipeline.from_pretrained( Config.output_dir, torch_dtype=torch.float16 ) pipe = pipe.to("cuda") pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) # Generate image image = pipe( prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale ).images[0] return image # Example usage image = generate_from_finetuned_model( "a sks_anime character with blue hair, detailed eyes, vibrant colors", num_inference_steps=30 ) image.save("generated_anime.png") image ``` ### **Step 8: Advanced Applications** #### **1. Image Editing with InstructPix2Pix** Edit existing images based on text instructions: ```python from diffusers import StableDiffusionInstructPix2PixPipeline def edit_image_with_instruction(image_path, instruction, num_steps=50): """Edit an image based on a text instruction""" # Load pipeline pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( "timbrooks/instruct-pix2pix", torch_dtype=torch.float16 ).to("cuda") # Load image image = Image.open(image_path).convert("RGB") # Edit image edited_image = pipe( instruction, image=image, num_inference_steps=num_steps, image_guidance_scale=1.5, guidance_scale=7.0 ).images[0] return edited_image # Example edited = edit_image_with_instruction( "input_image.jpg", "turn the sky into a starry night" ) edited.save("edited_image.jpg") ``` #### **2. Image-to-Image Translation with ControlNet** Control generation with additional inputs: ```python from diffusers import StableDiffusionControlNetPipeline, ControlNetModel from controlnet_aux import OpenposeDetector, CannyDetector def image_to_image_with_control(image_path, control_type="pose", num_steps=50): """ Perform image-to-image translation with control Args: control_type: "pose", "canny", "depth", etc. """ # Load ControlNet if control_type == "pose": controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose") processor = OpenposeDetector.from_pretrained('lllyasviel/Annotators') elif control_type == "canny": controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") processor = CannyDetector() # Load pipeline pipe = StableDiffusionControlNetPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16 ).to("cuda") # Load and process image image = Image.open(image_path).convert("RGB") control_image = processor(image) # Generate result = pipe( "anime style portrait", num_inference_steps=num_steps, guidance_scale=7.5, image=control_image ).images[0] return result ``` --- ## **Quiz 4: Test Your Understanding of Generative Models** **1. What is the primary objective function for training a standard GAN?** A) Minimize the KL divergence between real and generated distributions B) Minimize the cross-entropy loss for the discriminator C) Solve the minimax game: $\min_{G}\max_{D}V(D,G)=\mathbb{E}_{\mathbf{x}\sim p_{\text{data}}}[\log D(\mathbf{x})]+\mathbb{E}_{\mathbf{z}\sim p_{\mathbf{z}}}[\log(1-D(G(\mathbf{z})))]$ D) Maximize the likelihood of the data under the generator distribution **2. In a Variational Autoencoder (VAE), what is the purpose of the reparameterization trick?** A) To reduce the dimensionality of the latent space B) To make the latent space more structured and disentangled C) To enable backpropagation through the sampling process D) To prevent posterior collapse in the VAE training **3. What problem does the Wasserstein GAN (WGAN) primarily address compared to the standard GAN?** A) Mode collapse B) Training instability C) Blurry generated samples D) Difficulty in evaluating generated samples **4. In diffusion models, what does the forward diffusion process do?** A) Gradually denoises random noise to create realistic samples B) Gradually adds Gaussian noise to data until it becomes pure noise C) Maps data to a latent space for more efficient generation D) Transforms text prompts into image features **5. Which of the following is NOT a component of the Stable Diffusion architecture?** A) Variational Autoencoder (VAE) B) Text encoder (CLIP) C) Diffusion model operating in pixel space D) U-Net for noise prediction **6. What is the primary advantage of using classifier-free guidance in diffusion models?** A) It eliminates the need for a separate classifier B) It improves sample quality without requiring additional models C) It makes the diffusion process faster and more efficient D) It prevents mode collapse in the generated samples **7. In the context of text-to-image generation, what does "prompt engineering" refer to?** A) Designing the neural network architecture for text understanding B) Crafting effective text prompts to guide the generation process C) Training the text encoder on domain-specific vocabulary D) Aligning text and image embeddings in a shared space **8. Which metric is most commonly used to evaluate the quality of generated images by comparing to real images?** A) Inception Score (IS) B) Fréchet Inception Distance (FID) C) Precision and Recall D) Kernel Inception Distance (KID) **9. What is the key innovation of the StyleGAN architecture?** A) Using a progressive growing approach to train at multiple resolutions B) Introducing adaptive instance normalization (AdaIN) for style control C) Replacing the generator with a transformer-based architecture D) Using a Wasserstein loss for more stable training **10. In a VAE, what does the KL divergence term in the ELBO objective encourage?** A) Accurate reconstruction of input data B) The latent distribution to match a prior distribution (typically standard normal) C) Disentanglement of factors in the latent space D) Both B and C **11. Which technique would most directly address the problem of mode collapse in GAN training?** A) Using a larger batch size B) Increasing the learning rate C) Wasserstein loss with gradient penalty D) Adding more layers to the discriminator **12. What is the primary purpose of the time embedding in diffusion models?** A) To encode the current diffusion step for the noise prediction network B) To align the generated image with the text prompt timing C) To control the speed of the diffusion process D) To prevent overfitting during training **13. In the context of music generation, what does a WaveNet model primarily use for its architecture?** A) Transformer layers with self-attention B) Dilated causal convolutions C) Recurrent neural networks D) Autoregressive transformers **14. Which of the following best describes "textual inversion" in the context of text-to-image models?** A) Inverting the text embedding process to generate text from images B) Learning new concepts with just a few images by modifying text embeddings C) Reversing the diffusion process to recover the original text prompt D) Translating text prompts between different languages **15. What is the key difference between a standard autoencoder and a variational autoencoder?** A) VAEs use convolutional layers while autoencoders use fully connected layers B) VAEs learn a probabilistic latent space while standard autoencoders learn a deterministic mapping C) VAEs are trained with adversarial loss while autoencoders use reconstruction loss D) VAEs can generate new samples while autoencoders can only reconstruct inputs --- **Answers:** 1. C - Standard GAN objective is the minimax game 2. C - Reparameterization trick enables backpropagation through sampling 3. B - WGAN addresses training instability through Wasserstein distance 4. B - Forward diffusion gradually adds noise to data 5. C - Stable Diffusion uses diffusion in latent space, not pixel space 6. B - Classifier-free guidance improves quality without extra models 7. B - Prompt engineering is crafting effective text prompts 8. B - FID is the most common metric for image quality evaluation 9. B - StyleGAN's key innovation is AdaIN for style control 10. D - KL term encourages matching prior and enables disentanglement 11. C - Wasserstein loss with gradient penalty addresses mode collapse 12. A - Time embedding encodes the diffusion step for noise prediction 13. B - WaveNet uses dilated causal convolutions 14. B - Textual inversion learns new concepts with few images 15. B - VAEs learn probabilistic latent space vs. deterministic in autoencoders --- ## **Summary and What's Next in Part 5** In this **comprehensive Part 4** of our PyTorch Masterclass, we've covered: - **Autoencoders**: Learning efficient data representations - **Variational Autoencoders**: Probabilistic generative modeling - **Generative Adversarial Networks**: Creating realistic synthetic data - **Diffusion Models**: The new frontier in generative AI - **Text-to-Image Generation**: Building models like DALL-E - **Music and Audio Generation**: Creating novel audio content - **Evaluating Generative Models**: Metrics and qualitative assessment - **Complete Image Generation Pipeline**: From data to deployment You now have the skills to: - Build and train various generative models - Fine-tune models on custom datasets - Generate high-quality images from text prompts - Create music and audio with deep learning - Evaluate generative models using appropriate metrics ### **What's Coming in Part 5?** In **Part 5**, we'll dive into **Reinforcement Learning with PyTorch**: - **Markov Decision Processes**: The theoretical foundation - **Q-Learning and Deep Q-Networks (DQN)**: Learning from experience - **Policy Gradient Methods**: REINFORCE, Actor-Critic - **Proximal Policy Optimization (PPO)**: State-of-the-art policy optimization - **Deep Deterministic Policy Gradient (DDPG)**: For continuous control - **Model-Based Reinforcement Learning**: Planning with world models - **Multi-Agent Reinforcement Learning**: Cooperation and competition - **Building a Complete RL Agent**: From CartPole to Atari games We'll build a **reinforcement learning agent** that learns to play games and solve complex control problems. 👉 **Stay tuned for Part 5: Reinforcement Learning with PyTorch** --- **Hashtags:** #PyTorch #GenerativeAI #GANs #VAEs #DiffusionModels #Autoencoders #TextToImage #DeepLearning #MachineLearning #AI #GenerativeAdversarialNetworks #VariationalAutoencoders #StableDiffusion #DALLE #ImageGeneration #MusicGeneration #AudioSynthesis #LatentSpace #PyTorchGenerative #WaveNet #StyleGAN #TextualInversion #DreamBooth #LoRA #ControlNet #InstructPix2Pix #FID #InceptionScore #GANTraining #DiffusionTraining #TextToImageGeneration #AudioGeneration #MusicAI #GenerativeModelEvaluation #PyTorchTutorial #DeepLearningCourse #AIEngineering #GenerativeAIDeveloper