## Models ``` 1. ANPR Detection: V3.pt (yolov8) 2. Image Segmentation (segment anything): sam2 (sam2.1_hiera_small.pt) 3. Additional algorithm (tracker: byte track) ``` ## Code Flow 1. Load Video / get frame (cv2) ``` input: path video output: frame (inside loop) ``` ![image](https://hackmd.io/_uploads/B1YARrg3kx.png) 2. Yolo ANPR Detection ``` input: frame output: bbox (x1, y1, x2, y2) -> cropping img license plate only ``` 2. (additional) Tracker (byte track) ``` input: coordinate bbox output: track_id each object (in here is lpr) problem & solving background story: - detection sometimes is flickering (resulting inconsistant prediction) -> maintain the bbox if yolo can't detect it ``` ![image](https://hackmd.io/_uploads/ry2DkUe31g.png) 3. SAM ``` input: cropped_img_plate output: mask (binary) plate only process: select the center of the cropped (usually license is on the middle!) function : segment_plate(cropped_plate) ``` 4. Improvement (stability of the prediction): ``` Flow: - we do sam segmentation and save it based on bbox (track_id) - the location of the license usually doesn't change. so we just redraw the semgnetation after that - in short, we only do SAM segmentation once every license plate ``` ![image](https://hackmd.io/_uploads/Sk0eenZnJl.png) ## Code ```python import cv2 import numpy as np import torch import os import warnings from datetime import datetime from ultralytics import YOLO from yolox.tracker.byte_tracker import BYTETracker from configs.tracker_cfg import args from sam2.build_sam import build_sam2 from sam2.sam2_image_predictor import SAM2ImagePredictor # Disable warnings for cleaner output warnings.filterwarnings("ignore") # ===== Debug Configuration ===== DEBUG = True DEBUG_DIR = "debug_logs" os.makedirs(DEBUG_DIR, exist_ok=True) # Create debug folder if it doesn't exist def log(message, image=None, prefix=""): """ Log a message and save a debug image with a timestamp. Args: message (str): Message to log. image (np.ndarray, optional): Image to save for debugging. prefix (str, optional): Prefix for the debug image file name. """ if DEBUG: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f") print(f"[{timestamp}] {message}") if image is not None: filename = f"{DEBUG_DIR}/{prefix}_{timestamp}.png" cv2.imwrite(filename, image) # ===== Step 1: Initialize Models ===== # Initialize the YOLO model for license plate detection yolo_model = YOLO("V3.pt") # Initialize the SAM2 model for segmentation checkpoint = "./sam2/checkpoints/sam2.1_hiera_small.pt" model_cfg = "./configs/sam2.1/sam2.1_hiera_s.yaml" sam2_model = build_sam2(model_cfg, checkpoint) predictor = SAM2ImagePredictor(sam2_model) # Initialize the tracker to track license plates across frames tracker = BYTETracker(args=args, frame_rate=args.fps) # ===== Step 2: Utility Functions ===== def apply_mask(frame, mask, x1, y1, x2, y2): """ Apply a segmentation mask to the license plate area in the frame. Args: frame (np.ndarray): Video frame. mask (np.ndarray): Segmentation mask. x1, y1, x2, y2 (int): Coordinates of the license plate bounding box. """ try: current_height = y2 - y1 current_width = x2 - x1 # Resize the mask to match the bounding box size resized_mask = cv2.resize( (mask * 255).astype(np.uint8), (current_width, current_height), interpolation=cv2.INTER_NEAREST ) # Apply the mask: set pixels with value > 0 to white frame[y1:y2, x1:x2][resized_mask > 0] = 255 except Exception as e: print(f"Error applying mask: {str(e)}") def segment_plate(cropped_plate): """ Segment the license plate using the SAM2 model. Steps: 1. Convert the image from BGR to RGB. 2. Define a prompt box with a 10% margin to focus on the plate area. 3. Use the center point as a foreground prompt. 4. Run the prediction and return the best mask. Args: cropped_plate (np.ndarray): Cropped image of the license plate. Returns: np.ndarray: Best segmentation mask, or None if segmentation fails. """ try: cropped_plate_rgb = cv2.cvtColor(cropped_plate, cv2.COLOR_BGR2RGB) h, w = cropped_plate_rgb.shape[:2] # Define a prompt box with a 10% margin from each side scale = 0.9 new_w = int(w * scale) new_h = int(h * scale) x1 = (w - new_w) // 2 y1 = (h - new_h) // 2 x2 = x1 + new_w y2 = y1 + new_h prompt_box = np.array([[x1, y1, x2, y2]]) # Define the prompt point: use the center as the foreground center_point = np.array([[w // 2, h // 2]]) point_labels = np.array([1]) # Run the SAM2 predictor with the specified prompt with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16): predictor.set_image(cropped_plate_rgb) masks, scores, _ = predictor.predict( box=prompt_box, point_coords=center_point, point_labels=point_labels, multimask_output=False ) # Return the best mask if available if masks is not None and len(masks) > 0: best_mask = masks[np.argmax(scores)] return best_mask return None except Exception as e: print(f"Error during segmentation: {str(e)}") return None # ===== Step 3: Process Video ===== def process_video(input_path, output_path, temp_crop_folder="temp_crop"): """ Process a video to detect, segment, and track license plates. Steps: 1. Open the input video and initialize the output video writer. 2. For each frame: a. Detect license plates using YOLO. b. Track the plates using BYTETracker. c. For each detected plate: - Crop the plate area. - Save the cropped image (for debugging). - If the track is new, perform segmentation and store the mask. - Apply the stored mask to the frame. - Draw the bounding box and track ID. d. Write the processed frame to the output video. 3. Release all video resources. Args: input_path (str): Path to the input video. output_path (str): Path to save the output video. temp_crop_folder (str): Folder to save cropped images for debugging. """ cap = cv2.VideoCapture(input_path) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(cap.get(cv2.CAP_PROP_FPS)) out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height)) mask_memory = {} # Store segmentation masks for each track_id frame_count = 0 os.makedirs(temp_crop_folder, exist_ok=True) while cap.isOpened(): ret, frame = cap.read() if not ret: break frame_count += 1 # Detect license plates using YOLO results = yolo_model(frame)[0] detections = [box.xyxy[0].cpu().numpy().tolist() + [box.conf.item()] for box in results.boxes] tracks = tracker.update(np.array(detections), (height, width), (height, width)) if detections else [] for track in tracks: track_id = int(track.track_id) x1, y1, x2, y2 = track.tlbr.astype(int) cropped_plate = frame[y1:y2, x1:x2] # Save the cropped plate image for debugging temp_file = os.path.join(temp_crop_folder, f"track_{track_id}_frame_{frame_count}.jpg") cv2.imwrite(temp_file, cropped_plate) # For new tracks, perform segmentation and save the mask if track_id not in mask_memory: best_mask = segment_plate(cropped_plate) if best_mask is not None: mask_memory[track_id] = best_mask log(f"Segmentation mask for track ID {track_id} created", (best_mask * 255).astype(np.uint8), f"mask_{track_id}") else: print(f"Segmentation failed for track ID {track_id} on frame {frame_count}") continue # Apply the stored mask to the frame if track_id in mask_memory: apply_mask(frame, mask_memory[track_id], x1, y1, x2, y2) # Draw the bounding box and tracker ID on the frame cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(frame, str(track_id), (x1 + 5, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2) # Write the processed frame to the output video out.write(frame) log("Frame processed", frame, "frame_output") cap.release() out.release() # ===== Step 4: Execution ===== if __name__ == "__main__": process_video( input_path="./Video/Traffic IP Camera video.mp4", output_path="output_final4.mp4", temp_crop_folder="temp_crop" # Folder to save cropped images for debugging ) ``` ## Question ``` Halo ka, aku ada bbrp pertanyaan nih untuk projectnya kalo mau dijawab nanti pas meeting malem nanti juga boleh: 1. untuk datasetnya secara spesifik pakai apa aja ya ka? apakah dari link ini dari repo yg waktu itu?: https://universe.roboflow.com/roboflow-universe-projects/license-plate-recognition-rxg4e/dataset/4 2. apakah ada proses augmentasi untuk resize image, rotate, crop, B&W, noise? dan bagimana dengan hasil testnya? 3. proses training modelnya seperti apa saja tahapannya dan pakai software / hardware apa? dan kalo boleh tau jg trainnya berapa lama dgn spek sistem apa. 4. mengapa model yolo yg digunakan yolov8n? sedangkan ada model yolo lainnya yg lebih baru contohnya v11 5. apakah bisa support video file .mkv? ``` ``` Answer: 1. datasetnya habis ini saya upload ya. saya combine dikit" sm yg lain sama bersihin 2. augmentation detail akan saya share ya 3. trainingnya pakai ultralytics saja.. Tahapan: a. prepare datasetnya sesuai format yolov8 b. prepare env training pakai ultralytics c. hardware kmrin saya pakai T4 cloud compute AWS (km bisa pakai kaggle or google colab kok) d. process training bergantung hardware. tp kurang lebih 1-4 jam sih 4. bisa dipakai yolov11 dsbnya. cmn lebih lambat. jadi bisa kita ganti jika memang pingin ``` https://docs.ultralytics.com/reference/trackers/utils/kalman_filter/