## Models
```
1. ANPR Detection: V3.pt (yolov8)
2. Image Segmentation (segment anything): sam2 (sam2.1_hiera_small.pt)
3. Additional algorithm (tracker: byte track)
```
## Code Flow
1. Load Video / get frame (cv2)
```
input: path video
output: frame (inside loop)
```

2. Yolo ANPR Detection
```
input: frame
output: bbox (x1, y1, x2, y2) -> cropping img license plate only
```
2. (additional) Tracker (byte track)
```
input: coordinate bbox
output: track_id each object (in here is lpr)
problem & solving background story:
- detection sometimes is flickering (resulting inconsistant prediction) -> maintain the bbox if yolo can't detect it
```

3. SAM
```
input: cropped_img_plate
output: mask (binary) plate only
process: select the center of the cropped (usually license is on the middle!)
function : segment_plate(cropped_plate)
```
4. Improvement (stability of the prediction):
```
Flow:
- we do sam segmentation and save it based on bbox (track_id)
- the location of the license usually doesn't change. so we just redraw the semgnetation after that
- in short, we only do SAM segmentation once every license plate
```

## Code
```python
import cv2
import numpy as np
import torch
import os
import warnings
from datetime import datetime
from ultralytics import YOLO
from yolox.tracker.byte_tracker import BYTETracker
from configs.tracker_cfg import args
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
# Disable warnings for cleaner output
warnings.filterwarnings("ignore")
# ===== Debug Configuration =====
DEBUG = True
DEBUG_DIR = "debug_logs"
os.makedirs(DEBUG_DIR, exist_ok=True) # Create debug folder if it doesn't exist
def log(message, image=None, prefix=""):
"""
Log a message and save a debug image with a timestamp.
Args:
message (str): Message to log.
image (np.ndarray, optional): Image to save for debugging.
prefix (str, optional): Prefix for the debug image file name.
"""
if DEBUG:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
print(f"[{timestamp}] {message}")
if image is not None:
filename = f"{DEBUG_DIR}/{prefix}_{timestamp}.png"
cv2.imwrite(filename, image)
# ===== Step 1: Initialize Models =====
# Initialize the YOLO model for license plate detection
yolo_model = YOLO("V3.pt")
# Initialize the SAM2 model for segmentation
checkpoint = "./sam2/checkpoints/sam2.1_hiera_small.pt"
model_cfg = "./configs/sam2.1/sam2.1_hiera_s.yaml"
sam2_model = build_sam2(model_cfg, checkpoint)
predictor = SAM2ImagePredictor(sam2_model)
# Initialize the tracker to track license plates across frames
tracker = BYTETracker(args=args, frame_rate=args.fps)
# ===== Step 2: Utility Functions =====
def apply_mask(frame, mask, x1, y1, x2, y2):
"""
Apply a segmentation mask to the license plate area in the frame.
Args:
frame (np.ndarray): Video frame.
mask (np.ndarray): Segmentation mask.
x1, y1, x2, y2 (int): Coordinates of the license plate bounding box.
"""
try:
current_height = y2 - y1
current_width = x2 - x1
# Resize the mask to match the bounding box size
resized_mask = cv2.resize(
(mask * 255).astype(np.uint8),
(current_width, current_height),
interpolation=cv2.INTER_NEAREST
)
# Apply the mask: set pixels with value > 0 to white
frame[y1:y2, x1:x2][resized_mask > 0] = 255
except Exception as e:
print(f"Error applying mask: {str(e)}")
def segment_plate(cropped_plate):
"""
Segment the license plate using the SAM2 model.
Steps:
1. Convert the image from BGR to RGB.
2. Define a prompt box with a 10% margin to focus on the plate area.
3. Use the center point as a foreground prompt.
4. Run the prediction and return the best mask.
Args:
cropped_plate (np.ndarray): Cropped image of the license plate.
Returns:
np.ndarray: Best segmentation mask, or None if segmentation fails.
"""
try:
cropped_plate_rgb = cv2.cvtColor(cropped_plate, cv2.COLOR_BGR2RGB)
h, w = cropped_plate_rgb.shape[:2]
# Define a prompt box with a 10% margin from each side
scale = 0.9
new_w = int(w * scale)
new_h = int(h * scale)
x1 = (w - new_w) // 2
y1 = (h - new_h) // 2
x2 = x1 + new_w
y2 = y1 + new_h
prompt_box = np.array([[x1, y1, x2, y2]])
# Define the prompt point: use the center as the foreground
center_point = np.array([[w // 2, h // 2]])
point_labels = np.array([1])
# Run the SAM2 predictor with the specified prompt
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
predictor.set_image(cropped_plate_rgb)
masks, scores, _ = predictor.predict(
box=prompt_box,
point_coords=center_point,
point_labels=point_labels,
multimask_output=False
)
# Return the best mask if available
if masks is not None and len(masks) > 0:
best_mask = masks[np.argmax(scores)]
return best_mask
return None
except Exception as e:
print(f"Error during segmentation: {str(e)}")
return None
# ===== Step 3: Process Video =====
def process_video(input_path, output_path, temp_crop_folder="temp_crop"):
"""
Process a video to detect, segment, and track license plates.
Steps:
1. Open the input video and initialize the output video writer.
2. For each frame:
a. Detect license plates using YOLO.
b. Track the plates using BYTETracker.
c. For each detected plate:
- Crop the plate area.
- Save the cropped image (for debugging).
- If the track is new, perform segmentation and store the mask.
- Apply the stored mask to the frame.
- Draw the bounding box and track ID.
d. Write the processed frame to the output video.
3. Release all video resources.
Args:
input_path (str): Path to the input video.
output_path (str): Path to save the output video.
temp_crop_folder (str): Folder to save cropped images for debugging.
"""
cap = cv2.VideoCapture(input_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
mask_memory = {} # Store segmentation masks for each track_id
frame_count = 0
os.makedirs(temp_crop_folder, exist_ok=True)
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frame_count += 1
# Detect license plates using YOLO
results = yolo_model(frame)[0]
detections = [box.xyxy[0].cpu().numpy().tolist() + [box.conf.item()] for box in results.boxes]
tracks = tracker.update(np.array(detections), (height, width), (height, width)) if detections else []
for track in tracks:
track_id = int(track.track_id)
x1, y1, x2, y2 = track.tlbr.astype(int)
cropped_plate = frame[y1:y2, x1:x2]
# Save the cropped plate image for debugging
temp_file = os.path.join(temp_crop_folder, f"track_{track_id}_frame_{frame_count}.jpg")
cv2.imwrite(temp_file, cropped_plate)
# For new tracks, perform segmentation and save the mask
if track_id not in mask_memory:
best_mask = segment_plate(cropped_plate)
if best_mask is not None:
mask_memory[track_id] = best_mask
log(f"Segmentation mask for track ID {track_id} created",
(best_mask * 255).astype(np.uint8), f"mask_{track_id}")
else:
print(f"Segmentation failed for track ID {track_id} on frame {frame_count}")
continue
# Apply the stored mask to the frame
if track_id in mask_memory:
apply_mask(frame, mask_memory[track_id], x1, y1, x2, y2)
# Draw the bounding box and tracker ID on the frame
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(frame, str(track_id), (x1 + 5, y1 - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
# Write the processed frame to the output video
out.write(frame)
log("Frame processed", frame, "frame_output")
cap.release()
out.release()
# ===== Step 4: Execution =====
if __name__ == "__main__":
process_video(
input_path="./Video/Traffic IP Camera video.mp4",
output_path="output_final4.mp4",
temp_crop_folder="temp_crop" # Folder to save cropped images for debugging
)
```
## Question
```
Halo ka, aku ada bbrp pertanyaan nih untuk projectnya kalo mau dijawab nanti pas meeting malem nanti juga boleh:
1. untuk datasetnya secara spesifik pakai apa aja ya ka? apakah dari link ini dari repo yg waktu itu?: https://universe.roboflow.com/roboflow-universe-projects/license-plate-recognition-rxg4e/dataset/4
2. apakah ada proses augmentasi untuk resize image, rotate, crop, B&W, noise? dan bagimana dengan hasil testnya?
3. proses training modelnya seperti apa saja tahapannya dan pakai software / hardware apa? dan kalo boleh tau jg trainnya berapa lama dgn spek sistem apa.
4. mengapa model yolo yg digunakan yolov8n? sedangkan ada model yolo lainnya yg lebih baru contohnya v11
5. apakah bisa support video file .mkv?
```
```
Answer:
1. datasetnya habis ini saya upload ya. saya combine dikit" sm yg lain sama bersihin
2. augmentation detail akan saya share ya
3. trainingnya pakai ultralytics saja..
Tahapan:
a. prepare datasetnya sesuai format yolov8
b. prepare env training pakai ultralytics
c. hardware kmrin saya pakai T4 cloud compute AWS (km bisa pakai kaggle or google colab kok)
d. process training bergantung hardware. tp kurang lebih 1-4 jam sih
4. bisa dipakai yolov11 dsbnya. cmn lebih lambat. jadi bisa kita ganti jika memang pingin
```
https://docs.ultralytics.com/reference/trackers/utils/kalman_filter/