抓取影像中文文字並裁切

# 抓取影像中文文字並裁切 ###### tags: playground ## 從影像中產出可以餵給 TrOCR 的影像片段 + 使用灰階圖 morphology dilation 的方法。 + 使用 python open-cv 以下方法: + `getStructuringElement` + `dilate` + `findContours` + `boundingRect` + `rectangle` + 測試原圖 ![](https://i.imgur.com/TOZIbbu.png) ```python # 設定輸入的影像路徑，並讀檔 img = cv2.imread(image_path) # 轉換圖片為灰階 gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # 轉換灰階圖片為 binary 圖像 ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV) cv2.imshow('thresh1', thresh1) ``` + 此時圖像為黑底: ![](https://i.imgur.com/KJ3Bxm4.png) ```python # 選取矩形結構的 kernel rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 3)) # image dilation process dilation = cv2.dilate(thresh1, rect_kernel, iterations=1) cv2.imshow('dilation', dilation) ``` + dilation 處理後圖像: ![](https://i.imgur.com/p0FmLEv.png) ```python contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) im2 = img.copy() for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 2) cv2.imshow('final', im2) cv2.waitKey(0) cv2.destroyAllWindows() ``` + 根據 dilation 結果找 text lines 輪廓 ![](https://i.imgur.com/exx7YkF.png) + 可以擷取 `contours` 相關資訊進行 text lines 影像裁切 + 單行文字影像裁切 + 原圖 ![](https://i.imgur.com/XcrCgys.png) + 給定最大寬度 `max_size`，模型可以接受的最大寬度，大於該寬度就要裁切，且不可以切到字。 ```python def sub_crop(max_size=384): path = "testing1.png" img = cv2.imread(path) image_x = img.shape[1] if image_x < max_size: return img gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV) contours, hierarchy = cv2.findContours(thresh, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) dic = defaultdict(list) for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) if (x + w) // max_size == x // max_size: key_ = x // max_size else: key_ = (x + w) // max_size dic[key_].append([x, y, w, h]) print(len(dic)) color_list = [tuple(random.randint(0, 255) for _ in range(3)) for _ in range(len(dic))] for k, c in zip(dic, color_list): for v in dic[k]: cv2.rectangle(img, (v[0], v[1]), (v[0] + v[2], v[1] + v[3]), c, 2) cv2.imshow('final', img) cv2.waitKey(0) ``` + 透過以上可以得出，不同顏色代表要裁切的不同部分，以此圖寬度為 719 px，裁切為兩張，如圖: ![](https://i.imgur.com/lRxhnWL.png) + 但要如何實行裁切? + 調整以上程式為: ```python def sub_crop(max_size=384): path = "testing1.png" img = cv2.imread(path) image_x = img.shape[1] if image_x < max_size: return img gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV) contours, hierarchy = cv2.findContours(thresh, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) dic = defaultdict(list) for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) if (x + w) // max_size == x // max_size: key_ = x // max_size else: key_ = (x + w) // max_size dic[key_].append([x, y, w, h]) margins = [] start = 0 for k in dict(sorted(dic.items())): temp_margin = 0 for v in dic[k]: if v[0] + v[2] > temp_margin: temp_margin = v[0] + v[2] if v[0] < temp_margin and (v[0] + v[2]) > temp_margin: temp_margin = v[0] margins.append((start, temp_margin)) start = temp_margin print(margins) for i, (start, end) in enumerate(margins): cv2.imshow(f"{i}_img", img[:, start:end]) cv2.waitKey(0) ``` + 使用 margins 儲存裁切的 x 座標，若該字 box 範圍與 max_size x 座標重疊，則該 box 移入下一張裁切的子圖內，以此例子 max_size 為 384，`不` 字重疊到 x 座標384，因此實際裁切座標左移至 376 `[(0, 376), (376, 711)]` ![](https://i.imgur.com/QiRdZFx.png) ![](https://i.imgur.com/L2sizkQ.png) + 整理以上程式 ```python def sub_crop(img, max_size=384): image_x = img.shape[1] if image_x < max_size: return [img] gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV) contours, hierarchy = cv2.findContours(thresh, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) dic = defaultdict(list) for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) if (x + w) // max_size == x // max_size: key_ = x // max_size else: key_ = (x + w) // max_size dic[key_].append([x, y, w, h]) color_list = [tuple(random.randint(0, 255) for _ in range(3)) for _ in range(len(dic))] margins = [] start = 0 for k, c in zip(dict(sorted(dic.items())), color_list): temp_margin = 0 for v in dic[k]: if v[0] + v[2] > temp_margin: temp_margin = v[0] + v[2] if v[0] < temp_margin and (v[0] + v[2]) > temp_margin: temp_margin = v[0] margins.append((start, temp_margin)) start = temp_margin img_list = [] for i, (start, end) in enumerate(margins): img_list.append(img[:, start:end]) return img_list def find_textlines(): # 設定輸入的影像路徑，並讀檔 path = "testing.png" img = cv2.imread(path) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Apply binary threshold to the image ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV) # cv2.imshow('thresh1', thresh1) rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 3)) dilation = cv2.dilate(thresh1, rect_kernel, iterations=1) # cv2.imshow('dilation', dilation) contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) img_list = [] boxes = [cv2.boundingRect(c) for c in contours] boxes_sorted = sorted(boxes, key=lambda b: b[1]) for box in boxes_sorted: x, y, w, h = box img_list.append(img[y:y + h, x:x + w]) text_img_list = [] for img in img_list: _img_array = sub_crop(img) for _img in _img_array: cv2.imshow('Cropped Image', _img) cv2.waitKey(0) ``` + 原圖 ![](https://i.imgur.com/TOZIbbu.png) + 結果 ![](https://i.imgur.com/dBlw1GH.png) ![](https://i.imgur.com/YWlXFW0.png) ![](https://i.imgur.com/S4xyVDl.png) ![](https://i.imgur.com/aradZ0h.png) ![](https://i.imgur.com/zJBms2m.png) + Reference + https://www.tutorialspoint.com/how-to-find-the-bounding-rectangle-of-an-image-contour-in-opencv-python + https://stackoverflow.com/a/50777937