Paddle OCR - HackMD

# Paddle OCR ![image](https://hackmd.io/_uploads/ryXtC4drC.png) # 偵測文字將其框起來 ![image](https://hackmd.io/_uploads/r10DQ4rBA.png) 顯示被框起來的文本、座標 ![image](https://hackmd.io/_uploads/Bkvi7ErS0.png) ``` from paddleocr import PaddleOCR, draw_ocr import os from PIL import Image import numpy as np import matplotlib.pyplot as plt # 初始化 PaddleOCR ocr = PaddleOCR(use_angle_cls=True, lang="ch") # 指定图片路径 img_path = '15.jpg' img_name = os.path.basename(img_path) # 进行文字检测 result_det = ocr.ocr(img_path, det=True) for idx in range(len(result_det)): res = result_det[idx] for line in res: print("文本:") print(line[1][0]) print("座標:") print(line[0][0:4]) # 可视化文字检测结果并保存 image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result_det[0]] im_show = draw_ocr(image, boxes, font_path='./fonts/simfang.ttf') im_show = Image.fromarray(im_show) im_show.save('result/{}only_detection.jpg'.format(os.path.splitext(img_name)[0])) plt.figure(figsize=(15, 8)) plt.imshow(im_show) plt.show() ##只偵測圖片上的文字 ``` # 偵測文字將其框起來-PDF版 ![image](https://hackmd.io/_uploads/HJjfVNrH0.png) ``` from paddleocr import PaddleOCR, draw_ocr import fitz from PIL import Image import cv2 import numpy as np PAGE_NUM = 1 # PDF 頁數 pdf_path = 'test.pdf' ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=PAGE_NUM) # need to run only once to download and load model into memory # ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=PAGE_NUM,use_gpu=0) # 如果需要使用GPU，请取消此行的注释并注释上一行 / To Use GPU,uncomment this line and comment the above one. result = ocr.ocr(pdf_path, cls=True) for idx in range(len(result)): res = result[idx] for line in res: print("文本:") print(line[1][0]) print("座標:") print(line[0][0:4]) # 顯示結果 imgs = [] with fitz.open(pdf_path) as pdf: for pg in range(0, PAGE_NUM): page = pdf.load_page(pg) mat = fitz.Matrix(2, 2) pm = page.get_pixmap(matrix=mat, alpha=False) # if width or height > 2000 pixels, don't enlarge the image if pm.width > 2000 or pm.height > 2000: pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) imgs.append(img) for idx in range(len(result)): res = result[idx] if res is None: print(f"[DEBUG] Empty page {idx} detected, skip it.") continue image = imgs[idx] boxes = [line[0] for line in res] #txts = [line[1][0] for line in res] 文字 #scores = [line[1][1] for line in res] 分數 im_show = draw_ocr(image, boxes, font_path='simfang.ttf') im_show = Image.fromarray(im_show) im_show.save('result_pdf/result_pdf_detection_{}.jpg'.format(idx)) #PDF 偵測文字 ``` # PDF 全程體驗圖片上顯示含文字及準確度 ![image](https://hackmd.io/_uploads/B1OlB4rB0.png) ``` from paddleocr import PaddleOCR, draw_ocr import fitz from PIL import Image import cv2 import numpy as np PAGE_NUM = 1 # Set the recognition page number pdf_path = 'test.pdf' ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=PAGE_NUM) # need to run only once to download and load model into memory # ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=PAGE_NUM,use_gpu=0) # 如果需要使用GPU，请取消此行的注释并注释上一行 / To Use GPU,uncomment this line and comment the above one. result = ocr.ocr(pdf_path, cls=True) # 显示结果 imgs = [] with fitz.open(pdf_path) as pdf: for pg in range(0, PAGE_NUM): page = pdf.load_page(pg) # 修正索引为从0开始 mat = fitz.Matrix(2, 2) pm = page.get_pixmap(matrix=mat, alpha=False) # if width or height > 2000 pixels, don't enlarge the image if pm.width > 2000 or pm.height > 2000: pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) imgs.append(img) for idx in range(len(result)): res = result[idx] if res is None: print(f"[DEBUG] Empty page {idx} detected, skip it.") continue image = imgs[idx] boxes = [line[0] for line in res] txts = [line[1][0] for line in res] scores = [line[1][1] for line in res] im_show = draw_ocr(image, boxes, txts, scores, font_path='simfang.ttf') im_show = Image.fromarray(im_show) im_show.save('result_pdf/result_pdf_{}.jpg'.format(idx)) #PDF 全程體驗 ``` # 只抓取特定文字 ![image](https://hackmd.io/_uploads/ByEUINHH0.png) ``` from paddleocr import PaddleOCR, draw_ocr import os from PIL import Image import numpy as np import matplotlib.pyplot as plt # 初始化 PaddleOCR ocr = PaddleOCR(use_angle_cls=True, lang="ch") # 指定圖片路徑 img_path = '15.jpg' img_name = os.path.basename(img_path) # 要檢測的固定文字 target_text = "冬阳君>" # 進行文字檢測 result_det = ocr.ocr(img_path, det=True) target_boxes = [] for res in result_det[0]: text = res[1][0] if text == target_text: target_boxes.append(res[0]) # 可視化文字檢測結果並保存 image = Image.open(img_path).convert('RGB') im_show = draw_ocr(image, target_boxes, font_path='./fonts/simfang.ttf') im_show = Image.fromarray(im_show) im_show.save('result/{}only_detection.jpg'.format(os.path.splitext(img_name)[0])) plt.figure(figsize=(15, 8)) plt.imshow(im_show) plt.show() ``` 只抓取特定文字-PDF ![image](https://hackmd.io/_uploads/SkTJPESrC.png) ``` from paddleocr import PaddleOCR, draw_ocr import fitz from PIL import Image import cv2 import numpy as np import matplotlib.pyplot as plt PAGE_NUM = 1 # PDF 頁數 pdf_path = 'test.pdf' ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=PAGE_NUM) # 要檢測的固定文字 target_text = "Sharon" # 進行文字檢測 result = ocr.ocr(pdf_path, cls=True) imgs = [] with fitz.open(pdf_path) as pdf: for pg in range(0, PAGE_NUM): page = pdf.load_page(pg) mat = fitz.Matrix(2, 2) pm = page.get_pixmap(matrix=mat, alpha=False) # if width or height > 2000 pixels, don't enlarge the image if pm.width > 2000 or pm.height > 2000: pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) imgs.append(img) for idx in range(len(result)): res = result[idx] if res is None: print(f"[DEBUG] Empty page {idx} detected, skip it.") continue image = imgs[idx] target_boxes = [] for line in res: text = line[1][0] if text == target_text: target_boxes.append(line[0]) # 顯示結果 if target_boxes: im_show = draw_ocr(image, target_boxes, font_path='simfang.ttf') im_show = Image.fromarray(im_show) im_show.save('result_pdf/result_pdf_detection_{}.jpg'.format(idx)) plt.figure(figsize=(15, 8)) plt.imshow(im_show) plt.show() ``` **如果要直接找出某文字的座標直接在line([1][0])去判斷和符不符合符合輸出該座標 ------------------------- 問題 ![image](https://hackmd.io/_uploads/r1S8DVBBA.png) 如果要抓取特定文字"Sharon"單行只有Sharon沒問題但如果像是交給Sharon掃描歸檔就不會偵測到 ------------------------- # PaddleOCR-參考資料 [官方github](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.7?tab=readme-ov-file) [PaddleOCR 快速开始](https://github.com/PaddlePaddle/PaddleOCR/blob/main/doc/doc_ch/quickstart.md) [官方PaddlePaddle文檔](https://www.paddlepaddle.org.cn/tutorials/projectdetail/5603475) [PPOCRv3的det检测模型finetune训练](https://blog.csdn.net/z5z5z5z56/article/details/129377434) ------------------------- # PP-Structure PPstructure PP-structure 图像方向分类+版面分析+表格识别原圖: ![demo03](https://hackmd.io/_uploads/S1ZiREPfC.png) structure生成 xlsx ![image](https://hackmd.io/_uploads/S12qAVDGA.png) ``` import os import cv2 from paddleocr import PPStructure,save_structure_res from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx # 中文测试图 table_engine = PPStructure(recovery=True) # 英文测试图 # table_engine = PPStructure(recovery=True, lang='en') save_folder = 'output' img_path = 'demo01.jpg' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) for line in result: line.pop('img') print(line) h, w, _ = img.shape res = sorted_layout_boxes(result, w) convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0]) #版面恢復 ``` 原圖: ![demo01](https://hackmd.io/_uploads/H1nfkHDzC.jpg) structure生成 xlsx ![image](https://hackmd.io/_uploads/H1vbJHDMA.png) PDF 版面恢復-to word ![image](https://hackmd.io/_uploads/HywcuDBrA.png) ``` import os def run_paddleocr(image_dir, output_dir): command = f"paddleocr --image_dir={image_dir} --type=structure --recovery=true --use_pdf2docx_api=true" os.system(command) if __name__ == "__main__": image_dir = "demo02.pdf" # pdf run_paddleocr(image_dir, output_dir="output/") ``` # PP-structure-參考資料 [github](https://github.com/PaddlePaddle/PaddleOCR/blob/main/ppstructure/README_ch.md) [PP-Structure工具包：PDF图片表格一键提取解决方案](https://aistudio.baidu.com/projectdetail/2274897) -------------------------------------- # PP-structure-关键信息抽取原圖: ![deom0613_1](https://hackmd.io/_uploads/HJ1p-S_rC.jpg) ![deom0613_1_ser_re](https://hackmd.io/_uploads/BJE-GSdS0.jpg) [github](https://github.com/PaddlePaddle/PaddleOCR/blob/main/ppstructure/kie/README_ch.md) 根據上面的github下載模型和套件關鍵信息抽取-RE-cmd指令 ``` python kie/predict_kie_token_ser_re.py --kie_algorithm=LayoutXLM --re_model_dir=../inference/re_vi_layoutxlm_xfund_infer --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer --use_visual_backbone=False --image_dir=./docs/kie/input/"圖片" --ser_dict_path=../train_data/XFUND/class_list_xfun.txt --vis_font_path=../doc/fonts/simfang.ttf --ocr_order_method="tb-yx" ```