import os
import subprocess
import json
def extract_text_from_pdfs(input_path, output_dir="results/filename", langs=None):
"""
Extracts text from a PDF or all PDFs in a directory using Surya OCR.
Parameters:
input_path (str): Path to the PDF file or directory containing PDF files.
output_dir (str): Directory where results.json will be saved. Default is "results".
langs (list): List of languages for OCR (e.g., ["en", "zh"]). Default is None (auto-detect).
Returns:
dict: Extracted text as a dictionary with filenames as keys.
json: text_lines - the detected text and bounding boxes for each line
text - the text in the line
confidence - the confidence of the model in the detected text (0-1)
polygon - the polygon for the text line in (x1, y1), (x2, y2), (x3, y3), (x4, y4) format. The points are in clockwise order from the top left.
bbox - the axis-aligned rectangle for the text line in (x1, y1, x2, y2) format. (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner.
languages - the languages specified for the page
page - the page number in the file
image_bbox - the bbox for the image in (x1, y1, x2, y2) format. (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner. All line bboxes will be contained within this bbox.
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input path '{input_path}' does not exist.")
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Construct the surya_ocr command
command = ["surya_ocr", input_path, "--results_dir", output_dir]
if langs:
command.extend(["--langs", ",".join(langs)])
# Run the command
try:
subprocess.run(command, check=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Error occurred during OCR processing: {e}")
# Load the results.json file
results_file = os.path.join(output_dir, "results.json")
if not os.path.exists(results_file):
raise FileNotFoundError(f"Results file not found at '{results_file}'.")
with open(results_file, "r", encoding="utf-8") as file:
extracted_text = json.load(file)
return extracted_text
# Example usage
if __name__ == "__main__":
input_path = "path/to/your/pdf/or/folder"
langs = ["zh"] # Specify languages if needed
results = extract_text_from_pdfs(input_path, langs=langs)
print(results)
https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html#simplest-invocation-to-ocr-an-image
最後更新:2025‑05‑03
May 3, 2025image
Apr 22, 2025要在 AI 訓練中「把資料用到最有效」,關鍵不在於盲目囤積樣本,而是確保資料本身乾淨、去重、資訊密度高,並搭配策略性取樣(Active Learning)、資料剪枝(Data Pruning)、資料集蒸餾(Dataset Distillation)、自監督/遷移式預訓練及合成資料等技術,配合課程學習將「先易後難」的樣本排序,再透過持續監測資料—效能曲線,動態調整來源與標註流程,就能在相同算力下取得更高泛化能力與更快收斂速度。
Apr 20, 2025列出三到五個你常用或是你曾經聽過的 AI 工具/系統e.g., NoteBookLM, Napkin.AI, Perplexity.AI,…
Apr 10, 2025or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up