import os
import subprocess
import json
def extract_text_from_pdfs(input_path, output_dir="results/filename", langs=None):
"""
Extracts text from a PDF or all PDFs in a directory using Surya OCR.
Parameters:
input_path (str): Path to the PDF file or directory containing PDF files.
output_dir (str): Directory where results.json will be saved. Default is "results".
langs (list): List of languages for OCR (e.g., ["en", "zh"]). Default is None (auto-detect).
Returns:
dict: Extracted text as a dictionary with filenames as keys.
json: text_lines - the detected text and bounding boxes for each line
text - the text in the line
confidence - the confidence of the model in the detected text (0-1)
polygon - the polygon for the text line in (x1, y1), (x2, y2), (x3, y3), (x4, y4) format. The points are in clockwise order from the top left.
bbox - the axis-aligned rectangle for the text line in (x1, y1, x2, y2) format. (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner.
languages - the languages specified for the page
page - the page number in the file
image_bbox - the bbox for the image in (x1, y1, x2, y2) format. (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner. All line bboxes will be contained within this bbox.
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input path '{input_path}' does not exist.")
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Construct the surya_ocr command
command = ["surya_ocr", input_path, "--results_dir", output_dir]
if langs:
command.extend(["--langs", ",".join(langs)])
# Run the command
try:
subprocess.run(command, check=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Error occurred during OCR processing: {e}")
# Load the results.json file
results_file = os.path.join(output_dir, "results.json")
if not os.path.exists(results_file):
raise FileNotFoundError(f"Results file not found at '{results_file}'.")
with open(results_file, "r", encoding="utf-8") as file:
extracted_text = json.load(file)
return extracted_text
# Example usage
if __name__ == "__main__":
input_path = "path/to/your/pdf/or/folder"
langs = ["zh"] # Specify languages if needed
results = extract_text_from_pdfs(input_path, langs=langs)
print(results)
https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html#simplest-invocation-to-ocr-an-image
or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up