OCR function

surya OCR

import os
import subprocess
import json

def extract_text_from_pdfs(input_path, output_dir="results/filename", langs=None):
    """
    Extracts text from a PDF or all PDFs in a directory using Surya OCR.

    Parameters:
        input_path (str): Path to the PDF file or directory containing PDF files.
        output_dir (str): Directory where results.json will be saved. Default is "results".
        langs (list): List of languages for OCR (e.g., ["en", "zh"]). Default is None (auto-detect).

    Returns:
        dict: Extracted text as a dictionary with filenames as keys.
        json: text_lines - the detected text and bounding boxes for each line
                  text - the text in the line
                  confidence - the confidence of the model in the detected text (0-1)
                  polygon - the polygon for the text line in (x1, y1), (x2, y2), (x3, y3), (x4, y4) format. The points are in clockwise order from the top left.
                  bbox - the axis-aligned rectangle for the text line in (x1, y1, x2, y2) format. (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner.
              languages - the languages specified for the page
              page - the page number in the file
              image_bbox - the bbox for the image in (x1, y1, x2, y2) format. (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner. All line bboxes will be contained within this bbox.
    """
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Input path '{input_path}' does not exist.")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Construct the surya_ocr command
    command = ["surya_ocr", input_path, "--results_dir", output_dir]
    if langs:
        command.extend(["--langs", ",".join(langs)])
    
    # Run the command
    try:
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Error occurred during OCR processing: {e}")
    
    # Load the results.json file
    results_file = os.path.join(output_dir, "results.json")
    if not os.path.exists(results_file):
        raise FileNotFoundError(f"Results file not found at '{results_file}'.")

    with open(results_file, "r", encoding="utf-8") as file:
        extracted_text = json.load(file)
    
    return extracted_text

# Example usage
if __name__ == "__main__":
    input_path = "path/to/your/pdf/or/folder"
    langs = ["zh"]  # Specify languages if needed
    results = extract_text_from_pdfs(input_path, langs=langs)
    print(results)

Tesseract OCR

https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html#simplest-invocation-to-ocr-an-image

Docling OCR

https://github.com/DS4SD/docling