Last active
April 14, 2025 05:43
-
-
Save hided62/b98f51d6d97f42a2fdb90b1b86da12b1 to your computer and use it in GitHub Desktop.
Add Hidden Text Layer to PDF Using Paddle OCR(MinerU, magic-pdf) JSON
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pymupdf | |
| import json | |
| import argparse | |
| def add_hidden_text_layer(ocr_json_path, pdf_input, pdf_output): | |
| # Load OCR JSON data | |
| with open(ocr_json_path, 'r', encoding='utf-8') as f: | |
| ocr_data = json.load(f) | |
| # Open the existing PDF | |
| doc = pymupdf.open(pdf_input) | |
| for ocr_pages in ocr_data.get("pdf_info", {}): | |
| # Process the "para_blocks" section as an example | |
| for block in ocr_pages.get("para_blocks", []): | |
| if "page_num" not in block: | |
| continue | |
| # Page number: e.g., 'page_0' | |
| page_index = int(block["page_num"].split("_")[1]) | |
| page = doc[page_index] | |
| # Check page size from OCR JSON's page_size or PDF page's bound() | |
| tw = pymupdf.TextWriter(page.rect) | |
| print("page_num:", page_index) | |
| for line in block.get("lines", []): | |
| for span in line.get("spans", []): | |
| bbox_ocr = span["bbox"] | |
| content = span["content"].strip() + " " | |
| rect = pymupdf.Rect(bbox_ocr) | |
| # Estimate font size based on bbox height | |
| # (Adjust if there is a significant difference from the actual font size) | |
| font_size = rect.height * 0.8 | |
| point = pymupdf.Point(bbox_ocr[0], bbox_ocr[1] + font_size) | |
| # Insert text | |
| tw.append(point, content, fontsize=font_size) | |
| tw.write_text(page, render_mode=3) | |
| doc.save(pdf_output) | |
| doc.close() | |
| print("Hidden text layer added to PDF:", pdf_output) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Add a hidden text layer to a PDF using Paddle OCR JSON") | |
| parser.add_argument("--ocr_json_path", type=str, required=True, help="Path to the OCR result JSON file") | |
| parser.add_argument("--pdf_input", type=str, required=True, help="Path to the input PDF file") | |
| parser.add_argument("--pdf_output", type=str, required=True, help="Path to the output PDF file") | |
| args = parser.parse_args() | |
| add_hidden_text_layer(args.ocr_json_path, args.pdf_input, args.pdf_output) | |
| if __name__ == "__main__": | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
An example of performing OCR on a Korean document and adding a hidden text layer to a PDF file