Skip to content

Instantly share code, notes, and snippets.

@hided62
Last active April 14, 2025 05:43
Show Gist options
  • Select an option

  • Save hided62/b98f51d6d97f42a2fdb90b1b86da12b1 to your computer and use it in GitHub Desktop.

Select an option

Save hided62/b98f51d6d97f42a2fdb90b1b86da12b1 to your computer and use it in GitHub Desktop.
Add Hidden Text Layer to PDF Using Paddle OCR(MinerU, magic-pdf) JSON
import pymupdf
import json
import argparse
def add_hidden_text_layer(ocr_json_path, pdf_input, pdf_output):
# Load OCR JSON data
with open(ocr_json_path, 'r', encoding='utf-8') as f:
ocr_data = json.load(f)
# Open the existing PDF
doc = pymupdf.open(pdf_input)
for ocr_pages in ocr_data.get("pdf_info", {}):
# Process the "para_blocks" section as an example
for block in ocr_pages.get("para_blocks", []):
if "page_num" not in block:
continue
# Page number: e.g., 'page_0'
page_index = int(block["page_num"].split("_")[1])
page = doc[page_index]
# Check page size from OCR JSON's page_size or PDF page's bound()
tw = pymupdf.TextWriter(page.rect)
print("page_num:", page_index)
for line in block.get("lines", []):
for span in line.get("spans", []):
bbox_ocr = span["bbox"]
content = span["content"].strip() + " "
rect = pymupdf.Rect(bbox_ocr)
# Estimate font size based on bbox height
# (Adjust if there is a significant difference from the actual font size)
font_size = rect.height * 0.8
point = pymupdf.Point(bbox_ocr[0], bbox_ocr[1] + font_size)
# Insert text
tw.append(point, content, fontsize=font_size)
tw.write_text(page, render_mode=3)
doc.save(pdf_output)
doc.close()
print("Hidden text layer added to PDF:", pdf_output)
def main():
parser = argparse.ArgumentParser(description="Add a hidden text layer to a PDF using Paddle OCR JSON")
parser.add_argument("--ocr_json_path", type=str, required=True, help="Path to the OCR result JSON file")
parser.add_argument("--pdf_input", type=str, required=True, help="Path to the input PDF file")
parser.add_argument("--pdf_output", type=str, required=True, help="Path to the output PDF file")
args = parser.parse_args()
add_hidden_text_layer(args.ocr_json_path, args.pdf_input, args.pdf_output)
if __name__ == "__main__":
main()
@hided62
Copy link
Author

hided62 commented Apr 14, 2025

An example of performing OCR on a Korean document and adding a hidden text layer to a PDF file

magic-pdf -p a.pdf -o output -m ocr -l korean
python inserter.py --ocr_json_path output/a/ocr/a_middle.json --pdf_input a.pdf --pdf_output a_ocr.pdf

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment