hided62 · April 14, 2025 05:43 · hided62 · Apr 14, 2025
diff --git a/inserter.py b/inserter.py
 import pymupdf
 import json
 import argparse
    

 def add_hidden_text_layer(ocr_json_path, pdf_input, pdf_output):
    # Load OCR JSON data
    with open(ocr_json_path, 'r', encoding='utf-8') as f:
        ocr_data = json.load(f)
    
    # Open the existing PDF
    doc = pymupdf.open(pdf_input)
    
    for ocr_pages in ocr_data.get("pdf_info", {}):
        # Process the "para_blocks" section as an example
        for block in ocr_pages.get("para_blocks", []):
            if "page_num" not in block:
                continue
            # Page number: e.g., 'page_0'
            page_index = int(block["page_num"].split("_")[1])
            page = doc[page_index]
            # Check page size from OCR JSON's page_size or PDF page's bound()
            tw = pymupdf.TextWriter(page.rect)
            
            print("page_num:", page_index)
                        
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    bbox_ocr = span["bbox"]
                    content = span["content"].strip() + " "
                    rect = pymupdf.Rect(bbox_ocr)
                    # Estimate font size based on bbox height
                    # (Adjust if there is a significant difference from the actual font size)
                    font_size = rect.height * 0.8

                    point = pymupdf.Point(bbox_ocr[0], bbox_ocr[1] + font_size)
                    # Insert text
                    tw.append(point, content, fontsize=font_size)
            
            tw.write_text(page, render_mode=3)
                    
    
    doc.save(pdf_output)
    doc.close()
    print("Hidden text layer added to PDF:", pdf_output)

 def main():
    parser = argparse.ArgumentParser(description="Add a hidden text layer to a PDF using Paddle OCR JSON")
    parser.add_argument("--ocr_json_path", type=str, required=True, help="Path to the OCR result JSON file")
    parser.add_argument("--pdf_input", type=str, required=True, help="Path to the input PDF file")
    parser.add_argument("--pdf_output", type=str, required=True, help="Path to the output PDF file")
    
    args = parser.parse_args()
    
    add_hidden_text_layer(args.ocr_json_path, args.pdf_input, args.pdf_output)

 if __name__ == "__main__":
    main()
	import pymupdf
	import json
	import argparse


	def add_hidden_text_layer(ocr_json_path, pdf_input, pdf_output):
	# Load OCR JSON data
	with open(ocr_json_path, 'r', encoding='utf-8') as f:
	ocr_data = json.load(f)

	# Open the existing PDF
	doc = pymupdf.open(pdf_input)

	for ocr_pages in ocr_data.get("pdf_info", {}):
	# Process the "para_blocks" section as an example
	for block in ocr_pages.get("para_blocks", []):
	if "page_num" not in block:
	continue
	# Page number: e.g., 'page_0'
	page_index = int(block["page_num"].split("_")[1])
	page = doc[page_index]
	# Check page size from OCR JSON's page_size or PDF page's bound()
	tw = pymupdf.TextWriter(page.rect)

	print("page_num:", page_index)

	for line in block.get("lines", []):
	for span in line.get("spans", []):
	bbox_ocr = span["bbox"]
	content = span["content"].strip() + " "
	rect = pymupdf.Rect(bbox_ocr)
	# Estimate font size based on bbox height
	# (Adjust if there is a significant difference from the actual font size)
	font_size = rect.height * 0.8

	point = pymupdf.Point(bbox_ocr[0], bbox_ocr[1] + font_size)
	# Insert text
	tw.append(point, content, fontsize=font_size)

	tw.write_text(page, render_mode=3)


	doc.save(pdf_output)
	doc.close()
	print("Hidden text layer added to PDF:", pdf_output)

	def main():
	parser = argparse.ArgumentParser(description="Add a hidden text layer to a PDF using Paddle OCR JSON")
	parser.add_argument("--ocr_json_path", type=str, required=True, help="Path to the OCR result JSON file")
	parser.add_argument("--pdf_input", type=str, required=True, help="Path to the input PDF file")
	parser.add_argument("--pdf_output", type=str, required=True, help="Path to the output PDF file")

	args = parser.parse_args()

	add_hidden_text_layer(args.ocr_json_path, args.pdf_input, args.pdf_output)

	if __name__ == "__main__":
	main()
No results found