Skip to content

Instantly share code, notes, and snippets.

View mswaringen's full-sized avatar

Mark Swaringen mswaringen

  • Remote
  • 04:02 (UTC)
View GitHub Profile
from pathlib import Path
import re
from urllib.parse import urljoin, urlparse
from scrapling.fetchers import Fetcher, StealthyFetcher
from scrapling.parser import Selector
paper_url = "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2041429"
# need to have Tesseract installed
# pip install pymupdf
# get_textpage will attempt hybrid OCR first, if it enounters a non-digital block it will OCR only that block
# if text is unreadable, will revert to full page OCR
def get_textpage(page):
tp = page.get_textpage_ocr(flags=0,language="eng+por", dpi=300, full=False)
page_text = page.get_text(textpage=tp, sort=True)
readable = is_readable_text(page_text)
if not readable: