This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pathlib import Path | |
| import re | |
| from urllib.parse import urljoin, urlparse | |
| from scrapling.fetchers import Fetcher, StealthyFetcher | |
| from scrapling.parser import Selector | |
| paper_url = "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2041429" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # need to have Tesseract installed | |
| # pip install pymupdf | |
| # get_textpage will attempt hybrid OCR first, if it enounters a non-digital block it will OCR only that block | |
| # if text is unreadable, will revert to full page OCR | |
| def get_textpage(page): | |
| tp = page.get_textpage_ocr(flags=0,language="eng+por", dpi=300, full=False) | |
| page_text = page.get_text(textpage=tp, sort=True) | |
| readable = is_readable_text(page_text) | |
| if not readable: |