Created
January 20, 2026 22:07
-
-
Save mswaringen/87e8455d1a256e4658fdac14183fad1e to your computer and use it in GitHub Desktop.
SSRN scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pathlib import Path | |
| import re | |
| from urllib.parse import urljoin, urlparse | |
| from scrapling.fetchers import Fetcher, StealthyFetcher | |
| from scrapling.parser import Selector | |
| paper_url = "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2041429" | |
| def extract_pdf_url(html: str, base_url: str) -> str: | |
| selector = Selector(html) | |
| href = selector.css('a[href*="Delivery.cfm"][href$=".pdf"]::attr(href)').get() | |
| if not href: | |
| href = selector.css('a[href$=".pdf"]::attr(href)').get() | |
| if not href: | |
| href = selector.css('meta[name="citation_pdf_url"]::attr(content)').get() | |
| if not href: | |
| match = re.search(r'href="([^"]*Delivery\.cfm[^"]*)"', html, re.IGNORECASE) | |
| if match: | |
| href = match.group(1) | |
| if not href: | |
| raise ValueError("Could not find a PDF download link on the page.") | |
| return urljoin(base_url, href) | |
| def filename_from_url(pdf_url: str) -> str: | |
| path = urlparse(pdf_url).path | |
| name = Path(path).name or "paper.pdf" | |
| if not name.lower().endswith(".pdf"): | |
| name = f"{name}.pdf" | |
| return name | |
| def response_bytes(response) -> bytes: | |
| for attr in ("body", "content", "raw", "data"): | |
| value = getattr(response, attr, None) | |
| if isinstance(value, (bytes, bytearray)): | |
| return bytes(value) | |
| text = getattr(response, "text", None) | |
| if isinstance(text, str): | |
| return text.encode("utf-8") | |
| raise TypeError("Unsupported response payload type for PDF download.") | |
| def cookies_to_dict(cookies) -> dict: | |
| if isinstance(cookies, dict): | |
| return cookies | |
| if isinstance(cookies, (list, tuple)): | |
| jar = {} | |
| for item in cookies: | |
| if isinstance(item, dict) and "name" in item and "value" in item: | |
| jar[item["name"]] = item["value"] | |
| return jar | |
| return {} | |
| def fetch_pdf(url: str, referer: str, cookies) -> bytes: | |
| cookie_dict = cookies_to_dict(cookies) | |
| headers = {"Referer": referer} | |
| response = Fetcher.get(url, impersonate="chrome", headers=headers, cookies=cookie_dict) | |
| payload = response_bytes(response) | |
| if response.status == 429 or not payload.lstrip().startswith(b"%PDF"): | |
| response = StealthyFetcher.fetch( | |
| url, | |
| headless=True, | |
| network_idle=True, | |
| cookies=cookies, | |
| solve_cloudflare=True, | |
| ) | |
| payload = response_bytes(response) | |
| return payload | |
| page = StealthyFetcher.fetch(paper_url, headless=True, network_idle=True) | |
| print(page.status) | |
| html_dir = Path("html") | |
| html_dir.mkdir(parents=True, exist_ok=True) | |
| abstract_id = re.search(r"abstract_id=(\d+)", paper_url) | |
| html_filename = f"paper_{abstract_id.group(1)}.html" if abstract_id else "paper.html" | |
| html_path = html_dir / html_filename | |
| html_path.write_text(page.html_content, encoding="utf-8") | |
| print(f"Saved HTML to {html_path}") | |
| pdf_url = extract_pdf_url(page.html_content, paper_url) | |
| pdfs_dir = Path("pdfs") | |
| pdfs_dir.mkdir(parents=True, exist_ok=True) | |
| pdf_filename = filename_from_url(pdf_url) | |
| pdf_path = pdfs_dir / pdf_filename | |
| pdf_bytes = fetch_pdf(pdf_url, paper_url, page.cookies) | |
| if not pdf_bytes.lstrip().startswith(b"%PDF"): | |
| pdf_debug_path = html_dir / f"{pdf_filename}.html" | |
| pdf_debug_path.write_bytes(pdf_bytes) | |
| raise ValueError(f"Download did not look like a PDF. Saved HTML to {pdf_debug_path}") | |
| pdf_path.write_bytes(pdf_bytes) | |
| print(f"Saved PDF to {pdf_path}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment