Skip to content

Instantly share code, notes, and snippets.

@mswaringen
Created January 20, 2026 22:07
Show Gist options
  • Select an option

  • Save mswaringen/87e8455d1a256e4658fdac14183fad1e to your computer and use it in GitHub Desktop.

Select an option

Save mswaringen/87e8455d1a256e4658fdac14183fad1e to your computer and use it in GitHub Desktop.
SSRN scraper
from pathlib import Path
import re
from urllib.parse import urljoin, urlparse
from scrapling.fetchers import Fetcher, StealthyFetcher
from scrapling.parser import Selector
paper_url = "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2041429"
def extract_pdf_url(html: str, base_url: str) -> str:
selector = Selector(html)
href = selector.css('a[href*="Delivery.cfm"][href$=".pdf"]::attr(href)').get()
if not href:
href = selector.css('a[href$=".pdf"]::attr(href)').get()
if not href:
href = selector.css('meta[name="citation_pdf_url"]::attr(content)').get()
if not href:
match = re.search(r'href="([^"]*Delivery\.cfm[^"]*)"', html, re.IGNORECASE)
if match:
href = match.group(1)
if not href:
raise ValueError("Could not find a PDF download link on the page.")
return urljoin(base_url, href)
def filename_from_url(pdf_url: str) -> str:
path = urlparse(pdf_url).path
name = Path(path).name or "paper.pdf"
if not name.lower().endswith(".pdf"):
name = f"{name}.pdf"
return name
def response_bytes(response) -> bytes:
for attr in ("body", "content", "raw", "data"):
value = getattr(response, attr, None)
if isinstance(value, (bytes, bytearray)):
return bytes(value)
text = getattr(response, "text", None)
if isinstance(text, str):
return text.encode("utf-8")
raise TypeError("Unsupported response payload type for PDF download.")
def cookies_to_dict(cookies) -> dict:
if isinstance(cookies, dict):
return cookies
if isinstance(cookies, (list, tuple)):
jar = {}
for item in cookies:
if isinstance(item, dict) and "name" in item and "value" in item:
jar[item["name"]] = item["value"]
return jar
return {}
def fetch_pdf(url: str, referer: str, cookies) -> bytes:
cookie_dict = cookies_to_dict(cookies)
headers = {"Referer": referer}
response = Fetcher.get(url, impersonate="chrome", headers=headers, cookies=cookie_dict)
payload = response_bytes(response)
if response.status == 429 or not payload.lstrip().startswith(b"%PDF"):
response = StealthyFetcher.fetch(
url,
headless=True,
network_idle=True,
cookies=cookies,
solve_cloudflare=True,
)
payload = response_bytes(response)
return payload
page = StealthyFetcher.fetch(paper_url, headless=True, network_idle=True)
print(page.status)
html_dir = Path("html")
html_dir.mkdir(parents=True, exist_ok=True)
abstract_id = re.search(r"abstract_id=(\d+)", paper_url)
html_filename = f"paper_{abstract_id.group(1)}.html" if abstract_id else "paper.html"
html_path = html_dir / html_filename
html_path.write_text(page.html_content, encoding="utf-8")
print(f"Saved HTML to {html_path}")
pdf_url = extract_pdf_url(page.html_content, paper_url)
pdfs_dir = Path("pdfs")
pdfs_dir.mkdir(parents=True, exist_ok=True)
pdf_filename = filename_from_url(pdf_url)
pdf_path = pdfs_dir / pdf_filename
pdf_bytes = fetch_pdf(pdf_url, paper_url, page.cookies)
if not pdf_bytes.lstrip().startswith(b"%PDF"):
pdf_debug_path = html_dir / f"{pdf_filename}.html"
pdf_debug_path.write_bytes(pdf_bytes)
raise ValueError(f"Download did not look like a PDF. Saved HTML to {pdf_debug_path}")
pdf_path.write_bytes(pdf_bytes)
print(f"Saved PDF to {pdf_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment