mswaringen · January 20, 2026 22:07
diff --git a/gistfile1.txt b/gistfile1.txt
 from pathlib import Path
 import re
 from urllib.parse import urljoin, urlparse

 from scrapling.fetchers import Fetcher, StealthyFetcher
 from scrapling.parser import Selector


 paper_url = "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2041429"


 def extract_pdf_url(html: str, base_url: str) -> str:
    selector = Selector(html)
    href = selector.css('a[href*="Delivery.cfm"][href$=".pdf"]::attr(href)').get()
    if not href:
        href = selector.css('a[href$=".pdf"]::attr(href)').get()
    if not href:
        href = selector.css('meta[name="citation_pdf_url"]::attr(content)').get()
    if not href:
        match = re.search(r'href="([^"]*Delivery\.cfm[^"]*)"', html, re.IGNORECASE)
        if match:
            href = match.group(1)
    if not href:
        raise ValueError("Could not find a PDF download link on the page.")
    return urljoin(base_url, href)


 def filename_from_url(pdf_url: str) -> str:
    path = urlparse(pdf_url).path
    name = Path(path).name or "paper.pdf"
    if not name.lower().endswith(".pdf"):
        name = f"{name}.pdf"
    return name


 def response_bytes(response) -> bytes:
    for attr in ("body", "content", "raw", "data"):
        value = getattr(response, attr, None)
        if isinstance(value, (bytes, bytearray)):
            return bytes(value)
    text = getattr(response, "text", None)
    if isinstance(text, str):
        return text.encode("utf-8")
    raise TypeError("Unsupported response payload type for PDF download.")


 def cookies_to_dict(cookies) -> dict:
    if isinstance(cookies, dict):
        return cookies
    if isinstance(cookies, (list, tuple)):
        jar = {}
        for item in cookies:
            if isinstance(item, dict) and "name" in item and "value" in item:
                jar[item["name"]] = item["value"]
        return jar
    return {}


 def fetch_pdf(url: str, referer: str, cookies) -> bytes:
    cookie_dict = cookies_to_dict(cookies)
    headers = {"Referer": referer}
    response = Fetcher.get(url, impersonate="chrome", headers=headers, cookies=cookie_dict)
    payload = response_bytes(response)
    if response.status == 429 or not payload.lstrip().startswith(b"%PDF"):
        response = StealthyFetcher.fetch(
            url,
            headless=True,
            network_idle=True,
            cookies=cookies,
            solve_cloudflare=True,
        )
        payload = response_bytes(response)
    return payload

 page = StealthyFetcher.fetch(paper_url, headless=True, network_idle=True)
 print(page.status)

 html_dir = Path("html")
 html_dir.mkdir(parents=True, exist_ok=True)
 abstract_id = re.search(r"abstract_id=(\d+)", paper_url)
 html_filename = f"paper_{abstract_id.group(1)}.html" if abstract_id else "paper.html"
 html_path = html_dir / html_filename
 html_path.write_text(page.html_content, encoding="utf-8")
 print(f"Saved HTML to {html_path}")

 pdf_url = extract_pdf_url(page.html_content, paper_url)

 pdfs_dir = Path("pdfs")
 pdfs_dir.mkdir(parents=True, exist_ok=True)

 pdf_filename = filename_from_url(pdf_url)
 pdf_path = pdfs_dir / pdf_filename
 pdf_bytes = fetch_pdf(pdf_url, paper_url, page.cookies)
 if not pdf_bytes.lstrip().startswith(b"%PDF"):
    pdf_debug_path = html_dir / f"{pdf_filename}.html"
    pdf_debug_path.write_bytes(pdf_bytes)
    raise ValueError(f"Download did not look like a PDF. Saved HTML to {pdf_debug_path}")
 pdf_path.write_bytes(pdf_bytes)

 print(f"Saved PDF to {pdf_path}")
	from pathlib import Path
	import re
	from urllib.parse import urljoin, urlparse

	from scrapling.fetchers import Fetcher, StealthyFetcher
	from scrapling.parser import Selector


	paper_url = "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2041429"


	def extract_pdf_url(html: str, base_url: str) -> str:
	selector = Selector(html)
	href = selector.css('a[href*="Delivery.cfm"][href$=".pdf"]::attr(href)').get()
	if not href:
	href = selector.css('a[href$=".pdf"]::attr(href)').get()
	if not href:
	href = selector.css('meta[name="citation_pdf_url"]::attr(content)').get()
	if not href:
	match = re.search(r'href="([^"]Delivery\.cfm[^"])"', html, re.IGNORECASE)
	if match:
	href = match.group(1)
	if not href:
	raise ValueError("Could not find a PDF download link on the page.")
	return urljoin(base_url, href)


	def filename_from_url(pdf_url: str) -> str:
	path = urlparse(pdf_url).path
	name = Path(path).name or "paper.pdf"
	if not name.lower().endswith(".pdf"):
	name = f"{name}.pdf"
	return name


	def response_bytes(response) -> bytes:
	for attr in ("body", "content", "raw", "data"):
	value = getattr(response, attr, None)
	if isinstance(value, (bytes, bytearray)):
	return bytes(value)
	text = getattr(response, "text", None)
	if isinstance(text, str):
	return text.encode("utf-8")
	raise TypeError("Unsupported response payload type for PDF download.")


	def cookies_to_dict(cookies) -> dict:
	if isinstance(cookies, dict):
	return cookies
	if isinstance(cookies, (list, tuple)):
	jar = {}
	for item in cookies:
	if isinstance(item, dict) and "name" in item and "value" in item:
	jar[item["name"]] = item["value"]
	return jar
	return {}


	def fetch_pdf(url: str, referer: str, cookies) -> bytes:
	cookie_dict = cookies_to_dict(cookies)
	headers = {"Referer": referer}
	response = Fetcher.get(url, impersonate="chrome", headers=headers, cookies=cookie_dict)
	payload = response_bytes(response)
	if response.status == 429 or not payload.lstrip().startswith(b"%PDF"):
	response = StealthyFetcher.fetch(
	url,
	headless=True,
	network_idle=True,
	cookies=cookies,
	solve_cloudflare=True,
	)
	payload = response_bytes(response)
	return payload

	page = StealthyFetcher.fetch(paper_url, headless=True, network_idle=True)
	print(page.status)

	html_dir = Path("html")
	html_dir.mkdir(parents=True, exist_ok=True)
	abstract_id = re.search(r"abstract_id=(\d+)", paper_url)
	html_filename = f"paper_{abstract_id.group(1)}.html" if abstract_id else "paper.html"
	html_path = html_dir / html_filename
	html_path.write_text(page.html_content, encoding="utf-8")
	print(f"Saved HTML to {html_path}")

	pdf_url = extract_pdf_url(page.html_content, paper_url)

	pdfs_dir = Path("pdfs")
	pdfs_dir.mkdir(parents=True, exist_ok=True)

	pdf_filename = filename_from_url(pdf_url)
	pdf_path = pdfs_dir / pdf_filename
	pdf_bytes = fetch_pdf(pdf_url, paper_url, page.cookies)
	if not pdf_bytes.lstrip().startswith(b"%PDF"):
	pdf_debug_path = html_dir / f"{pdf_filename}.html"
	pdf_debug_path.write_bytes(pdf_bytes)
	raise ValueError(f"Download did not look like a PDF. Saved HTML to {pdf_debug_path}")
	pdf_path.write_bytes(pdf_bytes)

	print(f"Saved PDF to {pdf_path}")
No results found