Skip to content

Instantly share code, notes, and snippets.

@abrichr
Last active August 12, 2024 17:22
Show Gist options
  • Select an option

  • Save abrichr/455f0e569bf1bd104c696a7ad9e6b20f to your computer and use it in GitHub Desktop.

Select an option

Save abrichr/455f0e569bf1bd104c696a7ad9e6b20f to your computer and use it in GitHub Desktop.

Revisions

  1. abrichr revised this gist Aug 12, 2024. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion get_pdf.py
    Original file line number Diff line number Diff line change
    @@ -14,7 +14,6 @@
    import subprocess
    import re
    import sys
    import re

    def sanitize_filename(doi: str) -> str:
    """
  2. abrichr created this gist Jul 27, 2024.
    89 changes: 89 additions & 0 deletions get_pdf.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,89 @@
    """
    Module to download a PDF from Sci-Hub using a provided DOI.
    This script uses curl to fetch a PDF from Sci-Hub based on the provided DOI and
    saves it to a specified output path.
    Example usage:
    python get_pdf.py "10.1038/s41586-020-2649-2" "output.pdf"
    If no output path is specified, the DOI will be used as the filename with invalid
    characters replaced.
    """

    import subprocess
    import re
    import sys
    import re

    def sanitize_filename(doi: str) -> str:
    """
    Sanitize the DOI to create a valid filename.
    Args:
    doi (str): The DOI to sanitize.
    Returns:
    str: A sanitized filename.
    """
    return re.sub(r'[\\/*?:"<>|]', '_', doi) + ".pdf"

    def get_pdf_by_doi(doi: str, output_path: str) -> None:
    """
    Download a PDF from Sci-Hub using the provided DOI.
    Args:
    doi (str): The DOI of the paper to download.
    output_path (str): The path to save the downloaded PDF.
    Raises:
    RuntimeError: If the download fails.
    """
    # Construct the Sci-Hub URL
    sci_hub_url = f"https://sci-hub.se/{doi}"

    # Use curl to fetch the Sci-Hub page
    result = subprocess.run(['curl', '-sL', sci_hub_url], capture_output=True, text=True)

    if result.returncode != 0:
    raise RuntimeError(f"Failed to fetch the Sci-Hub page: {result.stderr}")

    # Extract the PDF URL from the Sci-Hub page
    pdf_url_match = re.search(r'src="([^"]+\.pdf[^"]*)"', result.stdout)

    if not pdf_url_match:
    raise RuntimeError("Failed to find the PDF URL on the Sci-Hub page.")

    pdf_url = pdf_url_match.group(1)

    # Ensure the PDF URL is complete
    if not pdf_url.startswith("http"):
    pdf_url = "https://sci-hub.se" + pdf_url

    # Download the PDF using curl
    download_result = subprocess.run(['curl', '-o', output_path, pdf_url])

    if download_result.returncode != 0:
    raise RuntimeError(f"Failed to download the PDF: {download_result.stderr}")

    print(f"Saved to {output_path}")

    def main() -> None:
    """
    Main function to download a PDF by DOI using Sci-Hub.
    Command-line Args:
    doi (str): The DOI of the paper to download.
    output_path (str, optional): The path to save the downloaded PDF. Defaults to using the DOI as filename.
    """
    if len(sys.argv) < 2:
    print("Usage: python get_pdf.py <DOI> [output_path]")
    sys.exit(1)

    doi = sys.argv[1]
    output_path = sys.argv[2] if len(sys.argv) > 2 else sanitize_filename(doi)

    get_pdf_by_doi(doi, output_path)

    if __name__ == "__main__":
    main()