abrichr · August 12, 2024 17:22 · Aug 12, 2024 · Jul 27, 2024
diff --git a/get_pdf.py b/get_pdf.py
@@ -14,7 +14,6 @@
 import subprocess
 import re
 import sys
-import re
 
 def sanitize_filename(doi: str) -> str:
     """

diff --git a/get_pdf.py b/get_pdf.py
@@ -0,0 +1,89 @@
+"""
+Module to download a PDF from Sci-Hub using a provided DOI.
+
+This script uses curl to fetch a PDF from Sci-Hub based on the provided DOI and
+saves it to a specified output path.
+
+Example usage:
+    python get_pdf.py "10.1038/s41586-020-2649-2" "output.pdf"
+
+If no output path is specified, the DOI will be used as the filename with invalid
+characters replaced.
+"""
+
+import subprocess
+import re
+import sys
+import re
+
+def sanitize_filename(doi: str) -> str:
+    """
+    Sanitize the DOI to create a valid filename.
+
+    Args:
+        doi (str): The DOI to sanitize.
+
+    Returns:
+        str: A sanitized filename.
+    """
+    return re.sub(r'[\\/*?:"<>|]', '_', doi) + ".pdf"
+
+def get_pdf_by_doi(doi: str, output_path: str) -> None:
+    """
+    Download a PDF from Sci-Hub using the provided DOI.
+
+    Args:
+        doi (str): The DOI of the paper to download.
+        output_path (str): The path to save the downloaded PDF.
+
+    Raises:
+        RuntimeError: If the download fails.
+    """
+    # Construct the Sci-Hub URL
+    sci_hub_url = f"https://sci-hub.se/{doi}"
+
+    # Use curl to fetch the Sci-Hub page
+    result = subprocess.run(['curl', '-sL', sci_hub_url], capture_output=True, text=True)
+
+    if result.returncode != 0:
+        raise RuntimeError(f"Failed to fetch the Sci-Hub page: {result.stderr}")
+
+    # Extract the PDF URL from the Sci-Hub page
+    pdf_url_match = re.search(r'src="([^"]+\.pdf[^"]*)"', result.stdout)
+
+    if not pdf_url_match:
+        raise RuntimeError("Failed to find the PDF URL on the Sci-Hub page.")
+
+    pdf_url = pdf_url_match.group(1)
+
+    # Ensure the PDF URL is complete
+    if not pdf_url.startswith("http"):
+        pdf_url = "https://sci-hub.se" + pdf_url
+
+    # Download the PDF using curl
+    download_result = subprocess.run(['curl', '-o', output_path, pdf_url])
+
+    if download_result.returncode != 0:
+        raise RuntimeError(f"Failed to download the PDF: {download_result.stderr}")
+
+    print(f"Saved to {output_path}")
+
+def main() -> None:
+    """
+    Main function to download a PDF by DOI using Sci-Hub.
+
+    Command-line Args:
+        doi (str): The DOI of the paper to download.
+        output_path (str, optional): The path to save the downloaded PDF. Defaults to using the DOI as filename.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: python get_pdf.py <DOI> [output_path]")
+        sys.exit(1)
+
+    doi = sys.argv[1]
+    output_path = sys.argv[2] if len(sys.argv) > 2 else sanitize_filename(doi)
+
+    get_pdf_by_doi(doi, output_path)
+
+if __name__ == "__main__":
+    main()
No results found