Last active
August 12, 2024 17:22
-
-
Save abrichr/455f0e569bf1bd104c696a7ad9e6b20f to your computer and use it in GitHub Desktop.
Revisions
-
abrichr revised this gist
Aug 12, 2024 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -14,7 +14,6 @@ import subprocess import re import sys def sanitize_filename(doi: str) -> str: """ -
abrichr created this gist
Jul 27, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,89 @@ """ Module to download a PDF from Sci-Hub using a provided DOI. This script uses curl to fetch a PDF from Sci-Hub based on the provided DOI and saves it to a specified output path. Example usage: python get_pdf.py "10.1038/s41586-020-2649-2" "output.pdf" If no output path is specified, the DOI will be used as the filename with invalid characters replaced. """ import subprocess import re import sys import re def sanitize_filename(doi: str) -> str: """ Sanitize the DOI to create a valid filename. Args: doi (str): The DOI to sanitize. Returns: str: A sanitized filename. """ return re.sub(r'[\\/*?:"<>|]', '_', doi) + ".pdf" def get_pdf_by_doi(doi: str, output_path: str) -> None: """ Download a PDF from Sci-Hub using the provided DOI. Args: doi (str): The DOI of the paper to download. output_path (str): The path to save the downloaded PDF. Raises: RuntimeError: If the download fails. """ # Construct the Sci-Hub URL sci_hub_url = f"https://sci-hub.se/{doi}" # Use curl to fetch the Sci-Hub page result = subprocess.run(['curl', '-sL', sci_hub_url], capture_output=True, text=True) if result.returncode != 0: raise RuntimeError(f"Failed to fetch the Sci-Hub page: {result.stderr}") # Extract the PDF URL from the Sci-Hub page pdf_url_match = re.search(r'src="([^"]+\.pdf[^"]*)"', result.stdout) if not pdf_url_match: raise RuntimeError("Failed to find the PDF URL on the Sci-Hub page.") pdf_url = pdf_url_match.group(1) # Ensure the PDF URL is complete if not pdf_url.startswith("http"): pdf_url = "https://sci-hub.se" + pdf_url # Download the PDF using curl download_result = subprocess.run(['curl', '-o', output_path, pdf_url]) if download_result.returncode != 0: raise RuntimeError(f"Failed to download the PDF: {download_result.stderr}") print(f"Saved to {output_path}") def main() -> None: """ Main function to download a PDF by DOI using Sci-Hub. Command-line Args: doi (str): The DOI of the paper to download. output_path (str, optional): The path to save the downloaded PDF. Defaults to using the DOI as filename. """ if len(sys.argv) < 2: print("Usage: python get_pdf.py <DOI> [output_path]") sys.exit(1) doi = sys.argv[1] output_path = sys.argv[2] if len(sys.argv) > 2 else sanitize_filename(doi) get_pdf_by_doi(doi, output_path) if __name__ == "__main__": main()