Skip to content

Instantly share code, notes, and snippets.

@ecarreras
Created December 13, 2023 09:35
Show Gist options
  • Select an option

  • Save ecarreras/25146a627024d1015d85f46a05d75833 to your computer and use it in GitHub Desktop.

Select an option

Save ecarreras/25146a627024d1015d85f46a05d75833 to your computer and use it in GitHub Desktop.

Revisions

  1. ecarreras created this gist Dec 13, 2023.
    46 changes: 46 additions & 0 deletions downloader.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,46 @@
    import requests
    from bs4 import BeautifulSoup
    import os
    from PyPDF2 import PdfReader, PdfWriter
    from rich.progress import track

    def merge_pdfs(folder_path, output_filename):
    pdf_writer = PdfWriter()
    pdfs = [item for item in os.listdir(folder_path) if item.endswith('.pdf')]
    for item in track(pdfs, description="Merging..."):
    pdf_reader = PdfReader(os.path.join(folder_path, item))
    for page in pdf_reader.pages:
    pdf_writer.add_page(page)

    with open(output_filename, 'wb') as out:
    pdf_writer.write(out)

    def download_pdfs(url, download_folder):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')

    pdf_links = [link for link in links if link.get('href') and link['href'].endswith('.pdf')]

    for link in track(pdf_links, description="Downloading..."):
    href = link['href']
    full_url = requests.compat.urljoin(url, href)
    filename = os.path.join(download_folder, href.split('/')[-1])
    with requests.get(full_url, stream=True) as r:
    r.raise_for_status()
    with open(filename, 'wb') as f:
    for chunk in r.iter_content(chunk_size=8192):
    f.write(chunk)

    # URL de la pàgina web
    url = "https://www.ree.es/es/actividades/operacion-del-sistema-electrico/procedimientos-de-operacion"
    # Carpeta on es desaran els PDFs
    download_folder = "/tmp/pos"

    #download_pdfs(url, download_folder)


    # Nom de l'arxiu de sortida
    output_filename = '/tmp/po_unified.pdf'

    merge_pdfs(download_folder, output_filename)