Created
April 7, 2021 08:31
-
-
Save BelenCebrian/5aed4d1c0dba712386ae7bc923869e79 to your computer and use it in GitHub Desktop.
Para buscar todos los enlaces de un mismo tipo en páginas con un patrón consecutivo (como enlaces a YouTube en los cursos de Eduteca)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import urllib.request | |
| from bs4 import BeautifulSoup | |
| # extraemos de la web los elementos que interesen | |
| # <iframe src="//www.youtube.com/embed/3AbQPaHY7QE?wmode=opaque" ...> | |
| def getLinks(url): | |
| # print() | |
| # print(f"url: {url}") | |
| html_page = urllib.request.urlopen(url) | |
| soup = BeautifulSoup(html_page, features="lxml") | |
| links = [] | |
| for link in soup.findAll('iframe'): | |
| links.append(link.get('src').strip('//')) | |
| return links | |
| # generamos todas las paginas consecutivas que sigan un patron | |
| # ej: http://www.educoteca.com/python---tema-1.html | |
| # patron: http://www.educoteca.com/python---tema-{s}.html | |
| def generate_pages(): | |
| LINKS = [] | |
| patron = input("Patron:") | |
| print() | |
| a = int(input("Inicio:")) | |
| b = int(input("Final:")) | |
| print(f"Generamos patron desde {a} hasta {b}:") | |
| for i in range(a, (b+1), 1): | |
| LINKS.append(patron.replace("{s}", str(i))) | |
| print(f"{i}:", patron.replace("{s}", str(i))) | |
| print("\n", LINKS, "\n") | |
| return LINKS | |
| def main(): | |
| pages = generate_pages() | |
| for page in enumerate(pages): | |
| print(getLinks(page[1])) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment