Created
August 1, 2024 17:22
-
-
Save tikikun/b2a7640d4b98b52e14226439168e483a to your computer and use it in GitHub Desktop.
Script để crawl thơ việt nam
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| def fetch_poems(author_name,end_page:int): | |
| """ | |
| Fetch poems from isach.info for a given author. | |
| Args: | |
| author_name (str): Name of the author. | |
| Returns: | |
| pd.DataFrame: DataFrame containing poem information. | |
| """ | |
| print("crawling for author:",author_name," to page:",end_page) | |
| all_poems = [] | |
| base_url = "https://isach.info/mobile/poem.php" | |
| # Loop through pages 1 to 6 | |
| for page_number in range(1, end_page + 1): | |
| params = { | |
| "list": "poem", | |
| "author": author_name, | |
| "order": "poem_id", | |
| "page": page_number | |
| } | |
| response = requests.get(base_url, params=params) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Find all titles with the specified attributes | |
| titles_on_page = soup.find_all(attrs={"title": True}, href=lambda href: href and 'poem' in href, name='a') | |
| print(f"Page {page_number}: Found {len(titles_on_page)} poems") | |
| # Extract the title and link | |
| for title in titles_on_page: | |
| poem_info = { | |
| 'title': title['title'], | |
| 'link': title['href'] | |
| } | |
| # Fetch the poem page | |
| poem_url = "https://isach.info/mobile/" + title['href'] | |
| poem_response = requests.get(poem_url) | |
| poem_soup = BeautifulSoup(poem_response.text, 'html.parser') | |
| # Extract and concatenate all poem text | |
| poem_text_elements = poem_soup.find_all(class_="poem_text") | |
| poem_info['poem_text'] = "\n".join(element.text for element in poem_text_elements) | |
| # Add to the list | |
| all_poems.append(poem_info) | |
| # Create a DataFrame | |
| poems_df = pd.DataFrame(all_poems) | |
| return poems_df | |
| def get_max_page_number(soup): | |
| """ | |
| Extracts the maximum page number from a BeautifulSoup object. | |
| Parameters: | |
| - soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML content. | |
| Returns: | |
| - int: The maximum page number found in the HTML content, or None if no pages are found. | |
| """ | |
| # Find all <a> tags with 'page' in their href attribute | |
| page_links = soup.find_all(name='a', href=lambda href: href and 'page' in href) | |
| # Extract page numbers from the href attributes | |
| page_numbers = [] | |
| for link in page_links: | |
| href = link.get('href') | |
| if 'page=' in href: | |
| # Extract the page number using split and get the integer value | |
| page_number = int(href.split('page=')[-1]) | |
| page_numbers.append(page_number) | |
| # Find the maximum page number | |
| max_page_number = max(page_numbers) if page_numbers else 1 # because there is no max page for single page author | |
| return max_page_number | |
| def get_authors(num_pages): | |
| """ | |
| Get all authors from all pages. | |
| Args: | |
| num_pages (int): The number of pages to fetch authors from. | |
| Returns: | |
| list: A list of author names. | |
| """ | |
| authors = [] | |
| for page in range(1, num_pages + 1): | |
| url = f"https://isach.info/mobile/poem.php?list=author&page={page}" | |
| data = requests.get(url) | |
| soup = BeautifulSoup(data.text, 'html.parser') | |
| author_tags = soup.find_all(name='a', href=lambda href: "poem" in href and "author=" in href and 'mobile' not in href) | |
| authors.extend([author_tag['href'].split('=')[2] for author_tag in author_tags]) | |
| return authors | |
| import os | |
| import pandas as pd | |
| # Get the maximum page number for authors | |
| url = "https://isach.info/mobile/poem.php?list=author" | |
| data = requests.get(url) | |
| soup = BeautifulSoup(data.text, 'html.parser') | |
| max_author_pages = get_max_page_number(soup) | |
| # Get all authors from all pages | |
| authors = get_authors(max_author_pages) | |
| # Create a folder to store the CSV files | |
| folder_name = "poems_by_author" | |
| if not os.path.exists(folder_name): | |
| os.makedirs(folder_name) | |
| # Loop through each author and fetch poems | |
| for author in authors: | |
| # Get the maximum page number for poems for this author | |
| url = f"https://isach.info/mobile/poem.php?list=poem&author={author}" | |
| data = requests.get(url) | |
| soup = BeautifulSoup(data.text, 'html.parser') | |
| max_poem_pages = get_max_page_number(soup) | |
| # Fetch poems for this author | |
| poems_df = fetch_poems(author, max_poem_pages) | |
| # Save the poems to a CSV file | |
| file_name = f"{folder_name}/{author}.csv" | |
| poems_df.to_csv(file_name, index=False) | |
| print(f"Saved poems for {author} to {file_name}") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment