Skip to content

Instantly share code, notes, and snippets.

@tikikun
Created August 1, 2024 17:22
Show Gist options
  • Select an option

  • Save tikikun/b2a7640d4b98b52e14226439168e483a to your computer and use it in GitHub Desktop.

Select an option

Save tikikun/b2a7640d4b98b52e14226439168e483a to your computer and use it in GitHub Desktop.
Script để crawl thơ việt nam
import requests
from bs4 import BeautifulSoup
import pandas as pd
def fetch_poems(author_name,end_page:int):
"""
Fetch poems from isach.info for a given author.
Args:
author_name (str): Name of the author.
Returns:
pd.DataFrame: DataFrame containing poem information.
"""
print("crawling for author:",author_name," to page:",end_page)
all_poems = []
base_url = "https://isach.info/mobile/poem.php"
# Loop through pages 1 to 6
for page_number in range(1, end_page + 1):
params = {
"list": "poem",
"author": author_name,
"order": "poem_id",
"page": page_number
}
response = requests.get(base_url, params=params)
soup = BeautifulSoup(response.text, 'html.parser')
# Find all titles with the specified attributes
titles_on_page = soup.find_all(attrs={"title": True}, href=lambda href: href and 'poem' in href, name='a')
print(f"Page {page_number}: Found {len(titles_on_page)} poems")
# Extract the title and link
for title in titles_on_page:
poem_info = {
'title': title['title'],
'link': title['href']
}
# Fetch the poem page
poem_url = "https://isach.info/mobile/" + title['href']
poem_response = requests.get(poem_url)
poem_soup = BeautifulSoup(poem_response.text, 'html.parser')
# Extract and concatenate all poem text
poem_text_elements = poem_soup.find_all(class_="poem_text")
poem_info['poem_text'] = "\n".join(element.text for element in poem_text_elements)
# Add to the list
all_poems.append(poem_info)
# Create a DataFrame
poems_df = pd.DataFrame(all_poems)
return poems_df
def get_max_page_number(soup):
"""
Extracts the maximum page number from a BeautifulSoup object.
Parameters:
- soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML content.
Returns:
- int: The maximum page number found in the HTML content, or None if no pages are found.
"""
# Find all <a> tags with 'page' in their href attribute
page_links = soup.find_all(name='a', href=lambda href: href and 'page' in href)
# Extract page numbers from the href attributes
page_numbers = []
for link in page_links:
href = link.get('href')
if 'page=' in href:
# Extract the page number using split and get the integer value
page_number = int(href.split('page=')[-1])
page_numbers.append(page_number)
# Find the maximum page number
max_page_number = max(page_numbers) if page_numbers else 1 # because there is no max page for single page author
return max_page_number
def get_authors(num_pages):
"""
Get all authors from all pages.
Args:
num_pages (int): The number of pages to fetch authors from.
Returns:
list: A list of author names.
"""
authors = []
for page in range(1, num_pages + 1):
url = f"https://isach.info/mobile/poem.php?list=author&page={page}"
data = requests.get(url)
soup = BeautifulSoup(data.text, 'html.parser')
author_tags = soup.find_all(name='a', href=lambda href: "poem" in href and "author=" in href and 'mobile' not in href)
authors.extend([author_tag['href'].split('=')[2] for author_tag in author_tags])
return authors
import os
import pandas as pd
# Get the maximum page number for authors
url = "https://isach.info/mobile/poem.php?list=author"
data = requests.get(url)
soup = BeautifulSoup(data.text, 'html.parser')
max_author_pages = get_max_page_number(soup)
# Get all authors from all pages
authors = get_authors(max_author_pages)
# Create a folder to store the CSV files
folder_name = "poems_by_author"
if not os.path.exists(folder_name):
os.makedirs(folder_name)
# Loop through each author and fetch poems
for author in authors:
# Get the maximum page number for poems for this author
url = f"https://isach.info/mobile/poem.php?list=poem&author={author}"
data = requests.get(url)
soup = BeautifulSoup(data.text, 'html.parser')
max_poem_pages = get_max_page_number(soup)
# Fetch poems for this author
poems_df = fetch_poems(author, max_poem_pages)
# Save the poems to a CSV file
file_name = f"{folder_name}/{author}.csv"
poems_df.to_csv(file_name, index=False)
print(f"Saved poems for {author} to {file_name}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment