tikikun · August 1, 2024 17:22
diff --git a/viet_poem.py b/viet_poem.py
 import requests
 from bs4 import BeautifulSoup
 import pandas as pd

 def fetch_poems(author_name,end_page:int):
    """
    Fetch poems from isach.info for a given author.

    Args:
        author_name (str): Name of the author.

    Returns:
        pd.DataFrame: DataFrame containing poem information.
    """
    print("crawling for author:",author_name," to page:",end_page)
    all_poems = []
    base_url = "https://isach.info/mobile/poem.php"

    # Loop through pages 1 to 6
    for page_number in range(1, end_page + 1):
        params = {
            "list": "poem",
            "author": author_name,
            "order": "poem_id",
            "page": page_number
        }
        response = requests.get(base_url, params=params)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all titles with the specified attributes
        titles_on_page = soup.find_all(attrs={"title": True}, href=lambda href: href and 'poem' in href, name='a')
        print(f"Page {page_number}: Found {len(titles_on_page)} poems")

        # Extract the title and link
        for title in titles_on_page:
            poem_info = {
                'title': title['title'],
                'link': title['href']
            }

            # Fetch the poem page
            poem_url = "https://isach.info/mobile/" + title['href']
            poem_response = requests.get(poem_url)
            poem_soup = BeautifulSoup(poem_response.text, 'html.parser')

            # Extract and concatenate all poem text
            poem_text_elements = poem_soup.find_all(class_="poem_text")
            poem_info['poem_text'] = "\n".join(element.text for element in poem_text_elements)

            # Add to the list
            all_poems.append(poem_info)

    # Create a DataFrame
    poems_df = pd.DataFrame(all_poems)
    return poems_df

 def get_max_page_number(soup):
    """
    Extracts the maximum page number from a BeautifulSoup object.

    Parameters:
    - soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML content.

    Returns:
    - int: The maximum page number found in the HTML content, or None if no pages are found.
    """
    # Find all <a> tags with 'page' in their href attribute
    page_links = soup.find_all(name='a', href=lambda href: href and 'page' in href)

    # Extract page numbers from the href attributes
    page_numbers = []
    for link in page_links:
        href = link.get('href')
        if 'page=' in href:
            # Extract the page number using split and get the integer value
            page_number = int(href.split('page=')[-1])
            page_numbers.append(page_number)

    # Find the maximum page number
    max_page_number = max(page_numbers) if page_numbers else 1 # because there is no max page for single page author

    return max_page_number


 def get_authors(num_pages):
    """
    Get all authors from all pages.

    Args:
        num_pages (int): The number of pages to fetch authors from.

    Returns:
        list: A list of author names.
    """
    authors = []
    for page in range(1, num_pages + 1):
        url = f"https://isach.info/mobile/poem.php?list=author&page={page}"
        data = requests.get(url)
        soup = BeautifulSoup(data.text, 'html.parser')
        author_tags = soup.find_all(name='a', href=lambda href: "poem" in href and "author=" in href and 'mobile' not in href)
        authors.extend([author_tag['href'].split('=')[2] for author_tag in author_tags])
    return authors



 import os
 import pandas as pd

 # Get the maximum page number for authors
 url = "https://isach.info/mobile/poem.php?list=author"
 data = requests.get(url)
 soup = BeautifulSoup(data.text, 'html.parser')
 max_author_pages = get_max_page_number(soup)

 # Get all authors from all pages
 authors = get_authors(max_author_pages)

 # Create a folder to store the CSV files
 folder_name = "poems_by_author"
 if not os.path.exists(folder_name):
    os.makedirs(folder_name)

 # Loop through each author and fetch poems
 for author in authors:
    # Get the maximum page number for poems for this author
    url = f"https://isach.info/mobile/poem.php?list=poem&author={author}"
    data = requests.get(url)
    soup = BeautifulSoup(data.text, 'html.parser')
    max_poem_pages = get_max_page_number(soup)

    # Fetch poems for this author
    poems_df = fetch_poems(author, max_poem_pages)

    # Save the poems to a CSV file
    file_name = f"{folder_name}/{author}.csv"
    poems_df.to_csv(file_name, index=False)
    print(f"Saved poems for {author} to {file_name}")
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd

	def fetch_poems(author_name,end_page:int):
	"""
	Fetch poems from isach.info for a given author.

	Args:
	author_name (str): Name of the author.

	Returns:
	pd.DataFrame: DataFrame containing poem information.
	"""
	print("crawling for author:",author_name," to page:",end_page)
	all_poems = []
	base_url = "https://isach.info/mobile/poem.php"

	# Loop through pages 1 to 6
	for page_number in range(1, end_page + 1):
	params = {
	"list": "poem",
	"author": author_name,
	"order": "poem_id",
	"page": page_number
	}
	response = requests.get(base_url, params=params)
	soup = BeautifulSoup(response.text, 'html.parser')

	# Find all titles with the specified attributes
	titles_on_page = soup.find_all(attrs={"title": True}, href=lambda href: href and 'poem' in href, name='a')
	print(f"Page {page_number}: Found {len(titles_on_page)} poems")

	# Extract the title and link
	for title in titles_on_page:
	poem_info = {
	'title': title['title'],
	'link': title['href']
	}

	# Fetch the poem page
	poem_url = "https://isach.info/mobile/" + title['href']
	poem_response = requests.get(poem_url)
	poem_soup = BeautifulSoup(poem_response.text, 'html.parser')

	# Extract and concatenate all poem text
	poem_text_elements = poem_soup.find_all(class_="poem_text")
	poem_info['poem_text'] = "\n".join(element.text for element in poem_text_elements)

	# Add to the list
	all_poems.append(poem_info)

	# Create a DataFrame
	poems_df = pd.DataFrame(all_poems)
	return poems_df

	def get_max_page_number(soup):
	"""
	Extracts the maximum page number from a BeautifulSoup object.

	Parameters:
	- soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML content.

	Returns:
	- int: The maximum page number found in the HTML content, or None if no pages are found.
	"""
	# Find all <a> tags with 'page' in their href attribute
	page_links = soup.find_all(name='a', href=lambda href: href and 'page' in href)

	# Extract page numbers from the href attributes
	page_numbers = []
	for link in page_links:
	href = link.get('href')
	if 'page=' in href:
	# Extract the page number using split and get the integer value
	page_number = int(href.split('page=')[-1])
	page_numbers.append(page_number)

	# Find the maximum page number
	max_page_number = max(page_numbers) if page_numbers else 1 # because there is no max page for single page author

	return max_page_number


	def get_authors(num_pages):
	"""
	Get all authors from all pages.

	Args:
	num_pages (int): The number of pages to fetch authors from.

	Returns:
	list: A list of author names.
	"""
	authors = []
	for page in range(1, num_pages + 1):
	url = f"https://isach.info/mobile/poem.php?list=author&page={page}"
	data = requests.get(url)
	soup = BeautifulSoup(data.text, 'html.parser')
	author_tags = soup.find_all(name='a', href=lambda href: "poem" in href and "author=" in href and 'mobile' not in href)
	authors.extend([author_tag['href'].split('=')[2] for author_tag in author_tags])
	return authors



	import os
	import pandas as pd

	# Get the maximum page number for authors
	url = "https://isach.info/mobile/poem.php?list=author"
	data = requests.get(url)
	soup = BeautifulSoup(data.text, 'html.parser')
	max_author_pages = get_max_page_number(soup)

	# Get all authors from all pages
	authors = get_authors(max_author_pages)

	# Create a folder to store the CSV files
	folder_name = "poems_by_author"
	if not os.path.exists(folder_name):
	os.makedirs(folder_name)

	# Loop through each author and fetch poems
	for author in authors:
	# Get the maximum page number for poems for this author
	url = f"https://isach.info/mobile/poem.php?list=poem&author={author}"
	data = requests.get(url)
	soup = BeautifulSoup(data.text, 'html.parser')
	max_poem_pages = get_max_page_number(soup)

	# Fetch poems for this author
	poems_df = fetch_poems(author, max_poem_pages)

	# Save the poems to a CSV file
	file_name = f"{folder_name}/{author}.csv"
	poems_df.to_csv(file_name, index=False)
	print(f"Saved poems for {author} to {file_name}")
No results found