antonl-dev · August 21, 2025 18:54
diff --git a/radcap-foreach-crawl.py b/radcap-foreach-crawl.py
 import requests
 import re
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin

 # --- Configuration ---
 BASE_URL = "http://www.radcap.ru/"
 START_PAGE = urljoin(BASE_URL, "index-d.html")
 OUTPUT_FILENAME = "radcap-all-streams-0022-aug22-2025.txt"

 # Set a user-agent to mimic a browser, which can help avoid being blocked.
 HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 }

 def log_and_print(message, file_handle):
    """Prints a message to the console and writes it to the given file."""
    print(message)
    file_handle.write(message + '\n')

 def get_soup(url, log_file):
    """Fetches a URL and returns a BeautifulSoup object, or None on error."""
    try:
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status()
        response.encoding = 'utf-8'
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        log_and_print(f"Error fetching {url}: {e}", log_file)
        return None

 def main():
    """Main function to scrape the radio station data."""
    # Open the output file; 'w' mode overwrites it if it exists.
    # The 'with' statement ensures the file is closed automatically.
    with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as output_file:
        log_and_print(f"Fetching main genre page: {START_PAGE}", output_file)
        main_soup = get_soup(START_PAGE, output_file)
        if not main_soup:
            log_and_print("Could not fetch the main page. Exiting.", output_file)
            return

        genre_links = main_soup.select('table.genres-d a.genres-big.genres-index-d')
        if not genre_links:
            log_and_print("No genre links found on the main page.", output_file)
            return
        
        log_and_print(f"Found {len(genre_links)} genres.", output_file)

        # --- 1. Loop through each GENRE ---
        for genre_link in genre_links:
            genre_name = genre_link.contents[0].strip()
            genre_href = genre_link.get('href')
            if not genre_href:
                continue
            genre_url = urljoin(BASE_URL, genre_href)
            
            log_and_print(f"\n--- GENRE: {genre_name} ---", output_file)
            
            genre_soup = get_soup(genre_url, output_file)
            if not genre_soup:
                continue

            station_links = genre_soup.select('a.genres220, a.genres')
            if not station_links:
                log_and_print(f"  No station links found on {genre_url}", output_file)
                continue

            # --- 2. Loop through each STATION within the genre ---
            for station_link in station_links:
                station_name = station_link.find(string=True, recursive=False).strip()
                if not station_name: continue

                station_href = station_link.get('href')
                if not station_href:
                    continue
                station_url = urljoin(BASE_URL, station_href)

                log_and_print(f"  Station: {station_name}", output_file)
                
                station_soup = get_soup(station_url, output_file)
                if not station_soup:
                    continue
                
                # --- 3. Find the script with stream data ---
                script_tag = station_soup.find('script', string=re.compile(r'new Playerjs'))
                if not script_tag or not script_tag.string:
                    log_and_print(f"    Could not find player script on {station_url}", output_file)
                    continue

                script_content = script_tag.string
                
                pattern = re.compile(r'\{\s*"?title"?\s*:\s*"([^"]+)"\s*,\s*"?file"?\s*:\s*"([^"]+)"\s*\}')
                matches = pattern.findall(script_content)
                
                if not matches:
                    log_and_print("    Could not extract stream data from script.", output_file)
                    continue

                # --- 4. Loop through the extracted matches and print streams "1" and "3" ---
                found_any = False
                for match in matches:
                    title, file_url = match[0], match[1]
                    
                    if title in ['1', '3']:
                        if file_url.startswith('//'):
                            file_url = 'http:' + file_url
                        log_and_print(f"    Stream {title}: {file_url}", output_file)
                        found_any = True
                
                if not found_any:
                    log_and_print("    Streams 1 and 3 not found.", output_file)

        log_and_print(f"\nScraping complete. Output saved to {OUTPUT_FILENAME}", output_file)

 if __name__ == "__main__":
    main()
	import requests
	import re
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin

	# --- Configuration ---
	BASE_URL = "http://www.radcap.ru/"
	START_PAGE = urljoin(BASE_URL, "index-d.html")
	OUTPUT_FILENAME = "radcap-all-streams-0022-aug22-2025.txt"

	# Set a user-agent to mimic a browser, which can help avoid being blocked.
	HEADERS = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	def log_and_print(message, file_handle):
	"""Prints a message to the console and writes it to the given file."""
	print(message)
	file_handle.write(message + '\n')

	def get_soup(url, log_file):
	"""Fetches a URL and returns a BeautifulSoup object, or None on error."""
	try:
	response = requests.get(url, headers=HEADERS, timeout=15)
	response.raise_for_status()
	response.encoding = 'utf-8'
	return BeautifulSoup(response.text, 'html.parser')
	except requests.exceptions.RequestException as e:
	log_and_print(f"Error fetching {url}: {e}", log_file)
	return None

	def main():
	"""Main function to scrape the radio station data."""
	# Open the output file; 'w' mode overwrites it if it exists.
	# The 'with' statement ensures the file is closed automatically.
	with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as output_file:
	log_and_print(f"Fetching main genre page: {START_PAGE}", output_file)
	main_soup = get_soup(START_PAGE, output_file)
	if not main_soup:
	log_and_print("Could not fetch the main page. Exiting.", output_file)
	return

	genre_links = main_soup.select('table.genres-d a.genres-big.genres-index-d')
	if not genre_links:
	log_and_print("No genre links found on the main page.", output_file)
	return

	log_and_print(f"Found {len(genre_links)} genres.", output_file)

	# --- 1. Loop through each GENRE ---
	for genre_link in genre_links:
	genre_name = genre_link.contents[0].strip()
	genre_href = genre_link.get('href')
	if not genre_href:
	continue
	genre_url = urljoin(BASE_URL, genre_href)

	log_and_print(f"\n--- GENRE: {genre_name} ---", output_file)

	genre_soup = get_soup(genre_url, output_file)
	if not genre_soup:
	continue

	station_links = genre_soup.select('a.genres220, a.genres')
	if not station_links:
	log_and_print(f" No station links found on {genre_url}", output_file)
	continue

	# --- 2. Loop through each STATION within the genre ---
	for station_link in station_links:
	station_name = station_link.find(string=True, recursive=False).strip()
	if not station_name: continue

	station_href = station_link.get('href')
	if not station_href:
	continue
	station_url = urljoin(BASE_URL, station_href)

	log_and_print(f" Station: {station_name}", output_file)

	station_soup = get_soup(station_url, output_file)
	if not station_soup:
	continue

	# --- 3. Find the script with stream data ---
	script_tag = station_soup.find('script', string=re.compile(r'new Playerjs'))
	if not script_tag or not script_tag.string:
	log_and_print(f" Could not find player script on {station_url}", output_file)
	continue

	script_content = script_tag.string

	pattern = re.compile(r'\{\s"?title"?\s:\s"([^"]+)"\s,\s"?file"?\s:\s"([^"]+)"\s\}')
	matches = pattern.findall(script_content)

	if not matches:
	log_and_print(" Could not extract stream data from script.", output_file)
	continue

	# --- 4. Loop through the extracted matches and print streams "1" and "3" ---
	found_any = False
	for match in matches:
	title, file_url = match[0], match[1]

	if title in ['1', '3']:
	if file_url.startswith('//'):
	file_url = 'http:' + file_url
	log_and_print(f" Stream {title}: {file_url}", output_file)
	found_any = True

	if not found_any:
	log_and_print(" Streams 1 and 3 not found.", output_file)

	log_and_print(f"\nScraping complete. Output saved to {OUTPUT_FILENAME}", output_file)

	if __name__ == "__main__":
	main()
No results found