Created
August 21, 2025 18:54
-
-
Save antonl-dev/1f1bc392945c16e64f7603086da68793 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| import re | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin | |
| # --- Configuration --- | |
| BASE_URL = "http://www.radcap.ru/" | |
| START_PAGE = urljoin(BASE_URL, "index-d.html") | |
| OUTPUT_FILENAME = "radcap-all-streams-0022-aug22-2025.txt" | |
| # Set a user-agent to mimic a browser, which can help avoid being blocked. | |
| HEADERS = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| def log_and_print(message, file_handle): | |
| """Prints a message to the console and writes it to the given file.""" | |
| print(message) | |
| file_handle.write(message + '\n') | |
| def get_soup(url, log_file): | |
| """Fetches a URL and returns a BeautifulSoup object, or None on error.""" | |
| try: | |
| response = requests.get(url, headers=HEADERS, timeout=15) | |
| response.raise_for_status() | |
| response.encoding = 'utf-8' | |
| return BeautifulSoup(response.text, 'html.parser') | |
| except requests.exceptions.RequestException as e: | |
| log_and_print(f"Error fetching {url}: {e}", log_file) | |
| return None | |
| def main(): | |
| """Main function to scrape the radio station data.""" | |
| # Open the output file; 'w' mode overwrites it if it exists. | |
| # The 'with' statement ensures the file is closed automatically. | |
| with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as output_file: | |
| log_and_print(f"Fetching main genre page: {START_PAGE}", output_file) | |
| main_soup = get_soup(START_PAGE, output_file) | |
| if not main_soup: | |
| log_and_print("Could not fetch the main page. Exiting.", output_file) | |
| return | |
| genre_links = main_soup.select('table.genres-d a.genres-big.genres-index-d') | |
| if not genre_links: | |
| log_and_print("No genre links found on the main page.", output_file) | |
| return | |
| log_and_print(f"Found {len(genre_links)} genres.", output_file) | |
| # --- 1. Loop through each GENRE --- | |
| for genre_link in genre_links: | |
| genre_name = genre_link.contents[0].strip() | |
| genre_href = genre_link.get('href') | |
| if not genre_href: | |
| continue | |
| genre_url = urljoin(BASE_URL, genre_href) | |
| log_and_print(f"\n--- GENRE: {genre_name} ---", output_file) | |
| genre_soup = get_soup(genre_url, output_file) | |
| if not genre_soup: | |
| continue | |
| station_links = genre_soup.select('a.genres220, a.genres') | |
| if not station_links: | |
| log_and_print(f" No station links found on {genre_url}", output_file) | |
| continue | |
| # --- 2. Loop through each STATION within the genre --- | |
| for station_link in station_links: | |
| station_name = station_link.find(string=True, recursive=False).strip() | |
| if not station_name: continue | |
| station_href = station_link.get('href') | |
| if not station_href: | |
| continue | |
| station_url = urljoin(BASE_URL, station_href) | |
| log_and_print(f" Station: {station_name}", output_file) | |
| station_soup = get_soup(station_url, output_file) | |
| if not station_soup: | |
| continue | |
| # --- 3. Find the script with stream data --- | |
| script_tag = station_soup.find('script', string=re.compile(r'new Playerjs')) | |
| if not script_tag or not script_tag.string: | |
| log_and_print(f" Could not find player script on {station_url}", output_file) | |
| continue | |
| script_content = script_tag.string | |
| pattern = re.compile(r'\{\s*"?title"?\s*:\s*"([^"]+)"\s*,\s*"?file"?\s*:\s*"([^"]+)"\s*\}') | |
| matches = pattern.findall(script_content) | |
| if not matches: | |
| log_and_print(" Could not extract stream data from script.", output_file) | |
| continue | |
| # --- 4. Loop through the extracted matches and print streams "1" and "3" --- | |
| found_any = False | |
| for match in matches: | |
| title, file_url = match[0], match[1] | |
| if title in ['1', '3']: | |
| if file_url.startswith('//'): | |
| file_url = 'http:' + file_url | |
| log_and_print(f" Stream {title}: {file_url}", output_file) | |
| found_any = True | |
| if not found_any: | |
| log_and_print(" Streams 1 and 3 not found.", output_file) | |
| log_and_print(f"\nScraping complete. Output saved to {OUTPUT_FILENAME}", output_file) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment