#!/usr/bin/env bash # ------------------------------------------------------------------------- # - # Cinemassacre.com Angry Video Game Nerd (AVGN) Episode Scraper - # - # Created by Fonic - # Date: 02/18/23 - 02/19/23 - # - # Why use cinemassacre.com instead of Youtube?: - # 1) cinemassacre.com is THE reference source for AVGN episodes - # 2) video titles on cinemassacre.com contain episode numbers - # - # Caveats: - # Most videos listed on cinemassacre.com are stored on odysee.com. - # Downloads from there are slow and unstable. Thus, one might be - # better off to download videos from Youtube instead and just use - # episode and title information from this scraper. - # - # ------------------------------------------------------------------------- # -------------------------------------- # Globals - # -------------------------------------- CURL_OPTS=("--fail" "--location" "--silent" "--show-error" "--retry" "2" "--connect-timeout" "60") LIST_PAGE_URL="https://cinemassacre.com/category/angry-video-game-nerd/page/%d/" # also works for page 1 which redirects to URL without trailing '/page/x/' LIST_PAGE_COUNT=6 VIDEO_PAGE_URL_RE='([^<]+)' # video URL, video title VIDEO_TITLE_EPISODE_PART_RE="Episode ([0-9]+)( Part ([0-9]+))?" # part is optional (part number will be in BASH_REMATCH[3]!) VIDEO_PAGE_STOP_MARKER='
' # stop parsing here to skip unrelated videos at bottom of listing page VIDEO_PAGE_URLS_FILE="video-page-urls.txt" VIDEO_TITLES_FILE="video-titles.txt" VIDEO_DOWNLOAD_URL_RE='
' VIDEO_DOWNOAD_URLS_FILE="video-download-urls.txt" # -------------------------------------- # Functions - # -------------------------------------- # Print normal/hilite/good/warn/error message [$*: message] function printn() { echo -e "$*"; } function printh() { echo -e "\e[1m$*\e[0m"; } function printg() { echo -e "\e[1;32m$*\e[0m"; } function printw() { echo -e "\e[1;33m$*\e[0m" >&2; } function printe() { echo -e "\e[1;31m$*\e[0m" >&2; } # -------------------------------------- # Main - # -------------------------------------- # Set up error handling set -ue; trap "printe \"Error: an unhandled error occurred on line \${LINENO}, aborting.\"; exit 1" ERR # Set up storages declare -A video_page_urls=() declare -A video_titles=() declare -A video_download_urls=() # Process video listing pages and extract video page URLs (both on cinemassacre.com) for ((list_page_index=1; list_page_index <= ${LIST_PAGE_COUNT}; list_page_index++)); do printh "Processing video listing page ${list_page_index}:" printf -v list_page_url "${LIST_PAGE_URL}" ${list_page_index} printn "Video listing page '${list_page_url}'..." # Parse listing page, extract video page URLs url_title_count=0 while read -r line; do if [[ "${line}" =~ ${VIDEO_PAGE_URL_RE} ]]; then video_page_url="${BASH_REMATCH[1]}" video_title="${BASH_REMATCH[2]}" # Determine episode and part if [[ "${video_title}" =~ ${VIDEO_TITLE_EPISODE_PART_RE} ]]; then if [[ -z "${BASH_REMATCH[3]}" ]]; then # got part? episode_part="E${BASH_REMATCH[1]}" # episode only else episode_part="E${BASH_REMATCH[1]}_P${BASH_REMATCH[3]}" # episode + part fi else printw "No episode/part match for title '${video_title}'" episode_part="E???" fi # Store video URL and video title video_page_urls["${episode_part}"]="${video_page_url}" video_titles["${episode_part}"]="${video_title}" url_title_count=$((url_title_count + 1)) #printd "[DEBUG] Video URL: '${video_page_url}', video title: '${video_title}'" elif [[ "${line}" == "${VIDEO_PAGE_STOP_MARKER}" ]]; then break # stop parsing #else # printd "[DEBUG] No match for line: '${line}'" fi done < <(curl "${CURL_OPTS[@]}" "${list_page_url}") printn "Got ${url_title_count} video page URLs and video titles from video listing page." done printn printg "Got ${#video_page_urls[@]} video page URLs and video titles from video listing pages in total." printn # Save video page URLs and video titles to file printh "Saving video page URLs and video titles to files:" if [[ -f "${VIDEO_PAGE_URLS_FILE}" ]]; then printn "Backing up existing '${VIDEO_PAGE_URLS_FILE}'..." cp "${VIDEO_PAGE_URLS_FILE}" "${VIDEO_PAGE_URLS_FILE}.old" fi printn "Writing '${VIDEO_PAGE_URLS_FILE}'..." for episode_part in "${!video_page_urls[@]}"; do echo "${episode_part}: ${video_page_urls["${episode_part}"]}" done | sort -V | uniq > "${VIDEO_PAGE_URLS_FILE}" if [[ -f "${VIDEO_TITLES_FILE}" ]]; then printn "Backing up existing '${VIDEO_TITLES_FILE}'..." cp "${VIDEO_TITLES_FILE}" "${VIDEO_TITLES_FILE}.old" fi printn "Writing '${VIDEO_TITLES_FILE}'..." for episode_part in "${!video_titles[@]}"; do video_title="${video_titles["${episode_part}"]}" video_title="${video_title//"&"/"&"}" # & == '&' -> '&' #video_title="${video_title//"–"/":"}" # – == ':' -> ':' video_title="${video_title//"–"/"-"}" # – == ':' -> '-' (better) video_title="${video_title//"’"/"'"}" # ’ == "'" -> "'" video_title="${video_title//"“"/"'"}" # “ == left double quote -> "'" video_title="${video_title//"”"/"'"}" # ” == right double quote -> "'" #video_title="${video_title//"&;"/""}" # &; == '' -> '' echo "${episode_part}: ${video_title}" done | sort -V | uniq > "${VIDEO_TITLES_FILE}" printn # Process video pages (on cinemassacre.com) to extract video download URLs (videos are stored on odysee.com, lbry.tv, and Youtube) printh "Processing video pages to extract video download URLs..." for episode_part in "${!video_page_urls[@]}"; do video_page_url="${video_page_urls["${episode_part}"]}" printn "Video page '${video_page_url}'..." url_count=0 while read -r line; do if [[ "${line}" =~ ${VIDEO_DOWNLOAD_URL_RE} ]]; then video_download_url="${BASH_REMATCH[1]}" video_download_urls["${episode_part}"]="${video_download_url}" url_count=$((url_count + 1)) #printd "[DEBUG] Video download URL: '${video_download_url}'" #else # printd "[DEBUG] No match for line: '${line}'" fi done < <(curl "${CURL_OPTS[@]}" "${video_page_url}") if (( ${url_count} != 1 )); then # could be 0 or more than 1 printw "Got ${url_count} video download URLs from video page." fi done printn printg "Got ${#video_download_urls[@]} video download URLs from video pages in total." printn # Save video download URLs to file printh "Saving video download URLs to file..." if [[ -f "${VIDEO_DOWNOAD_URLS_FILE}" ]]; then printn "Backing up existing '${VIDEO_DOWNOAD_URLS_FILE}'..." cp "${VIDEO_DOWNOAD_URLS_FILE}" "${VIDEO_DOWNOAD_URLS_FILE}.old" fi printn "Writing '${VIDEO_DOWNOAD_URLS_FILE}'..." for episode_part in "${!video_download_urls[@]}"; do echo "${episode_part}: ${video_download_urls["${episode_part}"]}" done | sort -V | uniq > "${VIDEO_DOWNOAD_URLS_FILE}"