Created
March 11, 2023 17:31
-
-
Save geo-rge-li/5f526a6c65baf8ba6be3eb5746184b57 to your computer and use it in GitHub Desktop.
NT Scraper Selenium
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[1]: | |
| from gooey import Gooey, GooeyParser | |
| # In[2]: | |
| import argparse | |
| import time | |
| import os | |
| import sys | |
| import re | |
| import json | |
| import http.cookiejar | |
| from selenium import webdriver | |
| from random import randint | |
| import html5lib | |
| from bs4 import BeautifulSoup | |
| from html import escape # python 3.x | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.wait import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| import urllib.request | |
| @Gooey( | |
| program_name='Neopian Times Scraper to Make NTWF Review Threads', | |
| menu=[{ | |
| 'name': 'File', | |
| 'items': [{ | |
| 'type': 'AboutDialog', | |
| 'menuTitle': 'About', | |
| 'name': 'Neopian Times Scraper to Make NTWF Review Threads', | |
| 'description': 'Made by RielCZ for the NTWF. Check me out on the NTWF or Neopets!', | |
| 'version': '1.0.1', | |
| 'copyright': '(c) 2021 RielCZ', | |
| 'website': 'https://ntwriters.proboards.com/' | |
| },{ | |
| 'type': 'Link', | |
| 'menuTitle': 'Link to Sample Neo2NTWF UN File', | |
| 'description': 'Made by Twillie at the NTWF, maintained by the NTWF.', | |
| 'url': 'https://docs.google.com/document/d/1EIbsrbgzU7kNKCAmjp4ORGD9z16Jh8helHaERrtCkEE/edit' | |
| }] | |
| }] | |
| ) | |
| def main(): | |
| parser = GooeyParser(description='Create NT reviews template. Vs. 1') | |
| required = parser.add_argument_group( | |
| "Required Information", | |
| "Required information used to generate basic templates." | |
| ) | |
| relevant = parser.add_argument_group( | |
| "Relevant Information", | |
| "Other information used to generate basic templates." | |
| ) | |
| options = parser.add_argument_group( | |
| "Options", | |
| "Options to modify the basic template." | |
| ) | |
| other = parser.add_argument_group( | |
| "Other", | |
| "Things the average user doesn't need." | |
| ) | |
| required.add_argument('Neo2NTWF_Filepath', metavar='Path to Neo2NTWF Username Text File', help='The path to the text file containing Neopets to NTWF usernames in NeoUN = @NTWFUN format, e.g. File > Link.', widget="FileChooser") | |
| relevant.add_argument('-o', '--outputpath', metavar='Path to Output File', type=str, default=None, help='Where to save the output text file; default is current working directory.', widget="FileSaver") | |
| relevant.add_argument('-e', '--edition', metavar='NT Edition Number', type=int, default=None, help='Which edition to scrape (default is most recent).') | |
| options.add_argument('-f', '--show-ntwf-only', metavar='Show NTWF Pieces Only', action='store_true', help='Show only pieces with an NTWF contributor.') | |
| options.add_argument('-a', '--sort-alphabetically', metavar='Sort Alphabetically', action='store_true', help='Sort the pieces in each section alphabetically.') | |
| options.add_argument('-t', '--catalogue-the', metavar='Special Catalogue "The"', action='store_true', help='Piece titles that start with "The" will have it moved to the title end in form ", The".') | |
| options.add_argument('-s', '--show-empty-categories', metavar='Show Empty Categories', action='store_true', help='Show categories without pieces with "None".') | |
| options.add_argument('--title-text', metavar='Custom Title Text', type=str, default=None, help='Text to add after the edition number in the title (e.g. for a special issue).') | |
| options.add_argument('--post-text', type=str, metavar='Post Tempate Text', default="All reviews of all pieces are welcome! However, if you would really like your piece to be reviewed, post and let others know!", help='Text to add after the normal template.') | |
| #other.add_argument('-v', '--verbose', metavar='Verbose', action='store_true', help='Show debugging text in the console.') | |
| other.add_argument('-j', '--export-json', metavar='Export JSON', action='store_true', help='Dump the extracted text contents (dictionary) in a JSON file.') | |
| other.add_argument('--outputjsonpath', metavar='Path to JSON Output', type=str, default=None, help='Where to save the output JSON file; default is current working directory.', widget="FileSaver") | |
| optionsadv = parser.add_argument_group( | |
| "Advanced Options", | |
| "Options for web scraping and/or text analysis for construction of the basic template." | |
| ) | |
| optionsadv.add_argument('--collab-text', metavar='Description Text to Denote Collabs', type=str, default='collab with; also by', help='Text to denote start of collab authors.') | |
| optionsadv.add_argument('--collab-text-delimiter', type=str, metavar='Collab Text Delimiter', default=';', help='Delimiter for Description Collab Text field.') | |
| template = parser.add_argument_group( | |
| "Template", | |
| "Directly modify the basic template." | |
| ) | |
| template.add_argument('--template-title', metavar='Template Title', type=str, default='[font size="6"][a href="{0}"]Issue {1}{2}[/a][/font]', help='The BBS text for formatting the title. {0}: Edition URL. {1}: Edition number. {2}: Custom title.') | |
| template.add_argument('--template-custom-title', metavar='Template Custom Title', type=str, default=' – {0}', help='The BBS text for formatting the custom title. {0}: Title.') | |
| template.add_argument('--template-quote', metavar='Template Quote', type=str, default='[quote]{0}[/quote]—{1}', help='The BBS text for formatting the Quote of the Week. {0}: Quote. {1}: Author.') | |
| template.add_argument('--template-section', metavar='Template Section', type=str, default='[font size="4"]{0}[/font]', help='The BBS text for formatting the Sections. {0}: Section name.') | |
| template.add_argument('--template-pieces', metavar='Template Pieces', type=str, default='[a href="{0}"]{1}[/a]{2} by {3}', help='The BBS text for formatting the Pieces. {0}: Piece URL. {1}: Piece title. {2}: NTWF user piece decoration. {3} Author(s). {4}: Description.') | |
| template.add_argument('--template-ntwf-user-decoration', metavar='Template NTWF User Piece Decoration', type=str, default='', help='The BBS text after Pieces with an NTWF contributor.') | |
| template.add_argument('--max-chars-in-desc', metavar='Maximum Chars. in Description', type=int, default=None, help='The max. number of piece description chars. to provide if using the description. Default is full description.') | |
| template.add_argument('--template-editorial', metavar='Template Editorial', type=str, default='And don\'t forget [a href="{0}"]The Editorial[/a].', help='The BBS text for formatting the Editorial. {0}: Editorial URL.') | |
| args = parser.parse_args() | |
| EDITION = args.edition | |
| if EDITION is None or EDITION < 150 or int(EDITION) < 150: | |
| EDITION = '' | |
| print('***** PROCESSING EDITION = '+(str(EDITION) if EDITION else "[Current]")+' *****') | |
| sys.stdout.flush() | |
| SECTION_NAME_DICT = {'articles' : 'Articles', | |
| 'shorts' : 'Short Stories', | |
| 'comics' : 'Comics', | |
| 'series' : 'New Series', | |
| 'cont' : 'Continued Series' | |
| } | |
| SHOW_NON_NTWF_NAMES = not args.show_ntwf_only | |
| COLLAB_INDICATORS = [e.strip() for e in args.collab_text.split(args.collab_text_delimiter)] | |
| VERBOSE = False #args.verbose | |
| SORT_ALPHABETICALLY = args.sort_alphabetically | |
| CATALOGUE_THE = args.catalogue_the | |
| TITLE_TEXT = args.title_text | |
| POST_TEXT = args.post_text | |
| SHOW_EMPTY_CATEGORIES = args.show_empty_categories | |
| MAX_DESC_CHARS = args.max_chars_in_desc | |
| OUTPATH = args.outputpath | |
| if OUTPATH is None: | |
| OUTPATH = os.path.join(os.getcwd(), 'output.txt') | |
| elif os.path.isdir(OUTPATH): | |
| OUTPATH = os.path.join(OUTPATH, 'output.txt') | |
| EXPORTJSON = args.export_json | |
| OUTPATHJSON = args.outputjsonpath | |
| if OUTPATHJSON is None: | |
| OUTPATHJSON = os.path.join(os.getcwd(), 'output.json') | |
| elif os.path.isdir(OUTPATHJSON): | |
| OUTPATHJSON = os.path.join(OUTPATHJSON, 'output.json') | |
| # In[4]: | |
| nameDict = {} | |
| with open(args.Neo2NTWF_Filepath, 'r') as f: | |
| for i, line in enumerate(f.readlines()): | |
| temp = line.split("=") | |
| if len(temp) != 2: | |
| if temp is not None and temp[0].strip() is not None: | |
| print("[WARNING] Skipping input Neo2NTWF file line "+str(i+1)+" because it is misformatted: '"+str(temp)+"'", file=sys.stderr, flush=True) | |
| continue | |
| neoUN = temp[0].strip().lower() | |
| forumUN = temp[1].strip().lower() | |
| nameDict[neoUN] = forumUN | |
| if VERBOSE: | |
| print(nameDict) | |
| sys.stdout.flush() | |
| # Set the URL to webscrape from | |
| url = 'http://www.neopets.com/ntimes/index.phtml?week='+str(EDITION) | |
| headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} | |
| # In[5]: | |
| def fetchQuote(responseText): | |
| ''' | |
| Assistance from HERE: https://linuxhint.com/find_children_nodes_beautiful_soup/ | |
| ''' | |
| # Parse HTML and save to BeautifulSoup object | |
| soup = BeautifulSoup(responseText, "html.parser") | |
| #print("SOUUUUP: " + soup) | |
| quote = soup.find('div', {'class' : 'quote'}).text.split('--') | |
| quote_text = quote[0].split('\n')[2].strip() | |
| quote_author = quote[1].strip() | |
| return [quote_text, quote_author] | |
| # In[6]: | |
| def fetchTableInfo(responseText): | |
| ''' | |
| Adapted from HERE: https://stackoverflow.com/questions/23377533/python-beautifulsoup-parsing-table, | |
| https://stackoverflow.com/questions/36538789/beautifulsoup-bs4-how-to-ignore-ending-tag-in-malformed-html, | |
| https://stackoverflow.com/questions/5815747/beautifulsoup-getting-href | |
| ''' | |
| ##responseText = responseText.replace("</P>", "") # F*** the NT and its improper formatting | |
| #print(responseText) # DEBUG | |
| #sys.stdout.flush() # DEBUG | |
| # Parse HTML and save to BeautifulSoup object | |
| soup = BeautifulSoup(responseText, "html5lib") # Gotta use the liberal parser (or do the replace above) | |
| # print(soup) # DEBUG | |
| #sys.stdout.flush() | |
| table = soup.find('td', {'class' : 'content'}).findChild("table").find_all('tr') | |
| listings = [] | |
| for row in table: | |
| col = row.find_all('td')[1] | |
| #print(col, flush=True) # DEBUG | |
| #sys.stdout.flush() # DEBUG | |
| a_tags = col.findAll('a') | |
| listingLink = 'http://www.neopets.com/ntimes/'+a_tags[0]["href"].strip() | |
| listingName = a_tags[0].text.strip() | |
| userName = a_tags[1].text.strip() | |
| listingDesc = 'by'.join(col.get_text()[:-len(userName)].split('by')[:-1]) | |
| listingDesc = listingDesc[len(listingName):].strip() | |
| if CATALOGUE_THE and listingName.split()[0].lower().strip() == "the": | |
| listingName = listingName[3:].strip()+', The' | |
| usernames = [userName] | |
| for indicator in COLLAB_INDICATORS: | |
| listingDescIdx = listingDesc.lower().find(indicator) | |
| if listingDescIdx > 0: | |
| usernames.extend([re.sub(r'\W+', '', e) for e in listingDesc[listingDescIdx+len(indicator):].split() if e.lower() != "and"]) | |
| listingDesc = listingDesc[:listingDescIdx].strip() | |
| break | |
| elif listingDescIdx == 0: | |
| posPeriod = listingDesc.find('.') | |
| usernames.extend([re.sub(r'\W+', '', e) for e in listingDesc[len(indicator):posPeriod].split() if e.lower() != "and"]) | |
| listingDesc = listingDesc[posPeriod+1:].strip() | |
| break | |
| listings.append({"Link" : listingLink, "Name" : listingName, "UN" : usernames, "Desc" : listingDesc}) | |
| if SORT_ALPHABETICALLY: | |
| listings.sort(key=lambda k: k["Name"]) | |
| return listings | |
| # In[7]: | |
| def savePageContentHelper(level, url, headers=None, sleepTime=0, randSleepTimeAddon=0): | |
| toReturn = [] | |
| if VERBOSE: | |
| print("*** PROCESSING QUOTE OF THE WEEK ***") | |
| sys.stdout.flush() | |
| # Connect to the URL | |
| # Note: will OPEN a browser window!!! | |
| dr = webdriver.Firefox() | |
| dr.get(url) | |
| try: | |
| # looking for the footer since it's the last to load and stackpath page has a div with class content unfortunately | |
| element = WebDriverWait(dr, 5000).until( | |
| EC.presence_of_element_located((By.CLASS_NAME, "footer")) | |
| ) | |
| # cookie_j = http.cookiejar.LWPCookieJar() | |
| # opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cookie_j) ) | |
| # response = opener.open(urllib.request.Request(url, headers=headers)).read().decode() | |
| # print(dr.page_source) # DEBUG | |
| #sys.stdout.flush() # DEBUG | |
| quote = fetchQuote(dr.page_source) | |
| if VERBOSE: | |
| print(quote) | |
| sys.stdout.flush() | |
| if VERBOSE: | |
| print("Finished processing quote of the week.") | |
| sys.stdout.flush() | |
| time.sleep(sleepTime+randint(0,randSleepTimeAddon)) # pause the code for this many seconds | |
| finally: | |
| dr.quit() | |
| temp = {} | |
| for key in SECTION_NAME_DICT.keys(): | |
| if VERBOSE: | |
| print("*** PROCESSING "+key+" ***") | |
| sys.stdout.flush() | |
| #print(url+'§ion='+key) # DEBUG | |
| #sys.stdout.flush() # DEBUG | |
| # Connect to the URL | |
| #response = urllib.request.urlopen(urllib.request.Request(url+'§ion='+key, headers=headers)).read().decode() | |
| dr = webdriver.Firefox() | |
| dr.get(url+'§ion='+key) | |
| try: | |
| element = WebDriverWait(dr, 5000).until( | |
| EC.presence_of_element_located((By.CLASS_NAME, "footer")) | |
| ) | |
| #print(response) # DEBUG | |
| #sys.stdout.flush() # DEBUG | |
| temp[key] = fetchTableInfo(dr.page_source) | |
| if VERBOSE: | |
| print(temp[key]) | |
| sys.stdout.flush() | |
| time.sleep(sleepTime+randint(0,randSleepTimeAddon)) # pause the code for this many seconds | |
| finally: | |
| dr.quit() | |
| toReturn = [quote, temp] | |
| return toReturn | |
| # In[8]: | |
| def savePageContent(url, headers=None, sleepTime=0, randSleepTimeAddon=3): # With level 1 recursion, which seems to be sufficient for the NT. | |
| return savePageContentHelper(1, url, headers=headers, sleepTime=sleepTime, randSleepTimeAddon=randSleepTimeAddon) | |
| # In[9]: | |
| results = savePageContent(url, headers=headers, sleepTime=1) | |
| # In[10]: | |
| if VERBOSE: | |
| print(results) | |
| sys.stdout.flush() | |
| # In[12]: | |
| def getForumerName(name, returnSuccess=False): | |
| name = name.lower() | |
| if name in nameDict: | |
| if returnSuccess: | |
| return True, nameDict[name] | |
| # else | |
| return nameDict[name] | |
| # else | |
| if returnSuccess: | |
| return False, name | |
| # else | |
| return name | |
| # In[13]: | |
| def returnUNtext(uns): | |
| len_ = len(uns) | |
| if len_ == 1: | |
| return uns[0] | |
| elif len_ == 2: | |
| return uns[0]+' and '+uns[1] | |
| # else | |
| i = 0 | |
| toReturn = '' | |
| while i < len_: | |
| if i == len_-1: | |
| toReturn += 'and '+uns[i] | |
| else: | |
| toReturn += uns[i]+', ' | |
| i += 1 | |
| return toReturn | |
| # In[14]: | |
| NTText = '' | |
| for key in SECTION_NAME_DICT.keys(): | |
| NTText_ = args.template_section.format(SECTION_NAME_DICT[key])+'\n' | |
| validListings = 0 | |
| for listing in results[1][key]: | |
| usernames = listing["UN"] | |
| usernames2 = [getForumerName(username) for username in usernames if username] | |
| hasNTWFer = any([getForumerName(username, returnSuccess=True)[0] for username in usernames]) | |
| if not SHOW_NON_NTWF_NAMES and not hasNTWFer: | |
| continue | |
| tempDesc = listing["Desc"] | |
| if MAX_DESC_CHARS and len(tempDesc) > abs(MAX_DESC_CHARS): | |
| if MAX_DESC_CHARS > 0: | |
| tempDesc = tempDesc[:MAX_DESC_CHARS].strip()+'...' | |
| elif MAX_DESC_CHARS < 0: | |
| tempDesc = '...'+tempDesc[MAX_DESC_CHARS:].strip() | |
| temp = args.template_pieces.format(listing["Link"], escape(listing["Name"]), args.template_ntwf_user_decoration if hasNTWFer else '', returnUNtext(usernames2), tempDesc)+'\n' | |
| if not EDITION: | |
| editionGetterHelper = listing["Link"].rfind('=') | |
| EDITION = listing["Link"][editionGetterHelper+1:] | |
| url += EDITION | |
| NTText_ += temp | |
| validListings += 1 | |
| if validListings > 0: | |
| NTText += NTText_+'\n' | |
| elif SHOW_EMPTY_CATEGORIES: | |
| NTText += NTText_+'None\n\n' | |
| if VERBOSE: | |
| print(NTText) | |
| sys.stdout.flush() | |
| # In[15]: | |
| outputText = '[div align="center"]'+args.template_title.format(url, EDITION, (args.template_custom_title.format(TITLE_TEXT)) if TITLE_TEXT else '')+'\n\n' | |
| outputText += args.template_quote.format(results[0][0], getForumerName(results[0][1]))+'[/div]'+'\n\n' | |
| outputText += NTText | |
| outputText += args.template_editorial.format(url+'§ion=editorial') | |
| outputText += ('\n\n'+POST_TEXT) if POST_TEXT else '' | |
| if VERBOSE: | |
| print(outputText) | |
| sys.stdout.flush() | |
| with open(OUTPATH, 'w+') as f: | |
| for line in outputText: | |
| f.write(line) | |
| if EXPORTJSON: | |
| with open(OUTPATHJSON, 'w') as j: | |
| json.dump({"quote" : {"Text" : results[0][0], "UN" : results[0][1]}, "pieces" : results[1]}, j) | |
| if __name__ == "__main__": | |
| main() | |
| # EOF |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment