Skip to content

Instantly share code, notes, and snippets.

@geo-rge-li
Created March 11, 2023 17:31
Show Gist options
  • Select an option

  • Save geo-rge-li/5f526a6c65baf8ba6be3eb5746184b57 to your computer and use it in GitHub Desktop.

Select an option

Save geo-rge-li/5f526a6c65baf8ba6be3eb5746184b57 to your computer and use it in GitHub Desktop.
NT Scraper Selenium
#!/usr/bin/env python
# coding: utf-8
# In[1]:
from gooey import Gooey, GooeyParser
# In[2]:
import argparse
import time
import os
import sys
import re
import json
import http.cookiejar
from selenium import webdriver
from random import randint
import html5lib
from bs4 import BeautifulSoup
from html import escape # python 3.x
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import urllib.request
@Gooey(
program_name='Neopian Times Scraper to Make NTWF Review Threads',
menu=[{
'name': 'File',
'items': [{
'type': 'AboutDialog',
'menuTitle': 'About',
'name': 'Neopian Times Scraper to Make NTWF Review Threads',
'description': 'Made by RielCZ for the NTWF. Check me out on the NTWF or Neopets!',
'version': '1.0.1',
'copyright': '(c) 2021 RielCZ',
'website': 'https://ntwriters.proboards.com/'
},{
'type': 'Link',
'menuTitle': 'Link to Sample Neo2NTWF UN File',
'description': 'Made by Twillie at the NTWF, maintained by the NTWF.',
'url': 'https://docs.google.com/document/d/1EIbsrbgzU7kNKCAmjp4ORGD9z16Jh8helHaERrtCkEE/edit'
}]
}]
)
def main():
parser = GooeyParser(description='Create NT reviews template. Vs. 1')
required = parser.add_argument_group(
"Required Information",
"Required information used to generate basic templates."
)
relevant = parser.add_argument_group(
"Relevant Information",
"Other information used to generate basic templates."
)
options = parser.add_argument_group(
"Options",
"Options to modify the basic template."
)
other = parser.add_argument_group(
"Other",
"Things the average user doesn't need."
)
required.add_argument('Neo2NTWF_Filepath', metavar='Path to Neo2NTWF Username Text File', help='The path to the text file containing Neopets to NTWF usernames in NeoUN = @NTWFUN format, e.g. File > Link.', widget="FileChooser")
relevant.add_argument('-o', '--outputpath', metavar='Path to Output File', type=str, default=None, help='Where to save the output text file; default is current working directory.', widget="FileSaver")
relevant.add_argument('-e', '--edition', metavar='NT Edition Number', type=int, default=None, help='Which edition to scrape (default is most recent).')
options.add_argument('-f', '--show-ntwf-only', metavar='Show NTWF Pieces Only', action='store_true', help='Show only pieces with an NTWF contributor.')
options.add_argument('-a', '--sort-alphabetically', metavar='Sort Alphabetically', action='store_true', help='Sort the pieces in each section alphabetically.')
options.add_argument('-t', '--catalogue-the', metavar='Special Catalogue "The"', action='store_true', help='Piece titles that start with "The" will have it moved to the title end in form ", The".')
options.add_argument('-s', '--show-empty-categories', metavar='Show Empty Categories', action='store_true', help='Show categories without pieces with "None".')
options.add_argument('--title-text', metavar='Custom Title Text', type=str, default=None, help='Text to add after the edition number in the title (e.g. for a special issue).')
options.add_argument('--post-text', type=str, metavar='Post Tempate Text', default="All reviews of all pieces are welcome! However, if you would really like your piece to be reviewed, post and let others know!", help='Text to add after the normal template.')
#other.add_argument('-v', '--verbose', metavar='Verbose', action='store_true', help='Show debugging text in the console.')
other.add_argument('-j', '--export-json', metavar='Export JSON', action='store_true', help='Dump the extracted text contents (dictionary) in a JSON file.')
other.add_argument('--outputjsonpath', metavar='Path to JSON Output', type=str, default=None, help='Where to save the output JSON file; default is current working directory.', widget="FileSaver")
optionsadv = parser.add_argument_group(
"Advanced Options",
"Options for web scraping and/or text analysis for construction of the basic template."
)
optionsadv.add_argument('--collab-text', metavar='Description Text to Denote Collabs', type=str, default='collab with; also by', help='Text to denote start of collab authors.')
optionsadv.add_argument('--collab-text-delimiter', type=str, metavar='Collab Text Delimiter', default=';', help='Delimiter for Description Collab Text field.')
template = parser.add_argument_group(
"Template",
"Directly modify the basic template."
)
template.add_argument('--template-title', metavar='Template Title', type=str, default='[font size="6"][a href="{0}"]Issue {1}{2}[/a][/font]', help='The BBS text for formatting the title. {0}: Edition URL. {1}: Edition number. {2}: Custom title.')
template.add_argument('--template-custom-title', metavar='Template Custom Title', type=str, default=' – {0}', help='The BBS text for formatting the custom title. {0}: Title.')
template.add_argument('--template-quote', metavar='Template Quote', type=str, default='[quote]{0}[/quote]—{1}', help='The BBS text for formatting the Quote of the Week. {0}: Quote. {1}: Author.')
template.add_argument('--template-section', metavar='Template Section', type=str, default='[font size="4"]{0}[/font]', help='The BBS text for formatting the Sections. {0}: Section name.')
template.add_argument('--template-pieces', metavar='Template Pieces', type=str, default='[a href="{0}"]{1}[/a]{2} by {3}', help='The BBS text for formatting the Pieces. {0}: Piece URL. {1}: Piece title. {2}: NTWF user piece decoration. {3} Author(s). {4}: Description.')
template.add_argument('--template-ntwf-user-decoration', metavar='Template NTWF User Piece Decoration', type=str, default='', help='The BBS text after Pieces with an NTWF contributor.')
template.add_argument('--max-chars-in-desc', metavar='Maximum Chars. in Description', type=int, default=None, help='The max. number of piece description chars. to provide if using the description. Default is full description.')
template.add_argument('--template-editorial', metavar='Template Editorial', type=str, default='And don\'t forget [a href="{0}"]The Editorial[/a].', help='The BBS text for formatting the Editorial. {0}: Editorial URL.')
args = parser.parse_args()
EDITION = args.edition
if EDITION is None or EDITION < 150 or int(EDITION) < 150:
EDITION = ''
print('***** PROCESSING EDITION = '+(str(EDITION) if EDITION else "[Current]")+' *****')
sys.stdout.flush()
SECTION_NAME_DICT = {'articles' : 'Articles',
'shorts' : 'Short Stories',
'comics' : 'Comics',
'series' : 'New Series',
'cont' : 'Continued Series'
}
SHOW_NON_NTWF_NAMES = not args.show_ntwf_only
COLLAB_INDICATORS = [e.strip() for e in args.collab_text.split(args.collab_text_delimiter)]
VERBOSE = False #args.verbose
SORT_ALPHABETICALLY = args.sort_alphabetically
CATALOGUE_THE = args.catalogue_the
TITLE_TEXT = args.title_text
POST_TEXT = args.post_text
SHOW_EMPTY_CATEGORIES = args.show_empty_categories
MAX_DESC_CHARS = args.max_chars_in_desc
OUTPATH = args.outputpath
if OUTPATH is None:
OUTPATH = os.path.join(os.getcwd(), 'output.txt')
elif os.path.isdir(OUTPATH):
OUTPATH = os.path.join(OUTPATH, 'output.txt')
EXPORTJSON = args.export_json
OUTPATHJSON = args.outputjsonpath
if OUTPATHJSON is None:
OUTPATHJSON = os.path.join(os.getcwd(), 'output.json')
elif os.path.isdir(OUTPATHJSON):
OUTPATHJSON = os.path.join(OUTPATHJSON, 'output.json')
# In[4]:
nameDict = {}
with open(args.Neo2NTWF_Filepath, 'r') as f:
for i, line in enumerate(f.readlines()):
temp = line.split("=")
if len(temp) != 2:
if temp is not None and temp[0].strip() is not None:
print("[WARNING] Skipping input Neo2NTWF file line "+str(i+1)+" because it is misformatted: '"+str(temp)+"'", file=sys.stderr, flush=True)
continue
neoUN = temp[0].strip().lower()
forumUN = temp[1].strip().lower()
nameDict[neoUN] = forumUN
if VERBOSE:
print(nameDict)
sys.stdout.flush()
# Set the URL to webscrape from
url = 'http://www.neopets.com/ntimes/index.phtml?week='+str(EDITION)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
# In[5]:
def fetchQuote(responseText):
'''
Assistance from HERE: https://linuxhint.com/find_children_nodes_beautiful_soup/
'''
# Parse HTML and save to BeautifulSoup object
soup = BeautifulSoup(responseText, "html.parser")
#print("SOUUUUP: " + soup)
quote = soup.find('div', {'class' : 'quote'}).text.split('--')
quote_text = quote[0].split('\n')[2].strip()
quote_author = quote[1].strip()
return [quote_text, quote_author]
# In[6]:
def fetchTableInfo(responseText):
'''
Adapted from HERE: https://stackoverflow.com/questions/23377533/python-beautifulsoup-parsing-table,
https://stackoverflow.com/questions/36538789/beautifulsoup-bs4-how-to-ignore-ending-tag-in-malformed-html,
https://stackoverflow.com/questions/5815747/beautifulsoup-getting-href
'''
##responseText = responseText.replace("</P>", "") # F*** the NT and its improper formatting
#print(responseText) # DEBUG
#sys.stdout.flush() # DEBUG
# Parse HTML and save to BeautifulSoup object
soup = BeautifulSoup(responseText, "html5lib") # Gotta use the liberal parser (or do the replace above)
# print(soup) # DEBUG
#sys.stdout.flush()
table = soup.find('td', {'class' : 'content'}).findChild("table").find_all('tr')
listings = []
for row in table:
col = row.find_all('td')[1]
#print(col, flush=True) # DEBUG
#sys.stdout.flush() # DEBUG
a_tags = col.findAll('a')
listingLink = 'http://www.neopets.com/ntimes/'+a_tags[0]["href"].strip()
listingName = a_tags[0].text.strip()
userName = a_tags[1].text.strip()
listingDesc = 'by'.join(col.get_text()[:-len(userName)].split('by')[:-1])
listingDesc = listingDesc[len(listingName):].strip()
if CATALOGUE_THE and listingName.split()[0].lower().strip() == "the":
listingName = listingName[3:].strip()+', The'
usernames = [userName]
for indicator in COLLAB_INDICATORS:
listingDescIdx = listingDesc.lower().find(indicator)
if listingDescIdx > 0:
usernames.extend([re.sub(r'\W+', '', e) for e in listingDesc[listingDescIdx+len(indicator):].split() if e.lower() != "and"])
listingDesc = listingDesc[:listingDescIdx].strip()
break
elif listingDescIdx == 0:
posPeriod = listingDesc.find('.')
usernames.extend([re.sub(r'\W+', '', e) for e in listingDesc[len(indicator):posPeriod].split() if e.lower() != "and"])
listingDesc = listingDesc[posPeriod+1:].strip()
break
listings.append({"Link" : listingLink, "Name" : listingName, "UN" : usernames, "Desc" : listingDesc})
if SORT_ALPHABETICALLY:
listings.sort(key=lambda k: k["Name"])
return listings
# In[7]:
def savePageContentHelper(level, url, headers=None, sleepTime=0, randSleepTimeAddon=0):
toReturn = []
if VERBOSE:
print("*** PROCESSING QUOTE OF THE WEEK ***")
sys.stdout.flush()
# Connect to the URL
# Note: will OPEN a browser window!!!
dr = webdriver.Firefox()
dr.get(url)
try:
# looking for the footer since it's the last to load and stackpath page has a div with class content unfortunately
element = WebDriverWait(dr, 5000).until(
EC.presence_of_element_located((By.CLASS_NAME, "footer"))
)
# cookie_j = http.cookiejar.LWPCookieJar()
# opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cookie_j) )
# response = opener.open(urllib.request.Request(url, headers=headers)).read().decode()
# print(dr.page_source) # DEBUG
#sys.stdout.flush() # DEBUG
quote = fetchQuote(dr.page_source)
if VERBOSE:
print(quote)
sys.stdout.flush()
if VERBOSE:
print("Finished processing quote of the week.")
sys.stdout.flush()
time.sleep(sleepTime+randint(0,randSleepTimeAddon)) # pause the code for this many seconds
finally:
dr.quit()
temp = {}
for key in SECTION_NAME_DICT.keys():
if VERBOSE:
print("*** PROCESSING "+key+" ***")
sys.stdout.flush()
#print(url+'&section='+key) # DEBUG
#sys.stdout.flush() # DEBUG
# Connect to the URL
#response = urllib.request.urlopen(urllib.request.Request(url+'&section='+key, headers=headers)).read().decode()
dr = webdriver.Firefox()
dr.get(url+'&section='+key)
try:
element = WebDriverWait(dr, 5000).until(
EC.presence_of_element_located((By.CLASS_NAME, "footer"))
)
#print(response) # DEBUG
#sys.stdout.flush() # DEBUG
temp[key] = fetchTableInfo(dr.page_source)
if VERBOSE:
print(temp[key])
sys.stdout.flush()
time.sleep(sleepTime+randint(0,randSleepTimeAddon)) # pause the code for this many seconds
finally:
dr.quit()
toReturn = [quote, temp]
return toReturn
# In[8]:
def savePageContent(url, headers=None, sleepTime=0, randSleepTimeAddon=3): # With level 1 recursion, which seems to be sufficient for the NT.
return savePageContentHelper(1, url, headers=headers, sleepTime=sleepTime, randSleepTimeAddon=randSleepTimeAddon)
# In[9]:
results = savePageContent(url, headers=headers, sleepTime=1)
# In[10]:
if VERBOSE:
print(results)
sys.stdout.flush()
# In[12]:
def getForumerName(name, returnSuccess=False):
name = name.lower()
if name in nameDict:
if returnSuccess:
return True, nameDict[name]
# else
return nameDict[name]
# else
if returnSuccess:
return False, name
# else
return name
# In[13]:
def returnUNtext(uns):
len_ = len(uns)
if len_ == 1:
return uns[0]
elif len_ == 2:
return uns[0]+' and '+uns[1]
# else
i = 0
toReturn = ''
while i < len_:
if i == len_-1:
toReturn += 'and '+uns[i]
else:
toReturn += uns[i]+', '
i += 1
return toReturn
# In[14]:
NTText = ''
for key in SECTION_NAME_DICT.keys():
NTText_ = args.template_section.format(SECTION_NAME_DICT[key])+'\n'
validListings = 0
for listing in results[1][key]:
usernames = listing["UN"]
usernames2 = [getForumerName(username) for username in usernames if username]
hasNTWFer = any([getForumerName(username, returnSuccess=True)[0] for username in usernames])
if not SHOW_NON_NTWF_NAMES and not hasNTWFer:
continue
tempDesc = listing["Desc"]
if MAX_DESC_CHARS and len(tempDesc) > abs(MAX_DESC_CHARS):
if MAX_DESC_CHARS > 0:
tempDesc = tempDesc[:MAX_DESC_CHARS].strip()+'...'
elif MAX_DESC_CHARS < 0:
tempDesc = '...'+tempDesc[MAX_DESC_CHARS:].strip()
temp = args.template_pieces.format(listing["Link"], escape(listing["Name"]), args.template_ntwf_user_decoration if hasNTWFer else '', returnUNtext(usernames2), tempDesc)+'\n'
if not EDITION:
editionGetterHelper = listing["Link"].rfind('=')
EDITION = listing["Link"][editionGetterHelper+1:]
url += EDITION
NTText_ += temp
validListings += 1
if validListings > 0:
NTText += NTText_+'\n'
elif SHOW_EMPTY_CATEGORIES:
NTText += NTText_+'None\n\n'
if VERBOSE:
print(NTText)
sys.stdout.flush()
# In[15]:
outputText = '[div align="center"]'+args.template_title.format(url, EDITION, (args.template_custom_title.format(TITLE_TEXT)) if TITLE_TEXT else '')+'\n\n'
outputText += args.template_quote.format(results[0][0], getForumerName(results[0][1]))+'[/div]'+'\n\n'
outputText += NTText
outputText += args.template_editorial.format(url+'&section=editorial')
outputText += ('\n\n'+POST_TEXT) if POST_TEXT else ''
if VERBOSE:
print(outputText)
sys.stdout.flush()
with open(OUTPATH, 'w+') as f:
for line in outputText:
f.write(line)
if EXPORTJSON:
with open(OUTPATHJSON, 'w') as j:
json.dump({"quote" : {"Text" : results[0][0], "UN" : results[0][1]}, "pieces" : results[1]}, j)
if __name__ == "__main__":
main()
# EOF
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment