geo-rge-li · March 11, 2023 17:31
diff --git a/NTscraper.py b/NTscraper.py
 #!/usr/bin/env python
 # coding: utf-8

 # In[1]:


 from gooey import Gooey, GooeyParser

 # In[2]:

 import argparse

 import time
 import os
 import sys
 import re
 import json
 import http.cookiejar
 from selenium import webdriver

 from random import randint
 import html5lib
 from bs4 import BeautifulSoup
 from html import escape  # python 3.x
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC

 import urllib.request

 @Gooey(
    program_name='Neopian Times Scraper to Make NTWF Review Threads',
    menu=[{
        'name': 'File',
        'items': [{
                'type': 'AboutDialog',
                'menuTitle': 'About',
                'name': 'Neopian Times Scraper to Make NTWF Review Threads',
                'description': 'Made by RielCZ for the NTWF. Check me out on the NTWF or Neopets!',
                'version': '1.0.1',
                'copyright': '(c) 2021 RielCZ',
                'website': 'https://ntwriters.proboards.com/'
            },{
                'type': 'Link',
                'menuTitle': 'Link to Sample Neo2NTWF UN File',
                'description': 'Made by Twillie at the NTWF, maintained by the NTWF.',
                'url': 'https://docs.google.com/document/d/1EIbsrbgzU7kNKCAmjp4ORGD9z16Jh8helHaERrtCkEE/edit'
            }]
        }]
 )

 def main():
    parser = GooeyParser(description='Create NT reviews template. Vs. 1')
    
    required = parser.add_argument_group(
        "Required Information", 
        "Required information used to generate basic templates."
    )
    
    relevant = parser.add_argument_group(
        "Relevant Information", 
        "Other information used to generate basic templates."
    )
    
    options = parser.add_argument_group(
        "Options", 
        "Options to modify the basic template."
    )
    
    other = parser.add_argument_group(
        "Other", 
        "Things the average user doesn't need."
    )
    
    required.add_argument('Neo2NTWF_Filepath', metavar='Path to Neo2NTWF Username Text File', help='The path to the text file containing Neopets to NTWF usernames in NeoUN = @NTWFUN format, e.g. File > Link.', widget="FileChooser")
    relevant.add_argument('-o', '--outputpath', metavar='Path to Output File', type=str, default=None, help='Where to save the output text file; default is current working directory.', widget="FileSaver")
    
    relevant.add_argument('-e', '--edition', metavar='NT Edition Number', type=int, default=None, help='Which edition to scrape (default is most recent).')
    options.add_argument('-f', '--show-ntwf-only', metavar='Show NTWF Pieces Only', action='store_true', help='Show only pieces with an NTWF contributor.')
    options.add_argument('-a', '--sort-alphabetically', metavar='Sort Alphabetically', action='store_true', help='Sort the pieces in each section alphabetically.')
    options.add_argument('-t', '--catalogue-the', metavar='Special Catalogue "The"', action='store_true', help='Piece titles that start with "The" will have it moved to the title end in form ", The".')
    options.add_argument('-s', '--show-empty-categories', metavar='Show Empty Categories', action='store_true', help='Show categories without pieces with "None".')
       
    options.add_argument('--title-text', metavar='Custom Title Text', type=str, default=None, help='Text to add after the edition number in the title (e.g. for a special issue).')
    options.add_argument('--post-text', type=str, metavar='Post Tempate Text', default="All reviews of all pieces are welcome! However, if you would really like your piece to be reviewed, post and let others know!", help='Text to add after the normal template.')
    
    #other.add_argument('-v', '--verbose', metavar='Verbose', action='store_true', help='Show debugging text in the console.')
    other.add_argument('-j', '--export-json', metavar='Export JSON', action='store_true', help='Dump the extracted text contents (dictionary) in a JSON file.')
    other.add_argument('--outputjsonpath', metavar='Path to JSON Output', type=str, default=None, help='Where to save the output JSON file; default is current working directory.', widget="FileSaver")
    
    optionsadv = parser.add_argument_group(
        "Advanced Options", 
        "Options for web scraping and/or text analysis for construction of the basic template."
    )
    
    optionsadv.add_argument('--collab-text', metavar='Description Text to Denote Collabs', type=str, default='collab with; also by', help='Text to denote start of collab authors.')
    optionsadv.add_argument('--collab-text-delimiter', type=str, metavar='Collab Text Delimiter', default=';', help='Delimiter for Description Collab Text field.')
    
    template = parser.add_argument_group(
        "Template", 
        "Directly modify the basic template."
    )
    
    template.add_argument('--template-title', metavar='Template Title', type=str, default='[font size="6"][a href="{0}"]Issue {1}{2}[/a][/font]', help='The BBS text for formatting the title. {0}: Edition URL. {1}: Edition number. {2}: Custom title.')
    
    template.add_argument('--template-custom-title', metavar='Template Custom Title', type=str, default=' &ndash; {0}', help='The BBS text for formatting the custom title. {0}: Title.')
    
    template.add_argument('--template-quote', metavar='Template Quote', type=str, default='[quote]{0}[/quote]&mdash;{1}', help='The BBS text for formatting the Quote of the Week. {0}: Quote. {1}: Author.')
    
    template.add_argument('--template-section', metavar='Template Section', type=str, default='[font size="4"]{0}[/font]', help='The BBS text for formatting the Sections. {0}: Section name.')
    
    template.add_argument('--template-pieces', metavar='Template Pieces', type=str, default='[a href="{0}"]{1}[/a]{2} by {3}', help='The BBS text for formatting the Pieces. {0}: Piece URL. {1}: Piece title. {2}: NTWF user piece decoration. {3} Author(s). {4}: Description.')
    
    template.add_argument('--template-ntwf-user-decoration', metavar='Template NTWF User Piece Decoration', type=str, default='', help='The BBS text after Pieces with an NTWF contributor.')
    
    template.add_argument('--max-chars-in-desc', metavar='Maximum Chars. in Description', type=int, default=None, help='The max. number of piece description chars. to provide if using the description. Default is full description.')
    
    template.add_argument('--template-editorial', metavar='Template Editorial', type=str, default='And don\'t forget [a href="{0}"]The Editorial[/a].', help='The BBS text for formatting the Editorial. {0}: Editorial URL.')
    
    args = parser.parse_args()
    
    EDITION = args.edition
    if EDITION is None or EDITION < 150 or int(EDITION) < 150:
        EDITION = ''
    print('***** PROCESSING EDITION = '+(str(EDITION) if EDITION else "[Current]")+' *****')
    sys.stdout.flush() 
    
    SECTION_NAME_DICT = {'articles' : 'Articles', 
                         'shorts' : 'Short Stories', 
                         'comics' : 'Comics', 
                         'series' : 'New Series',
                         'cont' : 'Continued Series'
                        }
    
    SHOW_NON_NTWF_NAMES = not args.show_ntwf_only
    
    COLLAB_INDICATORS = [e.strip() for e in args.collab_text.split(args.collab_text_delimiter)]
    
    VERBOSE = False #args.verbose
    SORT_ALPHABETICALLY = args.sort_alphabetically
    CATALOGUE_THE = args.catalogue_the
    
    TITLE_TEXT = args.title_text
    POST_TEXT = args.post_text
    
    SHOW_EMPTY_CATEGORIES = args.show_empty_categories
    
    MAX_DESC_CHARS = args.max_chars_in_desc
    
    OUTPATH = args.outputpath
    if OUTPATH is None:
        OUTPATH = os.path.join(os.getcwd(), 'output.txt')
    elif os.path.isdir(OUTPATH):
        OUTPATH = os.path.join(OUTPATH, 'output.txt')
    
    EXPORTJSON = args.export_json
    
    OUTPATHJSON = args.outputjsonpath
    if OUTPATHJSON is None:
        OUTPATHJSON = os.path.join(os.getcwd(), 'output.json')
    elif os.path.isdir(OUTPATHJSON):
        OUTPATHJSON = os.path.join(OUTPATHJSON, 'output.json')
    
    # In[4]:   
    
    nameDict = {}
    
    with open(args.Neo2NTWF_Filepath, 'r') as f:
        for i, line in enumerate(f.readlines()):
            temp = line.split("=")
            if len(temp) != 2:
                if temp is not None and temp[0].strip() is not None:
                    print("[WARNING] Skipping input Neo2NTWF file line "+str(i+1)+" because it is misformatted: '"+str(temp)+"'", file=sys.stderr, flush=True)
                    continue
            neoUN = temp[0].strip().lower()
            forumUN = temp[1].strip().lower()
            nameDict[neoUN] = forumUN
    
    if VERBOSE:
        print(nameDict)
        sys.stdout.flush() 
    
    # Set the URL to webscrape from
    url = 'http://www.neopets.com/ntimes/index.phtml?week='+str(EDITION)
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    
    
    # In[5]:
    
    
    def fetchQuote(responseText):
        '''
        Assistance from HERE: https://linuxhint.com/find_children_nodes_beautiful_soup/
        '''
    
        # Parse HTML and save to BeautifulSoup object

        soup = BeautifulSoup(responseText, "html.parser")
        #print("SOUUUUP: " + soup)
        quote = soup.find('div', {'class' : 'quote'}).text.split('--')
        
        quote_text = quote[0].split('\n')[2].strip()
        quote_author = quote[1].strip()
    
        return [quote_text, quote_author]
    
    
    # In[6]:
    
    
    def fetchTableInfo(responseText):
        '''
        Adapted from HERE: https://stackoverflow.com/questions/23377533/python-beautifulsoup-parsing-table,
        https://stackoverflow.com/questions/36538789/beautifulsoup-bs4-how-to-ignore-ending-tag-in-malformed-html,
        https://stackoverflow.com/questions/5815747/beautifulsoup-getting-href
        '''
        ##responseText = responseText.replace("</P>", "") # F*** the NT and its improper formatting
        #print(responseText) # DEBUG 
        #sys.stdout.flush()  # DEBUG 
        
        # Parse HTML and save to BeautifulSoup object
        soup = BeautifulSoup(responseText, "html5lib") # Gotta use the liberal parser (or do the replace above)
        # print(soup) # DEBUG
        #sys.stdout.flush() 
        table = soup.find('td', {'class' : 'content'}).findChild("table").find_all('tr')
    
        listings = []
        
        for row in table:
            col = row.find_all('td')[1]
            #print(col, flush=True) # DEBUG
            #sys.stdout.flush()  # DEBUG
            
            a_tags = col.findAll('a')
            listingLink = 'http://www.neopets.com/ntimes/'+a_tags[0]["href"].strip()
            listingName = a_tags[0].text.strip()
            userName    = a_tags[1].text.strip()
            listingDesc = 'by'.join(col.get_text()[:-len(userName)].split('by')[:-1])
            listingDesc = listingDesc[len(listingName):].strip()
            
            if CATALOGUE_THE and listingName.split()[0].lower().strip() == "the":
                listingName = listingName[3:].strip()+', The'
            
            usernames = [userName]
            for indicator in COLLAB_INDICATORS:
                listingDescIdx = listingDesc.lower().find(indicator)
                if listingDescIdx > 0:
                    usernames.extend([re.sub(r'\W+', '', e) for e in listingDesc[listingDescIdx+len(indicator):].split() if e.lower() != "and"])
                    listingDesc = listingDesc[:listingDescIdx].strip()
                    break
                elif listingDescIdx == 0:
                    posPeriod = listingDesc.find('.')
                    usernames.extend([re.sub(r'\W+', '', e) for e in listingDesc[len(indicator):posPeriod].split() if e.lower() != "and"])
                    listingDesc = listingDesc[posPeriod+1:].strip()
                    break
            
            listings.append({"Link" : listingLink, "Name" : listingName, "UN" : usernames, "Desc" : listingDesc}) 
            
            if SORT_ALPHABETICALLY:
                listings.sort(key=lambda k: k["Name"])
    
        return listings
    
    
    # In[7]:
    
    
    def savePageContentHelper(level, url, headers=None, sleepTime=0, randSleepTimeAddon=0): 
        toReturn = []
        
        if VERBOSE:
            print("*** PROCESSING QUOTE OF THE WEEK ***")
            sys.stdout.flush() 
        # Connect to the URL
        # Note: will OPEN a browser window!!!
        dr = webdriver.Firefox()
        dr.get(url)
        try: 
            # looking for the footer since it's the last to load and stackpath page has a div with class content unfortunately
            element = WebDriverWait(dr, 5000).until(
                EC.presence_of_element_located((By.CLASS_NAME, "footer"))
            )
            
            # cookie_j = http.cookiejar.LWPCookieJar()
            # opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cookie_j) ) 
            # response = opener.open(urllib.request.Request(url, headers=headers)).read().decode()
            # print(dr.page_source) # DEBUG
            #sys.stdout.flush()  # DEBUG
            quote = fetchQuote(dr.page_source)
            if VERBOSE:
                print(quote)
                sys.stdout.flush() 
            
            if VERBOSE:
                print("Finished processing quote of the week.")
                sys.stdout.flush() 
                
            time.sleep(sleepTime+randint(0,randSleepTimeAddon)) # pause the code for this many seconds
        finally:
            dr.quit()
        temp = {}
        for key in SECTION_NAME_DICT.keys():
            if VERBOSE:
                print("*** PROCESSING "+key+" ***")
                sys.stdout.flush() 
            #print(url+'&section='+key) # DEBUG
            #sys.stdout.flush()  # DEBUG
            # Connect to the URL
            #response = urllib.request.urlopen(urllib.request.Request(url+'&section='+key, headers=headers)).read().decode()
            dr = webdriver.Firefox()
            dr.get(url+'&section='+key)
            try: 
                element = WebDriverWait(dr, 5000).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "footer"))
                )
                #print(response) # DEBUG
                #sys.stdout.flush()  # DEBUG
                temp[key] = fetchTableInfo(dr.page_source)
                if VERBOSE:
                    print(temp[key])
                    sys.stdout.flush() 
                
                time.sleep(sleepTime+randint(0,randSleepTimeAddon)) # pause the code for this many seconds
            finally:
                dr.quit()
        toReturn = [quote, temp]
        return toReturn
    
    
    # In[8]:
    
    
    def savePageContent(url, headers=None, sleepTime=0, randSleepTimeAddon=3): # With level 1 recursion, which seems to be sufficient for the NT.
        return savePageContentHelper(1, url, headers=headers, sleepTime=sleepTime, randSleepTimeAddon=randSleepTimeAddon)
    
    
    # In[9]:
    
    
    results = savePageContent(url, headers=headers, sleepTime=1)
    
    # In[10]:
    
    if VERBOSE:
        print(results)
        sys.stdout.flush() 
    
    
    # In[12]:
    
    
    def getForumerName(name, returnSuccess=False):
        name = name.lower()
        if name in nameDict:
            if returnSuccess:
                return True, nameDict[name]
            # else
            return nameDict[name]
        # else
        if returnSuccess:
            return False, name
        # else
        return name
    
    
    # In[13]:
    
    
    def returnUNtext(uns):
        len_ = len(uns)
        if len_ == 1:
            return uns[0]
        elif len_ == 2:
            return uns[0]+' and '+uns[1]
        # else
        i = 0
        toReturn = ''
        while i < len_:
            if i == len_-1:
                toReturn += 'and '+uns[i]
            else:
                toReturn += uns[i]+', '
            i += 1
        return toReturn
    
    
    # In[14]:
    
    
    NTText = ''
    
    for key in SECTION_NAME_DICT.keys():
        NTText_ = args.template_section.format(SECTION_NAME_DICT[key])+'\n'
        validListings = 0
        for listing in results[1][key]:
            usernames  = listing["UN"]
            usernames2 = [getForumerName(username) for username in usernames if username]
            hasNTWFer = any([getForumerName(username, returnSuccess=True)[0] for username in usernames])
            if not SHOW_NON_NTWF_NAMES and not hasNTWFer:
                continue
            tempDesc = listing["Desc"]
            if MAX_DESC_CHARS and len(tempDesc) > abs(MAX_DESC_CHARS):
                if MAX_DESC_CHARS > 0:
                    tempDesc = tempDesc[:MAX_DESC_CHARS].strip()+'...'
                elif MAX_DESC_CHARS < 0:
                    tempDesc = '...'+tempDesc[MAX_DESC_CHARS:].strip()
            temp = args.template_pieces.format(listing["Link"], escape(listing["Name"]), args.template_ntwf_user_decoration if hasNTWFer else '', returnUNtext(usernames2), tempDesc)+'\n'
            if not EDITION:
                editionGetterHelper = listing["Link"].rfind('=')
                EDITION = listing["Link"][editionGetterHelper+1:]
                url += EDITION
            NTText_ += temp
            validListings += 1
        if validListings > 0:
            NTText += NTText_+'\n'
        elif SHOW_EMPTY_CATEGORIES:
            NTText += NTText_+'None\n\n'
    
    if VERBOSE:
        print(NTText)
        sys.stdout.flush() 
    
    
    # In[15]:
    
    
    outputText = '[div align="center"]'+args.template_title.format(url, EDITION, (args.template_custom_title.format(TITLE_TEXT)) if TITLE_TEXT else '')+'\n\n'
    
    outputText += args.template_quote.format(results[0][0], getForumerName(results[0][1]))+'[/div]'+'\n\n'
    
    outputText += NTText
    
    outputText += args.template_editorial.format(url+'&section=editorial')
    
    outputText += ('\n\n'+POST_TEXT) if POST_TEXT else ''
    
    if VERBOSE:
        print(outputText)
        sys.stdout.flush() 
    
    with open(OUTPATH, 'w+') as f:
        for line in outputText:
            f.write(line)
    
    if EXPORTJSON:
        with open(OUTPATHJSON, 'w') as j:
            json.dump({"quote" : {"Text" : results[0][0], "UN" : results[0][1]}, "pieces" : results[1]}, j)

 if __name__ == "__main__":
    main()

 # EOF
	#!/usr/bin/env python
	# coding: utf-8

	# In[1]:


	from gooey import Gooey, GooeyParser

	# In[2]:

	import argparse

	import time
	import os
	import sys
	import re
	import json
	import http.cookiejar
	from selenium import webdriver

	from random import randint
	import html5lib
	from bs4 import BeautifulSoup
	from html import escape # python 3.x
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.wait import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC

	import urllib.request

	@Gooey(
	program_name='Neopian Times Scraper to Make NTWF Review Threads',
	menu=[{
	'name': 'File',
	'items': [{
	'type': 'AboutDialog',
	'menuTitle': 'About',
	'name': 'Neopian Times Scraper to Make NTWF Review Threads',
	'description': 'Made by RielCZ for the NTWF. Check me out on the NTWF or Neopets!',
	'version': '1.0.1',
	'copyright': '(c) 2021 RielCZ',
	'website': 'https://ntwriters.proboards.com/'
	},{
	'type': 'Link',
	'menuTitle': 'Link to Sample Neo2NTWF UN File',
	'description': 'Made by Twillie at the NTWF, maintained by the NTWF.',
	'url': 'https://docs.google.com/document/d/1EIbsrbgzU7kNKCAmjp4ORGD9z16Jh8helHaERrtCkEE/edit'
	}]
	}]
	)

	def main():
	parser = GooeyParser(description='Create NT reviews template. Vs. 1')

	required = parser.add_argument_group(
	"Required Information",
	"Required information used to generate basic templates."
	)

	relevant = parser.add_argument_group(
	"Relevant Information",
	"Other information used to generate basic templates."
	)

	options = parser.add_argument_group(
	"Options",
	"Options to modify the basic template."
	)

	other = parser.add_argument_group(
	"Other",
	"Things the average user doesn't need."
	)

	required.add_argument('Neo2NTWF_Filepath', metavar='Path to Neo2NTWF Username Text File', help='The path to the text file containing Neopets to NTWF usernames in NeoUN = @NTWFUN format, e.g. File > Link.', widget="FileChooser")
	relevant.add_argument('-o', '--outputpath', metavar='Path to Output File', type=str, default=None, help='Where to save the output text file; default is current working directory.', widget="FileSaver")

	relevant.add_argument('-e', '--edition', metavar='NT Edition Number', type=int, default=None, help='Which edition to scrape (default is most recent).')
	options.add_argument('-f', '--show-ntwf-only', metavar='Show NTWF Pieces Only', action='store_true', help='Show only pieces with an NTWF contributor.')
	options.add_argument('-a', '--sort-alphabetically', metavar='Sort Alphabetically', action='store_true', help='Sort the pieces in each section alphabetically.')
	options.add_argument('-t', '--catalogue-the', metavar='Special Catalogue "The"', action='store_true', help='Piece titles that start with "The" will have it moved to the title end in form ", The".')
	options.add_argument('-s', '--show-empty-categories', metavar='Show Empty Categories', action='store_true', help='Show categories without pieces with "None".')

	options.add_argument('--title-text', metavar='Custom Title Text', type=str, default=None, help='Text to add after the edition number in the title (e.g. for a special issue).')
	options.add_argument('--post-text', type=str, metavar='Post Tempate Text', default="All reviews of all pieces are welcome! However, if you would really like your piece to be reviewed, post and let others know!", help='Text to add after the normal template.')

	#other.add_argument('-v', '--verbose', metavar='Verbose', action='store_true', help='Show debugging text in the console.')
	other.add_argument('-j', '--export-json', metavar='Export JSON', action='store_true', help='Dump the extracted text contents (dictionary) in a JSON file.')
	other.add_argument('--outputjsonpath', metavar='Path to JSON Output', type=str, default=None, help='Where to save the output JSON file; default is current working directory.', widget="FileSaver")

	optionsadv = parser.add_argument_group(
	"Advanced Options",
	"Options for web scraping and/or text analysis for construction of the basic template."
	)

	optionsadv.add_argument('--collab-text', metavar='Description Text to Denote Collabs', type=str, default='collab with; also by', help='Text to denote start of collab authors.')
	optionsadv.add_argument('--collab-text-delimiter', type=str, metavar='Collab Text Delimiter', default=';', help='Delimiter for Description Collab Text field.')

	template = parser.add_argument_group(
	"Template",
	"Directly modify the basic template."
	)

	template.add_argument('--template-title', metavar='Template Title', type=str, default='[font size="6"][a href="{0}"]Issue {1}{2}[/a][/font]', help='The BBS text for formatting the title. {0}: Edition URL. {1}: Edition number. {2}: Custom title.')

	template.add_argument('--template-custom-title', metavar='Template Custom Title', type=str, default=' – {0}', help='The BBS text for formatting the custom title. {0}: Title.')

	template.add_argument('--template-quote', metavar='Template Quote', type=str, default='[quote]{0}[/quote]—{1}', help='The BBS text for formatting the Quote of the Week. {0}: Quote. {1}: Author.')

	template.add_argument('--template-section', metavar='Template Section', type=str, default='[font size="4"]{0}[/font]', help='The BBS text for formatting the Sections. {0}: Section name.')

	template.add_argument('--template-pieces', metavar='Template Pieces', type=str, default='[a href="{0}"]{1}[/a]{2} by {3}', help='The BBS text for formatting the Pieces. {0}: Piece URL. {1}: Piece title. {2}: NTWF user piece decoration. {3} Author(s). {4}: Description.')

	template.add_argument('--template-ntwf-user-decoration', metavar='Template NTWF User Piece Decoration', type=str, default='', help='The BBS text after Pieces with an NTWF contributor.')

	template.add_argument('--max-chars-in-desc', metavar='Maximum Chars. in Description', type=int, default=None, help='The max. number of piece description chars. to provide if using the description. Default is full description.')

	template.add_argument('--template-editorial', metavar='Template Editorial', type=str, default='And don\'t forget [a href="{0}"]The Editorial[/a].', help='The BBS text for formatting the Editorial. {0}: Editorial URL.')

	args = parser.parse_args()

	EDITION = args.edition
	if EDITION is None or EDITION < 150 or int(EDITION) < 150:
	EDITION = ''
	print('*** PROCESSING EDITION = '+(str(EDITION) if EDITION else "[Current]")+' ***')
	sys.stdout.flush()

	SECTION_NAME_DICT = {'articles' : 'Articles',
	'shorts' : 'Short Stories',
	'comics' : 'Comics',
	'series' : 'New Series',
	'cont' : 'Continued Series'
	}

	SHOW_NON_NTWF_NAMES = not args.show_ntwf_only

	COLLAB_INDICATORS = [e.strip() for e in args.collab_text.split(args.collab_text_delimiter)]

	VERBOSE = False #args.verbose
	SORT_ALPHABETICALLY = args.sort_alphabetically
	CATALOGUE_THE = args.catalogue_the

	TITLE_TEXT = args.title_text
	POST_TEXT = args.post_text

	SHOW_EMPTY_CATEGORIES = args.show_empty_categories

	MAX_DESC_CHARS = args.max_chars_in_desc

	OUTPATH = args.outputpath
	if OUTPATH is None:
	OUTPATH = os.path.join(os.getcwd(), 'output.txt')
	elif os.path.isdir(OUTPATH):
	OUTPATH = os.path.join(OUTPATH, 'output.txt')

	EXPORTJSON = args.export_json

	OUTPATHJSON = args.outputjsonpath
	if OUTPATHJSON is None:
	OUTPATHJSON = os.path.join(os.getcwd(), 'output.json')
	elif os.path.isdir(OUTPATHJSON):
	OUTPATHJSON = os.path.join(OUTPATHJSON, 'output.json')

	# In[4]:

	nameDict = {}

	with open(args.Neo2NTWF_Filepath, 'r') as f:
	for i, line in enumerate(f.readlines()):
	temp = line.split("=")
	if len(temp) != 2:
	if temp is not None and temp[0].strip() is not None:
	print("[WARNING] Skipping input Neo2NTWF file line "+str(i+1)+" because it is misformatted: '"+str(temp)+"'", file=sys.stderr, flush=True)
	continue
	neoUN = temp[0].strip().lower()
	forumUN = temp[1].strip().lower()
	nameDict[neoUN] = forumUN

	if VERBOSE:
	print(nameDict)
	sys.stdout.flush()

	# Set the URL to webscrape from
	url = 'http://www.neopets.com/ntimes/index.phtml?week='+str(EDITION)
	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}


	# In[5]:


	def fetchQuote(responseText):
	'''
	Assistance from HERE: https://linuxhint.com/find_children_nodes_beautiful_soup/
	'''

	# Parse HTML and save to BeautifulSoup object

	soup = BeautifulSoup(responseText, "html.parser")
	#print("SOUUUUP: " + soup)
	quote = soup.find('div', {'class' : 'quote'}).text.split('--')

	quote_text = quote[0].split('\n')[2].strip()
	quote_author = quote[1].strip()

	return [quote_text, quote_author]


	# In[6]:


	def fetchTableInfo(responseText):
	'''
	Adapted from HERE: https://stackoverflow.com/questions/23377533/python-beautifulsoup-parsing-table,
	https://stackoverflow.com/questions/36538789/beautifulsoup-bs4-how-to-ignore-ending-tag-in-malformed-html,
	https://stackoverflow.com/questions/5815747/beautifulsoup-getting-href
	'''
	##responseText = responseText.replace("</P>", "") # F*** the NT and its improper formatting
	#print(responseText) # DEBUG
	#sys.stdout.flush() # DEBUG

	# Parse HTML and save to BeautifulSoup object
	soup = BeautifulSoup(responseText, "html5lib") # Gotta use the liberal parser (or do the replace above)
	# print(soup) # DEBUG
	#sys.stdout.flush()
	table = soup.find('td', {'class' : 'content'}).findChild("table").find_all('tr')

	listings = []

	for row in table:
	col = row.find_all('td')[1]
	#print(col, flush=True) # DEBUG
	#sys.stdout.flush() # DEBUG

	a_tags = col.findAll('a')
	listingLink = 'http://www.neopets.com/ntimes/'+a_tags[0]["href"].strip()
	listingName = a_tags[0].text.strip()
	userName = a_tags[1].text.strip()
	listingDesc = 'by'.join(col.get_text()[:-len(userName)].split('by')[:-1])
	listingDesc = listingDesc[len(listingName):].strip()

	if CATALOGUE_THE and listingName.split()[0].lower().strip() == "the":
	listingName = listingName[3:].strip()+', The'

	usernames = [userName]
	for indicator in COLLAB_INDICATORS:
	listingDescIdx = listingDesc.lower().find(indicator)
	if listingDescIdx > 0:
	usernames.extend([re.sub(r'\W+', '', e) for e in listingDesc[listingDescIdx+len(indicator):].split() if e.lower() != "and"])
	listingDesc = listingDesc[:listingDescIdx].strip()
	break
	elif listingDescIdx == 0:
	posPeriod = listingDesc.find('.')
	usernames.extend([re.sub(r'\W+', '', e) for e in listingDesc[len(indicator):posPeriod].split() if e.lower() != "and"])
	listingDesc = listingDesc[posPeriod+1:].strip()
	break

	listings.append({"Link" : listingLink, "Name" : listingName, "UN" : usernames, "Desc" : listingDesc})

	if SORT_ALPHABETICALLY:
	listings.sort(key=lambda k: k["Name"])

	return listings


	# In[7]:


	def savePageContentHelper(level, url, headers=None, sleepTime=0, randSleepTimeAddon=0):
	toReturn = []

	if VERBOSE:
	print("* PROCESSING QUOTE OF THE WEEK *")
	sys.stdout.flush()
	# Connect to the URL
	# Note: will OPEN a browser window!!!
	dr = webdriver.Firefox()
	dr.get(url)
	try:
	# looking for the footer since it's the last to load and stackpath page has a div with class content unfortunately
	element = WebDriverWait(dr, 5000).until(
	EC.presence_of_element_located((By.CLASS_NAME, "footer"))
	)

	# cookie_j = http.cookiejar.LWPCookieJar()
	# opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cookie_j) )
	# response = opener.open(urllib.request.Request(url, headers=headers)).read().decode()
	# print(dr.page_source) # DEBUG
	#sys.stdout.flush() # DEBUG
	quote = fetchQuote(dr.page_source)
	if VERBOSE:
	print(quote)
	sys.stdout.flush()

	if VERBOSE:
	print("Finished processing quote of the week.")
	sys.stdout.flush()

	time.sleep(sleepTime+randint(0,randSleepTimeAddon)) # pause the code for this many seconds
	finally:
	dr.quit()
	temp = {}
	for key in SECTION_NAME_DICT.keys():
	if VERBOSE:
	print("* PROCESSING "+key+" *")
	sys.stdout.flush()
	#print(url+'&section='+key) # DEBUG
	#sys.stdout.flush() # DEBUG
	# Connect to the URL
	#response = urllib.request.urlopen(urllib.request.Request(url+'&section='+key, headers=headers)).read().decode()
	dr = webdriver.Firefox()
	dr.get(url+'&section='+key)
	try:
	element = WebDriverWait(dr, 5000).until(
	EC.presence_of_element_located((By.CLASS_NAME, "footer"))
	)
	#print(response) # DEBUG
	#sys.stdout.flush() # DEBUG
	temp[key] = fetchTableInfo(dr.page_source)
	if VERBOSE:
	print(temp[key])
	sys.stdout.flush()

	time.sleep(sleepTime+randint(0,randSleepTimeAddon)) # pause the code for this many seconds
	finally:
	dr.quit()
	toReturn = [quote, temp]
	return toReturn


	# In[8]:


	def savePageContent(url, headers=None, sleepTime=0, randSleepTimeAddon=3): # With level 1 recursion, which seems to be sufficient for the NT.
	return savePageContentHelper(1, url, headers=headers, sleepTime=sleepTime, randSleepTimeAddon=randSleepTimeAddon)


	# In[9]:


	results = savePageContent(url, headers=headers, sleepTime=1)

	# In[10]:

	if VERBOSE:
	print(results)
	sys.stdout.flush()


	# In[12]:


	def getForumerName(name, returnSuccess=False):
	name = name.lower()
	if name in nameDict:
	if returnSuccess:
	return True, nameDict[name]
	# else
	return nameDict[name]
	# else
	if returnSuccess:
	return False, name
	# else
	return name


	# In[13]:


	def returnUNtext(uns):
	len_ = len(uns)
	if len_ == 1:
	return uns[0]
	elif len_ == 2:
	return uns[0]+' and '+uns[1]
	# else
	i = 0
	toReturn = ''
	while i < len_:
	if i == len_-1:
	toReturn += 'and '+uns[i]
	else:
	toReturn += uns[i]+', '
	i += 1
	return toReturn


	# In[14]:


	NTText = ''

	for key in SECTION_NAME_DICT.keys():
	NTText_ = args.template_section.format(SECTION_NAME_DICT[key])+'\n'
	validListings = 0
	for listing in results[1][key]:
	usernames = listing["UN"]
	usernames2 = [getForumerName(username) for username in usernames if username]
	hasNTWFer = any([getForumerName(username, returnSuccess=True)[0] for username in usernames])
	if not SHOW_NON_NTWF_NAMES and not hasNTWFer:
	continue
	tempDesc = listing["Desc"]
	if MAX_DESC_CHARS and len(tempDesc) > abs(MAX_DESC_CHARS):
	if MAX_DESC_CHARS > 0:
	tempDesc = tempDesc[:MAX_DESC_CHARS].strip()+'...'
	elif MAX_DESC_CHARS < 0:
	tempDesc = '...'+tempDesc[MAX_DESC_CHARS:].strip()
	temp = args.template_pieces.format(listing["Link"], escape(listing["Name"]), args.template_ntwf_user_decoration if hasNTWFer else '', returnUNtext(usernames2), tempDesc)+'\n'
	if not EDITION:
	editionGetterHelper = listing["Link"].rfind('=')
	EDITION = listing["Link"][editionGetterHelper+1:]
	url += EDITION
	NTText_ += temp
	validListings += 1
	if validListings > 0:
	NTText += NTText_+'\n'
	elif SHOW_EMPTY_CATEGORIES:
	NTText += NTText_+'None\n\n'

	if VERBOSE:
	print(NTText)
	sys.stdout.flush()


	# In[15]:


	outputText = '[div align="center"]'+args.template_title.format(url, EDITION, (args.template_custom_title.format(TITLE_TEXT)) if TITLE_TEXT else '')+'\n\n'

	outputText += args.template_quote.format(results[0][0], getForumerName(results[0][1]))+'[/div]'+'\n\n'

	outputText += NTText

	outputText += args.template_editorial.format(url+'&section=editorial')

	outputText += ('\n\n'+POST_TEXT) if POST_TEXT else ''

	if VERBOSE:
	print(outputText)
	sys.stdout.flush()

	with open(OUTPATH, 'w+') as f:
	for line in outputText:
	f.write(line)

	if EXPORTJSON:
	with open(OUTPATHJSON, 'w') as j:
	json.dump({"quote" : {"Text" : results[0][0], "UN" : results[0][1]}, "pieces" : results[1]}, j)

	if __name__ == "__main__":
	main()

	# EOF
No results found