Skip to content

Instantly share code, notes, and snippets.

@rsmahabir
Forked from vchahun/scraper.py
Created May 22, 2018 07:06
Show Gist options
  • Select an option

  • Save rsmahabir/26bb48801f4e9c6f51ace60453b2431a to your computer and use it in GitHub Desktop.

Select an option

Save rsmahabir/26bb48801f4e9c6f51ace60453b2431a to your computer and use it in GitHub Desktop.
Scrape links from Google News
import sys
import multiprocessing
from urllib2 import urlopen, URLError
import chardet
import urlparse, urllib
def fixurl(url):
# turn string into unicode
if not isinstance(url,unicode):
url = url.decode('utf8')
# parse it
parsed = urlparse.urlsplit(url)
# divide the netloc further
userpass,at,hostport = parsed.netloc.partition('@')
user,colon1,pass_ = userpass.partition(':')
host,colon2,port = hostport.partition(':')
# encode each component
scheme = parsed.scheme.encode('utf8')
user = urllib.quote(user.encode('utf8'))
colon1 = colon1.encode('utf8')
pass_ = urllib.quote(pass_.encode('utf8'))
at = at.encode('utf8')
host = host.encode('idna')
colon2 = colon2.encode('utf8')
port = port.encode('utf8')
path = '/'.join( # could be encoded slashes!
urllib.quote(urllib.unquote(pce).encode('utf8'),'')
for pce in parsed.path.split('/')
)
query = urllib.quote(urllib.unquote(parsed.query).encode('utf8'),'=&?/')
fragment = urllib.quote(urllib.unquote(parsed.query).encode('utf8'))
# put it back together
netloc = ''.join((user,colon1,pass_,at,host,colon2,port))
return urlparse.urlunsplit((scheme,netloc,path,query,fragment))
def getArticle(link):
(i, _, _, _, url) = link
print "Requesting #%d [%s]..."%(i, url)
html = u""
try:
article = urlopen(fixurl(url), timeout = 10).read()
if article:
encoding = chardet.detect(article)['encoding']
if encoding:
html = unicode(article, encoding)
except UnicodeEncodeError as e:
print "Failed encoding #%d (%s)"%(i, str(e))
except URLError as e:
print "Failed download #%d (%s)"%(i, str(e))
except Exception as e:
print "Unknowkn exception #%d (%s)"%(i, str(e))
return (link, html)
def scrape(fileName):
links = []
with open(fileName) as f:
for l in f:
(i, _, title, source, _, date, _, url) = l[:-1].decode("utf8").split('\t')
i = int(i)
links.append((i, title, source, date, url))
pool = multiprocessing.Pool(10)
articles = pool.map(getArticle, links)
print "##### Inserting into DB..."
import sqlite3
cnx = sqlite3.connect("articles.db")
cur = cnx.cursor()
cur.executemany("""insert into articles(id, date, title, source, url, html)
values(?, ?, ?, ?, ?, ?)""",
[(i, date, title, source, url, html) for ((i, title, source, date, url), html) in articles])
cnx.commit()
cur.close()
if __name__ == '__main__':
scrape(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment