chengdujin · March 3, 2012 16:39
diff --git a/feed_scraper.py b/feed_scraper.py
 #!/usr/bin/python
 # -*- coding: utf-8 -*-

 ##
 # this script serves to fetch all of a provider's feeds (a limit 
 # for the historical feeds would be 5000). and then the feeds are
 # stored in mongodb
 #
 # @author Yuan JIN
 # @contact chengudjin@gmail.com
 # @since 2012.03.03
 # @latest 2012.03.03

 '''
 Copyright (c) 2012-2013, Yuan JIN
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
    * Neither the name of Redis nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 '''

 #relaod the script encoding
 import sys
 reload(sys)
 sys.setdefaultencoding('UTF-8')


 # Google OAuth
 SCOPE = "http://www.google.com/reader/api http://www.google.com/reader/atom"
 REQUEST_OAUTH_TOKEN_URL = "https://www.google.com/accounts/OAuthGetRequestToken?scope=%s" % SCOPE
 AUTHORIZE_URL = "https://www.google.com/accounts/OAuthAuthorizeToken"
 ACCESS_TOKEN_URL = "https://www.google.com/accounts/OAuthGetAccessToken"

 CLIENT_ID = "secret.apps.googleusercontent.com"
 CLIENT_SECRET = "secret"


 def store_feeds(feeds):
    'store the feeds in mongodb '
    from pymongo import Connection
    con = Connection('localhost', 27017)
    db = con.songshuhui_articles

    for feed in feeds:
        cursor = db.songshuhui_articles.find({'title':'%s' % feed['title']})
        if cursor.count() > 0:
            continue
        else:
            print feed
            db.songshuhui_articles.insert(feed)

 def parse_feeds(data, limit):
    'parse out necessary information'
    feeds = []
    
    from BeautifulSoup import BeautifulStoneSoup
    root = BeautifulStoneSoup(data)

    # every feed item starts from 7 + 2*step
    end = int(limit) * 2 - 1
    for item in xrange(7, 7 + end, 2):
        feed = {}
        xml_feeds = root.contents[1].contents[item].contents
        for xml_feed in xml_feeds:
            if xml_feed.name == 'entry':
                info = xml_feed.contents
                category = []
                for id, entry in enumerate(info):
                    if entry.name == 'category':
                        if not entry['term'][:5] == 'user/':
                                category.append(entry['term'])
                    if (id + 1) == len(info):
                        feed['title'] = entry.contents[0].string
                        feed['published'] = entry.contents[1].string
                feed['category'] = category
                    
        xml_feeds = root.contents[1].contents[item + 1]
        feed['source'] = xml_feeds['href']
        feed['author'] = xml_feeds.contents[1].contents[0].string

        feeds.append(feed)
    return feeds
    

 def create_oauth_client():
    'authorize with google reader - get the access token'
    import oauth2
    consumer = oauth2.Consumer(CLIENT_ID, CLIENT_SECRET)

    import os.path
    if not (os.path.exists('access_token') and os.path.exists('access_token_secret')):
        client = oauth2.Client(consumer)
        
        # request oauth token
        response, content = client.request(REQUEST_OAUTH_TOKEN_URL, 'GET')
        import urlparse
        request_token = dict(urlparse.parse_qsl(content))
    
        # authorization
        print "Open this link in a browser..:"
        print "%s?oauth_token=%s" % (AUTHORIZE_URL, request_token['oauth_token'])
        print
        print "Press ENTER when ready.."
        raw_input()
        
        # get access token
        token = oauth2.Token(request_token['oauth2_token'], request_token['oauth_token_secret'])
        client = oauth2.Client(consumer, token)
        
        response, content = client.request(ACCESS_TOKEN_URL, 'GET')
        access_token = dict(urlparse.parse_qsl(content))
        
        # record the token
        f = open ('access_token', 'w')
        f.write (access_token['oauth_token'])
        f.close ()
        oauth_token = access_token['oauth_token']
            
        f = open ('access_token_secret', 'w')
        f.write (access_token['oauth_token_secret'])
        f.close ()
        oauth_token_secret = access_token['oauth_token_secret']
    else:
        # read in the token and secret from local disk
        f = open('access_token', 'r')
        oauth_token = f.read()
        f.close()
            
        f = open ('access_token_secret', 'r')
        oauth_token_secret = f.read()
        f.close()
    
    token = oauth2.Token(oauth_token, oauth_token_secret)
    client = oauth2.Client(consumer, token)
    return client
    
 def retrieve_data(url, limit):
    'read from google reader service'
    # request access token or find it locally
    client = create_oauth_client()

    # unlimited access to a provider's historical feeds
    # courtesy of google
    url = "http://www.google.com/reader/atom/feed/%s?n=%s" % (url, limit)
    response, feeds = client.request(url, 'GET')
    return feeds

 def main():
    'entrance to feeds retrieval and storing'
    data = retrieve_data('http://secret/feed', 100)
    feeds = parse_feeds(data, 100)
    store_feeds(feeds)

 if __name__ == '__main__':
    main()
	#!/usr/bin/python
	# -- coding: utf-8 --

	##
	# this script serves to fetch all of a provider's feeds (a limit
	# for the historical feeds would be 5000). and then the feeds are
	# stored in mongodb
	#
	# @author Yuan JIN
	# @contact chengudjin@gmail.com
	# @since 2012.03.03
	# @latest 2012.03.03

	'''
	Copyright (c) 2012-2013, Yuan JIN
	All rights reserved.

	Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

	* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
	* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
	* Neither the name of Redis nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	'''

	#relaod the script encoding
	import sys
	reload(sys)
	sys.setdefaultencoding('UTF-8')


	# Google OAuth
	SCOPE = "http://www.google.com/reader/api http://www.google.com/reader/atom"
	REQUEST_OAUTH_TOKEN_URL = "https://www.google.com/accounts/OAuthGetRequestToken?scope=%s" % SCOPE
	AUTHORIZE_URL = "https://www.google.com/accounts/OAuthAuthorizeToken"
	ACCESS_TOKEN_URL = "https://www.google.com/accounts/OAuthGetAccessToken"

	CLIENT_ID = "secret.apps.googleusercontent.com"
	CLIENT_SECRET = "secret"


	def store_feeds(feeds):
	'store the feeds in mongodb '
	from pymongo import Connection
	con = Connection('localhost', 27017)
	db = con.songshuhui_articles

	for feed in feeds:
	cursor = db.songshuhui_articles.find({'title':'%s' % feed['title']})
	if cursor.count() > 0:
	continue
	else:
	print feed
	db.songshuhui_articles.insert(feed)

	def parse_feeds(data, limit):
	'parse out necessary information'
	feeds = []

	from BeautifulSoup import BeautifulStoneSoup
	root = BeautifulStoneSoup(data)

	# every feed item starts from 7 + 2*step
	end = int(limit) * 2 - 1
	for item in xrange(7, 7 + end, 2):
	feed = {}
	xml_feeds = root.contents[1].contents[item].contents
	for xml_feed in xml_feeds:
	if xml_feed.name == 'entry':
	info = xml_feed.contents
	category = []
	for id, entry in enumerate(info):
	if entry.name == 'category':
	if not entry['term'][:5] == 'user/':
	category.append(entry['term'])
	if (id + 1) == len(info):
	feed['title'] = entry.contents[0].string
	feed['published'] = entry.contents[1].string
	feed['category'] = category

	xml_feeds = root.contents[1].contents[item + 1]
	feed['source'] = xml_feeds['href']
	feed['author'] = xml_feeds.contents[1].contents[0].string

	feeds.append(feed)
	return feeds


	def create_oauth_client():
	'authorize with google reader - get the access token'
	import oauth2
	consumer = oauth2.Consumer(CLIENT_ID, CLIENT_SECRET)

	import os.path
	if not (os.path.exists('access_token') and os.path.exists('access_token_secret')):
	client = oauth2.Client(consumer)

	# request oauth token
	response, content = client.request(REQUEST_OAUTH_TOKEN_URL, 'GET')
	import urlparse
	request_token = dict(urlparse.parse_qsl(content))

	# authorization
	print "Open this link in a browser..:"
	print "%s?oauth_token=%s" % (AUTHORIZE_URL, request_token['oauth_token'])
	print
	print "Press ENTER when ready.."
	raw_input()

	# get access token
	token = oauth2.Token(request_token['oauth2_token'], request_token['oauth_token_secret'])
	client = oauth2.Client(consumer, token)

	response, content = client.request(ACCESS_TOKEN_URL, 'GET')
	access_token = dict(urlparse.parse_qsl(content))

	# record the token
	f = open ('access_token', 'w')
	f.write (access_token['oauth_token'])
	f.close ()
	oauth_token = access_token['oauth_token']

	f = open ('access_token_secret', 'w')
	f.write (access_token['oauth_token_secret'])
	f.close ()
	oauth_token_secret = access_token['oauth_token_secret']
	else:
	# read in the token and secret from local disk
	f = open('access_token', 'r')
	oauth_token = f.read()
	f.close()

	f = open ('access_token_secret', 'r')
	oauth_token_secret = f.read()
	f.close()

	token = oauth2.Token(oauth_token, oauth_token_secret)
	client = oauth2.Client(consumer, token)
	return client

	def retrieve_data(url, limit):
	'read from google reader service'
	# request access token or find it locally
	client = create_oauth_client()

	# unlimited access to a provider's historical feeds
	# courtesy of google
	url = "http://www.google.com/reader/atom/feed/%s?n=%s" % (url, limit)
	response, feeds = client.request(url, 'GET')
	return feeds

	def main():
	'entrance to feeds retrieval and storing'
	data = retrieve_data('http://secret/feed', 100)
	feeds = parse_feeds(data, 100)
	store_feeds(feeds)

	if __name__ == '__main__':
	main()
No results found