Skip to content

Instantly share code, notes, and snippets.

@chengdujin
Created March 3, 2012 16:39
Show Gist options
  • Select an option

  • Save chengdujin/1966910 to your computer and use it in GitHub Desktop.

Select an option

Save chengdujin/1966910 to your computer and use it in GitHub Desktop.
using google reader's api (goolge oauth2), fetching specific number of feeds. the number is theoretically unlimited
#!/usr/bin/python
# -*- coding: utf-8 -*-
##
# this script serves to fetch all of a provider's feeds (a limit
# for the historical feeds would be 5000). and then the feeds are
# stored in mongodb
#
# @author Yuan JIN
# @contact chengudjin@gmail.com
# @since 2012.03.03
# @latest 2012.03.03
'''
Copyright (c) 2012-2013, Yuan JIN
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* Neither the name of Redis nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
'''
#relaod the script encoding
import sys
reload(sys)
sys.setdefaultencoding('UTF-8')
# Google OAuth
SCOPE = "http://www.google.com/reader/api http://www.google.com/reader/atom"
REQUEST_OAUTH_TOKEN_URL = "https://www.google.com/accounts/OAuthGetRequestToken?scope=%s" % SCOPE
AUTHORIZE_URL = "https://www.google.com/accounts/OAuthAuthorizeToken"
ACCESS_TOKEN_URL = "https://www.google.com/accounts/OAuthGetAccessToken"
CLIENT_ID = "secret.apps.googleusercontent.com"
CLIENT_SECRET = "secret"
def store_feeds(feeds):
'store the feeds in mongodb '
from pymongo import Connection
con = Connection('localhost', 27017)
db = con.songshuhui_articles
for feed in feeds:
cursor = db.songshuhui_articles.find({'title':'%s' % feed['title']})
if cursor.count() > 0:
continue
else:
print feed
db.songshuhui_articles.insert(feed)
def parse_feeds(data, limit):
'parse out necessary information'
feeds = []
from BeautifulSoup import BeautifulStoneSoup
root = BeautifulStoneSoup(data)
# every feed item starts from 7 + 2*step
end = int(limit) * 2 - 1
for item in xrange(7, 7 + end, 2):
feed = {}
xml_feeds = root.contents[1].contents[item].contents
for xml_feed in xml_feeds:
if xml_feed.name == 'entry':
info = xml_feed.contents
category = []
for id, entry in enumerate(info):
if entry.name == 'category':
if not entry['term'][:5] == 'user/':
category.append(entry['term'])
if (id + 1) == len(info):
feed['title'] = entry.contents[0].string
feed['published'] = entry.contents[1].string
feed['category'] = category
xml_feeds = root.contents[1].contents[item + 1]
feed['source'] = xml_feeds['href']
feed['author'] = xml_feeds.contents[1].contents[0].string
feeds.append(feed)
return feeds
def create_oauth_client():
'authorize with google reader - get the access token'
import oauth2
consumer = oauth2.Consumer(CLIENT_ID, CLIENT_SECRET)
import os.path
if not (os.path.exists('access_token') and os.path.exists('access_token_secret')):
client = oauth2.Client(consumer)
# request oauth token
response, content = client.request(REQUEST_OAUTH_TOKEN_URL, 'GET')
import urlparse
request_token = dict(urlparse.parse_qsl(content))
# authorization
print "Open this link in a browser..:"
print "%s?oauth_token=%s" % (AUTHORIZE_URL, request_token['oauth_token'])
print
print "Press ENTER when ready.."
raw_input()
# get access token
token = oauth2.Token(request_token['oauth2_token'], request_token['oauth_token_secret'])
client = oauth2.Client(consumer, token)
response, content = client.request(ACCESS_TOKEN_URL, 'GET')
access_token = dict(urlparse.parse_qsl(content))
# record the token
f = open ('access_token', 'w')
f.write (access_token['oauth_token'])
f.close ()
oauth_token = access_token['oauth_token']
f = open ('access_token_secret', 'w')
f.write (access_token['oauth_token_secret'])
f.close ()
oauth_token_secret = access_token['oauth_token_secret']
else:
# read in the token and secret from local disk
f = open('access_token', 'r')
oauth_token = f.read()
f.close()
f = open ('access_token_secret', 'r')
oauth_token_secret = f.read()
f.close()
token = oauth2.Token(oauth_token, oauth_token_secret)
client = oauth2.Client(consumer, token)
return client
def retrieve_data(url, limit):
'read from google reader service'
# request access token or find it locally
client = create_oauth_client()
# unlimited access to a provider's historical feeds
# courtesy of google
url = "http://www.google.com/reader/atom/feed/%s?n=%s" % (url, limit)
response, feeds = client.request(url, 'GET')
return feeds
def main():
'entrance to feeds retrieval and storing'
data = retrieve_data('http://secret/feed', 100)
feeds = parse_feeds(data, 100)
store_feeds(feeds)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment