Created
March 3, 2012 16:39
-
-
Save chengdujin/1966910 to your computer and use it in GitHub Desktop.
using google reader's api (goolge oauth2), fetching specific number of feeds. the number is theoretically unlimited
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # -*- coding: utf-8 -*- | |
| ## | |
| # this script serves to fetch all of a provider's feeds (a limit | |
| # for the historical feeds would be 5000). and then the feeds are | |
| # stored in mongodb | |
| # | |
| # @author Yuan JIN | |
| # @contact chengudjin@gmail.com | |
| # @since 2012.03.03 | |
| # @latest 2012.03.03 | |
| ''' | |
| Copyright (c) 2012-2013, Yuan JIN | |
| All rights reserved. | |
| Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: | |
| * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. | |
| * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. | |
| * Neither the name of Redis nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| ''' | |
| #relaod the script encoding | |
| import sys | |
| reload(sys) | |
| sys.setdefaultencoding('UTF-8') | |
| # Google OAuth | |
| SCOPE = "http://www.google.com/reader/api http://www.google.com/reader/atom" | |
| REQUEST_OAUTH_TOKEN_URL = "https://www.google.com/accounts/OAuthGetRequestToken?scope=%s" % SCOPE | |
| AUTHORIZE_URL = "https://www.google.com/accounts/OAuthAuthorizeToken" | |
| ACCESS_TOKEN_URL = "https://www.google.com/accounts/OAuthGetAccessToken" | |
| CLIENT_ID = "secret.apps.googleusercontent.com" | |
| CLIENT_SECRET = "secret" | |
| def store_feeds(feeds): | |
| 'store the feeds in mongodb ' | |
| from pymongo import Connection | |
| con = Connection('localhost', 27017) | |
| db = con.songshuhui_articles | |
| for feed in feeds: | |
| cursor = db.songshuhui_articles.find({'title':'%s' % feed['title']}) | |
| if cursor.count() > 0: | |
| continue | |
| else: | |
| print feed | |
| db.songshuhui_articles.insert(feed) | |
| def parse_feeds(data, limit): | |
| 'parse out necessary information' | |
| feeds = [] | |
| from BeautifulSoup import BeautifulStoneSoup | |
| root = BeautifulStoneSoup(data) | |
| # every feed item starts from 7 + 2*step | |
| end = int(limit) * 2 - 1 | |
| for item in xrange(7, 7 + end, 2): | |
| feed = {} | |
| xml_feeds = root.contents[1].contents[item].contents | |
| for xml_feed in xml_feeds: | |
| if xml_feed.name == 'entry': | |
| info = xml_feed.contents | |
| category = [] | |
| for id, entry in enumerate(info): | |
| if entry.name == 'category': | |
| if not entry['term'][:5] == 'user/': | |
| category.append(entry['term']) | |
| if (id + 1) == len(info): | |
| feed['title'] = entry.contents[0].string | |
| feed['published'] = entry.contents[1].string | |
| feed['category'] = category | |
| xml_feeds = root.contents[1].contents[item + 1] | |
| feed['source'] = xml_feeds['href'] | |
| feed['author'] = xml_feeds.contents[1].contents[0].string | |
| feeds.append(feed) | |
| return feeds | |
| def create_oauth_client(): | |
| 'authorize with google reader - get the access token' | |
| import oauth2 | |
| consumer = oauth2.Consumer(CLIENT_ID, CLIENT_SECRET) | |
| import os.path | |
| if not (os.path.exists('access_token') and os.path.exists('access_token_secret')): | |
| client = oauth2.Client(consumer) | |
| # request oauth token | |
| response, content = client.request(REQUEST_OAUTH_TOKEN_URL, 'GET') | |
| import urlparse | |
| request_token = dict(urlparse.parse_qsl(content)) | |
| # authorization | |
| print "Open this link in a browser..:" | |
| print "%s?oauth_token=%s" % (AUTHORIZE_URL, request_token['oauth_token']) | |
| print "Press ENTER when ready.." | |
| raw_input() | |
| # get access token | |
| token = oauth2.Token(request_token['oauth2_token'], request_token['oauth_token_secret']) | |
| client = oauth2.Client(consumer, token) | |
| response, content = client.request(ACCESS_TOKEN_URL, 'GET') | |
| access_token = dict(urlparse.parse_qsl(content)) | |
| # record the token | |
| f = open ('access_token', 'w') | |
| f.write (access_token['oauth_token']) | |
| f.close () | |
| oauth_token = access_token['oauth_token'] | |
| f = open ('access_token_secret', 'w') | |
| f.write (access_token['oauth_token_secret']) | |
| f.close () | |
| oauth_token_secret = access_token['oauth_token_secret'] | |
| else: | |
| # read in the token and secret from local disk | |
| f = open('access_token', 'r') | |
| oauth_token = f.read() | |
| f.close() | |
| f = open ('access_token_secret', 'r') | |
| oauth_token_secret = f.read() | |
| f.close() | |
| token = oauth2.Token(oauth_token, oauth_token_secret) | |
| client = oauth2.Client(consumer, token) | |
| return client | |
| def retrieve_data(url, limit): | |
| 'read from google reader service' | |
| # request access token or find it locally | |
| client = create_oauth_client() | |
| # unlimited access to a provider's historical feeds | |
| # courtesy of google | |
| url = "http://www.google.com/reader/atom/feed/%s?n=%s" % (url, limit) | |
| response, feeds = client.request(url, 'GET') | |
| return feeds | |
| def main(): | |
| 'entrance to feeds retrieval and storing' | |
| data = retrieve_data('http://secret/feed', 100) | |
| feeds = parse_feeds(data, 100) | |
| store_feeds(feeds) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment