Created
October 10, 2012 18:31
-
-
Save marcelcaraciolo/3867521 to your computer and use it in GitHub Desktop.
Revisions
-
marcelcaraciolo created this gist
Oct 10, 2012 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,156 @@ #-*-coding: utf-8 -*- ''' This module represents the recommender system for recommending new friends based on 'mutual friends'. ''' __author__ = 'Marcel Caraciolo <caraciol@gmail.com>' from mrjob.job import MRJob def combinations(iterable, r): """ Implementation of itertools combinations method. Re-implemented here because of import issues in Amazon Elastic MapReduce. Was just easier to do this than bootstrap. More info here: http://docs.python.org/library/itertools.html#itertools.combinations Input/Output: combinations('ABCD', 2) --> AB AC AD BC BD CD combinations(range(4), 3) --> 012 013 023 123 """ pool = tuple(iterable) n = len(pool) if r > n: return indices = range(r) yield tuple(pool[i] for i in indices) while True: for i in reversed(range(r)): if indices[i] != i + n - r: break else: return indices[i] += 1 for j in range(i + 1, r): indices[j] = indices[j - 1] + 1 yield tuple(pool[i] for i in indices) TOP_N = 10 class FriendsRecommender(MRJob): def steps(self): return [self.mr(self.map_input, self.reverse_index), self.mr(None, self.count_number_of_friends), self.mr(None, self.top_recommendations)] def map_input(self, key, line): ''' Compute the reversed index for each connection between the source. Input (source -> {“friend1”, “friend2”, “friend3”}): marcel,jonas,maria,jose,amanda Output {[friend1], source, 1, {“friend1”, “friend2”, “friend3”}}; {[friend2], source, 1, {“friend1”, “friend2”, “friend3”}}; ''' input = line.split(';') user_id, item_ids = input[0], input[1:] for i in range(len(item_ids)): f1 = item_ids[i] yield f1, (user_id, 1, item_ids) def reverse_index(self, key, values): ''' Compute the cartesian product between among all sources that shares common friends. Input (friend1 -> (source, 1, {“friend1”, “friend2”, “friend3”}}; marcel, jonas,maria,jose,amanda Output {(jonas, maria), 1}, {(maria, jose), 1}, {(jonas,amanda), 1}, {(maria, amanda),1}, {(jose, jonas), 1} ''' final_list = [] for value in values: final_list.append(value) for item1, item2 in combinations(final_list, 2): f1, count1, f1_friends = item1 f2, count2, f2_friends = item2 if f1 not in f2_friends: yield (f2, f1), (count1) if f2 not in f1_friends: yield (f1, f2), (count2) def count_number_of_friends(self, key, values): ''' Count the number of mutual friends. Input {(source, recommended), [count1, count2, ... count n]} {(jonas, maria), [1,1,1]}, {(jose, jonas), [1,1]}, {(maria, amanda), [1]}, {(maria,jose), [1]}, {(jonas,amanda), [1, 1, 1]} Output ({friend1, friend2} numberOfMutualFriends): (jonas, marcel ) 1 (fabiola, marcel) 1 (marcel, patricia) 1 (marcel, paula) 1 (carol, marcel) 2 ''' user_id, possible_friend = key yield (user_id), \ (possible_friend, sum(values)) def top_recommendations(self, key, values): ''' Get the TOP N recommendations for user. Input ({user_id, [(numberOfMutualFriends, friend), (numberOfMutualFriends2, friend)]}): [{marcel, [(patricia, 1), (paula, 1)]}, {carol, [(marcel, 2)]}, {fabiola, [(marcel,1)]}, {jonas, [(marcel,1)]}] Output ({user_id, [(numberOfMutualFriends, friend), (numberOfMutualFriends2, friend)]}): Ex: Get the top 3 suggestions. [{marcel, [(patricia, 1), (paula, 1)]}, {carol, [(marcel, 2)]}, {fabiola, [(marcel,1)]}, {jonas, [(marcel,1)]}] ''' recommendations = [] for idx, (item, score) in enumerate(values): recommendations.append((item, score)) yield key, sorted(recommendations, key=lambda k: -k[1])[:TOP_N] if __name__ == '__main__': FriendsRecommender.run()