# ABOUT: # A script that grabs a list of the friends or followers of a user on Google+, # grabs a sample of their friends, and generates the resulting social graph # USAGE: # Requirements: networkx (see DEPENDENCIES) # Configuration: see CONFIGURATION # Output: files will be save to the reports directory # To run the script: # 1) Download this file to a new directory somewhere as eg gplusESPnet.py # 2) cd to the directory # 3) *The first time*, create to new subdirectories (reports and cache); for example, run the following from the command line: mkdir reports; mkdir cache # 4) Call the script by running the following from the command line: # python gplusESPnet.py # DEPENDENCIES # The script makes use of the networkx library; you should only need to install it once. # To install networkx, from the command line type: easy_install networkx # If that doesn't work, follow the instructions on http://networkx.lanl.gov/install.html # In short: a) download and unzip http://networkx.lanl.gov/download/networkx/networkx-1.5.zip # b) cd to the networkx-1.5 directory, c) type: python setup.py install # END DEPENDENCIES import networkx as nx #--- the following should already be available import urllib2,re import md5,urllib,os,tempfile,time import random import datetime import StringIO #USER SETTINGS #rootID is the Google+ ID of the person whos ESP net you want to map rootID='100095426689697101649' #You also need to provide the name of this user name='Tony Hirst' #---- # Do some checks... def checkDir(dirpath): if not os.path.exists(dirpath): os.makedirs(dirpath) checkDir('reports') checkDir('cache') #--- oidRootNamePairs={rootID:name} defCache=360000 typ='fo' typ2='fr' DG=nx.DiGraph() reobj = re.compile(r'.*([0-9]{21}).*') reobj2 = re.compile(r',\["([^"]*)".*') reobj3=re.compile(r'.*[0-9]{21}"\]\n,\[\]\n,\["[^"]*') #oids = reobj3.findall(data) #for oid in oids: #,[[,,"112696985248193005986"]\n,[]\n,["Dawn Wicks-Sutton reobj4=re.compile(r',\[+,,"([0-9]{21})"]\n,\[\]\n,\["(.*)$') #ascii(reobj4.match(oid).group(2)) is name, tho check not '' if so 'U N Owen", reobj4.match(oid).group(1) is ID def ascii(s): return "".join(i for i in s if ord(i)<128) def getoidName(i,currIDs,oidNames): l=i.next() #print l oid = reobj.match(l) if oid==None: print 'at the end???' return i,currIDs,oidNames,-1 else: oid=oid.group(1) #if we don't get an ID, then return oidNames, i, -1 if oid not in currIDs: #print 'toploop' i.next() n=i.next() n=ascii(reobj2.match(n).group(1)) if oid not in oidNames: oidNames[oid]=n currIDs.append(oid) #print oid,n next='' while next!=',[]\n': next=i.next() #print '...'+next+',,,,' next='' while next!=']\n': next=i.next() else: print 'bottomloop' next='' while next!=']\n': next=i.next() return i,currIDs,oidNames,1 def getoidNames(oidNames,oid='',typ='fr'): #oidNames = {} if oid=='': return oidNames,[] currIDs=[] #???I suspect this only does one page of up to 1000(?) users? Need to check? if typ=='fr': url='https://plus.google.com/u/0/_/socialgraph/lookup/visible/?o=%5Bnull%2Cnull%2C%22'+oid+'%22%5D&rt=j' elif typ=='fo': url='https://plus.google.com/u/0/_/socialgraph/lookup/incoming/?o=%5Bnull%2Cnull%2C%22'+oid+'%22%5D&n=1000&rt=j' else: exit(-1) print url #data = urllib.urlopen(url).read() data=getGenericCachedData(url) i=StringIO.StringIO(data) i.next() i.next() i.next() #if flag returns <0, we're done flag=1 while flag>0: i,currIDs,oidNames,flag=getoidName(i,currIDs,oidNames) #print currIDs,oidNames return oidNames,currIDs #friends #https://plus.google.com/u/0/_/socialgraph/lookup/visible/?o=%5Bnull%2Cnull%2C%22GOOGLEPLUSUSERID%22%5D&rt=j #followers #https://plus.google.com/u/0/_/socialgraph/lookup/incoming/?o=%5Bnull%2Cnull%2C%22GOOGLEPLUSUSERID%22%5D&n=1000&rt=j #---------------------------------------------------------------- #Yield successive n-sized chunks from l def chunks(l, n): for i in xrange(0, len(l), n): yield l[i:i+n] def report(m, verbose=False): if verbose is True: print m class DiskCacheFetcherfname: def __init__(self, cache_dir=None): # If no cache directory specified, use system temp directory if cache_dir is None: cache_dir = tempfile.gettempdir() self.cache_dir = cache_dir def fetch(self, url, max_age=0): # Use MD5 hash of the URL as the filename filename = md5.new(url).hexdigest() filepath = os.path.join(self.cache_dir, filename) if os.path.exists(filepath): if int(time.time()) - os.path.getmtime(filepath) < max_age: #return open(filepath).read() report("using "+filename+", cached copy of fetched url: "+url) return filepath report("fetching fresh copy of fetched url: "+url) # Retrieve over HTTP and cache, using rename to avoid collisions data = urllib.urlopen(url).read() fd, temppath = tempfile.mkstemp() fp = os.fdopen(fd, 'w') fp.write(data) fp.close() os.rename(temppath, filepath) return filepath def getGenericCachedData(url, cachetime=defCache): fetcher=DiskCacheFetcherfname('cache') fn=fetcher.fetch(url, cachetime) f=open(fn) data=f.read() f.close() return data def addDirectedEdges(DG,fromNode,toSet,flip=False): for toNode in toSet: if flip==True: DG.add_edge(toNode,fromNode) else: DG.add_edge(fromNode,toNode) #print nx.info(DG) return DG def labelNodes(G,names): for nodeID in G.node: G.node[nodeID]['label']=names[nodeID] return G oidNamePairs={} for id in oidRootNamePairs: oidNamePairs,currIDs=getoidNames(oidNamePairs,id,typ) print currIDs flip=(typ=='fr') DG=addDirectedEdges(DG, id, currIDs,flip=flip) n=len(currIDs) print str(n) c=1 for cid in currIDs: print '\tSub-level run: getting ',typ2,str(c),'of',str(n),typ,cid oidNamePairs,ccurrIDs=getoidNames(oidNamePairs,cid,typ2) DG=addDirectedEdges(DG, cid, ccurrIDs) c=c+1 for id in oidRootNamePairs: if id not in oidNamePairs: oidNamePairs[id]=oidRootNamePairs[id] DG=labelNodes(DG,oidNamePairs) print nx.info(DG) now = datetime.datetime.now() ts = now.strftime("_%Y-%m-%d-%H-%M-%S") fname=name.replace(' ','_') nx.write_graphml(DG, '/'.join(['reports',fname+'_google'+typ+'Friends_'+ts+".graphml"])) nx.write_edgelist(DG, '/'.join(['reports',fname+'_google'+typ+'Friends_'+ts+".txt"]),data=False) def filterNet(DG,mindegree,indegree,outdegree,outdegreemax,typ,typ2,addUserFriendships,user,indegreemax): #need to tweak this to allow filtering by in and out degree? if addUserFriendships==1: DG=addFocus(DG,user,typ) #handle min,in,out degree filter=[] #filter=[n for n in DG if DG.degree(n)>=mindegree] for n in DG: if outdegreemax==None or DG.out_degree(n)<=outdegreemax: if mindegree!=None: if DG.degree(n)>=mindegree: filter.append(n) else: if indegree!=None: if DG.in_degree(n)>=indegree: filter.append(n) if outdegree!=None: if DG.out_degree(n)>=outdegree: filter.append(n) #the filter represents the intersect of the *degreesets #indegree and outdegree values are ignored if mindegree is set filter=set(filter) H=DG.subgraph(filter) #Superstitiously, perhaps, make sure we only grab nodes that project edges... filter= [n for n in H if H.degree(n)>0] L=H.subgraph(filter) #print "Filter set:",filter print L.order(),L.size() #L=labelGraph(L,filter) if mindegree==None: tm='X' else: tm=str(mindegree) if indegree==None: ti='X' else: ti=str(indegree) if outdegree==None: to='X' else: to=str(outdegree) if outdegreemax==None: tom='X' else: tom=str(outdegreemax) st='/'.join([projname,name+'_google'+typ+typ2+'degree_'+tm+'_'+ti+'_'+to+'_'+tom+"_esp"]) print nx.info(L) nx.write_graphml(L, st+".graphml") nx.write_edgelist(L, st+".txt",data=False) mindegree=None indegree=20 outdegree=25 outdegreemax=None addUserFriendships=0 user='' indegreemax=None projname='reports/' filterNet(DG,mindegree,indegree,outdegree,outdegreemax,typ,typ2,addUserFriendships,user,indegreemax)