import glom import sys from itertools import izip def taketwo(arr): i = (i for i in arr) return izip(i, i) def preload(ta_path): "preload glom with tagged articles from tagged clusters" # seed glom with the clusters we'll be searching against test_articles_set = {} ta = open(ta_path) while True: sl = ta.readline() if not sl: break _, said, stext = sl.split(' ', 2) sv = list((int(i), float(w)) for i,w in taketwo(stext.split())) test_articles_set[said] = sv return test_articles_set def trim(ta_path, radius): g = glom.Glom(radius, 50000) test_articles_set = preload(ta_path) while True: sl = sys.stdin.readline() if not sl: break _, said, stext = sl.split(' ', 2) sv = list((int(i), float(w)) for i,w in taketwo(stext.split())) for test_aid,test_points in test_articles_set.iteritems(): # itself or at least one similarity within the radius if test_aid == said or g.point_sim(test_points, sv) >= radius: print sl, break if __name__ == "__main__": if len(sys.argv) < 3: print >> sys.stderr, "usage:", sys.argv[0], " < full_vectors > trimmed_vectors" print >> sys.stderr, "example:", sys.argv[0], "vectors_time_sorted_tagged.txt 0.22 < vectors_time_sorted.txt > vectors_trimmed.txt" sys.exit(1) trim(sys.argv[1], float(sys.argv[2]))