(ns com.intangible-industries.cv (:use [clojure.string :only [lower-case split]]) (:use [clojure.contrib.duck-streams :only [read-lines]])) (def stopwords (apply hash-set (split "the a an and or of for in on to with by this that these those some other it its we our as but not do does is be are can was were which so from" #"\s"))) (defn extract-words [filename] (letfn [(filterfn [word] (and (not (contains? stopwords word)) (> (count word) 1)))] (filter filterfn (map lower-case (flatten (map #(split % #"[\s\/\.\&\'\,\(\)\;\:\p{C}]+") (read-lines filename))))))) (defn sorted-frequencies [words] (sort (fn [a b] (let [cmp-counts (compare (second b) (second a))] (if (zero? cmp-counts) (compare (first a) (first b)) cmp-counts))) (frequencies words))) (defn print-words [freqs] (apply print (map first (filter (fn [[_ c]] (> c 1)) freqs))) (println)) (print-words (sorted-frequencies (extract-words "cv.txt")))