Skip to content

Instantly share code, notes, and snippets.

@mehitabel
Created August 16, 2013 23:39
Show Gist options
  • Select an option

  • Save mehitabel/6254431 to your computer and use it in GitHub Desktop.

Select an option

Save mehitabel/6254431 to your computer and use it in GitHub Desktop.
very quick sketch of count-min-sketch
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "Count-Min-Sketch"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": "import pyhash\n\nclass CountMinSketch(object):\n def __init__(self, width=2**20, num_hashes=16):\n self.arrays = np.zeros((num_hashes, width))\n self.num_hashes = num_hashes\n self.width = width\n self.hasher = pyhash.murmur3_32()\n \n def _hash(self, x, k):\n return self.hasher(str(self.hasher(x) * k + 17)) % self.width\n \n def count(self, x):\n return min(self.arrays[k, self._hash(x,k)] for k in xrange(self.num_hashes))\n \n def inc(self, x):\n count = self.count(x)\n for k in xrange(self.num_hashes):\n multi_index = k, self._hash(x,k)\n if self.arrays[multi_index] == count:\n self.arrays[multi_index] += 1\n ",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": "from collections import Counter\n\ncms = CountMinSketch()\nc = Counter()",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 48
},
{
"cell_type": "code",
"collapsed": false,
"input": "import requests\n\nwords = requests.get(\"http://en.wikipedia.org/wiki/Nicolas_Roeg\").text.split()",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 51
},
{
"cell_type": "code",
"collapsed": false,
"input": "for word in words:\n cms.inc(word)\n c[word] += 1\n",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 52
},
{
"cell_type": "code",
"collapsed": false,
"input": "max([cms.count(word) / c[word] for word in words])",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 59,
"text": "1.0"
}
],
"prompt_number": 59
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment