Skip to content

Instantly share code, notes, and snippets.

@PratikDeoghare
Last active June 18, 2023 07:36
Show Gist options
  • Select an option

  • Save PratikDeoghare/fb2c015692b7e017b19510d24b33f973 to your computer and use it in GitHub Desktop.

Select an option

Save PratikDeoghare/fb2c015692b7e017b19510d24b33f973 to your computer and use it in GitHub Desktop.
Approximate Word Counter
package main
import (
"flag"
"fmt"
"io"
"math/rand"
"os"
"regexp"
"sort"
)
var re = regexp.MustCompile(`\w+`)
func count(s []byte) int {
matches := re.FindAllIndex(s, -1)
return len(matches)
}
func main() {
filename := flag.String("f", "", "-f filename")
flag.Parse()
wc, err := avgWordCount(*filename, 100)
if err != nil {
panic(err)
}
fmt.Println(wc)
}
func avgWordCount(filename string, nSample int) (int, error) {
wcc := 0
for i := 0; i < nSample; i++ {
wc, err := countWords(filename)
if err != nil {
return 0, err
}
wcc += wc
}
return wcc / nSample, nil
}
func countWords(filename string) (int, error) {
fileInfo, err := os.Stat(filename)
if err != nil {
return 0, err
}
size := fileInfo.Size()
f, err := os.Open(filename)
if err != nil {
return 0, err
}
defer f.Close()
bufSize := 128
nIter := 10
var offsets []int64
for i := 0; i < nIter; i++ {
offsets = append(offsets, rand.Int63n(size))
}
sort.Slice(offsets, func(i, j int) bool {
return offsets[i] < offsets[j]
})
wc := 0
bs := make([]byte, bufSize)
for _, offset := range offsets {
_, err := f.ReadAt(bs, offset)
if err == io.EOF {
// ignore
} else if err != nil {
return 0, err
}
wc += count(bs)
}
w := float64(size) * float64(wc) / (float64(bufSize) * float64(nIter))
return int(w), nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment