Skip to content

Instantly share code, notes, and snippets.

@ali01
Last active August 29, 2015 14:00
Show Gist options
  • Select an option

  • Save ali01/5b82bc26d0da3d969500 to your computer and use it in GitHub Desktop.

Select an option

Save ali01/5b82bc26d0da3d969500 to your computer and use it in GitHub Desktop.

Revisions

  1. ali01 revised this gist May 4, 2014. 1 changed file with 13 additions and 8 deletions.
    21 changes: 13 additions & 8 deletions generate_test_file.py
    Original file line number Diff line number Diff line change
    @@ -7,15 +7,20 @@
    NUM_LINES = int(math.pow(2, 21))
    MAX_LINE_LENGTH = int(math.pow(2, 10))

    random.seed()

    def generate_test_file():
    random.seed()
    with open('./data.txt', 'a') as f:
    for i in range(0, NUM_LINES):
    line_length = random.randint(0, MAX_LINE_LENGTH)

    with open('./data.txt', 'a') as f:
    for i in range(0, NUM_LINES):
    line_length = random.randint(0, MAX_LINE_LENGTH)
    line = ''
    for j in range(0, line_length):
    line += str(unichr(random.randint(97, 122)))

    line = ''
    for j in range(0, line_length):
    line += str(unichr(random.randint(97, 122)))
    f.write("%s\n" % line)

    f.write("%s\n" % line)

    if __name__ == '__main__':
    generate_test_file()
    exit(0)
  2. ali01 revised this gist May 4, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion generate_test_file.py
    Original file line number Diff line number Diff line change
    @@ -10,7 +10,7 @@
    random.seed()


    with open('./test.txt', 'a') as f:
    with open('./data.txt', 'a') as f:
    for i in range(0, NUM_LINES):
    line_length = random.randint(0, MAX_LINE_LENGTH)

  3. ali01 revised this gist May 4, 2014. 2 changed files with 29 additions and 0 deletions.
    8 changes: 8 additions & 0 deletions Makefile
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,8 @@
    all:
    g++ -O2 -std=c++11 longest.cpp -o longest

    run: longest
    @./longest

    clean:
    rm -rf longest
    21 changes: 21 additions & 0 deletions generate_test_file.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,21 @@
    #!/usr/bin/env python

    import math
    import random

    # presently set so that the resulting file size is ~1GB on average
    NUM_LINES = int(math.pow(2, 21))
    MAX_LINE_LENGTH = int(math.pow(2, 10))

    random.seed()


    with open('./test.txt', 'a') as f:
    for i in range(0, NUM_LINES):
    line_length = random.randint(0, MAX_LINE_LENGTH)

    line = ''
    for j in range(0, line_length):
    line += str(unichr(random.randint(97, 122)))

    f.write("%s\n" % line)
  4. ali01 created this gist May 4, 2014.
    141 changes: 141 additions & 0 deletions longest.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,141 @@
    #include <cstring>
    #include <deque>
    #include <fcntl.h>
    #include <iostream>
    #include <string>
    #include <sys/stat.h>
    #include <sys/mman.h>
    #include <unistd.h>

    using namespace std;


    // Datastructure used to keep track of the N longest strings inserted.
    class LongestStrQueue {
    public:

    // Constructor:
    // uint32_t cap -- number of strings to keep track of
    LongestStrQueue(uint32_t cap) : cap_(cap), min_length_(0) {}


    // Inserts a string.
    // Runs in linear time with respect to `cap`, However, only strings that are
    // longer than min_length_, which is the length of the smallest string,
    // currently stored, need be inserted. If the being inserted come from a
    // pool of strings whose lengths are uniformly distributed [0, L), then the
    // fraction of strings that need to be inserted decreases with time as
    // min_length_ approaches L.
    void
    insert(string str) {
    if (str.length() > min_length_) {
    deque<string>::iterator it;
    for (it = strings_.begin(); it != strings_.end(); ++it) {
    if ((*it).length() < str.length())
    break;
    }

    strings_.insert(it, str);
    if (strings_.size() > cap_) {
    strings_.pop_back();
    }

    } else if (strings_.size() < cap_) {
    strings_.push_back(str);
    min_length_ = str.length();
    }
    }

    // Returns a const reference to the list of N.
    // longest strings in sorted order (decreasing length).
    const deque<string>&
    strings() {
    return strings_;
    }

    private:
    deque<string> strings_; // list of strings in sorted order (decreasing length)
    uint32_t min_length_; //
    const uint32_t cap_; // maximum number of strings to keep track of
    };


    int
    longest(const string& filepath, LongestStrQueue& longest_lines) {
    int fd = open(filepath.c_str(), O_RDONLY);

    // determine the size of the input file
    struct stat st;
    if (fstat(fd, &st) < 0) {
    perror("fstat");
    return -1;
    }

    // mmap input file
    void *data = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
    if (data == MAP_FAILED) {
    perror("mmap");
    return -1;
    }

    // breakdown file into lines and insert them into LongestStrQueue
    const char *data_str = (const char *)data;
    while (true) {
    int index = strcspn(data_str, "\n");
    if (data_str[index] == '\0')
    break;

    string line(data_str, index + 1);
    longest_lines.insert(line);

    data_str += index + 1;
    }

    return 0;
    }


    void
    print_usage(const char *program_name) {
    printf("usage: %s %s %s\n\n", program_name, "<input_filename>", "<num_lines");
    printf("Options:\n");
    printf(" -h show this help message and exit\n\n");
    }


    int
    main(int argc, char **argv) {
    // parse command line options
    int c_i;
    while ((c_i = getopt(argc, argv, "h")) != -1) {
    switch(c_i) {
    case 'h':
    case '?':
    default:
    print_usage(argv[0]);
    return 0;
    }
    }

    // parse non-option arguments
    // expect two non-option arguments
    if (optind + 2 != argc) {
    print_usage(argv[0]);
    return -1;
    }

    const string filepath = argv[optind];
    const uint32_t num_lines = stoi(argv[++optind]);

    // heavy lifting done by longest()
    LongestStrQueue longest_lines(num_lines);
    longest(filepath, longest_lines);

    // output result to stdout
    deque<string> strings = longest_lines.strings();
    for (deque<string>::iterator it = strings.begin(); it != strings.end(); ++it){
    cout << *it;
    }

    return 0;
    }