Last active
August 29, 2015 14:00
-
-
Save ali01/5b82bc26d0da3d969500 to your computer and use it in GitHub Desktop.
Revisions
-
ali01 revised this gist
May 4, 2014 . 1 changed file with 13 additions and 8 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -7,15 +7,20 @@ NUM_LINES = int(math.pow(2, 21)) MAX_LINE_LENGTH = int(math.pow(2, 10)) def generate_test_file(): random.seed() with open('./data.txt', 'a') as f: for i in range(0, NUM_LINES): line_length = random.randint(0, MAX_LINE_LENGTH) line = '' for j in range(0, line_length): line += str(unichr(random.randint(97, 122))) f.write("%s\n" % line) if __name__ == '__main__': generate_test_file() exit(0) -
ali01 revised this gist
May 4, 2014 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,7 +10,7 @@ random.seed() with open('./data.txt', 'a') as f: for i in range(0, NUM_LINES): line_length = random.randint(0, MAX_LINE_LENGTH) -
ali01 revised this gist
May 4, 2014 . 2 changed files with 29 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,8 @@ all: g++ -O2 -std=c++11 longest.cpp -o longest run: longest @./longest clean: rm -rf longest This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,21 @@ #!/usr/bin/env python import math import random # presently set so that the resulting file size is ~1GB on average NUM_LINES = int(math.pow(2, 21)) MAX_LINE_LENGTH = int(math.pow(2, 10)) random.seed() with open('./test.txt', 'a') as f: for i in range(0, NUM_LINES): line_length = random.randint(0, MAX_LINE_LENGTH) line = '' for j in range(0, line_length): line += str(unichr(random.randint(97, 122))) f.write("%s\n" % line) -
ali01 created this gist
May 4, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,141 @@ #include <cstring> #include <deque> #include <fcntl.h> #include <iostream> #include <string> #include <sys/stat.h> #include <sys/mman.h> #include <unistd.h> using namespace std; // Datastructure used to keep track of the N longest strings inserted. class LongestStrQueue { public: // Constructor: // uint32_t cap -- number of strings to keep track of LongestStrQueue(uint32_t cap) : cap_(cap), min_length_(0) {} // Inserts a string. // Runs in linear time with respect to `cap`, However, only strings that are // longer than min_length_, which is the length of the smallest string, // currently stored, need be inserted. If the being inserted come from a // pool of strings whose lengths are uniformly distributed [0, L), then the // fraction of strings that need to be inserted decreases with time as // min_length_ approaches L. void insert(string str) { if (str.length() > min_length_) { deque<string>::iterator it; for (it = strings_.begin(); it != strings_.end(); ++it) { if ((*it).length() < str.length()) break; } strings_.insert(it, str); if (strings_.size() > cap_) { strings_.pop_back(); } } else if (strings_.size() < cap_) { strings_.push_back(str); min_length_ = str.length(); } } // Returns a const reference to the list of N. // longest strings in sorted order (decreasing length). const deque<string>& strings() { return strings_; } private: deque<string> strings_; // list of strings in sorted order (decreasing length) uint32_t min_length_; // const uint32_t cap_; // maximum number of strings to keep track of }; int longest(const string& filepath, LongestStrQueue& longest_lines) { int fd = open(filepath.c_str(), O_RDONLY); // determine the size of the input file struct stat st; if (fstat(fd, &st) < 0) { perror("fstat"); return -1; } // mmap input file void *data = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0); if (data == MAP_FAILED) { perror("mmap"); return -1; } // breakdown file into lines and insert them into LongestStrQueue const char *data_str = (const char *)data; while (true) { int index = strcspn(data_str, "\n"); if (data_str[index] == '\0') break; string line(data_str, index + 1); longest_lines.insert(line); data_str += index + 1; } return 0; } void print_usage(const char *program_name) { printf("usage: %s %s %s\n\n", program_name, "<input_filename>", "<num_lines"); printf("Options:\n"); printf(" -h show this help message and exit\n\n"); } int main(int argc, char **argv) { // parse command line options int c_i; while ((c_i = getopt(argc, argv, "h")) != -1) { switch(c_i) { case 'h': case '?': default: print_usage(argv[0]); return 0; } } // parse non-option arguments // expect two non-option arguments if (optind + 2 != argc) { print_usage(argv[0]); return -1; } const string filepath = argv[optind]; const uint32_t num_lines = stoi(argv[++optind]); // heavy lifting done by longest() LongestStrQueue longest_lines(num_lines); longest(filepath, longest_lines); // output result to stdout deque<string> strings = longest_lines.strings(); for (deque<string>::iterator it = strings.begin(); it != strings.end(); ++it){ cout << *it; } return 0; }