Last active
August 29, 2015 14:00
-
-
Save ali01/5b82bc26d0da3d969500 to your computer and use it in GitHub Desktop.
Program to extract the N longest lines of a file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import math | |
| import random | |
| # presently set so that the resulting file size is ~1GB on average | |
| NUM_LINES = int(math.pow(2, 21)) | |
| MAX_LINE_LENGTH = int(math.pow(2, 10)) | |
| def generate_test_file(): | |
| random.seed() | |
| with open('./data.txt', 'a') as f: | |
| for i in range(0, NUM_LINES): | |
| line_length = random.randint(0, MAX_LINE_LENGTH) | |
| line = '' | |
| for j in range(0, line_length): | |
| line += str(unichr(random.randint(97, 122))) | |
| f.write("%s\n" % line) | |
| if __name__ == '__main__': | |
| generate_test_file() | |
| exit(0) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <cstring> | |
| #include <deque> | |
| #include <fcntl.h> | |
| #include <iostream> | |
| #include <string> | |
| #include <sys/stat.h> | |
| #include <sys/mman.h> | |
| #include <unistd.h> | |
| using namespace std; | |
| // Datastructure used to keep track of the N longest strings inserted. | |
| class LongestStrQueue { | |
| public: | |
| // Constructor: | |
| // uint32_t cap -- number of strings to keep track of | |
| LongestStrQueue(uint32_t cap) : cap_(cap), min_length_(0) {} | |
| // Inserts a string. | |
| // Runs in linear time with respect to `cap`, However, only strings that are | |
| // longer than min_length_, which is the length of the smallest string, | |
| // currently stored, need be inserted. If the being inserted come from a | |
| // pool of strings whose lengths are uniformly distributed [0, L), then the | |
| // fraction of strings that need to be inserted decreases with time as | |
| // min_length_ approaches L. | |
| void | |
| insert(string str) { | |
| if (str.length() > min_length_) { | |
| deque<string>::iterator it; | |
| for (it = strings_.begin(); it != strings_.end(); ++it) { | |
| if ((*it).length() < str.length()) | |
| break; | |
| } | |
| strings_.insert(it, str); | |
| if (strings_.size() > cap_) { | |
| strings_.pop_back(); | |
| } | |
| } else if (strings_.size() < cap_) { | |
| strings_.push_back(str); | |
| min_length_ = str.length(); | |
| } | |
| } | |
| // Returns a const reference to the list of N. | |
| // longest strings in sorted order (decreasing length). | |
| const deque<string>& | |
| strings() { | |
| return strings_; | |
| } | |
| private: | |
| deque<string> strings_; // list of strings in sorted order (decreasing length) | |
| uint32_t min_length_; // | |
| const uint32_t cap_; // maximum number of strings to keep track of | |
| }; | |
| int | |
| longest(const string& filepath, LongestStrQueue& longest_lines) { | |
| int fd = open(filepath.c_str(), O_RDONLY); | |
| // determine the size of the input file | |
| struct stat st; | |
| if (fstat(fd, &st) < 0) { | |
| perror("fstat"); | |
| return -1; | |
| } | |
| // mmap input file | |
| void *data = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0); | |
| if (data == MAP_FAILED) { | |
| perror("mmap"); | |
| return -1; | |
| } | |
| // breakdown file into lines and insert them into LongestStrQueue | |
| const char *data_str = (const char *)data; | |
| while (true) { | |
| int index = strcspn(data_str, "\n"); | |
| if (data_str[index] == '\0') | |
| break; | |
| string line(data_str, index + 1); | |
| longest_lines.insert(line); | |
| data_str += index + 1; | |
| } | |
| return 0; | |
| } | |
| void | |
| print_usage(const char *program_name) { | |
| printf("usage: %s %s %s\n\n", program_name, "<input_filename>", "<num_lines"); | |
| printf("Options:\n"); | |
| printf(" -h show this help message and exit\n\n"); | |
| } | |
| int | |
| main(int argc, char **argv) { | |
| // parse command line options | |
| int c_i; | |
| while ((c_i = getopt(argc, argv, "h")) != -1) { | |
| switch(c_i) { | |
| case 'h': | |
| case '?': | |
| default: | |
| print_usage(argv[0]); | |
| return 0; | |
| } | |
| } | |
| // parse non-option arguments | |
| // expect two non-option arguments | |
| if (optind + 2 != argc) { | |
| print_usage(argv[0]); | |
| return -1; | |
| } | |
| const string filepath = argv[optind]; | |
| const uint32_t num_lines = stoi(argv[++optind]); | |
| // heavy lifting done by longest() | |
| LongestStrQueue longest_lines(num_lines); | |
| longest(filepath, longest_lines); | |
| // output result to stdout | |
| deque<string> strings = longest_lines.strings(); | |
| for (deque<string>::iterator it = strings.begin(); it != strings.end(); ++it){ | |
| cout << *it; | |
| } | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| all: | |
| g++ -O2 -std=c++11 longest.cpp -o longest | |
| run: longest | |
| @./longest | |
| clean: | |
| rm -rf longest |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment