Skip to content

Instantly share code, notes, and snippets.

@constverum
Created June 27, 2017 16:32
Show Gist options
  • Select an option

  • Save constverum/eeb5324967631b539b48ea3bcd94b4f8 to your computer and use it in GitHub Desktop.

Select an option

Save constverum/eeb5324967631b539b48ea3bcd94b4f8 to your computer and use it in GitHub Desktop.
External merge sort
import array
import heapq
import sys
import tempfile
class ExternalMergeSort:
MAX_INTS_PARTION = 10000 # reads/writes up to N integers at a time
def __init__(self, input_file, output_file, limit_ram_mb=1024):
if sys.platform == 'darwin':
# on macos may be a problem reading more than 2GB at a time
limit_ram_mb = limit_ram_mb if limit_ram_mb < 2048 else 2047
self.input_file = input_file
self.output_file = output_file
self.buffer_size = int(limit_ram_mb * 1024 * 1024) # to bytes
self.iters = []
def _split_input_file(self):
with open(self.input_file, 'r') as inpf:
while True:
ints = inpf.read(self.buffer_size)
arr = array.array('i', map(int, ints))
if not arr:
break
tf = tempfile.TemporaryFile()
array.array('i', sorted(arr)).tofile(tf)
tf.seek(0)
self.iters.append(self._ints_from_file(tf))
def _write_results(self):
arr = array.array('i')
with open(self.output_file, 'w') as outf:
for x in heapq.merge(*self.iters):
arr.append(x)
if len(arr) >= self.MAX_INTS_PARTION:
outf.write(''.join(map(str, arr.tolist())))
del arr[:]
if arr:
outf.write(''.join(map(str, arr.tolist())))
def _ints_from_file(self, f):
while True:
arr = array.array('i')
ints = f.read(self.MAX_INTS_PARTION * 4)
arr.frombytes(ints)
if not arr:
break
for x in arr:
yield x
def sort(self):
self._split_input_file()
self._write_results()
if __name__ == '__main__':
ExternalMergeSort('input.txt', 'output.txt', limit_ram_mb=4096).sort()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment