Last active
January 25, 2025 22:46
-
-
Save ryancdotorg/fc98c86887d346b658eb51a5fdd831df to your computer and use it in GitHub Desktop.
Partial/streaming zip downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import io, sys | |
| import datetime | |
| import argparse | |
| import requests | |
| import operator | |
| import struct | |
| import zipfile | |
| from zipfile import ZipFile | |
| from fnmatch import fnmatch | |
| structFileHeader = "<4s2B4HL2L2H" | |
| sizeFileHeader = struct.calcsize(structFileHeader) | |
| _FH_FILENAME_LENGTH = 10 | |
| _FH_EXTRA_FIELD_LENGTH = 11 | |
| BLOCK_SIZE = 1<<24 | |
| __debug = False | |
| def debug(*args, **kwarg): | |
| if __debug: | |
| if 'file' not in kwarg: kwarg['file'] = sys.stderr | |
| print(*args, **kwarg) | |
| class HttpError(IOError): | |
| def __init__(self, response, message=None): | |
| self.response = response | |
| if message is None: self.message = f'http status {response.status_code}' | |
| else: self.message = message | |
| super().__init__(self.message) | |
| # not a full implementation, just enough to use with BufferedReader | |
| class HttpIO(io.RawIOBase): | |
| def __init__(self, url, *, session=None, parent=None): | |
| if not session: | |
| session = parent.session if session else requests.Session() | |
| self.session = session | |
| self._parent = parent | |
| self._url, self._off, self._pos = url, 0, 0 | |
| self._total = 0 | |
| if not parent: | |
| # We assume the zip file doesn't change while this tool is in use | |
| r = self._head() | |
| if 'bytes' not in r.headers.get('Accept-Ranges'): | |
| raise HttpError(r, 'byte ranges not supported by server') | |
| self._len = int(r.headers.get('Content-Length')) | |
| def _head(self): | |
| r = self.session.head(self._url, allow_redirects=True) | |
| if r.status_code != 200: raise HttpError(r) | |
| # Update URL if there was a redirect | |
| if r.url != self._url: self._url = r.url | |
| return r | |
| def _get(self, headers=None): | |
| r = self.session.get(self._url, headers=headers) | |
| if r.status_code not in (200, 206): raise HttpError(r) | |
| return r | |
| def _advance(self, n): | |
| self._pos += n | |
| self._total += n | |
| if self._parent: | |
| self._parent._total += n | |
| def slice(self, offset, length=None): | |
| hio = self.__class__(self._url, session=self.session, parent=self) | |
| hio._off = offset | |
| hio._len = length if length is not None else self._len - self._off | |
| hio.seek(0) | |
| return hio | |
| def read(self, size=-1): | |
| if size == 0: | |
| # any empty byte string will do | |
| return b'' | |
| elif self._off == 0 and self._pos == 0 and (size < 0 or size >= self._len): | |
| # entire file from the begining, no range header needed | |
| end = None | |
| elif size < 0: | |
| # rest of the file | |
| end = self._off + self._len | |
| else: | |
| # requested range or rest of the file, whichever is less | |
| end = min(self._off + self._len, self._pos + size) - 1 | |
| debug('read', size, self._pos - self._off, end - self._off, self._len) | |
| headers = {} | |
| if end is not None: | |
| headers['Range'] = f'bytes={self._pos}-{end}' | |
| r = self._get(headers) | |
| n = int(r.headers.get('Content-Length')) | |
| self._advance(n) | |
| return r.content | |
| def readall(self): | |
| return self.read(-1) | |
| def readinto(self, b): | |
| b = (memoryview(b) if not isinstance(b, memoryview) else b).cast('B') | |
| n = len(b) | |
| data = self.read(n) | |
| n = len(data) | |
| b[:n] = data | |
| return n | |
| def seek(self, pos, whence=0): | |
| if whence != 0 or pos != self._pos - self._off: | |
| debug('seek', pos, whence, self._pos - self._off) | |
| if whence == 0: self._pos = self._off + pos | |
| elif whence == 1: self._pos += pos | |
| elif whence == 2: self._pos = self._off + self._len + pos | |
| else: raise ValueError('invalid whence') | |
| return self._pos - self._off | |
| def __len__(self): return self._len | |
| def tell(self): return self._pos - self._off | |
| def writeable(self): return False | |
| def seekable(self): return True | |
| def readable(self): return True | |
| def __getattr__(self, name): | |
| if name in ('truncate', 'fileno', 'write'): | |
| raise OSError(f'{name} not supported') | |
| return None | |
| closed = property(lambda self: False) | |
| url = property(operator.attrgetter('_url')) | |
| class BufferedHttpIO(io.BufferedReader): | |
| def __init__(self, httpio, buffer_size=1024, block_size=BLOCK_SIZE): | |
| super().__init__(httpio, buffer_size) | |
| self._buffer_size = buffer_size | |
| self._block_size = block_size | |
| self._unbuffered = False | |
| self._tail = None | |
| def read(self, size=-1): | |
| if self.raw._len >= self._buffer_size: | |
| pos = self.tell() | |
| from_end = self.raw._len - pos | |
| if from_end <= self._buffer_size: | |
| # ZipFile does several small reads and seeks near the end of the file, | |
| # so it's useful to cache the last buffer worth of data, since seek() | |
| # resets the read buffer | |
| if not self._tail: | |
| debug('caching tail') | |
| self.seek(-self._buffer_size, 2) | |
| self._tail = super().read(self._buffer_size) | |
| start = len(self._tail) - from_end | |
| end = len(self._tail) if size < 0 else min(len(self._tail), start+size) | |
| chunk = self._tail[start:end] | |
| debug('cached tail read', len(chunk)) | |
| self.seek(pos + len(chunk)) | |
| return chunk | |
| # for larger reads, io.BufferedReader just gets in the way, so bypass it | |
| if size < 0 or size > self._buffer_size: | |
| if self._unbuffered: | |
| return self.raw.read(min(size, self._block_size)) | |
| else: | |
| self._unbuffered = True | |
| debug('enter unbuffered mode') | |
| if size < 0: size = self._buffer_size | |
| # return the buffer contents | |
| return self.read1(size) | |
| elif self._unbuffered: | |
| debug('exit unbuffered mode') | |
| # syncronize the file position | |
| self.seek(self.raw.tell()) | |
| self._unbuffered = False | |
| # normal buffered read | |
| return super().read(size) | |
| parser = argparse.ArgumentParser(description='Operate on zip files over HTTP.') | |
| parser.add_argument('url', metavar='URL', type=str, help='URL of a zip file') | |
| parser.add_argument('filename', metavar='FILENAME', type=str, nargs='?', | |
| help='filename within the zip file') | |
| args = parser.parse_args() | |
| h = HttpIO(args.url) | |
| r = BufferedHttpIO(h) | |
| z = ZipFile(r) | |
| def list_zipinfo(zi, prefix=None): | |
| ts = datetime.datetime(*zi.date_time).strftime('%Y-%m-%d %H:%M:%S') | |
| name = f'{prefix}:{zi.filename}' if prefix is not None else zi.filename | |
| method_id = zi.compress_type | |
| if method_id == zipfile.ZIP_STORED: method = 'S' | |
| elif method_id == zipfile.ZIP_DEFLATED: method = 'D' | |
| elif method_id == zipfile.ZIP_BZIP2: method = 'B' | |
| elif method_id == zipfile.ZIP_LZMA: method = 'L' | |
| else: method = '?' | |
| print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {method} {name}') | |
| def zi_slice(r, z, zi): | |
| # save position | |
| p = r.tell() | |
| r.seek(zi.header_offset) | |
| # get the zip member header | |
| fheader = r.read(sizeFileHeader) | |
| fheader = struct.unpack(structFileHeader, fheader) | |
| # restore position | |
| r.seek(p) | |
| # find the start of the actual data | |
| skip = fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH] | |
| data_offset = zi.header_offset + sizeFileHeader + skip | |
| return r.raw.slice(data_offset, zi.compress_size) | |
| def open_member(name, r, z): | |
| # try to glob the filename | |
| for c in '[]?*': | |
| if c not in name: | |
| continue | |
| pat1, sep, pat2 = name.partition(':') | |
| for zi in z.infolist(): | |
| if not fnmatch(zi.filename, pat1): | |
| continue | |
| if pat2: | |
| if zi.filename[-4:].lower() != '.zip': | |
| continue | |
| h2 = zi_slice(r, z, zi) | |
| r2 = BufferedHttpIO(h2) | |
| z2 = ZipFile(r2) | |
| for zi2 in z2.infolist(): | |
| if fnmatch(zi2.filename, pat2): | |
| return z2.open(zi2.filename) | |
| else: | |
| return z.open(zi.filename) | |
| break | |
| try: | |
| zi = z.getinfo(name) | |
| return z.open(name) | |
| except KeyError as e: | |
| oname, sep, name = name.partition(':') | |
| if not name or oname[-4:].lower() != '.zip': | |
| raise e | |
| zi = z.getinfo(oname) | |
| if zi.compress_type != zipfile.ZIP_STORED: | |
| raise ValueError(f'Nested zip file uses a method other than STORE!') | |
| h2 = zi_slice(r, z, zi) | |
| r2 = BufferedHttpIO(h2) | |
| z2 = ZipFile(r2) | |
| return z2.open(name) | |
| if args.filename: | |
| f = open_member(args.filename, r, z) | |
| while True: | |
| chunk = f.read1(-1) | |
| if len(chunk) == 0: break | |
| sys.stdout.buffer.write(chunk) | |
| sys.stdout.buffer.flush() | |
| else: | |
| for zi in z.infolist(): | |
| list_zipinfo(zi) | |
| file_ext = zi.filename[-4:].lower() | |
| if zi.compress_type == zipfile.ZIP_STORED and file_ext == '.zip': | |
| h2 = zi_slice(r, z, zi) | |
| r2 = BufferedHttpIO(h2) | |
| z2 = ZipFile(r2) | |
| for zi2 in z2.infolist(): | |
| list_zipinfo(zi2, zi.filename) | |
| print(f'total bytes read: {h._total}', file=sys.stderr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a tool extract individual files from a zip file hosted on a web server without having to download the entire file first.
One level of nesting is supported, provided the inner zip file was included with the
STOREmethod - use a colon to specify the nested file.Globs are supported in the filename to extract.
Examples: