Skip to content

Instantly share code, notes, and snippets.

@ryancdotorg
Last active January 25, 2025 22:46
Show Gist options
  • Select an option

  • Save ryancdotorg/fc98c86887d346b658eb51a5fdd831df to your computer and use it in GitHub Desktop.

Select an option

Save ryancdotorg/fc98c86887d346b658eb51a5fdd831df to your computer and use it in GitHub Desktop.

Revisions

  1. ryancdotorg revised this gist Feb 18, 2023. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion wzip.py
    Original file line number Diff line number Diff line change
    @@ -17,7 +17,7 @@
    _FH_FILENAME_LENGTH = 10
    _FH_EXTRA_FIELD_LENGTH = 11

    BLOCK_SIZE = 1<<20
    BLOCK_SIZE = 1<<18

    __debug = False
    def debug(*args, **kwarg):
  2. ryancdotorg revised this gist Feb 18, 2023. 1 changed file with 5 additions and 5 deletions.
    10 changes: 5 additions & 5 deletions wzip.py
    Original file line number Diff line number Diff line change
    @@ -36,9 +36,9 @@ def __init__(self, response, message=None):
    class _HttpIO(io.RawIOBase):
    def __init__(self, url, *, session=None, parent=None):
    if not session:
    session = parent.session if session else requests.Session()
    session = parent._session if parent else requests.Session()

    self.session = session
    self._session = session
    self._parent = parent
    self._url, self._off = url, 0
    self._pos, self._total = 0, 0
    @@ -55,7 +55,7 @@ def __init__(self, url, *, session=None, parent=None):
    def _head(self):
    self._heads += 1
    if self._parent: self._parent._heads += 1
    r = self.session.head(self._url, allow_redirects=True)
    r = self._session.head(self._url, allow_redirects=True)
    if r.status_code != 200: raise HttpError(r)
    # Update URL if there was a redirect
    if r.url != self._url: self._url = r.url
    @@ -64,7 +64,7 @@ def _head(self):
    def _get(self, headers=None, *, stream=False):
    self._gets += 1
    if self._parent: self._parent._gets += 1
    r = self.session.get(self._url, headers=headers, stream=stream)
    r = self._session.get(self._url, headers=headers, stream=stream)
    if r.status_code not in (200, 206): raise HttpError(r)
    return r

    @@ -75,7 +75,7 @@ def _advance(self, n):
    self._parent._total += n

    def slice(self, offset, length=None):
    hio = self.__class__(self._url, session=self.session, parent=self)
    hio = self.__class__(self._url, session=self._session, parent=self)
    hio._off = offset
    hio._len = length if length is not None else self._len - self._off
    hio.seek(0)
  3. ryancdotorg revised this gist Feb 18, 2023. 1 changed file with 6 additions and 1 deletion.
    7 changes: 6 additions & 1 deletion wzip.py
    Original file line number Diff line number Diff line change
    @@ -42,6 +42,7 @@ def __init__(self, url, *, session=None, parent=None):
    self._parent = parent
    self._url, self._off = url, 0
    self._pos, self._total = 0, 0
    self._gets, self._heads = 0, 0
    self._response, self._iter, self._next = None, None, None

    if not parent:
    @@ -52,13 +53,17 @@ def __init__(self, url, *, session=None, parent=None):
    self._len = int(r.headers.get('Content-Length'))

    def _head(self):
    self._heads += 1
    if self._parent: self._parent._heads += 1
    r = self.session.head(self._url, allow_redirects=True)
    if r.status_code != 200: raise HttpError(r)
    # Update URL if there was a redirect
    if r.url != self._url: self._url = r.url
    return r

    def _get(self, headers=None, *, stream=False):
    self._gets += 1
    if self._parent: self._parent._gets += 1
    r = self.session.get(self._url, headers=headers, stream=stream)
    if r.status_code not in (200, 206): raise HttpError(r)
    return r
    @@ -363,4 +368,4 @@ def list_zipinfo(zi, parents=None):
    for _, zi, parents in z.infolist_nested():
    list_zipinfo(zi, parents)

    print(f'total bytes read: {z.httpio._total}', file=sys.stderr)
    print(f'heads={z.httpio._heads} gets={z.httpio._gets} bytes_read={z.httpio._total}', file=sys.stderr)
  4. ryancdotorg revised this gist Feb 18, 2023. 1 changed file with 8 additions and 8 deletions.
    16 changes: 8 additions & 8 deletions wzip.py
    Original file line number Diff line number Diff line change
    @@ -76,13 +76,6 @@ def slice(self, offset, length=None):
    hio.seek(0)
    return hio

    # set up response streaming and return first chunk
    def _stream_init(self, response, chunk_size=None):
    debug(f'_stream_init chunk_size={chunk_size}')
    self._response = response
    self._iter = response.iter_content(chunk_size)
    self._next = next(self._iter, None)

    def _stream_close(self):
    debug('_stream_close')
    if self._response: self._response.close()
    @@ -138,10 +131,17 @@ def _read(self, size=-1, *, stream=False, chunk_size=BLOCK_SIZE):
    if end is not None:
    headers['Range'] = f'bytes={self._pos}-{end}'

    # reading multiple sequential chunks without streaming would require a request
    # to the server for each chunk, involving potentially slow network round trips
    actually_stream = bool(stream and (size < 0 or size >= chunk_size))
    r = self._get(headers, stream=actually_stream)
    if actually_stream:
    self._stream_init(r, chunk_size)
    # set up response streaming
    debug(f'_stream_init chunk_size={chunk_size}')
    self._response = r
    self._iter = r.iter_content(chunk_size)
    self._next = next(self._iter, None)
    # return the first chunk
    return self._stream_read(size)

    n = int(r.headers.get('Content-Length'))
  5. ryancdotorg revised this gist Feb 18, 2023. 1 changed file with 92 additions and 79 deletions.
    171 changes: 92 additions & 79 deletions wzip.py
    Original file line number Diff line number Diff line change
    @@ -33,7 +33,7 @@ def __init__(self, response, message=None):
    super().__init__(self.message)

    # not a full implementation, just enough to use with BufferedReader
    class HttpIO(io.RawIOBase):
    class _HttpIO(io.RawIOBase):
    def __init__(self, url, *, session=None, parent=None):
    if not session:
    session = parent.session if session else requests.Session()
    @@ -188,8 +188,11 @@ def __getattr__(self, name):
    closed = property(lambda self: False)
    url = property(operator.attrgetter('_url'))

    class BufferedHttpIO(io.BufferedReader):
    # helper class that automatically bypasses the buffer for bulk reads, and tries to
    # cache the end of central directory record from the end of a zip file
    class _BufferedHttpIO(io.BufferedReader):
    def __init__(self, httpio, buffer_size=1024, block_size=BLOCK_SIZE):
    assert isinstance(httpio, _HttpIO)
    super().__init__(httpio, buffer_size)
    self._buffer_size = buffer_size
    self._block_size = block_size
    @@ -240,53 +243,90 @@ def read(self, size=-1):
    # normal buffered read
    return super().read(size)

    parser = argparse.ArgumentParser(description='Operate on zip files over HTTP.')
    parser.add_argument('url', metavar='URL', type=str, help='URL of a zip file')
    parser.add_argument('filename', metavar='FILENAME', type=str, nargs='?',
    help='filename within the zip file')

    args = parser.parse_args()
    h = HttpIO(args.url)
    r = BufferedHttpIO(h)
    z = ZipFile(r)

    def zi_slice(r, z, zi):
    assert zi.compress_type == ZIP_STORED

    saved_position = r.tell()

    # get the zip member header
    r.seek(zi.header_offset)
    fheader = r.read(sizeFileHeader)
    fheader = struct.unpack(structFileHeader, fheader)

    r.seek(saved_position)

    # find the start of the actual data
    skip = fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH]
    data_offset = zi.header_offset + sizeFileHeader + skip
    return r.raw.slice(data_offset, zi.compress_size)
    class ZipHttp(ZipFile):
    def __init__(self, httpio, *args, **kwargs):
    assert isinstance(httpio, _HttpIO)
    self._args, self._kwargs = args, kwargs
    bkw = {}
    if 'buffer_size' in kwargs: bkw['buffer_size'] = kwargs.pop('buffer_size')
    if 'block_size' in kwargs: bkw['block_size'] = kwargs.pop('block_size')
    reader = _BufferedHttpIO(httpio, **bkw)
    super().__init__(reader, *args, **kwargs)
    self._httpio, self._reader = httpio, reader

    def sub(self, zi):
    assert zi.compress_type == ZIP_STORED

    saved_position = self._reader.tell()

    # get the zip member header
    self._reader.seek(zi.header_offset)
    fheader = self._reader.read(sizeFileHeader)
    fheader = struct.unpack(structFileHeader, fheader)

    self._reader.seek(saved_position)

    # find the start of the actual data
    skip = fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH]
    data_offset = zi.header_offset + sizeFileHeader + skip
    httpio = self.httpio.slice(data_offset, zi.compress_size)
    return self.__class__(httpio, *self._args, **self._kwargs)
    return self.httpio.slice(data_offset, zi.compress_size)

    def infolist_nested(self, max_depth=1, parents=None, *, pattern=None):
    parents = parents or ()

    for zi in self.infolist():
    yield self, zi, parents

    # recursion limit
    if max_depth < 0: continue

    # only try to recurse STORED .zip files
    if zi.compress_type != ZIP_STORED: continue
    if zi.filename[-4:].lower() != '.zip': continue

    # if a pattern was provided, don't recurse unless it matches
    if pattern is not None and not fnmatch(zi.filename, pattern): continue

    with self.sub(zi) as z2:
    yield from z2.infolist_nested(max_depth - 1, parents + (zi.filename,))

    def open_nested(self, name):
    # try to glob the filename
    for c in '[]?*':
    if c not in name:
    continue

    # if there's no colon in the name, pat1 and sep will be empty strings
    pat1, sep, pat2 = name.rpartition(':')
    for z2, zi, parents in self.infolist_nested(pattern=pat1):
    if not parents:
    if sep or not fnmatch(zi.filename, pat2): continue
    else:
    if not pat1 or not fnmatch(':'.join(parents), pat1): continue
    if not fnmatch(zi.filename, pat2): continue

    def infolist_nested(r, z, max_depth=1, parents=None, *, pattern=None):
    parents = parents or ()
    return z2.open(zi.filename)

    for zi in z.infolist():
    yield z, zi, parents
    try:
    zi = self.getinfo(name)
    return self.open(name)
    except KeyError as e:
    oname, sep, name = name.rpartition(':')
    if sep and oname[-4:].lower() != '.zip':
    raise e

    # recursion limit
    if max_depth < 0: continue
    try: zi = self.getinfo(oname)
    except: raise e

    # only try to recurse STORED .zip files
    if zi.compress_type != ZIP_STORED: continue
    if zi.filename[-4:].lower() != '.zip': continue
    if zi.compress_type != ZIP_STORED:
    raise ValueError(f'Nested zip file uses a method other than STORE!')

    # if a pattern was provided, don't recurse unless it matches
    if pattern is not None and not fnmatch(zi.filename, pattern): continue
    z2 = self.sub(zi)
    return z2.open(name)

    h2 = zi_slice(r, z, zi)
    r2 = BufferedHttpIO(h2)
    with ZipFile(r2) as z2:
    yield from infolist_nested(r2, z2, max_depth - 1, parents + (zi.filename,))
    httpio = property(operator.attrgetter('_httpio'))

    def list_zipinfo(zi, parents=None):
    parents = parents or ()
    @@ -301,43 +341,16 @@ def list_zipinfo(zi, parents=None):
    else: method = '?'
    print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {method} {name}')

    def open_member(name, r, z):
    # try to glob the filename
    for c in '[]?*':
    if c not in name:
    continue

    pat1, sep, pat2 = name.rpartition(':')
    for z2, zi, parents in infolist_nested(r, z, pattern=pat1):
    if not parents:
    if sep or not fnmatch(zi.filename, pat2): continue
    else:
    if not pat1 or not fnmatch(':'.join(parents), pat1): continue
    if not fnmatch(zi.filename, pat2): continue

    return z2.open(zi.filename)

    try:
    zi = z.getinfo(name)
    return z.open(name)
    except KeyError as e:
    oname, sep, name = name.rpartition(':')
    if sep and oname[-4:].lower() != '.zip':
    raise e

    try: zi = z.getinfo(oname)
    except: raise e

    if zi.compress_type != ZIP_STORED:
    raise ValueError(f'Nested zip file uses a method other than STORE!')
    parser = argparse.ArgumentParser(description='Operate on zip files over HTTP.')
    parser.add_argument('url', metavar='URL', type=str, help='URL of a zip file')
    parser.add_argument('filename', metavar='FILENAME', type=str, nargs='?',
    help='filename within the zip file')

    h2 = zi_slice(r, z, zi)
    r2 = BufferedHttpIO(h2)
    z2 = ZipFile(r2)
    return z2.open(name)
    args = parser.parse_args()
    z = ZipHttp(_HttpIO(args.url))

    if args.filename:
    f = open_member(args.filename, r, z)
    f = z.open_nested(args.filename)

    while True:
    chunk = f.read1(-1)
    @@ -347,7 +360,7 @@ def open_member(name, r, z):
    sys.stdout.buffer.flush()
    else:
    # if no filename given, list the contents
    for _, zi, parents in infolist_nested(r, z):
    for _, zi, parents in z.infolist_nested():
    list_zipinfo(zi, parents)

    print(f'total bytes read: {h._total}', file=sys.stderr)
    print(f'total bytes read: {z.httpio._total}', file=sys.stderr)
  6. ryancdotorg revised this gist Feb 18, 2023. 1 changed file with 57 additions and 50 deletions.
    107 changes: 57 additions & 50 deletions wzip.py
    Original file line number Diff line number Diff line change
    @@ -9,8 +9,7 @@
    import operator
    import struct

    import zipfile
    from zipfile import ZipFile
    from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED, ZIP_BZIP2, ZIP_LZMA
    from fnmatch import fnmatch

    structFileHeader = "<4s2B4HL2L2H"
    @@ -251,69 +250,85 @@ def read(self, size=-1):
    r = BufferedHttpIO(h)
    z = ZipFile(r)

    def list_zipinfo(zi, prefix=None):
    ts = datetime.datetime(*zi.date_time).strftime('%Y-%m-%d %H:%M:%S')
    name = f'{prefix}:{zi.filename}' if prefix is not None else zi.filename
    method_id = zi.compress_type
    if method_id == zipfile.ZIP_STORED: method = 'S'
    elif method_id == zipfile.ZIP_DEFLATED: method = 'D'
    elif method_id == zipfile.ZIP_BZIP2: method = 'B'
    elif method_id == zipfile.ZIP_LZMA: method = 'L'
    else: method = '?'
    print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {method} {name}')

    def zi_slice(r, z, zi):
    # save position
    p = r.tell()
    r.seek(zi.header_offset)
    assert zi.compress_type == ZIP_STORED

    saved_position = r.tell()

    # get the zip member header
    r.seek(zi.header_offset)
    fheader = r.read(sizeFileHeader)
    fheader = struct.unpack(structFileHeader, fheader)
    # restore position
    r.seek(p)

    r.seek(saved_position)

    # find the start of the actual data
    skip = fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH]
    data_offset = zi.header_offset + sizeFileHeader + skip
    return r.raw.slice(data_offset, zi.compress_size)

    def infolist_nested(r, z, max_depth=1, parents=None, *, pattern=None):
    parents = parents or ()

    for zi in z.infolist():
    yield z, zi, parents

    # recursion limit
    if max_depth < 0: continue

    # only try to recurse STORED .zip files
    if zi.compress_type != ZIP_STORED: continue
    if zi.filename[-4:].lower() != '.zip': continue

    # if a pattern was provided, don't recurse unless it matches
    if pattern is not None and not fnmatch(zi.filename, pattern): continue

    h2 = zi_slice(r, z, zi)
    r2 = BufferedHttpIO(h2)
    with ZipFile(r2) as z2:
    yield from infolist_nested(r2, z2, max_depth - 1, parents + (zi.filename,))

    def list_zipinfo(zi, parents=None):
    parents = parents or ()

    ts = datetime.datetime(*zi.date_time).strftime('%Y-%m-%d %H:%M:%S')
    name = ':'.join(parents + (zi.filename,))
    method_id = zi.compress_type
    if method_id == ZIP_STORED: method = 'S'
    elif method_id == ZIP_DEFLATED: method = 'D'
    elif method_id == ZIP_BZIP2: method = 'B'
    elif method_id == ZIP_LZMA: method = 'L'
    else: method = '?'
    print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {method} {name}')

    def open_member(name, r, z):
    # try to glob the filename
    for c in '[]?*':
    if c not in name:
    continue

    pat1, sep, pat2 = name.partition(':')
    for zi in z.infolist():
    if not fnmatch(zi.filename, pat1):
    continue

    if pat2:
    if zi.filename[-4:].lower() != '.zip':
    continue

    h2 = zi_slice(r, z, zi)
    r2 = BufferedHttpIO(h2)
    z2 = ZipFile(r2)

    for zi2 in z2.infolist():
    if fnmatch(zi2.filename, pat2):
    return z2.open(zi2.filename)
    pat1, sep, pat2 = name.rpartition(':')
    for z2, zi, parents in infolist_nested(r, z, pattern=pat1):
    if not parents:
    if sep or not fnmatch(zi.filename, pat2): continue
    else:
    return z.open(zi.filename)
    if not pat1 or not fnmatch(':'.join(parents), pat1): continue
    if not fnmatch(zi.filename, pat2): continue

    break
    return z2.open(zi.filename)

    try:
    zi = z.getinfo(name)
    return z.open(name)
    except KeyError as e:
    oname, sep, name = name.partition(':')
    if not name or oname[-4:].lower() != '.zip':
    oname, sep, name = name.rpartition(':')
    if sep and oname[-4:].lower() != '.zip':
    raise e

    zi = z.getinfo(oname)
    if zi.compress_type != zipfile.ZIP_STORED:
    try: zi = z.getinfo(oname)
    except: raise e

    if zi.compress_type != ZIP_STORED:
    raise ValueError(f'Nested zip file uses a method other than STORE!')

    h2 = zi_slice(r, z, zi)
    @@ -332,15 +347,7 @@ def open_member(name, r, z):
    sys.stdout.buffer.flush()
    else:
    # if no filename given, list the contents
    for zi in z.infolist():
    list_zipinfo(zi)
    # also list the contents of stored zip files
    file_ext = zi.filename[-4:].lower()
    if zi.compress_type == zipfile.ZIP_STORED and file_ext == '.zip':
    h2 = zi_slice(r, z, zi)
    r2 = BufferedHttpIO(h2)
    z2 = ZipFile(r2)
    for zi2 in z2.infolist():
    list_zipinfo(zi2, zi.filename)
    for _, zi, parents in infolist_nested(r, z):
    list_zipinfo(zi, parents)

    print(f'total bytes read: {h._total}', file=sys.stderr)
  7. ryancdotorg revised this gist Feb 17, 2023. 1 changed file with 77 additions and 14 deletions.
    91 changes: 77 additions & 14 deletions wzip.py
    100644 → 100755
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,6 @@
    #!/usr/bin/env python3
    # SPDX-License-Identifier: 0BSD or CC0-1.0 or MIT-0 or Unlicense
    # Copyright (c) 2023, Ryan Castellucci, No Rights Reserved

    import io, sys
    import datetime
    @@ -16,7 +18,7 @@
    _FH_FILENAME_LENGTH = 10
    _FH_EXTRA_FIELD_LENGTH = 11

    BLOCK_SIZE = 1<<24
    BLOCK_SIZE = 1<<20

    __debug = False
    def debug(*args, **kwarg):
    @@ -39,8 +41,9 @@ def __init__(self, url, *, session=None, parent=None):

    self.session = session
    self._parent = parent
    self._url, self._off, self._pos = url, 0, 0
    self._total = 0
    self._url, self._off = url, 0
    self._pos, self._total = 0, 0
    self._response, self._iter, self._next = None, None, None

    if not parent:
    # We assume the zip file doesn't change while this tool is in use
    @@ -56,8 +59,8 @@ def _head(self):
    if r.url != self._url: self._url = r.url
    return r

    def _get(self, headers=None):
    r = self.session.get(self._url, headers=headers)
    def _get(self, headers=None, *, stream=False):
    r = self.session.get(self._url, headers=headers, stream=stream)
    if r.status_code not in (200, 206): raise HttpError(r)
    return r

    @@ -74,8 +77,50 @@ def slice(self, offset, length=None):
    hio.seek(0)
    return hio

    # set up response streaming and return first chunk
    def _stream_init(self, response, chunk_size=None):
    debug(f'_stream_init chunk_size={chunk_size}')
    self._response = response
    self._iter = response.iter_content(chunk_size)
    self._next = next(self._iter, None)

    def _stream_close(self):
    debug('_stream_close')
    if self._response: self._response.close()
    self._response, self._iter, self._next = None, None, None

    def _stream_read(self, size=-1):
    debug(f'_stream_read avail={len(self._next) if self._next else 0}, requested={size}')
    assert self._iter is not None
    if self._next is None:
    # this implies that the response iterator was exausted on its first call,
    # which should not happen, but better safe than sorry
    self._stream_close()
    size = BLOCK_SIZE if size < 0 else min(size, BLOCK_SIZE)
    return self.read(size)
    elif size >= 0 and size < len(self._next):
    # reading less than the available amount of data isn't fully handled
    chunk = self._next[:size]
    self._stream_close()
    else:
    chunk = self._next
    # the response iterator only gets marked as exausted upon when it is asked
    # to generate a chunk but has no data available - read ahead one chunk
    # since the reader may stop after the last one it expects
    self._next = next(self._iter, None)
    if self._next is None:
    self._stream_close()

    self._advance(len(chunk))
    return chunk

    def read(self, size=-1):
    if size == 0:
    return self._read(size)

    def _read(self, size=-1, *, stream=False, chunk_size=BLOCK_SIZE):
    if self._iter:
    return self._stream_read(size)
    elif size == 0:
    # any empty byte string will do
    return b''
    elif self._off == 0 and self._pos == 0 and (size < 0 or size >= self._len):
    @@ -94,7 +139,12 @@ def read(self, size=-1):
    if end is not None:
    headers['Range'] = f'bytes={self._pos}-{end}'

    r = self._get(headers)
    actually_stream = bool(stream and (size < 0 or size >= chunk_size))
    r = self._get(headers, stream=actually_stream)
    if actually_stream:
    self._stream_init(r, chunk_size)
    return self._stream_read(size)

    n = int(r.headers.get('Content-Length'))
    self._advance(n)
    return r.content
    @@ -113,11 +163,17 @@ def readinto(self, b):
    def seek(self, pos, whence=0):
    if whence != 0 or pos != self._pos - self._off:
    debug('seek', pos, whence, self._pos - self._off)
    if whence == 0: self._pos = self._off + pos
    elif whence == 1: self._pos += pos
    elif whence == 2: self._pos = self._off + self._len + pos

    if whence == 0: newpos = self._off + pos
    elif whence == 1: newpos = self._pos + pos
    elif whence == 2: newpos = self._off + self._len + pos
    else: raise ValueError('invalid whence')
    return self._pos - self._off

    if self._pos != newpos and self._iter:
    self._stream_close()

    self._pos = newpos
    return newpos

    def __len__(self): return self._len
    def tell(self): return self._pos - self._off
    @@ -153,9 +209,13 @@ def read(self, size=-1):
    debug('caching tail')
    self.seek(-self._buffer_size, 2)
    self._tail = super().read(self._buffer_size)
    n = len(self._tail)
    self.raw._advance(n)
    else:
    n = len(self._tail)

    start = len(self._tail) - from_end
    end = len(self._tail) if size < 0 else min(len(self._tail), start+size)
    start = n - from_end
    end = n if size < 0 else min(n, start+size)
    chunk = self._tail[start:end]
    debug('cached tail read', len(chunk))
    self.seek(pos + len(chunk))
    @@ -164,7 +224,8 @@ def read(self, size=-1):
    # for larger reads, io.BufferedReader just gets in the way, so bypass it
    if size < 0 or size > self._buffer_size:
    if self._unbuffered:
    return self.raw.read(min(size, self._block_size))
    return self.raw._read(size, stream=True, chunk_size=self._block_size)
    #return self.raw.read(min(size, self._block_size))
    else:
    self._unbuffered = True
    debug('enter unbuffered mode')
    @@ -270,8 +331,10 @@ def open_member(name, r, z):

    sys.stdout.buffer.flush()
    else:
    # if no filename given, list the contents
    for zi in z.infolist():
    list_zipinfo(zi)
    # also list the contents of stored zip files
    file_ext = zi.filename[-4:].lower()
    if zi.compress_type == zipfile.ZIP_STORED and file_ext == '.zip':
    h2 = zi_slice(r, z, zi)
  8. ryancdotorg revised this gist Feb 17, 2023. 1 changed file with 139 additions and 32 deletions.
    171 changes: 139 additions & 32 deletions wzip.py
    Original file line number Diff line number Diff line change
    @@ -9,12 +9,15 @@

    import zipfile
    from zipfile import ZipFile
    from fnmatch import fnmatch

    structFileHeader = "<4s2B4HL2L2H"
    sizeFileHeader = struct.calcsize(structFileHeader)
    _FH_FILENAME_LENGTH = 10
    _FH_EXTRA_FIELD_LENGTH = 11

    BLOCK_SIZE = 1<<24

    __debug = False
    def debug(*args, **kwarg):
    if __debug:
    @@ -30,11 +33,16 @@ def __init__(self, response, message=None):

    # not a full implementation, just enough to use with BufferedReader
    class HttpIO(io.RawIOBase):
    def __init__(self, url, *, session=None, _head=True):
    self.session = requests.Session() if session is None else session
    def __init__(self, url, *, session=None, parent=None):
    if not session:
    session = parent.session if session else requests.Session()

    self.session = session
    self._parent = parent
    self._url, self._off, self._pos = url, 0, 0
    self._total = 0

    if _head:
    if not parent:
    # We assume the zip file doesn't change while this tool is in use
    r = self._head()
    if 'bytes' not in r.headers.get('Accept-Ranges'):
    @@ -53,26 +61,42 @@ def _get(self, headers=None):
    if r.status_code not in (200, 206): raise HttpError(r)
    return r

    def _advance(self, n):
    self._pos += n
    self._total += n
    if self._parent:
    self._parent._total += n

    def slice(self, offset, length=None):
    hio = self.__class__(self._url, session=self.session, _head=False)
    hio = self.__class__(self._url, session=self.session, parent=self)
    hio._off = offset
    hio._len = length if length is not None else self._len - self._off
    hio.seek(0)
    return hio

    def read(self, size=-1):
    if size == 0: return b''
    elif self._pos - self._off == 0 and (size < 0 or size >= self._off + self._len): end = -1
    elif size < 0: end = self._off + self._len
    else: end = min(self._off + self._len, self._pos + size) - 1
    debug('read', size, self._pos, end, self._len)
    if size == 0:
    # any empty byte string will do
    return b''
    elif self._off == 0 and self._pos == 0 and (size < 0 or size >= self._len):
    # entire file from the begining, no range header needed
    end = None
    elif size < 0:
    # rest of the file
    end = self._off + self._len
    else:
    # requested range or rest of the file, whichever is less
    end = min(self._off + self._len, self._pos + size) - 1

    debug('read', size, self._pos - self._off, end - self._off, self._len)

    headers = {}
    if end >= self._pos: headers['Range'] = f'bytes={self._pos}-{end}'
    if end is not None:
    headers['Range'] = f'bytes={self._pos}-{end}'

    r = self._get(headers)
    self._pos += int(r.headers.get('Content-Length'))
    #debug('->', len(r.content), r.content)
    n = int(r.headers.get('Content-Length'))
    self._advance(n)
    return r.content

    def readall(self):
    @@ -87,8 +111,9 @@ def readinto(self, b):
    return n

    def seek(self, pos, whence=0):
    debug('seek', pos, whence)
    if whence == 0: self._pos = self._off + pos
    if whence != 0 or pos != self._pos - self._off:
    debug('seek', pos, whence, self._pos - self._off)
    if whence == 0: self._pos = self._off + pos
    elif whence == 1: self._pos += pos
    elif whence == 2: self._pos = self._off + self._len + pos
    else: raise ValueError('invalid whence')
    @@ -100,25 +125,69 @@ def writeable(self): return False
    def seekable(self): return True
    def readable(self): return True

    @property
    def closed(self):
    return False

    def __getattr__(self, name):
    if name in ('truncate', 'fileno', 'write'):
    raise OSError(f'{name} not supported')
    return None

    closed = property(lambda self: False)
    url = property(operator.attrgetter('_url'))

    class BufferedHttpIO(io.BufferedReader):
    def __init__(self, httpio, buffer_size=1024, block_size=BLOCK_SIZE):
    super().__init__(httpio, buffer_size)
    self._buffer_size = buffer_size
    self._block_size = block_size
    self._unbuffered = False
    self._tail = None

    def read(self, size=-1):
    if self.raw._len >= self._buffer_size:
    pos = self.tell()
    from_end = self.raw._len - pos
    if from_end <= self._buffer_size:
    # ZipFile does several small reads and seeks near the end of the file,
    # so it's useful to cache the last buffer worth of data, since seek()
    # resets the read buffer
    if not self._tail:
    debug('caching tail')
    self.seek(-self._buffer_size, 2)
    self._tail = super().read(self._buffer_size)

    start = len(self._tail) - from_end
    end = len(self._tail) if size < 0 else min(len(self._tail), start+size)
    chunk = self._tail[start:end]
    debug('cached tail read', len(chunk))
    self.seek(pos + len(chunk))
    return chunk

    # for larger reads, io.BufferedReader just gets in the way, so bypass it
    if size < 0 or size > self._buffer_size:
    if self._unbuffered:
    return self.raw.read(min(size, self._block_size))
    else:
    self._unbuffered = True
    debug('enter unbuffered mode')
    if size < 0: size = self._buffer_size
    # return the buffer contents
    return self.read1(size)
    elif self._unbuffered:
    debug('exit unbuffered mode')
    # syncronize the file position
    self.seek(self.raw.tell())
    self._unbuffered = False

    # normal buffered read
    return super().read(size)

    parser = argparse.ArgumentParser(description='Operate on zip files over HTTP.')
    parser.add_argument('url', metavar='URL', type=str, help='URL of a zip file')
    parser.add_argument('filename', metavar='FILENAME', type=str, nargs='?',
    help='filename within the zip file')

    args = parser.parse_args()
    h = HttpIO(args.url)
    r = io.BufferedReader(h, 1<<18)
    r = BufferedHttpIO(h)
    z = ZipFile(r)

    def list_zipinfo(zi, prefix=None):
    @@ -133,35 +202,69 @@ def list_zipinfo(zi, prefix=None):
    print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {method} {name}')

    def zi_slice(r, z, zi):
    # save position
    p = r.tell()
    r.seek(zi.header_offset)
    # get the zip member header
    fheader = r.read(sizeFileHeader)
    fheader = struct.unpack(structFileHeader, fheader)
    # restore position
    r.seek(p)

    # find the start of the actual data
    skip = fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH]
    data_offset = zi.header_offset + sizeFileHeader + skip
    return r.raw.slice(data_offset, zi.compress_size)

    if args.filename:
    name, sep, iname = args.filename, '', ''
    def open_member(name, r, z):
    # try to glob the filename
    for c in '[]?*':
    if c not in name:
    continue

    pat1, sep, pat2 = name.partition(':')
    for zi in z.infolist():
    if not fnmatch(zi.filename, pat1):
    continue

    if pat2:
    if zi.filename[-4:].lower() != '.zip':
    continue

    h2 = zi_slice(r, z, zi)
    r2 = BufferedHttpIO(h2)
    z2 = ZipFile(r2)

    for zi2 in z2.infolist():
    if fnmatch(zi2.filename, pat2):
    return z2.open(zi2.filename)
    else:
    return z.open(zi.filename)

    break

    try:
    zi = z.getinfo(name)
    except KeyError:
    name, sep, iname = name.partition(':')
    zi = z.getinfo(name)
    return z.open(name)
    except KeyError as e:
    oname, sep, name = name.partition(':')
    if not name or oname[-4:].lower() != '.zip':
    raise e

    zi = z.getinfo(oname)
    if zi.compress_type != zipfile.ZIP_STORED:
    raise ValueError(f'Nested zip file uses a method other than STORE!')

    if iname and zi.compress_type == zipfile.ZIP_STORED and name[-4:].lower() == '.zip':
    h2 = zi_slice(r, z, zi)
    r2 = io.BufferedReader(h2, 1<<18)
    r2 = BufferedHttpIO(h2)
    z2 = ZipFile(r2)
    zi = z2.getinfo(iname)
    f = z2.open(iname)
    else:
    f = z.open(name)
    return z2.open(name)

    if args.filename:
    f = open_member(args.filename, r, z)

    while True:
    chunk = f.read(1<<18)
    chunk = f.read1(-1)
    if len(chunk) == 0: break
    sys.stdout.buffer.write(chunk)

    @@ -171,6 +274,10 @@ def zi_slice(r, z, zi):
    list_zipinfo(zi)
    file_ext = zi.filename[-4:].lower()
    if zi.compress_type == zipfile.ZIP_STORED and file_ext == '.zip':
    z2 = ZipFile(io.BufferedReader(zi_slice(r, z, zi), 1<<18))
    h2 = zi_slice(r, z, zi)
    r2 = BufferedHttpIO(h2)
    z2 = ZipFile(r2)
    for zi2 in z2.infolist():
    list_zipinfo(zi2, zi.filename)

    print(f'total bytes read: {h._total}', file=sys.stderr)
  9. ryancdotorg revised this gist Feb 17, 2023. 1 changed file with 98 additions and 21 deletions.
    119 changes: 98 additions & 21 deletions wzip.py
    Original file line number Diff line number Diff line change
    @@ -5,9 +5,16 @@
    import argparse
    import requests
    import operator
    import struct

    import zipfile
    from zipfile import ZipFile

    structFileHeader = "<4s2B4HL2L2H"
    sizeFileHeader = struct.calcsize(structFileHeader)
    _FH_FILENAME_LENGTH = 10
    _FH_EXTRA_FIELD_LENGTH = 11

    __debug = False
    def debug(*args, **kwarg):
    if __debug:
    @@ -21,16 +28,19 @@ def __init__(self, response, message=None):
    else: self.message = message
    super().__init__(self.message)

    class HttpIO(io.IOBase):
    def __init__(self, url, session=None):
    # not a full implementation, just enough to use with BufferedReader
    class HttpIO(io.RawIOBase):
    def __init__(self, url, *, session=None, _head=True):
    self.session = requests.Session() if session is None else session
    self._url, self._pos = url, 0
    # We assume the zip file doesn't change while this tool is in use
    r = self._head()
    if 'bytes' not in r.headers.get('Accept-Ranges'):
    raise HttpError(r, 'byte ranges not supported by server')
    self._len = int(r.headers.get('Content-Length'))

    self._url, self._off, self._pos = url, 0, 0

    if _head:
    # We assume the zip file doesn't change while this tool is in use
    r = self._head()
    if 'bytes' not in r.headers.get('Accept-Ranges'):
    raise HttpError(r, 'byte ranges not supported by server')
    self._len = int(r.headers.get('Content-Length'))

    def _head(self):
    r = self.session.head(self._url, allow_redirects=True)
    if r.status_code != 200: raise HttpError(r)
    @@ -43,11 +53,18 @@ def _get(self, headers=None):
    if r.status_code not in (200, 206): raise HttpError(r)
    return r

    def slice(self, offset, length=None):
    hio = self.__class__(self._url, session=self.session, _head=False)
    hio._off = offset
    hio._len = length if length is not None else self._len - self._off
    hio.seek(0)
    return hio

    def read(self, size=-1):
    if size == 0: return b''
    elif self._pos == 0 and (size < 0 or size >= self._len): end = -1
    elif size < 0: end = self._len
    else: end = min(self._len, self._pos + size) - 1
    elif self._pos - self._off == 0 and (size < 0 or size >= self._off + self._len): end = -1
    elif size < 0: end = self._off + self._len
    else: end = min(self._off + self._len, self._pos + size) - 1
    debug('read', size, self._pos, end, self._len)

    headers = {}
    @@ -58,18 +75,34 @@ def read(self, size=-1):
    #debug('->', len(r.content), r.content)
    return r.content

    def readall(self):
    return self.read(-1)

    def readinto(self, b):
    b = (memoryview(b) if not isinstance(b, memoryview) else b).cast('B')
    n = len(b)
    data = self.read(n)
    n = len(data)
    b[:n] = data
    return n

    def seek(self, pos, whence=0):
    debug('seek', pos, whence)
    if whence == 0: self._pos = pos
    if whence == 0: self._pos = self._off + pos
    elif whence == 1: self._pos += pos
    elif whence == 2: self._pos = self._len + pos
    elif whence == 2: self._pos = self._off + self._len + pos
    else: raise ValueError('invalid whence')
    return self._pos
    return self._pos - self._off

    def __len__(self): return self._len
    def tell(self): return self._pos
    def tell(self): return self._pos - self._off
    def writeable(self): return False
    def seekable(self): return True
    def readable(self): return True

    @property
    def closed(self):
    return False

    def __getattr__(self, name):
    if name in ('truncate', 'fileno', 'write'):
    @@ -84,16 +117,60 @@ def __getattr__(self, name):
    help='filename within the zip file')

    args = parser.parse_args()

    z = ZipFile(HttpIO(args.url))
    h = HttpIO(args.url)
    r = io.BufferedReader(h, 1<<18)
    z = ZipFile(r)

    def list_zipinfo(zi, prefix=None):
    ts = datetime.datetime(*zi.date_time).strftime('%Y-%m-%d %H:%M:%S')
    name = f'{prefix}:{zi.filename}' if prefix is not None else zi.filename
    method_id = zi.compress_type
    if method_id == zipfile.ZIP_STORED: method = 'S'
    elif method_id == zipfile.ZIP_DEFLATED: method = 'D'
    elif method_id == zipfile.ZIP_BZIP2: method = 'B'
    elif method_id == zipfile.ZIP_LZMA: method = 'L'
    else: method = '?'
    print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {method} {name}')

    def zi_slice(r, z, zi):
    p = r.tell()
    r.seek(zi.header_offset)
    fheader = r.read(sizeFileHeader)
    fheader = struct.unpack(structFileHeader, fheader)
    r.seek(p)

    skip = fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH]
    data_offset = zi.header_offset + sizeFileHeader + skip
    return r.raw.slice(data_offset, zi.compress_size)

    if args.filename:
    f = z.open(args.filename)
    name, sep, iname = args.filename, '', ''
    try:
    zi = z.getinfo(name)
    except KeyError:
    name, sep, iname = name.partition(':')
    zi = z.getinfo(name)

    if iname and zi.compress_type == zipfile.ZIP_STORED and name[-4:].lower() == '.zip':
    h2 = zi_slice(r, z, zi)
    r2 = io.BufferedReader(h2, 1<<18)
    z2 = ZipFile(r2)
    zi = z2.getinfo(iname)
    f = z2.open(iname)
    else:
    f = z.open(name)

    while True:
    chunk = f.read(1<<18)
    if len(chunk) == 0: break
    sys.stdout.buffer.write(chunk)

    sys.stdout.buffer.flush()
    else:
    for zi in z.infolist():
    ts = datetime.datetime(*zi.date_time).strftime('%Y-%m-%d %H:%M:%S')
    print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {zi.filename}')
    list_zipinfo(zi)
    file_ext = zi.filename[-4:].lower()
    if zi.compress_type == zipfile.ZIP_STORED and file_ext == '.zip':
    z2 = ZipFile(io.BufferedReader(zi_slice(r, z, zi), 1<<18))
    for zi2 in z2.infolist():
    list_zipinfo(zi2, zi.filename)
  10. ryancdotorg created this gist Nov 8, 2021.
    99 changes: 99 additions & 0 deletions wzip.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,99 @@
    #!/usr/bin/env python3

    import io, sys
    import datetime
    import argparse
    import requests
    import operator

    from zipfile import ZipFile

    __debug = False
    def debug(*args, **kwarg):
    if __debug:
    if 'file' not in kwarg: kwarg['file'] = sys.stderr
    print(*args, **kwarg)

    class HttpError(IOError):
    def __init__(self, response, message=None):
    self.response = response
    if message is None: self.message = f'http status {response.status_code}'
    else: self.message = message
    super().__init__(self.message)

    class HttpIO(io.IOBase):
    def __init__(self, url, session=None):
    self.session = requests.Session() if session is None else session
    self._url, self._pos = url, 0
    # We assume the zip file doesn't change while this tool is in use
    r = self._head()
    if 'bytes' not in r.headers.get('Accept-Ranges'):
    raise HttpError(r, 'byte ranges not supported by server')
    self._len = int(r.headers.get('Content-Length'))

    def _head(self):
    r = self.session.head(self._url, allow_redirects=True)
    if r.status_code != 200: raise HttpError(r)
    # Update URL if there was a redirect
    if r.url != self._url: self._url = r.url
    return r

    def _get(self, headers=None):
    r = self.session.get(self._url, headers=headers)
    if r.status_code not in (200, 206): raise HttpError(r)
    return r

    def read(self, size=-1):
    if size == 0: return b''
    elif self._pos == 0 and (size < 0 or size >= self._len): end = -1
    elif size < 0: end = self._len
    else: end = min(self._len, self._pos + size) - 1
    debug('read', size, self._pos, end, self._len)

    headers = {}
    if end >= self._pos: headers['Range'] = f'bytes={self._pos}-{end}'

    r = self._get(headers)
    self._pos += int(r.headers.get('Content-Length'))
    #debug('->', len(r.content), r.content)
    return r.content

    def seek(self, pos, whence=0):
    debug('seek', pos, whence)
    if whence == 0: self._pos = pos
    elif whence == 1: self._pos += pos
    elif whence == 2: self._pos = self._len + pos
    else: raise ValueError('invalid whence')
    return self._pos

    def __len__(self): return self._len
    def tell(self): return self._pos
    def writeable(self): return False
    def seekable(self): return True

    def __getattr__(self, name):
    if name in ('truncate', 'fileno', 'write'):
    raise OSError(f'{name} not supported')
    return None

    url = property(operator.attrgetter('_url'))

    parser = argparse.ArgumentParser(description='Operate on zip files over HTTP.')
    parser.add_argument('url', metavar='URL', type=str, help='URL of a zip file')
    parser.add_argument('filename', metavar='FILENAME', type=str, nargs='?',
    help='filename within the zip file')

    args = parser.parse_args()

    z = ZipFile(HttpIO(args.url))

    if args.filename:
    f = z.open(args.filename)
    while True:
    chunk = f.read(1<<18)
    if len(chunk) == 0: break
    sys.stdout.buffer.write(chunk)
    else:
    for zi in z.infolist():
    ts = datetime.datetime(*zi.date_time).strftime('%Y-%m-%d %H:%M:%S')
    print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {zi.filename}')