Last active
January 25, 2025 22:46
-
-
Save ryancdotorg/fc98c86887d346b658eb51a5fdd831df to your computer and use it in GitHub Desktop.
Revisions
-
ryancdotorg revised this gist
Feb 18, 2023 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -17,7 +17,7 @@ _FH_FILENAME_LENGTH = 10 _FH_EXTRA_FIELD_LENGTH = 11 BLOCK_SIZE = 1<<18 __debug = False def debug(*args, **kwarg): -
ryancdotorg revised this gist
Feb 18, 2023 . 1 changed file with 5 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -36,9 +36,9 @@ def __init__(self, response, message=None): class _HttpIO(io.RawIOBase): def __init__(self, url, *, session=None, parent=None): if not session: session = parent._session if parent else requests.Session() self._session = session self._parent = parent self._url, self._off = url, 0 self._pos, self._total = 0, 0 @@ -55,7 +55,7 @@ def __init__(self, url, *, session=None, parent=None): def _head(self): self._heads += 1 if self._parent: self._parent._heads += 1 r = self._session.head(self._url, allow_redirects=True) if r.status_code != 200: raise HttpError(r) # Update URL if there was a redirect if r.url != self._url: self._url = r.url @@ -64,7 +64,7 @@ def _head(self): def _get(self, headers=None, *, stream=False): self._gets += 1 if self._parent: self._parent._gets += 1 r = self._session.get(self._url, headers=headers, stream=stream) if r.status_code not in (200, 206): raise HttpError(r) return r @@ -75,7 +75,7 @@ def _advance(self, n): self._parent._total += n def slice(self, offset, length=None): hio = self.__class__(self._url, session=self._session, parent=self) hio._off = offset hio._len = length if length is not None else self._len - self._off hio.seek(0) -
ryancdotorg revised this gist
Feb 18, 2023 . 1 changed file with 6 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -42,6 +42,7 @@ def __init__(self, url, *, session=None, parent=None): self._parent = parent self._url, self._off = url, 0 self._pos, self._total = 0, 0 self._gets, self._heads = 0, 0 self._response, self._iter, self._next = None, None, None if not parent: @@ -52,13 +53,17 @@ def __init__(self, url, *, session=None, parent=None): self._len = int(r.headers.get('Content-Length')) def _head(self): self._heads += 1 if self._parent: self._parent._heads += 1 r = self.session.head(self._url, allow_redirects=True) if r.status_code != 200: raise HttpError(r) # Update URL if there was a redirect if r.url != self._url: self._url = r.url return r def _get(self, headers=None, *, stream=False): self._gets += 1 if self._parent: self._parent._gets += 1 r = self.session.get(self._url, headers=headers, stream=stream) if r.status_code not in (200, 206): raise HttpError(r) return r @@ -363,4 +368,4 @@ def list_zipinfo(zi, parents=None): for _, zi, parents in z.infolist_nested(): list_zipinfo(zi, parents) print(f'heads={z.httpio._heads} gets={z.httpio._gets} bytes_read={z.httpio._total}', file=sys.stderr) -
ryancdotorg revised this gist
Feb 18, 2023 . 1 changed file with 8 additions and 8 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -76,13 +76,6 @@ def slice(self, offset, length=None): hio.seek(0) return hio def _stream_close(self): debug('_stream_close') if self._response: self._response.close() @@ -138,10 +131,17 @@ def _read(self, size=-1, *, stream=False, chunk_size=BLOCK_SIZE): if end is not None: headers['Range'] = f'bytes={self._pos}-{end}' # reading multiple sequential chunks without streaming would require a request # to the server for each chunk, involving potentially slow network round trips actually_stream = bool(stream and (size < 0 or size >= chunk_size)) r = self._get(headers, stream=actually_stream) if actually_stream: # set up response streaming debug(f'_stream_init chunk_size={chunk_size}') self._response = r self._iter = r.iter_content(chunk_size) self._next = next(self._iter, None) # return the first chunk return self._stream_read(size) n = int(r.headers.get('Content-Length')) -
ryancdotorg revised this gist
Feb 18, 2023 . 1 changed file with 92 additions and 79 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -33,7 +33,7 @@ def __init__(self, response, message=None): super().__init__(self.message) # not a full implementation, just enough to use with BufferedReader class _HttpIO(io.RawIOBase): def __init__(self, url, *, session=None, parent=None): if not session: session = parent.session if session else requests.Session() @@ -188,8 +188,11 @@ def __getattr__(self, name): closed = property(lambda self: False) url = property(operator.attrgetter('_url')) # helper class that automatically bypasses the buffer for bulk reads, and tries to # cache the end of central directory record from the end of a zip file class _BufferedHttpIO(io.BufferedReader): def __init__(self, httpio, buffer_size=1024, block_size=BLOCK_SIZE): assert isinstance(httpio, _HttpIO) super().__init__(httpio, buffer_size) self._buffer_size = buffer_size self._block_size = block_size @@ -240,53 +243,90 @@ def read(self, size=-1): # normal buffered read return super().read(size) class ZipHttp(ZipFile): def __init__(self, httpio, *args, **kwargs): assert isinstance(httpio, _HttpIO) self._args, self._kwargs = args, kwargs bkw = {} if 'buffer_size' in kwargs: bkw['buffer_size'] = kwargs.pop('buffer_size') if 'block_size' in kwargs: bkw['block_size'] = kwargs.pop('block_size') reader = _BufferedHttpIO(httpio, **bkw) super().__init__(reader, *args, **kwargs) self._httpio, self._reader = httpio, reader def sub(self, zi): assert zi.compress_type == ZIP_STORED saved_position = self._reader.tell() # get the zip member header self._reader.seek(zi.header_offset) fheader = self._reader.read(sizeFileHeader) fheader = struct.unpack(structFileHeader, fheader) self._reader.seek(saved_position) # find the start of the actual data skip = fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH] data_offset = zi.header_offset + sizeFileHeader + skip httpio = self.httpio.slice(data_offset, zi.compress_size) return self.__class__(httpio, *self._args, **self._kwargs) return self.httpio.slice(data_offset, zi.compress_size) def infolist_nested(self, max_depth=1, parents=None, *, pattern=None): parents = parents or () for zi in self.infolist(): yield self, zi, parents # recursion limit if max_depth < 0: continue # only try to recurse STORED .zip files if zi.compress_type != ZIP_STORED: continue if zi.filename[-4:].lower() != '.zip': continue # if a pattern was provided, don't recurse unless it matches if pattern is not None and not fnmatch(zi.filename, pattern): continue with self.sub(zi) as z2: yield from z2.infolist_nested(max_depth - 1, parents + (zi.filename,)) def open_nested(self, name): # try to glob the filename for c in '[]?*': if c not in name: continue # if there's no colon in the name, pat1 and sep will be empty strings pat1, sep, pat2 = name.rpartition(':') for z2, zi, parents in self.infolist_nested(pattern=pat1): if not parents: if sep or not fnmatch(zi.filename, pat2): continue else: if not pat1 or not fnmatch(':'.join(parents), pat1): continue if not fnmatch(zi.filename, pat2): continue return z2.open(zi.filename) try: zi = self.getinfo(name) return self.open(name) except KeyError as e: oname, sep, name = name.rpartition(':') if sep and oname[-4:].lower() != '.zip': raise e try: zi = self.getinfo(oname) except: raise e if zi.compress_type != ZIP_STORED: raise ValueError(f'Nested zip file uses a method other than STORE!') z2 = self.sub(zi) return z2.open(name) httpio = property(operator.attrgetter('_httpio')) def list_zipinfo(zi, parents=None): parents = parents or () @@ -301,43 +341,16 @@ def list_zipinfo(zi, parents=None): else: method = '?' print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {method} {name}') parser = argparse.ArgumentParser(description='Operate on zip files over HTTP.') parser.add_argument('url', metavar='URL', type=str, help='URL of a zip file') parser.add_argument('filename', metavar='FILENAME', type=str, nargs='?', help='filename within the zip file') args = parser.parse_args() z = ZipHttp(_HttpIO(args.url)) if args.filename: f = z.open_nested(args.filename) while True: chunk = f.read1(-1) @@ -347,7 +360,7 @@ def open_member(name, r, z): sys.stdout.buffer.flush() else: # if no filename given, list the contents for _, zi, parents in z.infolist_nested(): list_zipinfo(zi, parents) print(f'total bytes read: {z.httpio._total}', file=sys.stderr) -
ryancdotorg revised this gist
Feb 18, 2023 . 1 changed file with 57 additions and 50 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,8 +9,7 @@ import operator import struct from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED, ZIP_BZIP2, ZIP_LZMA from fnmatch import fnmatch structFileHeader = "<4s2B4HL2L2H" @@ -251,69 +250,85 @@ def read(self, size=-1): r = BufferedHttpIO(h) z = ZipFile(r) def zi_slice(r, z, zi): assert zi.compress_type == ZIP_STORED saved_position = r.tell() # get the zip member header r.seek(zi.header_offset) fheader = r.read(sizeFileHeader) fheader = struct.unpack(structFileHeader, fheader) r.seek(saved_position) # find the start of the actual data skip = fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH] data_offset = zi.header_offset + sizeFileHeader + skip return r.raw.slice(data_offset, zi.compress_size) def infolist_nested(r, z, max_depth=1, parents=None, *, pattern=None): parents = parents or () for zi in z.infolist(): yield z, zi, parents # recursion limit if max_depth < 0: continue # only try to recurse STORED .zip files if zi.compress_type != ZIP_STORED: continue if zi.filename[-4:].lower() != '.zip': continue # if a pattern was provided, don't recurse unless it matches if pattern is not None and not fnmatch(zi.filename, pattern): continue h2 = zi_slice(r, z, zi) r2 = BufferedHttpIO(h2) with ZipFile(r2) as z2: yield from infolist_nested(r2, z2, max_depth - 1, parents + (zi.filename,)) def list_zipinfo(zi, parents=None): parents = parents or () ts = datetime.datetime(*zi.date_time).strftime('%Y-%m-%d %H:%M:%S') name = ':'.join(parents + (zi.filename,)) method_id = zi.compress_type if method_id == ZIP_STORED: method = 'S' elif method_id == ZIP_DEFLATED: method = 'D' elif method_id == ZIP_BZIP2: method = 'B' elif method_id == ZIP_LZMA: method = 'L' else: method = '?' print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {method} {name}') def open_member(name, r, z): # try to glob the filename for c in '[]?*': if c not in name: continue pat1, sep, pat2 = name.rpartition(':') for z2, zi, parents in infolist_nested(r, z, pattern=pat1): if not parents: if sep or not fnmatch(zi.filename, pat2): continue else: if not pat1 or not fnmatch(':'.join(parents), pat1): continue if not fnmatch(zi.filename, pat2): continue return z2.open(zi.filename) try: zi = z.getinfo(name) return z.open(name) except KeyError as e: oname, sep, name = name.rpartition(':') if sep and oname[-4:].lower() != '.zip': raise e try: zi = z.getinfo(oname) except: raise e if zi.compress_type != ZIP_STORED: raise ValueError(f'Nested zip file uses a method other than STORE!') h2 = zi_slice(r, z, zi) @@ -332,15 +347,7 @@ def open_member(name, r, z): sys.stdout.buffer.flush() else: # if no filename given, list the contents for _, zi, parents in infolist_nested(r, z): list_zipinfo(zi, parents) print(f'total bytes read: {h._total}', file=sys.stderr) -
ryancdotorg revised this gist
Feb 17, 2023 . 1 changed file with 77 additions and 14 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,6 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: 0BSD or CC0-1.0 or MIT-0 or Unlicense # Copyright (c) 2023, Ryan Castellucci, No Rights Reserved import io, sys import datetime @@ -16,7 +18,7 @@ _FH_FILENAME_LENGTH = 10 _FH_EXTRA_FIELD_LENGTH = 11 BLOCK_SIZE = 1<<20 __debug = False def debug(*args, **kwarg): @@ -39,8 +41,9 @@ def __init__(self, url, *, session=None, parent=None): self.session = session self._parent = parent self._url, self._off = url, 0 self._pos, self._total = 0, 0 self._response, self._iter, self._next = None, None, None if not parent: # We assume the zip file doesn't change while this tool is in use @@ -56,8 +59,8 @@ def _head(self): if r.url != self._url: self._url = r.url return r def _get(self, headers=None, *, stream=False): r = self.session.get(self._url, headers=headers, stream=stream) if r.status_code not in (200, 206): raise HttpError(r) return r @@ -74,8 +77,50 @@ def slice(self, offset, length=None): hio.seek(0) return hio # set up response streaming and return first chunk def _stream_init(self, response, chunk_size=None): debug(f'_stream_init chunk_size={chunk_size}') self._response = response self._iter = response.iter_content(chunk_size) self._next = next(self._iter, None) def _stream_close(self): debug('_stream_close') if self._response: self._response.close() self._response, self._iter, self._next = None, None, None def _stream_read(self, size=-1): debug(f'_stream_read avail={len(self._next) if self._next else 0}, requested={size}') assert self._iter is not None if self._next is None: # this implies that the response iterator was exausted on its first call, # which should not happen, but better safe than sorry self._stream_close() size = BLOCK_SIZE if size < 0 else min(size, BLOCK_SIZE) return self.read(size) elif size >= 0 and size < len(self._next): # reading less than the available amount of data isn't fully handled chunk = self._next[:size] self._stream_close() else: chunk = self._next # the response iterator only gets marked as exausted upon when it is asked # to generate a chunk but has no data available - read ahead one chunk # since the reader may stop after the last one it expects self._next = next(self._iter, None) if self._next is None: self._stream_close() self._advance(len(chunk)) return chunk def read(self, size=-1): return self._read(size) def _read(self, size=-1, *, stream=False, chunk_size=BLOCK_SIZE): if self._iter: return self._stream_read(size) elif size == 0: # any empty byte string will do return b'' elif self._off == 0 and self._pos == 0 and (size < 0 or size >= self._len): @@ -94,7 +139,12 @@ def read(self, size=-1): if end is not None: headers['Range'] = f'bytes={self._pos}-{end}' actually_stream = bool(stream and (size < 0 or size >= chunk_size)) r = self._get(headers, stream=actually_stream) if actually_stream: self._stream_init(r, chunk_size) return self._stream_read(size) n = int(r.headers.get('Content-Length')) self._advance(n) return r.content @@ -113,11 +163,17 @@ def readinto(self, b): def seek(self, pos, whence=0): if whence != 0 or pos != self._pos - self._off: debug('seek', pos, whence, self._pos - self._off) if whence == 0: newpos = self._off + pos elif whence == 1: newpos = self._pos + pos elif whence == 2: newpos = self._off + self._len + pos else: raise ValueError('invalid whence') if self._pos != newpos and self._iter: self._stream_close() self._pos = newpos return newpos def __len__(self): return self._len def tell(self): return self._pos - self._off @@ -153,9 +209,13 @@ def read(self, size=-1): debug('caching tail') self.seek(-self._buffer_size, 2) self._tail = super().read(self._buffer_size) n = len(self._tail) self.raw._advance(n) else: n = len(self._tail) start = n - from_end end = n if size < 0 else min(n, start+size) chunk = self._tail[start:end] debug('cached tail read', len(chunk)) self.seek(pos + len(chunk)) @@ -164,7 +224,8 @@ def read(self, size=-1): # for larger reads, io.BufferedReader just gets in the way, so bypass it if size < 0 or size > self._buffer_size: if self._unbuffered: return self.raw._read(size, stream=True, chunk_size=self._block_size) #return self.raw.read(min(size, self._block_size)) else: self._unbuffered = True debug('enter unbuffered mode') @@ -270,8 +331,10 @@ def open_member(name, r, z): sys.stdout.buffer.flush() else: # if no filename given, list the contents for zi in z.infolist(): list_zipinfo(zi) # also list the contents of stored zip files file_ext = zi.filename[-4:].lower() if zi.compress_type == zipfile.ZIP_STORED and file_ext == '.zip': h2 = zi_slice(r, z, zi) -
ryancdotorg revised this gist
Feb 17, 2023 . 1 changed file with 139 additions and 32 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,12 +9,15 @@ import zipfile from zipfile import ZipFile from fnmatch import fnmatch structFileHeader = "<4s2B4HL2L2H" sizeFileHeader = struct.calcsize(structFileHeader) _FH_FILENAME_LENGTH = 10 _FH_EXTRA_FIELD_LENGTH = 11 BLOCK_SIZE = 1<<24 __debug = False def debug(*args, **kwarg): if __debug: @@ -30,11 +33,16 @@ def __init__(self, response, message=None): # not a full implementation, just enough to use with BufferedReader class HttpIO(io.RawIOBase): def __init__(self, url, *, session=None, parent=None): if not session: session = parent.session if session else requests.Session() self.session = session self._parent = parent self._url, self._off, self._pos = url, 0, 0 self._total = 0 if not parent: # We assume the zip file doesn't change while this tool is in use r = self._head() if 'bytes' not in r.headers.get('Accept-Ranges'): @@ -53,26 +61,42 @@ def _get(self, headers=None): if r.status_code not in (200, 206): raise HttpError(r) return r def _advance(self, n): self._pos += n self._total += n if self._parent: self._parent._total += n def slice(self, offset, length=None): hio = self.__class__(self._url, session=self.session, parent=self) hio._off = offset hio._len = length if length is not None else self._len - self._off hio.seek(0) return hio def read(self, size=-1): if size == 0: # any empty byte string will do return b'' elif self._off == 0 and self._pos == 0 and (size < 0 or size >= self._len): # entire file from the begining, no range header needed end = None elif size < 0: # rest of the file end = self._off + self._len else: # requested range or rest of the file, whichever is less end = min(self._off + self._len, self._pos + size) - 1 debug('read', size, self._pos - self._off, end - self._off, self._len) headers = {} if end is not None: headers['Range'] = f'bytes={self._pos}-{end}' r = self._get(headers) n = int(r.headers.get('Content-Length')) self._advance(n) return r.content def readall(self): @@ -87,8 +111,9 @@ def readinto(self, b): return n def seek(self, pos, whence=0): if whence != 0 or pos != self._pos - self._off: debug('seek', pos, whence, self._pos - self._off) if whence == 0: self._pos = self._off + pos elif whence == 1: self._pos += pos elif whence == 2: self._pos = self._off + self._len + pos else: raise ValueError('invalid whence') @@ -100,25 +125,69 @@ def writeable(self): return False def seekable(self): return True def readable(self): return True def __getattr__(self, name): if name in ('truncate', 'fileno', 'write'): raise OSError(f'{name} not supported') return None closed = property(lambda self: False) url = property(operator.attrgetter('_url')) class BufferedHttpIO(io.BufferedReader): def __init__(self, httpio, buffer_size=1024, block_size=BLOCK_SIZE): super().__init__(httpio, buffer_size) self._buffer_size = buffer_size self._block_size = block_size self._unbuffered = False self._tail = None def read(self, size=-1): if self.raw._len >= self._buffer_size: pos = self.tell() from_end = self.raw._len - pos if from_end <= self._buffer_size: # ZipFile does several small reads and seeks near the end of the file, # so it's useful to cache the last buffer worth of data, since seek() # resets the read buffer if not self._tail: debug('caching tail') self.seek(-self._buffer_size, 2) self._tail = super().read(self._buffer_size) start = len(self._tail) - from_end end = len(self._tail) if size < 0 else min(len(self._tail), start+size) chunk = self._tail[start:end] debug('cached tail read', len(chunk)) self.seek(pos + len(chunk)) return chunk # for larger reads, io.BufferedReader just gets in the way, so bypass it if size < 0 or size > self._buffer_size: if self._unbuffered: return self.raw.read(min(size, self._block_size)) else: self._unbuffered = True debug('enter unbuffered mode') if size < 0: size = self._buffer_size # return the buffer contents return self.read1(size) elif self._unbuffered: debug('exit unbuffered mode') # syncronize the file position self.seek(self.raw.tell()) self._unbuffered = False # normal buffered read return super().read(size) parser = argparse.ArgumentParser(description='Operate on zip files over HTTP.') parser.add_argument('url', metavar='URL', type=str, help='URL of a zip file') parser.add_argument('filename', metavar='FILENAME', type=str, nargs='?', help='filename within the zip file') args = parser.parse_args() h = HttpIO(args.url) r = BufferedHttpIO(h) z = ZipFile(r) def list_zipinfo(zi, prefix=None): @@ -133,35 +202,69 @@ def list_zipinfo(zi, prefix=None): print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {method} {name}') def zi_slice(r, z, zi): # save position p = r.tell() r.seek(zi.header_offset) # get the zip member header fheader = r.read(sizeFileHeader) fheader = struct.unpack(structFileHeader, fheader) # restore position r.seek(p) # find the start of the actual data skip = fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH] data_offset = zi.header_offset + sizeFileHeader + skip return r.raw.slice(data_offset, zi.compress_size) def open_member(name, r, z): # try to glob the filename for c in '[]?*': if c not in name: continue pat1, sep, pat2 = name.partition(':') for zi in z.infolist(): if not fnmatch(zi.filename, pat1): continue if pat2: if zi.filename[-4:].lower() != '.zip': continue h2 = zi_slice(r, z, zi) r2 = BufferedHttpIO(h2) z2 = ZipFile(r2) for zi2 in z2.infolist(): if fnmatch(zi2.filename, pat2): return z2.open(zi2.filename) else: return z.open(zi.filename) break try: zi = z.getinfo(name) return z.open(name) except KeyError as e: oname, sep, name = name.partition(':') if not name or oname[-4:].lower() != '.zip': raise e zi = z.getinfo(oname) if zi.compress_type != zipfile.ZIP_STORED: raise ValueError(f'Nested zip file uses a method other than STORE!') h2 = zi_slice(r, z, zi) r2 = BufferedHttpIO(h2) z2 = ZipFile(r2) return z2.open(name) if args.filename: f = open_member(args.filename, r, z) while True: chunk = f.read1(-1) if len(chunk) == 0: break sys.stdout.buffer.write(chunk) @@ -171,6 +274,10 @@ def zi_slice(r, z, zi): list_zipinfo(zi) file_ext = zi.filename[-4:].lower() if zi.compress_type == zipfile.ZIP_STORED and file_ext == '.zip': h2 = zi_slice(r, z, zi) r2 = BufferedHttpIO(h2) z2 = ZipFile(r2) for zi2 in z2.infolist(): list_zipinfo(zi2, zi.filename) print(f'total bytes read: {h._total}', file=sys.stderr) -
ryancdotorg revised this gist
Feb 17, 2023 . 1 changed file with 98 additions and 21 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -5,9 +5,16 @@ import argparse import requests import operator import struct import zipfile from zipfile import ZipFile structFileHeader = "<4s2B4HL2L2H" sizeFileHeader = struct.calcsize(structFileHeader) _FH_FILENAME_LENGTH = 10 _FH_EXTRA_FIELD_LENGTH = 11 __debug = False def debug(*args, **kwarg): if __debug: @@ -21,16 +28,19 @@ def __init__(self, response, message=None): else: self.message = message super().__init__(self.message) # not a full implementation, just enough to use with BufferedReader class HttpIO(io.RawIOBase): def __init__(self, url, *, session=None, _head=True): self.session = requests.Session() if session is None else session self._url, self._off, self._pos = url, 0, 0 if _head: # We assume the zip file doesn't change while this tool is in use r = self._head() if 'bytes' not in r.headers.get('Accept-Ranges'): raise HttpError(r, 'byte ranges not supported by server') self._len = int(r.headers.get('Content-Length')) def _head(self): r = self.session.head(self._url, allow_redirects=True) if r.status_code != 200: raise HttpError(r) @@ -43,11 +53,18 @@ def _get(self, headers=None): if r.status_code not in (200, 206): raise HttpError(r) return r def slice(self, offset, length=None): hio = self.__class__(self._url, session=self.session, _head=False) hio._off = offset hio._len = length if length is not None else self._len - self._off hio.seek(0) return hio def read(self, size=-1): if size == 0: return b'' elif self._pos - self._off == 0 and (size < 0 or size >= self._off + self._len): end = -1 elif size < 0: end = self._off + self._len else: end = min(self._off + self._len, self._pos + size) - 1 debug('read', size, self._pos, end, self._len) headers = {} @@ -58,18 +75,34 @@ def read(self, size=-1): #debug('->', len(r.content), r.content) return r.content def readall(self): return self.read(-1) def readinto(self, b): b = (memoryview(b) if not isinstance(b, memoryview) else b).cast('B') n = len(b) data = self.read(n) n = len(data) b[:n] = data return n def seek(self, pos, whence=0): debug('seek', pos, whence) if whence == 0: self._pos = self._off + pos elif whence == 1: self._pos += pos elif whence == 2: self._pos = self._off + self._len + pos else: raise ValueError('invalid whence') return self._pos - self._off def __len__(self): return self._len def tell(self): return self._pos - self._off def writeable(self): return False def seekable(self): return True def readable(self): return True @property def closed(self): return False def __getattr__(self, name): if name in ('truncate', 'fileno', 'write'): @@ -84,16 +117,60 @@ def __getattr__(self, name): help='filename within the zip file') args = parser.parse_args() h = HttpIO(args.url) r = io.BufferedReader(h, 1<<18) z = ZipFile(r) def list_zipinfo(zi, prefix=None): ts = datetime.datetime(*zi.date_time).strftime('%Y-%m-%d %H:%M:%S') name = f'{prefix}:{zi.filename}' if prefix is not None else zi.filename method_id = zi.compress_type if method_id == zipfile.ZIP_STORED: method = 'S' elif method_id == zipfile.ZIP_DEFLATED: method = 'D' elif method_id == zipfile.ZIP_BZIP2: method = 'B' elif method_id == zipfile.ZIP_LZMA: method = 'L' else: method = '?' print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {method} {name}') def zi_slice(r, z, zi): p = r.tell() r.seek(zi.header_offset) fheader = r.read(sizeFileHeader) fheader = struct.unpack(structFileHeader, fheader) r.seek(p) skip = fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH] data_offset = zi.header_offset + sizeFileHeader + skip return r.raw.slice(data_offset, zi.compress_size) if args.filename: name, sep, iname = args.filename, '', '' try: zi = z.getinfo(name) except KeyError: name, sep, iname = name.partition(':') zi = z.getinfo(name) if iname and zi.compress_type == zipfile.ZIP_STORED and name[-4:].lower() == '.zip': h2 = zi_slice(r, z, zi) r2 = io.BufferedReader(h2, 1<<18) z2 = ZipFile(r2) zi = z2.getinfo(iname) f = z2.open(iname) else: f = z.open(name) while True: chunk = f.read(1<<18) if len(chunk) == 0: break sys.stdout.buffer.write(chunk) sys.stdout.buffer.flush() else: for zi in z.infolist(): list_zipinfo(zi) file_ext = zi.filename[-4:].lower() if zi.compress_type == zipfile.ZIP_STORED and file_ext == '.zip': z2 = ZipFile(io.BufferedReader(zi_slice(r, z, zi), 1<<18)) for zi2 in z2.infolist(): list_zipinfo(zi2, zi.filename) -
ryancdotorg created this gist
Nov 8, 2021 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,99 @@ #!/usr/bin/env python3 import io, sys import datetime import argparse import requests import operator from zipfile import ZipFile __debug = False def debug(*args, **kwarg): if __debug: if 'file' not in kwarg: kwarg['file'] = sys.stderr print(*args, **kwarg) class HttpError(IOError): def __init__(self, response, message=None): self.response = response if message is None: self.message = f'http status {response.status_code}' else: self.message = message super().__init__(self.message) class HttpIO(io.IOBase): def __init__(self, url, session=None): self.session = requests.Session() if session is None else session self._url, self._pos = url, 0 # We assume the zip file doesn't change while this tool is in use r = self._head() if 'bytes' not in r.headers.get('Accept-Ranges'): raise HttpError(r, 'byte ranges not supported by server') self._len = int(r.headers.get('Content-Length')) def _head(self): r = self.session.head(self._url, allow_redirects=True) if r.status_code != 200: raise HttpError(r) # Update URL if there was a redirect if r.url != self._url: self._url = r.url return r def _get(self, headers=None): r = self.session.get(self._url, headers=headers) if r.status_code not in (200, 206): raise HttpError(r) return r def read(self, size=-1): if size == 0: return b'' elif self._pos == 0 and (size < 0 or size >= self._len): end = -1 elif size < 0: end = self._len else: end = min(self._len, self._pos + size) - 1 debug('read', size, self._pos, end, self._len) headers = {} if end >= self._pos: headers['Range'] = f'bytes={self._pos}-{end}' r = self._get(headers) self._pos += int(r.headers.get('Content-Length')) #debug('->', len(r.content), r.content) return r.content def seek(self, pos, whence=0): debug('seek', pos, whence) if whence == 0: self._pos = pos elif whence == 1: self._pos += pos elif whence == 2: self._pos = self._len + pos else: raise ValueError('invalid whence') return self._pos def __len__(self): return self._len def tell(self): return self._pos def writeable(self): return False def seekable(self): return True def __getattr__(self, name): if name in ('truncate', 'fileno', 'write'): raise OSError(f'{name} not supported') return None url = property(operator.attrgetter('_url')) parser = argparse.ArgumentParser(description='Operate on zip files over HTTP.') parser.add_argument('url', metavar='URL', type=str, help='URL of a zip file') parser.add_argument('filename', metavar='FILENAME', type=str, nargs='?', help='filename within the zip file') args = parser.parse_args() z = ZipFile(HttpIO(args.url)) if args.filename: f = z.open(args.filename) while True: chunk = f.read(1<<18) if len(chunk) == 0: break sys.stdout.buffer.write(chunk) else: for zi in z.infolist(): ts = datetime.datetime(*zi.date_time).strftime('%Y-%m-%d %H:%M:%S') print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {zi.filename}')