Skip to content

Instantly share code, notes, and snippets.

@ryancdotorg
Last active January 25, 2025 22:46
Show Gist options
  • Select an option

  • Save ryancdotorg/fc98c86887d346b658eb51a5fdd831df to your computer and use it in GitHub Desktop.

Select an option

Save ryancdotorg/fc98c86887d346b658eb51a5fdd831df to your computer and use it in GitHub Desktop.
Partial/streaming zip downloader
#!/usr/bin/env python3
import io, sys
import datetime
import argparse
import requests
import operator
import struct
import zipfile
from zipfile import ZipFile
from fnmatch import fnmatch
structFileHeader = "<4s2B4HL2L2H"
sizeFileHeader = struct.calcsize(structFileHeader)
_FH_FILENAME_LENGTH = 10
_FH_EXTRA_FIELD_LENGTH = 11
BLOCK_SIZE = 1<<24
__debug = False
def debug(*args, **kwarg):
if __debug:
if 'file' not in kwarg: kwarg['file'] = sys.stderr
print(*args, **kwarg)
class HttpError(IOError):
def __init__(self, response, message=None):
self.response = response
if message is None: self.message = f'http status {response.status_code}'
else: self.message = message
super().__init__(self.message)
# not a full implementation, just enough to use with BufferedReader
class HttpIO(io.RawIOBase):
def __init__(self, url, *, session=None, parent=None):
if not session:
session = parent.session if session else requests.Session()
self.session = session
self._parent = parent
self._url, self._off, self._pos = url, 0, 0
self._total = 0
if not parent:
# We assume the zip file doesn't change while this tool is in use
r = self._head()
if 'bytes' not in r.headers.get('Accept-Ranges'):
raise HttpError(r, 'byte ranges not supported by server')
self._len = int(r.headers.get('Content-Length'))
def _head(self):
r = self.session.head(self._url, allow_redirects=True)
if r.status_code != 200: raise HttpError(r)
# Update URL if there was a redirect
if r.url != self._url: self._url = r.url
return r
def _get(self, headers=None):
r = self.session.get(self._url, headers=headers)
if r.status_code not in (200, 206): raise HttpError(r)
return r
def _advance(self, n):
self._pos += n
self._total += n
if self._parent:
self._parent._total += n
def slice(self, offset, length=None):
hio = self.__class__(self._url, session=self.session, parent=self)
hio._off = offset
hio._len = length if length is not None else self._len - self._off
hio.seek(0)
return hio
def read(self, size=-1):
if size == 0:
# any empty byte string will do
return b''
elif self._off == 0 and self._pos == 0 and (size < 0 or size >= self._len):
# entire file from the begining, no range header needed
end = None
elif size < 0:
# rest of the file
end = self._off + self._len
else:
# requested range or rest of the file, whichever is less
end = min(self._off + self._len, self._pos + size) - 1
debug('read', size, self._pos - self._off, end - self._off, self._len)
headers = {}
if end is not None:
headers['Range'] = f'bytes={self._pos}-{end}'
r = self._get(headers)
n = int(r.headers.get('Content-Length'))
self._advance(n)
return r.content
def readall(self):
return self.read(-1)
def readinto(self, b):
b = (memoryview(b) if not isinstance(b, memoryview) else b).cast('B')
n = len(b)
data = self.read(n)
n = len(data)
b[:n] = data
return n
def seek(self, pos, whence=0):
if whence != 0 or pos != self._pos - self._off:
debug('seek', pos, whence, self._pos - self._off)
if whence == 0: self._pos = self._off + pos
elif whence == 1: self._pos += pos
elif whence == 2: self._pos = self._off + self._len + pos
else: raise ValueError('invalid whence')
return self._pos - self._off
def __len__(self): return self._len
def tell(self): return self._pos - self._off
def writeable(self): return False
def seekable(self): return True
def readable(self): return True
def __getattr__(self, name):
if name in ('truncate', 'fileno', 'write'):
raise OSError(f'{name} not supported')
return None
closed = property(lambda self: False)
url = property(operator.attrgetter('_url'))
class BufferedHttpIO(io.BufferedReader):
def __init__(self, httpio, buffer_size=1024, block_size=BLOCK_SIZE):
super().__init__(httpio, buffer_size)
self._buffer_size = buffer_size
self._block_size = block_size
self._unbuffered = False
self._tail = None
def read(self, size=-1):
if self.raw._len >= self._buffer_size:
pos = self.tell()
from_end = self.raw._len - pos
if from_end <= self._buffer_size:
# ZipFile does several small reads and seeks near the end of the file,
# so it's useful to cache the last buffer worth of data, since seek()
# resets the read buffer
if not self._tail:
debug('caching tail')
self.seek(-self._buffer_size, 2)
self._tail = super().read(self._buffer_size)
start = len(self._tail) - from_end
end = len(self._tail) if size < 0 else min(len(self._tail), start+size)
chunk = self._tail[start:end]
debug('cached tail read', len(chunk))
self.seek(pos + len(chunk))
return chunk
# for larger reads, io.BufferedReader just gets in the way, so bypass it
if size < 0 or size > self._buffer_size:
if self._unbuffered:
return self.raw.read(min(size, self._block_size))
else:
self._unbuffered = True
debug('enter unbuffered mode')
if size < 0: size = self._buffer_size
# return the buffer contents
return self.read1(size)
elif self._unbuffered:
debug('exit unbuffered mode')
# syncronize the file position
self.seek(self.raw.tell())
self._unbuffered = False
# normal buffered read
return super().read(size)
parser = argparse.ArgumentParser(description='Operate on zip files over HTTP.')
parser.add_argument('url', metavar='URL', type=str, help='URL of a zip file')
parser.add_argument('filename', metavar='FILENAME', type=str, nargs='?',
help='filename within the zip file')
args = parser.parse_args()
h = HttpIO(args.url)
r = BufferedHttpIO(h)
z = ZipFile(r)
def list_zipinfo(zi, prefix=None):
ts = datetime.datetime(*zi.date_time).strftime('%Y-%m-%d %H:%M:%S')
name = f'{prefix}:{zi.filename}' if prefix is not None else zi.filename
method_id = zi.compress_type
if method_id == zipfile.ZIP_STORED: method = 'S'
elif method_id == zipfile.ZIP_DEFLATED: method = 'D'
elif method_id == zipfile.ZIP_BZIP2: method = 'B'
elif method_id == zipfile.ZIP_LZMA: method = 'L'
else: method = '?'
print(f'{ts} {zi.file_size:13d} {zi.compress_size:13d} {method} {name}')
def zi_slice(r, z, zi):
# save position
p = r.tell()
r.seek(zi.header_offset)
# get the zip member header
fheader = r.read(sizeFileHeader)
fheader = struct.unpack(structFileHeader, fheader)
# restore position
r.seek(p)
# find the start of the actual data
skip = fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH]
data_offset = zi.header_offset + sizeFileHeader + skip
return r.raw.slice(data_offset, zi.compress_size)
def open_member(name, r, z):
# try to glob the filename
for c in '[]?*':
if c not in name:
continue
pat1, sep, pat2 = name.partition(':')
for zi in z.infolist():
if not fnmatch(zi.filename, pat1):
continue
if pat2:
if zi.filename[-4:].lower() != '.zip':
continue
h2 = zi_slice(r, z, zi)
r2 = BufferedHttpIO(h2)
z2 = ZipFile(r2)
for zi2 in z2.infolist():
if fnmatch(zi2.filename, pat2):
return z2.open(zi2.filename)
else:
return z.open(zi.filename)
break
try:
zi = z.getinfo(name)
return z.open(name)
except KeyError as e:
oname, sep, name = name.partition(':')
if not name or oname[-4:].lower() != '.zip':
raise e
zi = z.getinfo(oname)
if zi.compress_type != zipfile.ZIP_STORED:
raise ValueError(f'Nested zip file uses a method other than STORE!')
h2 = zi_slice(r, z, zi)
r2 = BufferedHttpIO(h2)
z2 = ZipFile(r2)
return z2.open(name)
if args.filename:
f = open_member(args.filename, r, z)
while True:
chunk = f.read1(-1)
if len(chunk) == 0: break
sys.stdout.buffer.write(chunk)
sys.stdout.buffer.flush()
else:
for zi in z.infolist():
list_zipinfo(zi)
file_ext = zi.filename[-4:].lower()
if zi.compress_type == zipfile.ZIP_STORED and file_ext == '.zip':
h2 = zi_slice(r, z, zi)
r2 = BufferedHttpIO(h2)
z2 = ZipFile(r2)
for zi2 in z2.infolist():
list_zipinfo(zi2, zi.filename)
print(f'total bytes read: {h._total}', file=sys.stderr)
@ryancdotorg
Copy link
Author

This is a tool extract individual files from a zip file hosted on a web server without having to download the entire file first.

One level of nesting is supported, provided the inner zip file was included with the STORE method - use a colon to specify the nested file.

Globs are supported in the filename to extract.

Examples:

$ wzip https://dl.google.com/dl/android/aosp/barbet-tq1a.230205.002-factory-56361bbb.zip
2023-01-23 16:55:30             0             0 S barbet-tq1a.230205.002/
2023-01-23 16:55:30     153505932      75152746 D barbet-tq1a.230205.002/radio-barbet-g7250-00220-221017-b-9183951.img
2023-01-23 16:55:30           989           576 D barbet-tq1a.230205.002/flash-all.bat
2023-01-23 16:55:30       8972680       4779978 D barbet-tq1a.230205.002/bootloader-barbet-b9-0.5-9150481.img
2023-01-23 16:55:30          1054           624 D barbet-tq1a.230205.002/flash-base.sh
2023-01-23 16:55:30          1106           650 D barbet-tq1a.230205.002/flash-all.sh
2023-01-23 16:55:30    2252444755    2252444755 S barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip
2009-01-01 00:00:00           120            94 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:android-info.txt
2009-01-01 00:00:00     100663296      12089525 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:boot.img
2009-01-01 00:00:00     100663296      24423693 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:vendor_boot.img
2009-01-01 00:00:00     869920768     437992837 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:system.img
2009-01-01 00:00:00     759574528     313172391 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:vendor.img
2009-01-01 00:00:00    2515554304    1295326662 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:product.img
2009-01-01 00:00:00     371236864     155848024 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:system_ext.img
2009-01-01 00:00:00      25825280       9708299 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:system_other.img
2009-01-01 00:00:00      16777216       3877173 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:dtbo.img
2009-01-01 00:00:00          4096          2220 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:vbmeta_system.img
2009-01-01 00:00:00          8192          2278 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:vbmeta.img
2009-01-01 00:00:00          4976           325 D barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:super_empty.img
$ wzip https://dl.google.com/dl/android/aosp/barbet-tq1a.230205.002-factory-56361bbb.zip \
barbet-tq1a.230205.002/image-barbet-tq1a.230205.002.zip:android-info.txt
require board=barbet

require version-bootloader=b9-0.5-9150481


require version-baseband=g7250-00220-221017-B-9183951
total bytes read: 4096
$ wzip https://dl.google.com/dl/android/aosp/barbet-tq1a.230205.002-factory-56361bbb.zip \
*:boot.img > boot-barbet-tq1a.230205.002.img

@whitequark
Copy link

This is a nice tool!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment