Skip to content

Instantly share code, notes, and snippets.

@Jean-Baptiste-Camps
Forked from PonteIneptique/bnfcrawler.py
Created March 2, 2019 15:59
Show Gist options
  • Select an option

  • Save Jean-Baptiste-Camps/0917bdd672ea8a6f3eade4e47bf36f3a to your computer and use it in GitHub Desktop.

Select an option

Save Jean-Baptiste-Camps/0917bdd672ea8a6f3eade4e47bf36f3a to your computer and use it in GitHub Desktop.
BNF Crawler

IIIF-Crawler

Tool to interrogate IIIF servers and get images of manuscripts.

Currently implemented:

  • Gallica;

Usage

python3 bnfcrawler.py ID

where ID is the identifier of the manuscript, e.g. the ark btv1b9059486c.

import requests
import os
import shutil
import sys
from argparse import ArgumentParser
parser = ArgumentParser(description="Download Full Quality sets of pages from Gallica")
parser.add_argument("text", type=str, help="either the ID of the text (in http://gallica.bnf.fr/ark:/12148/btv1b53084829z/, this would be btv1b53084829z), or the path to a .csv file containing relevant information")
parser.add_argument("--start", type=int, default=1, help="Page to start from")
parser.add_argument("--end", type=int, default=None, help="Page to end at")
def dl_write(i, bookid):
uri = "http://gallica.bnf.fr/iiif/ark:/12148/{0}/f{1}/full/full/0/native.jpg".format(bookid, i)
response = requests.get(uri, stream=True)
with open("{0}/p{1}.jpg".format(bookid, i), "wb") as f:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, f)
def read_csv(file):
with open(file, 'r') as f:
if not f.readline().split('\t') == ['ID', 'source', 'start', 'end\n']:
sys.stderr.write("Error: csv file must have headings 'ID', 'source', 'start', 'end'\n")
sys.exit(1)
mss = []
for line in f.readlines():
mss.append(line.rstrip().split('\t'))
return mss
def ms_dl(bookid, start, end):
try:
os.makedirs(bookid)
except Exception as E:
pass
if end is None:
dl_write(i, bookid)
else:
for i in range(start, end + 1):
print("Downloading {current}/{end}".format(current=i, end=end))
# Thanks to @seeksanusername for the native format URL as I was using highres before
# https://medium.com/@seeksanusername/astuce1-r%C3%A9cup%C3%A9rer-de-la-hd-sur-gallica-bef0a6cc7f89
dl_write(i, bookid)
if __name__ == "__main__":
args = parser.parse_args()
text = args.text
if text.endswith(".csv"):
mss = read_csv(text)
for ms in mss:
print("Now downloading: {}".format(ms[0]) )
ms_dl(ms[0], int(ms[2]), int(ms[3]))
else:
start, end = args.start, args.end
bookid = text
ms_dl(bookid, start, end)
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
ID source start end
btv1b10515441p gallica 10 11
btv1b52510692j gallica 67 68
btv1b6000370c gallica 22 23
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment