#!/usr/bin/env python3 # # This program will introspect on an OpenAlex API filter call and try to # determine what record in the cursored result set is causing a problem. # # ./openalexbug.py https://api.openalex.org/works?filter=author.id:https://openalex.org/A5003671931&cursor=&per-page=200 # problem record: https://openalex.org/W3200281942 # 121 records # # /// script # dependencies = ["requests"] # /// # import sys import time from urllib.parse import urlparse, parse_qs import requests if len(sys.argv) != 2: sys.exit("usage: openalexbug.py ") api_url = urlparse(sys.argv[1]) if api_url.netloc != 'api.openalex.org': sys.exit(f"{sys.argv[1]} isn't an OpenAlex API URL") url = 'https://api.openalex.org/' + api_url.path params = parse_qs(api_url.query) params['cursor'] = '*' params['per-page'] = 200 record_count = 0 while True: time.sleep(1) resp= requests.get(url, params) if resp.status_code == 500: params['per-page'] = int(params['per-page'] / 2) # cut the page size in half to zero in on problematic record if params['per-page'] > 1: continue else: # just get the ID instead of the full record, so we can advance the cursor params['select'] = 'id' resp = requests.get(url, params) if resp.status_code == 200: print(f"problem record: {resp.json()['results'][0]['id']}") del params['select'] else: # this shouldn't happen when just getting the id? resp.raise_for_status() results = resp.json() record_count += len(results['results']) params['per-page'] == 200 params['cursor'] = results['meta'].get('next_cursor') # if there's no cursor we're done! if not params['cursor']: print(f"{record_count} records") break