Last active
November 20, 2018 16:17
-
-
Save salmanmaq/3343620a86e0d5b93f9b6f0ffd337e84 to your computer and use it in GitHub Desktop.
Generate Core and non-Core data for INSPIRE classifier
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ''' | |
| This snippet is supposed to be run from within the inspirehep shell. | |
| ''' | |
| from elasticsearch.helpers import scan | |
| from inspire_utils.record import get_value | |
| from invenio_search import current_search_client as es | |
| import json | |
| core_records = [] | |
| noncore_records = [] | |
| query = { | |
| "query": | |
| { | |
| "bool": | |
| { | |
| "must": | |
| [ | |
| { | |
| "exists": | |
| { | |
| "field": "earliest_date" | |
| } | |
| }, | |
| { | |
| "exists": | |
| { | |
| "field": "titles" | |
| } | |
| }, | |
| { | |
| "exists": | |
| { | |
| "field": "abstracts" | |
| } | |
| }, | |
| { | |
| "range": | |
| { | |
| "earliest_date": | |
| { | |
| "gte": "2016-01-01", | |
| "format": "yyyy||yyyy-MM||yyyy-MM-dd" | |
| } | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| "_source": | |
| [ | |
| "earliest_date", | |
| "control_number", | |
| "core", | |
| "titles.title", | |
| "abstracts.value" | |
| ] | |
| } | |
| for hit in scan(es, query=query, index='records-hep', doc_type='hep'): | |
| coreness = get_value(hit, '_source.core') | |
| title = get_value(hit, '_source.titles[0].title') | |
| abstract = get_value(hit, '_source.abstracts[0].value') | |
| if coreness is True: | |
| core_records.append({"title": title, "abstract": abstract}) | |
| else: | |
| noncore_records.append({"title": title, "abstract": abstract}) | |
| with open('inspire_core_records.json', 'w') as fd: | |
| json.dump(core_records, fd) | |
| with open('inspire_noncore_records.json', 'w') as fd: | |
| json.dump(noncore_records, fd) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment