Skip to content

Instantly share code, notes, and snippets.

@salmanmaq
Last active November 20, 2018 16:17
Show Gist options
  • Select an option

  • Save salmanmaq/3343620a86e0d5b93f9b6f0ffd337e84 to your computer and use it in GitHub Desktop.

Select an option

Save salmanmaq/3343620a86e0d5b93f9b6f0ffd337e84 to your computer and use it in GitHub Desktop.
Generate Core and non-Core data for INSPIRE classifier
'''
This snippet is supposed to be run from within the inspirehep shell.
'''
from elasticsearch.helpers import scan
from inspire_utils.record import get_value
from invenio_search import current_search_client as es
import json
core_records = []
noncore_records = []
query = {
"query":
{
"bool":
{
"must":
[
{
"exists":
{
"field": "earliest_date"
}
},
{
"exists":
{
"field": "titles"
}
},
{
"exists":
{
"field": "abstracts"
}
},
{
"range":
{
"earliest_date":
{
"gte": "2016-01-01",
"format": "yyyy||yyyy-MM||yyyy-MM-dd"
}
}
}
]
}
},
"_source":
[
"earliest_date",
"control_number",
"core",
"titles.title",
"abstracts.value"
]
}
for hit in scan(es, query=query, index='records-hep', doc_type='hep'):
coreness = get_value(hit, '_source.core')
title = get_value(hit, '_source.titles[0].title')
abstract = get_value(hit, '_source.abstracts[0].value')
if coreness is True:
core_records.append({"title": title, "abstract": abstract})
else:
noncore_records.append({"title": title, "abstract": abstract})
with open('inspire_core_records.json', 'w') as fd:
json.dump(core_records, fd)
with open('inspire_noncore_records.json', 'w') as fd:
json.dump(noncore_records, fd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment