Skip to content

Instantly share code, notes, and snippets.

@charlesriondet
Last active September 22, 2017 14:05
Show Gist options
  • Select an option

  • Save charlesriondet/26cd1b9e64fac8d4cb7e137c163f729b to your computer and use it in GitHub Desktop.

Select an option

Save charlesriondet/26cd1b9e64fac8d4cb7e137c163f729b to your computer and use it in GitHub Desktop.
A Python script to convert bibliographical references taken from HTML files in TEI <biblStruct> elements using the GROBID service.
#coding:utf-8
import xml.etree.ElementTree as ET
import requests
import re
import codecs
import time
currentDate=time.strftime("%A %B %d %Y, %I:%M:%S")
'''
INPUT AND OUTPUT CONFIGURATION
###
htmlBibl:Input file. Must be an html document.
'''
htmlBibl=''
'''
###
outputName:Must be an xml file
'''
outputName=''
'''
GROBID SERVICE
####
You can call Grobid service --> http://cloud.science-miner.com/grobid/processCitation
or install it on your machine --> http://grobid.readthedocs.io/en/latest/Install-Grobid/
In each case you need to paste the address of the service called in the grobidService variable
'''
grobidService='http://cloud.science-miner.com/grobid/processCitation'
'''
Tip : if you have grobid installed and have this error when running it --> error: invalid model format
change your LOCALE in your terminal before launching GROBID with:
export LC_ALL=C
For more information, see : https://github.com/kermitt2/grobid/issues/142
'''
#TEI HEADER EDITION
###Possible to edit the teiHeader directly here:
teiHeader="<teiHeader>\r<fileDesc>\r<titleStmt>\r<title>Bibliography</title>\r<author>generated by Grobid</author>\r<editor/>\r</titleStmt>\r<publicationStmt>\r<authority>authority</authority>\r</publicationStmt>\r<sourceDesc>\r<p>created on "+currentDate+"</p>\r</sourceDesc>\r</fileDesc>\r</teiHeader><text><body>"
def html2TEI(inputBib,service,Header,outputBib):
tree = ET.parse(inputBib)
root = tree.getroot()
result = ['<listBibl>']
for p in root.iter('p'):
bib = ''.join(p.itertext())
#bib : This variable contains the pattern of to retrieve the bibliographical references one by one in a loop.
#By default, we state that each reference is encoded in a <p> element.
#Example:
#<p>Burlet, Gregory and Ichiro Fujinaga. &#8220;Robotaba Guitar Tablature Transcription
# Framework.&#8221; <em>14th International Society for Music Information Retrieval
# Conference</em>. Curitiba, Brazil, 2013. 517–522. &lt;<a
# href="http://ismir2013.ismir.net/wp-content/uploads/2013/09/217_Paper.pdf" target="_blank"
# >http://ismir2013.ismir.net/wp-content/uploads/2013/09/217_Paper.pdf</a>&gt;.</p>
r = requests.post(service, data = {'citations':bib,'consolidate':'true'})
# print r.text
grobidr=r.text.encode("utf-8")
if grobidr.endswith('</biblStruct>\n'):
grobidr=grobidr[:-14]
for a in p.iter('a'):
url=''.join(a.itertext()).replace('&','&amp;')
grobidr=grobidr+'<ref type="url" target="'+url+'"/>\r</biblStruct>'
result.append(grobidr)
with codecs.open(outputBib, mode="w") as output:
output.write('<?xml version="1.0" encoding="UTF-8"?>\r<TEI xmlns="http://www.tei-c.org/ns/1.0">'+Header+"\r".join(result)+"</listBibl>\r</body>\r</text>\r</TEI>")
def main():
html2TEI(htmlBibl,grobidService,teiHeader,outputName)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment