Last active
September 22, 2017 14:05
-
-
Save charlesriondet/26cd1b9e64fac8d4cb7e137c163f729b to your computer and use it in GitHub Desktop.
A Python script to convert bibliographical references taken from HTML files in TEI <biblStruct> elements using the GROBID service.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #coding:utf-8 | |
| import xml.etree.ElementTree as ET | |
| import requests | |
| import re | |
| import codecs | |
| import time | |
| currentDate=time.strftime("%A %B %d %Y, %I:%M:%S") | |
| ''' | |
| INPUT AND OUTPUT CONFIGURATION | |
| ### | |
| htmlBibl:Input file. Must be an html document. | |
| ''' | |
| htmlBibl='' | |
| ''' | |
| ### | |
| outputName:Must be an xml file | |
| ''' | |
| outputName='' | |
| ''' | |
| GROBID SERVICE | |
| #### | |
| You can call Grobid service --> http://cloud.science-miner.com/grobid/processCitation | |
| or install it on your machine --> http://grobid.readthedocs.io/en/latest/Install-Grobid/ | |
| In each case you need to paste the address of the service called in the grobidService variable | |
| ''' | |
| grobidService='http://cloud.science-miner.com/grobid/processCitation' | |
| ''' | |
| Tip : if you have grobid installed and have this error when running it --> error: invalid model format | |
| change your LOCALE in your terminal before launching GROBID with: | |
| export LC_ALL=C | |
| For more information, see : https://github.com/kermitt2/grobid/issues/142 | |
| ''' | |
| #TEI HEADER EDITION | |
| ###Possible to edit the teiHeader directly here: | |
| teiHeader="<teiHeader>\r<fileDesc>\r<titleStmt>\r<title>Bibliography</title>\r<author>generated by Grobid</author>\r<editor/>\r</titleStmt>\r<publicationStmt>\r<authority>authority</authority>\r</publicationStmt>\r<sourceDesc>\r<p>created on "+currentDate+"</p>\r</sourceDesc>\r</fileDesc>\r</teiHeader><text><body>" | |
| def html2TEI(inputBib,service,Header,outputBib): | |
| tree = ET.parse(inputBib) | |
| root = tree.getroot() | |
| result = ['<listBibl>'] | |
| for p in root.iter('p'): | |
| bib = ''.join(p.itertext()) | |
| #bib : This variable contains the pattern of to retrieve the bibliographical references one by one in a loop. | |
| #By default, we state that each reference is encoded in a <p> element. | |
| #Example: | |
| #<p>Burlet, Gregory and Ichiro Fujinaga. “Robotaba Guitar Tablature Transcription | |
| # Framework.” <em>14th International Society for Music Information Retrieval | |
| # Conference</em>. Curitiba, Brazil, 2013. 517–522. <<a | |
| # href="http://ismir2013.ismir.net/wp-content/uploads/2013/09/217_Paper.pdf" target="_blank" | |
| # >http://ismir2013.ismir.net/wp-content/uploads/2013/09/217_Paper.pdf</a>>.</p> | |
| r = requests.post(service, data = {'citations':bib,'consolidate':'true'}) | |
| # print r.text | |
| grobidr=r.text.encode("utf-8") | |
| if grobidr.endswith('</biblStruct>\n'): | |
| grobidr=grobidr[:-14] | |
| for a in p.iter('a'): | |
| url=''.join(a.itertext()).replace('&','&') | |
| grobidr=grobidr+'<ref type="url" target="'+url+'"/>\r</biblStruct>' | |
| result.append(grobidr) | |
| with codecs.open(outputBib, mode="w") as output: | |
| output.write('<?xml version="1.0" encoding="UTF-8"?>\r<TEI xmlns="http://www.tei-c.org/ns/1.0">'+Header+"\r".join(result)+"</listBibl>\r</body>\r</text>\r</TEI>") | |
| def main(): | |
| html2TEI(htmlBibl,grobidService,teiHeader,outputName) | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment