charlesriondet · September 22, 2017 14:05
diff --git a/HTML2TEI.py b/HTML2TEI.py
 #coding:utf-8

 import xml.etree.ElementTree as ET
 import requests
 import re
 import codecs
 import time

 currentDate=time.strftime("%A %B %d %Y, %I:%M:%S")

 '''
 INPUT AND OUTPUT CONFIGURATION
 ###
 htmlBibl:Input file. Must be an html document.
 '''
 htmlBibl=''
 '''
 ###
 outputName:Must be an xml file
 '''
 outputName=''

 '''
 GROBID SERVICE
 ####
 You can call Grobid service --> http://cloud.science-miner.com/grobid/processCitation
 or install it on your machine --> http://grobid.readthedocs.io/en/latest/Install-Grobid/
 In each case you need to paste the address of the service called in the grobidService variable
 '''
 grobidService='http://cloud.science-miner.com/grobid/processCitation'
 '''
 Tip : if you have grobid installed and have this error when running it --> error: invalid model format
 change your LOCALE in your terminal before launching GROBID with:
 export LC_ALL=C
 For more information, see : https://github.com/kermitt2/grobid/issues/142
 '''
 #TEI HEADER EDITION
 ###Possible to edit the teiHeader directly here:
 teiHeader="<teiHeader>\r<fileDesc>\r<titleStmt>\r<title>Bibliography</title>\r<author>generated by Grobid</author>\r<editor/>\r</titleStmt>\r<publicationStmt>\r<authority>authority</authority>\r</publicationStmt>\r<sourceDesc>\r<p>created on "+currentDate+"</p>\r</sourceDesc>\r</fileDesc>\r</teiHeader><text><body>"

 def html2TEI(inputBib,service,Header,outputBib):

 	tree = ET.parse(inputBib)
 	root = tree.getroot()

 	result = ['<listBibl>']
 	for p in root.iter('p'):
 		bib = ''.join(p.itertext())

 #bib : This variable contains the pattern of to retrieve the bibliographical references one by one in a loop.
 #By default, we state that each reference is encoded in a <p> element.

 #Example:
 #<p>Burlet, Gregory and Ichiro Fujinaga. &#8220;Robotaba Guitar Tablature Transcription
 #    Framework.&#8221; <em>14th International Society for Music Information Retrieval
 #    Conference</em>. Curitiba, Brazil, 2013. 517–522. &lt;<a
 #        href="http://ismir2013.ismir.net/wp-content/uploads/2013/09/217_Paper.pdf" target="_blank"
 #        >http://ismir2013.ismir.net/wp-content/uploads/2013/09/217_Paper.pdf</a>&gt;.</p>

 		r = requests.post(service, data = {'citations':bib,'consolidate':'true'})
 		# print r.text
 		grobidr=r.text.encode("utf-8")
 		if grobidr.endswith('</biblStruct>\n'):
 			grobidr=grobidr[:-14]
 		for a in p.iter('a'):	
 			url=''.join(a.itertext()).replace('&','&amp;')
 		grobidr=grobidr+'<ref type="url" target="'+url+'"/>\r</biblStruct>'

 		result.append(grobidr)
 	
 	with codecs.open(outputBib, mode="w") as output:
 		output.write('<?xml version="1.0" encoding="UTF-8"?>\r<TEI xmlns="http://www.tei-c.org/ns/1.0">'+Header+"\r".join(result)+"</listBibl>\r</body>\r</text>\r</TEI>")

 def main():
 	html2TEI(htmlBibl,grobidService,teiHeader,outputName)

 main()
	#coding:utf-8

	import xml.etree.ElementTree as ET
	import requests
	import re
	import codecs
	import time

	currentDate=time.strftime("%A %B %d %Y, %I:%M:%S")

	'''
	INPUT AND OUTPUT CONFIGURATION
	###
	htmlBibl:Input file. Must be an html document.
	'''
	htmlBibl=''
	'''
	###
	outputName:Must be an xml file
	'''
	outputName=''

	'''
	GROBID SERVICE
	####
	You can call Grobid service --> http://cloud.science-miner.com/grobid/processCitation
	or install it on your machine --> http://grobid.readthedocs.io/en/latest/Install-Grobid/
	In each case you need to paste the address of the service called in the grobidService variable
	'''
	grobidService='http://cloud.science-miner.com/grobid/processCitation'
	'''
	Tip : if you have grobid installed and have this error when running it --> error: invalid model format
	change your LOCALE in your terminal before launching GROBID with:
	export LC_ALL=C
	For more information, see : https://github.com/kermitt2/grobid/issues/142
	'''
	#TEI HEADER EDITION
	###Possible to edit the teiHeader directly here:
	teiHeader="<teiHeader>\r<fileDesc>\r<titleStmt>\r<title>Bibliography</title>\r<author>generated by Grobid</author>\r<editor/>\r</titleStmt>\r<publicationStmt>\r<authority>authority</authority>\r</publicationStmt>\r<sourceDesc>\r<p>created on "+currentDate+"</p>\r</sourceDesc>\r</fileDesc>\r</teiHeader><text><body>"

	def html2TEI(inputBib,service,Header,outputBib):

	tree = ET.parse(inputBib)
	root = tree.getroot()

	result = ['<listBibl>']
	for p in root.iter('p'):
	bib = ''.join(p.itertext())

	#bib : This variable contains the pattern of to retrieve the bibliographical references one by one in a loop.
	#By default, we state that each reference is encoded in a <p> element.

	#Example:
	#<p>Burlet, Gregory and Ichiro Fujinaga. “Robotaba Guitar Tablature Transcription
	# Framework.” <em>14th International Society for Music Information Retrieval
	# Conference</em>. Curitiba, Brazil, 2013. 517–522. <<a
	# href="http://ismir2013.ismir.net/wp-content/uploads/2013/09/217_Paper.pdf" target="_blank"
	# >http://ismir2013.ismir.net/wp-content/uploads/2013/09/217_Paper.pdf</a>>.</p>

	r = requests.post(service, data = {'citations':bib,'consolidate':'true'})
	# print r.text
	grobidr=r.text.encode("utf-8")
	if grobidr.endswith('</biblStruct>\n'):
	grobidr=grobidr[:-14]
	for a in p.iter('a'):
	url=''.join(a.itertext()).replace('&','&')
	grobidr=grobidr+'<ref type="url" target="'+url+'"/>\r</biblStruct>'

	result.append(grobidr)

	with codecs.open(outputBib, mode="w") as output:
	output.write('<?xml version="1.0" encoding="UTF-8"?>\r<TEI xmlns="http://www.tei-c.org/ns/1.0">'+Header+"\r".join(result)+"</listBibl>\r</body>\r</text>\r</TEI>")

	def main():
	html2TEI(htmlBibl,grobidService,teiHeader,outputName)

	main()
No results found