soroushmehr · November 10, 2017 18:25
diff --git a/msmarco_to_paragraph.py b/msmarco_to_paragraph.py
 """
 Install CoreNLP

 Setup a server:
 $ cd /home/somehri/corenlp/stanford-corenlp-full-2017-06-09
 $ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer

 Install PyCoreNLP:
 https://github.com/smilli/py-corenlp

 Download and unzip MS MARCO dataset.

 Run:
 `python -u to_paragraph.py ../test_public_v1.1.json`
 """
 import os, sys
 import pickle as pkl
 import json
 import subprocess
 import unicodedata

 from pycorenlp import StanfordCoreNLP

 nlp = StanfordCoreNLP('http://localhost:9000')


 MIN_LEN = 300
 _DIR = sys.argv[1]

 def valid_name(s):
    return "".join([x if x.isalnum() else "_" for x in s])

 OUT_FILE = valid_name(_DIR+'.OUT')

 dm_single_close_quote = u'\u2019' # unicode
 dm_double_close_quote = u'\u201d'
 #END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")"] # acceptable ways to end a sentence


 if os.path.exists(OUT_FILE):
    raise ValueError(OUT_FILE+' already exists!')

 out_file = open(OUT_FILE, 'w')

 print('Output file: {}'.format(OUT_FILE))

 def get_words(text):
    #text = text.replace(dm_single_close_quote, "'").replace(dm_double_close_quote, "\"").encode('ascii', 'ignore')
    text = unicodedata.normalize('NFKD', text).encode('ascii','ignore')
    return [t['word'] for t in nlp.annotate(text, properties={'annotators': 'tokenize', 'outputFormat': 'json'})['tokens']]

 with open(_DIR) as data_file:
    for i, line in enumerate(data_file):
        passage_texts = [pt['passage_text'] for pt in json.loads(line)['passages']]
        passage_texts = [u' '.join(get_words(pt)) for pt in passage_texts]
        passage_texts = u'\t'.join(passage_texts) + u'\n'

        out_file.write(passage_texts.encode('utf-8'))

        if i % 50 == 0:  # print to file every 50 processed files.
            print i,

 out_file.close()
	"""
	Install CoreNLP

	Setup a server:
	$ cd /home/somehri/corenlp/stanford-corenlp-full-2017-06-09
	$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer

	Install PyCoreNLP:
	https://github.com/smilli/py-corenlp

	Download and unzip MS MARCO dataset.

	Run:
	`python -u to_paragraph.py ../test_public_v1.1.json`
	"""
	import os, sys
	import pickle as pkl
	import json
	import subprocess
	import unicodedata

	from pycorenlp import StanfordCoreNLP

	nlp = StanfordCoreNLP('http://localhost:9000')


	MIN_LEN = 300
	_DIR = sys.argv[1]

	def valid_name(s):
	return "".join([x if x.isalnum() else "_" for x in s])

	OUT_FILE = valid_name(_DIR+'.OUT')

	dm_single_close_quote = u'\u2019' # unicode
	dm_double_close_quote = u'\u201d'
	#END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")"] # acceptable ways to end a sentence


	if os.path.exists(OUT_FILE):
	raise ValueError(OUT_FILE+' already exists!')

	out_file = open(OUT_FILE, 'w')

	print('Output file: {}'.format(OUT_FILE))

	def get_words(text):
	#text = text.replace(dm_single_close_quote, "'").replace(dm_double_close_quote, "\"").encode('ascii', 'ignore')
	text = unicodedata.normalize('NFKD', text).encode('ascii','ignore')
	return [t['word'] for t in nlp.annotate(text, properties={'annotators': 'tokenize', 'outputFormat': 'json'})['tokens']]

	with open(_DIR) as data_file:
	for i, line in enumerate(data_file):
	passage_texts = [pt['passage_text'] for pt in json.loads(line)['passages']]
	passage_texts = [u' '.join(get_words(pt)) for pt in passage_texts]
	passage_texts = u'\t'.join(passage_texts) + u'\n'

	out_file.write(passage_texts.encode('utf-8'))

	if i % 50 == 0: # print to file every 50 processed files.
	print i,

	out_file.close()
No results found