Skip to content

Instantly share code, notes, and snippets.

@soroushmehr
Created November 10, 2017 18:25
Show Gist options
  • Select an option

  • Save soroushmehr/ddf0043737ab60709c377bdd11efd8ed to your computer and use it in GitHub Desktop.

Select an option

Save soroushmehr/ddf0043737ab60709c377bdd11efd8ed to your computer and use it in GitHub Desktop.
Tokenize (with CoreNLP Make CNN/DailyMail (tokenized) datasets into paragraphs of minimum length 300 words.
"""
Install CoreNLP
Setup a server:
$ cd /home/somehri/corenlp/stanford-corenlp-full-2017-06-09
$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
Install PyCoreNLP:
https://github.com/smilli/py-corenlp
Download and unzip MS MARCO dataset.
Run:
`python -u to_paragraph.py ../test_public_v1.1.json`
"""
import os, sys
import pickle as pkl
import json
import subprocess
import unicodedata
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')
MIN_LEN = 300
_DIR = sys.argv[1]
def valid_name(s):
return "".join([x if x.isalnum() else "_" for x in s])
OUT_FILE = valid_name(_DIR+'.OUT')
dm_single_close_quote = u'\u2019' # unicode
dm_double_close_quote = u'\u201d'
#END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")"] # acceptable ways to end a sentence
if os.path.exists(OUT_FILE):
raise ValueError(OUT_FILE+' already exists!')
out_file = open(OUT_FILE, 'w')
print('Output file: {}'.format(OUT_FILE))
def get_words(text):
#text = text.replace(dm_single_close_quote, "'").replace(dm_double_close_quote, "\"").encode('ascii', 'ignore')
text = unicodedata.normalize('NFKD', text).encode('ascii','ignore')
return [t['word'] for t in nlp.annotate(text, properties={'annotators': 'tokenize', 'outputFormat': 'json'})['tokens']]
with open(_DIR) as data_file:
for i, line in enumerate(data_file):
passage_texts = [pt['passage_text'] for pt in json.loads(line)['passages']]
passage_texts = [u' '.join(get_words(pt)) for pt in passage_texts]
passage_texts = u'\t'.join(passage_texts) + u'\n'
out_file.write(passage_texts.encode('utf-8'))
if i % 50 == 0: # print to file every 50 processed files.
print i,
out_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment