Created
November 10, 2017 18:25
-
-
Save soroushmehr/ddf0043737ab60709c377bdd11efd8ed to your computer and use it in GitHub Desktop.
Tokenize (with CoreNLP Make CNN/DailyMail (tokenized) datasets into paragraphs of minimum length 300 words.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Install CoreNLP | |
| Setup a server: | |
| $ cd /home/somehri/corenlp/stanford-corenlp-full-2017-06-09 | |
| $ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer | |
| Install PyCoreNLP: | |
| https://github.com/smilli/py-corenlp | |
| Download and unzip MS MARCO dataset. | |
| Run: | |
| `python -u to_paragraph.py ../test_public_v1.1.json` | |
| """ | |
| import os, sys | |
| import pickle as pkl | |
| import json | |
| import subprocess | |
| import unicodedata | |
| from pycorenlp import StanfordCoreNLP | |
| nlp = StanfordCoreNLP('http://localhost:9000') | |
| MIN_LEN = 300 | |
| _DIR = sys.argv[1] | |
| def valid_name(s): | |
| return "".join([x if x.isalnum() else "_" for x in s]) | |
| OUT_FILE = valid_name(_DIR+'.OUT') | |
| dm_single_close_quote = u'\u2019' # unicode | |
| dm_double_close_quote = u'\u201d' | |
| #END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")"] # acceptable ways to end a sentence | |
| if os.path.exists(OUT_FILE): | |
| raise ValueError(OUT_FILE+' already exists!') | |
| out_file = open(OUT_FILE, 'w') | |
| print('Output file: {}'.format(OUT_FILE)) | |
| def get_words(text): | |
| #text = text.replace(dm_single_close_quote, "'").replace(dm_double_close_quote, "\"").encode('ascii', 'ignore') | |
| text = unicodedata.normalize('NFKD', text).encode('ascii','ignore') | |
| return [t['word'] for t in nlp.annotate(text, properties={'annotators': 'tokenize', 'outputFormat': 'json'})['tokens']] | |
| with open(_DIR) as data_file: | |
| for i, line in enumerate(data_file): | |
| passage_texts = [pt['passage_text'] for pt in json.loads(line)['passages']] | |
| passage_texts = [u' '.join(get_words(pt)) for pt in passage_texts] | |
| passage_texts = u'\t'.join(passage_texts) + u'\n' | |
| out_file.write(passage_texts.encode('utf-8')) | |
| if i % 50 == 0: # print to file every 50 processed files. | |
| print i, | |
| out_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment