Last active
May 29, 2018 13:44
-
-
Save e-budur/4b69b4287571e91a155032076ea68980 to your computer and use it in GitHub Desktop.
Revisions
-
e-budur revised this gist
May 29, 2018 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -18,7 +18,6 @@ def main(): for doc in wiki.get_texts(): line = space.join(doc) line = line.decode('utf-8') line = line+ "\n" f.write(line) # python will convert \n to os.linesep count += 1 -
e-budur created this gist
May 29, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,31 @@ import gensim, logging import os import sys import codecs def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) f = codecs.open("data/wiki_tr/dump/trwiki_plain2.txt", mode="w", encoding="utf8") wiki = gensim.corpora.wikicorpus.WikiCorpus('data/wiki_tr/dump/trwiki-20170906-pages-articles.xml.bz2', lemmatize=False) count = 0 space = " " for doc in wiki.get_texts(): line = space.join(doc) line = line.decode('utf-8') #line = utils.normalize_corpus_text(line) line = line+ "\n" f.write(line) # python will convert \n to os.linesep count += 1 if count%1000==0: print count, doc f.close() return if __name__ == '__main__': main()