import gensim, logging import os import sys import codecs def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) f = codecs.open("data/wiki_tr/dump/trwiki_plain2.txt", mode="w", encoding="utf8") wiki = gensim.corpora.wikicorpus.WikiCorpus('data/wiki_tr/dump/trwiki-20170906-pages-articles.xml.bz2', lemmatize=False) count = 0 space = " " for doc in wiki.get_texts(): line = space.join(doc) line = line.decode('utf-8') #line = utils.normalize_corpus_text(line) line = line+ "\n" f.write(line) # python will convert \n to os.linesep count += 1 if count%1000==0: print count, doc f.close() return if __name__ == '__main__': main()