Skip to content

Instantly share code, notes, and snippets.

@e-budur
Last active May 29, 2018 13:44
Show Gist options
  • Save e-budur/4b69b4287571e91a155032076ea68980 to your computer and use it in GitHub Desktop.
Save e-budur/4b69b4287571e91a155032076ea68980 to your computer and use it in GitHub Desktop.
import gensim, logging
import os
import sys
import codecs
def main():
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
f = codecs.open("data/wiki_tr/dump/trwiki_plain2.txt", mode="w", encoding="utf8")
wiki = gensim.corpora.wikicorpus.WikiCorpus('data/wiki_tr/dump/trwiki-20170906-pages-articles.xml.bz2', lemmatize=False)
count = 0
space = " "
for doc in wiki.get_texts():
line = space.join(doc)
line = line.decode('utf-8')
line = line+ "\n"
f.write(line) # python will convert \n to os.linesep
count += 1
if count%1000==0:
print count, doc
f.close()
return
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment