Last active
January 25, 2019 19:19
-
-
Save allenanie/f7458f45b103247607c2fa3953c6c58d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import spacy | |
import logging | |
import string | |
import os | |
log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
logging.basicConfig(level=logging.INFO, format=log_fmt) | |
logger = logging.getLogger(__name__) | |
en_nlp = spacy.load("en_core_web_sm") | |
printable = set(string.printable) | |
file_names = [] | |
def walk(path): | |
if not os.path.exists(path): | |
return -1 | |
for root,dirs,names in os.walk(path): | |
for filename in names: | |
file_names.append(os.path.join(root,filename)) | |
walk('./news-crawl/') | |
if __name__ == "__main__": | |
log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
logging.basicConfig(level=logging.INFO, format=log_fmt) | |
logger = logging.getLogger(__name__) | |
fout = open('./corpora_sep_flattened.txt', 'w') | |
sentences = [] | |
i = 0 | |
for f in file_names: | |
if f[-4:] != '.txt': continue | |
# fout.write('<' + '=' * 25 + f + '='*25 + '>' + '\n') | |
with open(f, 'r') as f: | |
for l_n, line in enumerate(f): | |
i += 1 | |
# Tokenize the paragraph into words | |
processed = filter(lambda x: x in printable, line).strip() | |
tokens = en_nlp.tokenizer(unicode(processed)) | |
words = [str(token) for token in tokens if not str(token).isspace()] | |
# if len(words) < 3: | |
# continue | |
if l_n == 0: | |
# first line in the article | |
fout.write('>' + '=' * 25 + ' '.join(words) + '=' * 25 + '<' + '\n') | |
else: | |
fout.write(' '.join(words) + "\n") | |
if i % 1000000 == 0: | |
logger.info("processed {} sentences".format(i)) | |
fout.close() | |
## Python 3 Version | |
# -*- coding: utf-8 -*- | |
import spacy | |
import logging | |
import string | |
import os | |
log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
logging.basicConfig(level=logging.INFO, format=log_fmt) | |
logger = logging.getLogger(__name__) | |
en_nlp = spacy.load("en_core_web_sm") | |
printable = set(string.printable) | |
fin = "news.2011.en.shuffled" | |
log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
logging.basicConfig(level=logging.INFO, format=log_fmt) | |
logger = logging.getLogger(__name__) | |
fout = open('./news.2011.en.shuffled.toked.txt', 'w') | |
i = 0 | |
with open(fin, 'r') as f: | |
for l_n, line in enumerate(f): | |
i += 1 | |
processed = ''.join(list(filter(lambda x: x in printable, line.strip()))) | |
tokens = en_nlp.tokenizer(processed) | |
words = [str(token) for token in tokens if not str(token).isspace()] | |
fout.write(' '.join(words) + "\n") | |
if i % 100000 == 0: | |
logger.info("processed {}".format(i)) | |
fout.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment