Skip to content

Instantly share code, notes, and snippets.

@allenanie
Last active January 25, 2019 19:19
Show Gist options
  • Save allenanie/f7458f45b103247607c2fa3953c6c58d to your computer and use it in GitHub Desktop.
Save allenanie/f7458f45b103247607c2fa3953c6c58d to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import spacy
import logging
import string
import os
log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_fmt)
logger = logging.getLogger(__name__)
en_nlp = spacy.load("en_core_web_sm")
printable = set(string.printable)
file_names = []
def walk(path):
if not os.path.exists(path):
return -1
for root,dirs,names in os.walk(path):
for filename in names:
file_names.append(os.path.join(root,filename))
walk('./news-crawl/')
if __name__ == "__main__":
log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_fmt)
logger = logging.getLogger(__name__)
fout = open('./corpora_sep_flattened.txt', 'w')
sentences = []
i = 0
for f in file_names:
if f[-4:] != '.txt': continue
# fout.write('<' + '=' * 25 + f + '='*25 + '>' + '\n')
with open(f, 'r') as f:
for l_n, line in enumerate(f):
i += 1
# Tokenize the paragraph into words
processed = filter(lambda x: x in printable, line).strip()
tokens = en_nlp.tokenizer(unicode(processed))
words = [str(token) for token in tokens if not str(token).isspace()]
# if len(words) < 3:
# continue
if l_n == 0:
# first line in the article
fout.write('>' + '=' * 25 + ' '.join(words) + '=' * 25 + '<' + '\n')
else:
fout.write(' '.join(words) + "\n")
if i % 1000000 == 0:
logger.info("processed {} sentences".format(i))
fout.close()
## Python 3 Version
# -*- coding: utf-8 -*-
import spacy
import logging
import string
import os
log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_fmt)
logger = logging.getLogger(__name__)
en_nlp = spacy.load("en_core_web_sm")
printable = set(string.printable)
fin = "news.2011.en.shuffled"
log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_fmt)
logger = logging.getLogger(__name__)
fout = open('./news.2011.en.shuffled.toked.txt', 'w')
i = 0
with open(fin, 'r') as f:
for l_n, line in enumerate(f):
i += 1
processed = ''.join(list(filter(lambda x: x in printable, line.strip())))
tokens = en_nlp.tokenizer(processed)
words = [str(token) for token in tokens if not str(token).isspace()]
fout.write(' '.join(words) + "\n")
if i % 100000 == 0:
logger.info("processed {}".format(i))
fout.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment