Skip to content

Instantly share code, notes, and snippets.

@pertschuk
Created December 4, 2019 01:30
Show Gist options
  • Save pertschuk/66dd8ade5d10752e385b3b3922f02dc5 to your computer and use it in GitHub Desktop.
Save pertschuk/66dd8ade5d10752e385b3b3922f02dc5 to your computer and use it in GitHub Desktop.
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
ps = PorterStemmer()
collection_file = './collectionandqueries/collection.tsv'
categories_file = './categories.tsv'
with open(categories_file) as categories:
categories_dict = dict()
for line in categories:
doc_id, category, confidence = line.split('\t')
categories_dict[doc_id] = label_from_category(category)
# input.vw has format <label> <weight> |n <lowercased, stemmed text>
with open('input.vw', 'w') as output, open(collection_file) as collection:
for line in collection:
doc_id, text = line.split('\t')
if doc_id in categories_dict:
label, confidence = categories_dict[doc_id]
tokens = word_tokenize(text)
tokens = [ps.stem(word.lower()) for word in tokens]
cleaned = re.sub(r'\:', ' ', ' '.join(tokens))
line = str(label) + ' ' + str(confidence).strip()
line += ' |n ' + cleaned + ' \n'
output.write(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment