pertschuk · December 4, 2019 01:30
diff --git a/build_training_set.py b/build_training_set.py
 from nltk.stem import PorterStemmer
 from nltk.tokenize import word_tokenize
 import re

 ps = PorterStemmer()

 collection_file = './collectionandqueries/collection.tsv'
 categories_file = './categories.tsv'
 with open(categories_file) as categories:
  categories_dict = dict()
  for line in categories:
    doc_id, category, confidence = line.split('\t')
    categories_dict[doc_id] = label_from_category(category)

 # input.vw has format <label> <weight> |n <lowercased, stemmed text>
 with open('input.vw', 'w') as output, open(collection_file) as collection:
  for line in collection:
    doc_id, text = line.split('\t')
    if doc_id in categories_dict:
      label, confidence = categories_dict[doc_id]
      tokens = word_tokenize(text)
      tokens = [ps.stem(word.lower()) for word in tokens]
      cleaned = re.sub(r'\:', ' ', ' '.join(tokens))
      line = str(label) + ' ' + str(confidence).strip()
      line += ' |n ' + cleaned + ' \n'
      output.write(line)
	from nltk.stem import PorterStemmer
	from nltk.tokenize import word_tokenize
	import re

	ps = PorterStemmer()

	collection_file = './collectionandqueries/collection.tsv'
	categories_file = './categories.tsv'
	with open(categories_file) as categories:
	categories_dict = dict()
	for line in categories:
	doc_id, category, confidence = line.split('\t')
	categories_dict[doc_id] = label_from_category(category)

	# input.vw has format <label> <weight> \|n <lowercased, stemmed text>
	with open('input.vw', 'w') as output, open(collection_file) as collection:
	for line in collection:
	doc_id, text = line.split('\t')
	if doc_id in categories_dict:
	label, confidence = categories_dict[doc_id]
	tokens = word_tokenize(text)
	tokens = [ps.stem(word.lower()) for word in tokens]
	cleaned = re.sub(r'\:', ' ', ' '.join(tokens))
	line = str(label) + ' ' + str(confidence).strip()
	line += ' \|n ' + cleaned + ' \n'
	output.write(line)