Skip to content

Instantly share code, notes, and snippets.

View pertschuk's full-sized avatar

Jack Pertschuk pertschuk

  • Ithaca, NY
View GitHub Profile
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
SUBSET_SIZE = 10000 # the number of passages to classify
client = language.LanguageServiceClient()
with open('./categories.tsv', 'w+') as outfile:
with open('./collectionandqueries/collection.tsv') as collection:
def label_from_category(category, confidence):
return (1 if 'Health' in category
or 'Science' in category else 0, confidence)
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
ps = PorterStemmer()
collection_file = './collectionandqueries/collection.tsv'
categories_file = './categories.tsv'
with open(categories_file) as categories:
categories_dict = dict()
sed 's/ /|n /' $DATA_DIR/collection.tsv | \
sed "s/:/ /g" | sed "s/,/ /g" | sed "s/\./ /g" | \
tr '[:upper:]' '[:lower:]' | stmr | \
vw -i bio_model --ngram n2 --skips n1 --predictions $DATA_DIR/preds