Skip to content

Instantly share code, notes, and snippets.

@pjankiewicz
Created October 11, 2014 15:57
Show Gist options
  • Save pjankiewicz/7cfcca0f04461ef2cf61 to your computer and use it in GitHub Desktop.
Save pjankiewicz/7cfcca0f04461ef2cf61 to your computer and use it in GitHub Desktop.
essay pipelines
from textblob import TextBlob, Word
from features.skipgrams import EssaySkipgram
from features.essay_feature import EssayFeature, FunctionalTextEssayFeature, EssayTextConversion
from features.text_features import *
from features.word2vec_word_clusters import EssayTextToW2VClusters
from features.wiki_ngram_coverage import check_1gram_coverage, check_2gram_coverage, check_3gram_coverage
from features.convert_text_to_definitions import convert_text_to_definitions
#from features.word2vec_features import EssayWord2Vec, EssayWord2VecFirstWords
from lib.math.text_to_math import text_to_math
from lib.math.math_helpers import simplify_math
from lib.clean_text import safe_clean_text
from lib.spellcheck.spell_corrector import correct_text
from lib.porter import stemmer
pipeline_1 = {
"name":"DATASET_1",
"steps":[
EssayTextConversion(source="raw",dest="clean",fun=safe_clean_text)
,EssayTextConversion(source="clean",dest="clean",fun=text_to_math)
#,EssayTextConversion(source="clean",dest="clean_spell",fun=correct_text)
#,EssayTextToW2VClusters(source="clean",dest="w2v",n_clusters=lambda d: int(d/4),w2v_path="/home/pawel/Downloads/GoogleNews-vectors-negative300.bin.gz")
,FunctionalTextEssayFeature(feature_name="n_words_raw", fun=lambda essay: n_words(essay.texts["raw"])/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_raw", fun=lambda essay: (text_length(essay.texts["raw"]))/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_2nd_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.50)/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_4th_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.25)/1000.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_4_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],4)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_6_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],6)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_8_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],8)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_10_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_12_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],12)/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_mean_raw", fun=lambda essay: words_length_mean(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_variance_raw", fun=lambda essay: words_length_variance(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_10_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_18_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],18)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_25_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],25)/100.0)
#,FunctionalTextEssayFeature(feature_name="wiki_1gram_coverage", fun=lambda essay: check_1gram_coverage(essay.texts["raw"]))
#,FunctionalTextEssayFeature(feature_name="wiki_2gram_coverage", fun=lambda essay: check_2gram_coverage(essay.texts["raw"]))
#,FunctionalTextEssayFeature(feature_name="wiki_3gram_coverage", fun=lambda essay: check_3gram_coverage(essay.texts["raw"]))
,EssaySkipgram(name="LETTER",source="clean",base=lambda text: text, nskip=0, ngram=3)
#,EssaySkipgram(name="WORD",source="w2v",base=lambda text: text.split(), nskip=0, ngram=1)
,EssaySkipgram(name="WORD",source="clean",base=lambda text: text.split(), nskip=0, ngram=1)
]
}
pipeline_2 = {
"name":"DATASET_2",
"steps":[
EssayTextConversion(source="raw",dest="clean",fun=safe_clean_text)
,EssayTextConversion(source="clean",dest="clean",fun=text_to_math)
,EssayTextConversion(source="clean",dest="stem",fun=lambda text: " ".join([stemmer(t) for t in text.split()]))
,EssayTextConversion(source="clean",dest="pos",fun=lambda text: " ".join([k[1] for k in TextBlob(text).tags]))
#,EssayTextConversion(source="clean",dest="clean_spell",fun=correct_text)
#,EssayTextToW2VClusters(source="clean",dest="w2v",n_clusters=lambda d: int(d/4),w2v_path="/home/pawel/McGraw/v2/features/GoogleNews-vectors-negative300.bin")
,EssayFeature(fun=lambda essay: simplify_math(essay.texts["clean"]))
,FunctionalTextEssayFeature(feature_name="n_words_raw", fun=lambda essay: n_words(essay.texts["raw"])/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_raw", fun=lambda essay: (text_length(essay.texts["raw"]))/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_2nd_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.50)/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_4th_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.25)/1000.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_4_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],4)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_6_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],6)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_8_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],8)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_10_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_12_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],12)/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_mean_raw", fun=lambda essay: words_length_mean(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_variance_raw", fun=lambda essay: words_length_variance(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_10_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_18_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],18)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_25_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],25)/100.0)
,FunctionalTextEssayFeature(feature_name="wiki_1gram_coverage", fun=lambda essay: check_1gram_coverage(essay.texts["raw"]))
,FunctionalTextEssayFeature(feature_name="wiki_2gram_coverage", fun=lambda essay: check_2gram_coverage(essay.texts["raw"]))
,FunctionalTextEssayFeature(feature_name="wiki_3gram_coverage", fun=lambda essay: check_3gram_coverage(essay.texts["raw"]))
# sentiment
,FunctionalTextEssayFeature(feature_name="sentiment_polarity", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.polarity)
,FunctionalTextEssayFeature(feature_name="sentiment_subj", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.subjectivity)
# ngrams
,EssaySkipgram(name="LETTER1",source="raw",base=lambda text: text, nskip=0, ngram=1) # count of each character
,EssaySkipgram(name="LETTER2",source="clean",base=lambda text: text, nskip=0, ngram=3)
,EssaySkipgram(name="WORD",source="stem",base=lambda text: text.split(), nskip=0, ngram=1)
,EssaySkipgram(name="POS1",source="pos",base=lambda text: text.split(), nskip=0, ngram=1)
]
}
pipeline_3 = {
"name":"DATASET_3",
"steps":[
EssayTextConversion(source="raw",dest="clean",fun=safe_clean_text)
,EssayTextConversion(source="clean",dest="clean",fun=text_to_math)
,EssayTextConversion(source="clean",dest="stem",fun=lambda text: " ".join([stemmer(t) for t in text.split()]))
,EssayTextConversion(source="clean",dest="pos",fun=lambda text: " ".join([k[1] for k in TextBlob(text).tags]))
,EssayTextToW2VClusters(source="clean",dest="w2v",n_clusters=lambda d: int(d/4),w2v_path="/home/pawel/McGraw/v2/features/GoogleNews-vectors-negative300.bin")
,EssayFeature(fun=lambda essay: simplify_math(essay.texts["clean"]))
,FunctionalTextEssayFeature(feature_name="n_words_raw", fun=lambda essay: n_words(essay.texts["raw"])/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_raw", fun=lambda essay: (text_length(essay.texts["raw"]))/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_2nd_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.50)/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_4th_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.25)/1000.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_4_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],4)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_6_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],6)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_8_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],8)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_10_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_12_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],12)/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_mean_raw", fun=lambda essay: words_length_mean(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_variance_raw", fun=lambda essay: words_length_variance(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_10_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_18_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],18)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_25_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],25)/100.0)
,FunctionalTextEssayFeature(feature_name="wiki_1gram_coverage", fun=lambda essay: check_1gram_coverage(essay.texts["raw"]))
,FunctionalTextEssayFeature(feature_name="wiki_2gram_coverage", fun=lambda essay: check_2gram_coverage(essay.texts["raw"]))
,FunctionalTextEssayFeature(feature_name="wiki_3gram_coverage", fun=lambda essay: check_3gram_coverage(essay.texts["raw"]))
# sentiment
,FunctionalTextEssayFeature(feature_name="sentiment_polarity", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.polarity)
,FunctionalTextEssayFeature(feature_name="sentiment_subj", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.subjectivity)
# ngrams
,EssaySkipgram(name="LETTER1",source="raw",base=lambda text: text, nskip=0, ngram=1) # count of each character
,EssaySkipgram(name="LETTER2",source="clean",base=lambda text: text, nskip=0, ngram=3)
,EssaySkipgram(name="WORD",source="stem",base=lambda text: text.split(), nskip=0, ngram=1)
,EssaySkipgram(name="WORD",source="w2v",base=lambda text: text.split(), nskip=0, ngram=1)
,EssaySkipgram(name="POS1",source="pos",base=lambda text: text.split(), nskip=0, ngram=1)
]
}
# pipeline_2
# + 2grams
# + 3grams
pipeline_4 = {
"name":"DATASET_4",
"steps":[
EssayTextConversion(source="raw",dest="clean",fun=safe_clean_text)
,EssayTextConversion(source="clean",dest="clean",fun=text_to_math)
,EssayTextConversion(source="clean",dest="stem",fun=lambda text: " ".join([stemmer(t) for t in text.split()]))
,EssayTextConversion(source="clean",dest="pos",fun=lambda text: " ".join([k[1] for k in TextBlob(text).tags]))
#,EssayTextToW2VClusters(source="clean",dest="w2v",n_clusters=lambda d: int(d/4),w2v_path="/home/pawel/McGraw/v2/features/GoogleNews-vectors-negative300.bin")
,EssayFeature(fun=lambda essay: simplify_math(essay.texts["clean"]))
,FunctionalTextEssayFeature(feature_name="n_words_raw", fun=lambda essay: n_words(essay.texts["raw"])/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_raw", fun=lambda essay: (text_length(essay.texts["raw"]))/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_2nd_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.50)/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_4th_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.25)/1000.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_4_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],4)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_6_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],6)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_8_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],8)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_10_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_12_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],12)/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_mean_raw", fun=lambda essay: words_length_mean(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_variance_raw", fun=lambda essay: words_length_variance(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_10_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_18_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],18)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_25_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],25)/100.0)
,FunctionalTextEssayFeature(feature_name="wiki_1gram_coverage", fun=lambda essay: check_1gram_coverage(essay.texts["raw"]))
,FunctionalTextEssayFeature(feature_name="wiki_2gram_coverage", fun=lambda essay: check_2gram_coverage(essay.texts["raw"]))
,FunctionalTextEssayFeature(feature_name="wiki_3gram_coverage", fun=lambda essay: check_3gram_coverage(essay.texts["raw"]))
# sentiment
,FunctionalTextEssayFeature(feature_name="sentiment_polarity", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.polarity)
,FunctionalTextEssayFeature(feature_name="sentiment_subj", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.subjectivity)
# ngrams
,EssaySkipgram(name="LETTER1",source="raw",base=lambda text: text, nskip=0, ngram=1) # count of each character
,EssaySkipgram(name="LETTER2",source="clean",base=lambda text: text, nskip=0, ngram=3)
,EssaySkipgram(name="WORD",source="stem",base=lambda text: text.split(), nskip=0, ngram=1)
,EssaySkipgram(name="WORD",source="stem",base=lambda text: text.split(), nskip=0, ngram=2)
,EssaySkipgram(name="WORD",source="stem",base=lambda text: text.split(), nskip=0, ngram=3)
#,EssaySkipgram(name="WORD",source="w2v",base=lambda text: text.split(), nskip=0, ngram=1)
,EssaySkipgram(name="POS1",source="pos",base=lambda text: text.split(), nskip=0, ngram=1)
]
}
# pipeline_4
# + spell checking
pipeline_5 = {
"name":"DATASET_5",
"steps":[
EssayTextConversion(source="raw",dest="clean",fun=safe_clean_text)
,EssayTextConversion(source="clean",dest="clean",fun=text_to_math)
,EssayTextConversion(source="clean",dest="clean",fun=correct_text)
,EssayTextConversion(source="clean",dest="stem",fun=lambda text: " ".join([stemmer(t) for t in text.split()]))
,EssayTextConversion(source="clean",dest="pos",fun=lambda text: " ".join([k[1] for k in TextBlob(text).tags]))
#,EssayTextToW2VClusters(source="clean",dest="w2v",n_clusters=lambda d: int(d/4),w2v_path="/home/pawel/McGraw/v2/features/GoogleNews-vectors-negative300.bin")
,EssayFeature(fun=lambda essay: simplify_math(essay.texts["clean"]))
,FunctionalTextEssayFeature(feature_name="n_words_raw", fun=lambda essay: n_words(essay.texts["raw"])/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_raw", fun=lambda essay: (text_length(essay.texts["raw"]))/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_2nd_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.50)/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_4th_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.25)/1000.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_4_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],4)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_6_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],6)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_8_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],8)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_10_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_12_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],12)/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_mean_raw", fun=lambda essay: words_length_mean(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_variance_raw", fun=lambda essay: words_length_variance(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_10_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_18_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],18)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_25_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],25)/100.0)
,FunctionalTextEssayFeature(feature_name="wiki_1gram_coverage", fun=lambda essay: check_1gram_coverage(essay.texts["raw"]))
,FunctionalTextEssayFeature(feature_name="wiki_2gram_coverage", fun=lambda essay: check_2gram_coverage(essay.texts["raw"]))
,FunctionalTextEssayFeature(feature_name="wiki_3gram_coverage", fun=lambda essay: check_3gram_coverage(essay.texts["raw"]))
# sentiment
,FunctionalTextEssayFeature(feature_name="sentiment_polarity", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.polarity)
,FunctionalTextEssayFeature(feature_name="sentiment_subj", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.subjectivity)
# ngrams
,EssaySkipgram(name="LETTER1",source="raw",base=lambda text: text, nskip=0, ngram=1) # count of each character
,EssaySkipgram(name="LETTER2",source="clean",base=lambda text: text, nskip=0, ngram=3)
,EssaySkipgram(name="WORD",source="stem",base=lambda text: text.split(), nskip=0, ngram=1)
,EssaySkipgram(name="WORD",source="stem",base=lambda text: text.split(), nskip=0, ngram=2)
,EssaySkipgram(name="WORD",source="stem",base=lambda text: text.split(), nskip=0, ngram=3)
#,EssaySkipgram(name="WORD",source="w2v",base=lambda text: text.split(), nskip=0, ngram=1)
,EssaySkipgram(name="POS1",source="pos",base=lambda text: text.split(), nskip=0, ngram=1)
]
}
# pipeline_2
# + words definitions expand
pipeline_6 = {
"name":"DATASET_6",
"steps":[
EssayTextConversion(source="raw",dest="clean",fun=safe_clean_text)
,EssayTextConversion(source="clean",dest="clean",fun=text_to_math)
,EssayTextConversion(source="clean",dest="stem",fun=lambda text: " ".join([stemmer(t) for t in text.split()]))
,EssayTextConversion(source="clean",dest="pos",fun=lambda text: " ".join([k[1] for k in TextBlob(text).tags]))
,EssayTextConversion(source="clean",dest="clean_def",fun=convert_text_to_definitions)
,EssayTextConversion(source="clean_def",dest="clean_def",fun=safe_clean_text)
#,EssayTextConversion(source="clean",dest="clean_spell",fun=correct_text)
#,EssayTextToW2VClusters(source="clean",dest="w2v",n_clusters=lambda d: int(d/4),w2v_path="/home/pawel/McGraw/v2/features/GoogleNews-vectors-negative300.bin")
,EssayFeature(fun=lambda essay: simplify_math(essay.texts["clean"]))
,FunctionalTextEssayFeature(feature_name="n_words_raw", fun=lambda essay: n_words(essay.texts["raw"])/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_raw", fun=lambda essay: (text_length(essay.texts["raw"]))/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_2nd_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.50)/1000.0)
,FunctionalTextEssayFeature(feature_name="text_length_4th_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.25)/1000.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_4_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],4)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_6_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],6)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_8_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],8)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_10_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_words_longer_than_12_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],12)/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_mean_raw", fun=lambda essay: words_length_mean(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="words_length_variance_raw", fun=lambda essay: words_length_variance(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_10_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],10)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_18_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],18)/100.0)
,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_25_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],25)/100.0)
,FunctionalTextEssayFeature(feature_name="wiki_1gram_coverage", fun=lambda essay: check_1gram_coverage(essay.texts["raw"]))
,FunctionalTextEssayFeature(feature_name="wiki_2gram_coverage", fun=lambda essay: check_2gram_coverage(essay.texts["raw"]))
,FunctionalTextEssayFeature(feature_name="wiki_3gram_coverage", fun=lambda essay: check_3gram_coverage(essay.texts["raw"]))
# sentiment
,FunctionalTextEssayFeature(feature_name="sentiment_polarity", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.polarity)
,FunctionalTextEssayFeature(feature_name="sentiment_subj", fun=lambda essay: TextBlob(essay.texts["clean"]).sentiment.subjectivity)
# ngrams
,EssaySkipgram(name="LETTER1",source="raw",base=lambda text: text, nskip=0, ngram=1) # count of each character
,EssaySkipgram(name="LETTER2",source="clean",base=lambda text: text, nskip=0, ngram=3)
,EssaySkipgram(name="WORD",source="stem",base=lambda text: text.split(), nskip=0, ngram=1)
,EssaySkipgram(name="POS1",source="pos",base=lambda text: text.split(), nskip=0, ngram=1)
,EssaySkipgram(name="WORD_DEF",source="clean_def",base=lambda text: text.split(), nskip=0, ngram=1)
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment