Created
July 21, 2013 04:51
-
-
Save EpiphanyMachine/6047524 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
features = [] # make new empty list | |
counter = 0 | |
for row in train: | |
# for row in range(len(train)-1): # for every row in the data | |
# print row | |
# data we need to create features | |
# items needed to test questions | |
# PANDAS USAGE | |
question_tokens = nltk.word_tokenize(row[0]) # all tokens (words, punc, etc) | |
question_tags = nltk.pos_tag(question_tokens) # all tokens and parts of speech in lists | |
question_nouns = {} | |
question_verbs = {} | |
question_words = {} | |
question_adjectives = {} | |
hasPNoun = False | |
for tag in question_tags: #separate parts of speech | |
if tag[1] == "NNP" or tag[1] == "NNPS": | |
hasPNoun = True | |
if tag[1][0] == "N": | |
question_nouns[tag[0]] = tag[0] # all noun types | |
elif tag[1][0] == "V": | |
question_verbs[tag[0]] = tag[0] # all verb types | |
elif tag[1] != ".": | |
question_words[tag[0]] = tag[0] # all non punctuations types | |
elif tag[1][0] == "J": | |
question_adjectives[tag[0]] = tag[0] # all adjectives | |
# itquestion_first_word_tagems needed to test questions | |
num_ctwords_match_qwords = 0 | |
num_ctnoun_match_qnoun = 0 | |
context_topic_tokens = nltk.word_tokenize(row[2]) # all tokens (words, punc, etc) | |
context_topic_tags = nltk.pos_tag(question_tokens) # all tokens and parts of speech in lists | |
context_topic_nouns = {} | |
context_topic_words = {} | |
for tag in context_topic_tags: #separate parts of speech | |
if tag[1][0] == "N": | |
context_topic_nouns[tag[0]] = tag[0]# all noun types | |
if question_nouns.has_key(tag[0]): | |
num_ctnoun_match_qnoun += 1 | |
elif tag[1] != ".": | |
context_topic_words[tag[0]] = tag[0] # all non punctuations types | |
if question_words.has_key(tag[0]): | |
num_ctwords_match_qwords += 1 | |
# find number of nouns common between context_topic and question | |
# find number of words common between context_topic and question | |
# does first word match (Is..will..can..do..does..are..) | |
# check if question uses proper capitolization | |
# question_correct_capitalization = 0 | |
# for i in range(len(question_tokens)): | |
# if question_tokens[i] == ".": | |
# if question_tokens[i + 1][0] and question_tokens[i + 1][0].isupper(): | |
# question_correct_capitalization += 1 | |
# check if question contains a proper noun | |
# create features | |
features.append([]) # append a list to features for each row in data | |
# PANDAS USAGE | |
features[counter].append(train[:, 1])# # of followers in context topic | |
features[counter].append(len(question_tokens))# # of words in the question | |
# features[counter].append()# # of topics | |
# features[counter].append()# sum of followers in topics | |
# PANDAS USAGE | |
features[counter].append(1 if row[5] else 0)# 1 for anon 0 for non-anon | |
features[counter].append(num_ctnoun_match_qnoun)# # of common nouns between question text and topics | |
features[counter].append(num_ctwords_match_qwords)# # of common words between question text and topics | |
# features[counter].append()# does first word match (Is..will..can..do..does..are..) | |
# features[counter].append()# What kind of question is it? (Who? What? Where? When? Why? How?) | |
# features[counter].append()# no additional topics | |
# features[counter].append()# question text count > 50 | |
# features[counter].append()# # of sentences | |
# features[counter].append()# ends with a question mark | |
features[counter].append(len(question_verbs)/len(question_tags))# ratio of verbs | |
features[counter].append(len(question_adjectives)/len(question_tags))# ratio of adjectives | |
# features[counter].append(question_correct_capitalization)# if words are capitalized after a period | |
features[counter].append(1 if hasPNoun else 0)# Does the question have a proper noun in it? | |
# features[counter].append()# Does the question have a name in it? | |
# features[counter].append()# Does the question have a name of someone famous in it? (list of celebrities) | |
# features[counter].append()# Is the question related to technology? | |
counter += 1 | |
print features |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment