EpiphanyMachine · July 21, 2013 04:51
diff --git a/feature create b/feature create
 features = [] # make new empty list
 counter = 0
 for row in train:
 # for row in range(len(train)-1): # for every row in the data
    # print row
    # data we need to create features
    # items needed to test questions
 # PANDAS USAGE
    question_tokens = nltk.word_tokenize(row[0]) # all tokens (words, punc, etc)
    question_tags = nltk.pos_tag(question_tokens) # all tokens and parts of speech in lists
    question_nouns = {}
    question_verbs = {}
    question_words = {}
    question_adjectives = {}
    hasPNoun = False
    for tag in question_tags: #separate parts of speech
        if tag[1] == "NNP" or tag[1] == "NNPS":
            hasPNoun = True
        if tag[1][0] == "N":
            question_nouns[tag[0]] = tag[0] # all noun types
        elif tag[1][0] == "V":
            question_verbs[tag[0]] = tag[0] # all verb types
        elif tag[1] != ".":
            question_words[tag[0]] = tag[0] # all non punctuations types
        elif tag[1][0] == "J":
            question_adjectives[tag[0]] = tag[0] # all adjectives
    # itquestion_first_word_tagems needed to test questions
    num_ctwords_match_qwords = 0
    num_ctnoun_match_qnoun = 0
    context_topic_tokens = nltk.word_tokenize(row[2]) # all tokens (words, punc, etc)
    context_topic_tags = nltk.pos_tag(question_tokens) # all tokens and parts of speech in lists
    context_topic_nouns = {}
    context_topic_words = {}
    for tag in context_topic_tags:  #separate parts of speech
        if tag[1][0] == "N":
            context_topic_nouns[tag[0]] = tag[0]# all noun types
            if question_nouns.has_key(tag[0]):
                num_ctnoun_match_qnoun += 1
        elif tag[1] != ".":
            context_topic_words[tag[0]] = tag[0] # all non punctuations types
            if question_words.has_key(tag[0]):
                num_ctwords_match_qwords += 1
    # find number of nouns common between context_topic and question




    # find number of words common between context_topic and question




    # does first word match (Is..will..can..do..does..are..)

 # check if question uses proper capitolization
 #    question_correct_capitalization = 0
 #    for i in range(len(question_tokens)):
 #        if question_tokens[i] == ".":
 #            if question_tokens[i + 1][0] and question_tokens[i + 1][0].isupper():
 #                question_correct_capitalization += 1

 # check if question contains a proper noun

 # create features
    features.append([]) # append a list to features for each row in data
 # PANDAS USAGE
    features[counter].append(train[:, 1])# # of followers in context topic
    features[counter].append(len(question_tokens))# # of words in the question
 #    features[counter].append()# # of topics
 #    features[counter].append()# sum of followers in topics
 # PANDAS USAGE
    features[counter].append(1 if row[5] else 0)# 1 for anon 0 for non-anon
    features[counter].append(num_ctnoun_match_qnoun)# # of common nouns between question text and topics
    features[counter].append(num_ctwords_match_qwords)# # of common words between question text and topics
 #    features[counter].append()# does first word match (Is..will..can..do..does..are..)
 #    features[counter].append()# What kind of question is it? (Who? What? Where? When? Why? How?)
 #    features[counter].append()# no additional topics
 #    features[counter].append()# question text count > 50
 #    features[counter].append()# # of sentences
 #    features[counter].append()# ends with a question mark
    features[counter].append(len(question_verbs)/len(question_tags))# ratio of verbs
    features[counter].append(len(question_adjectives)/len(question_tags))# ratio of adjectives
 #    features[counter].append(question_correct_capitalization)# if words are capitalized after a period
    features[counter].append(1 if hasPNoun else 0)# Does the question have a proper noun in it?
 #    features[counter].append()# Does the question have a name in it?
 #    features[counter].append()# Does the question have a name of someone famous in it? (list of celebrities)
 #    features[counter].append()# Is the question related to technology?
    counter += 1
 print features
	features = [] # make new empty list
	counter = 0
	for row in train:
	# for row in range(len(train)-1): # for every row in the data
	# print row
	# data we need to create features
	# items needed to test questions
	# PANDAS USAGE
	question_tokens = nltk.word_tokenize(row[0]) # all tokens (words, punc, etc)
	question_tags = nltk.pos_tag(question_tokens) # all tokens and parts of speech in lists
	question_nouns = {}
	question_verbs = {}
	question_words = {}
	question_adjectives = {}
	hasPNoun = False
	for tag in question_tags: #separate parts of speech
	if tag[1] == "NNP" or tag[1] == "NNPS":
	hasPNoun = True
	if tag[1][0] == "N":
	question_nouns[tag[0]] = tag[0] # all noun types
	elif tag[1][0] == "V":
	question_verbs[tag[0]] = tag[0] # all verb types
	elif tag[1] != ".":
	question_words[tag[0]] = tag[0] # all non punctuations types
	elif tag[1][0] == "J":
	question_adjectives[tag[0]] = tag[0] # all adjectives
	# itquestion_first_word_tagems needed to test questions
	num_ctwords_match_qwords = 0
	num_ctnoun_match_qnoun = 0
	context_topic_tokens = nltk.word_tokenize(row[2]) # all tokens (words, punc, etc)
	context_topic_tags = nltk.pos_tag(question_tokens) # all tokens and parts of speech in lists
	context_topic_nouns = {}
	context_topic_words = {}
	for tag in context_topic_tags: #separate parts of speech
	if tag[1][0] == "N":
	context_topic_nouns[tag[0]] = tag[0]# all noun types
	if question_nouns.has_key(tag[0]):
	num_ctnoun_match_qnoun += 1
	elif tag[1] != ".":
	context_topic_words[tag[0]] = tag[0] # all non punctuations types
	if question_words.has_key(tag[0]):
	num_ctwords_match_qwords += 1
	# find number of nouns common between context_topic and question




	# find number of words common between context_topic and question




	# does first word match (Is..will..can..do..does..are..)

	# check if question uses proper capitolization
	# question_correct_capitalization = 0
	# for i in range(len(question_tokens)):
	# if question_tokens[i] == ".":
	# if question_tokens[i + 1][0] and question_tokens[i + 1][0].isupper():
	# question_correct_capitalization += 1

	# check if question contains a proper noun

	# create features
	features.append([]) # append a list to features for each row in data
	# PANDAS USAGE
	features[counter].append(train[:, 1])# # of followers in context topic
	features[counter].append(len(question_tokens))# # of words in the question
	# features[counter].append()# # of topics
	# features[counter].append()# sum of followers in topics
	# PANDAS USAGE
	features[counter].append(1 if row[5] else 0)# 1 for anon 0 for non-anon
	features[counter].append(num_ctnoun_match_qnoun)# # of common nouns between question text and topics
	features[counter].append(num_ctwords_match_qwords)# # of common words between question text and topics
	# features[counter].append()# does first word match (Is..will..can..do..does..are..)
	# features[counter].append()# What kind of question is it? (Who? What? Where? When? Why? How?)
	# features[counter].append()# no additional topics
	# features[counter].append()# question text count > 50
	# features[counter].append()# # of sentences
	# features[counter].append()# ends with a question mark
	features[counter].append(len(question_verbs)/len(question_tags))# ratio of verbs
	features[counter].append(len(question_adjectives)/len(question_tags))# ratio of adjectives
	# features[counter].append(question_correct_capitalization)# if words are capitalized after a period
	features[counter].append(1 if hasPNoun else 0)# Does the question have a proper noun in it?
	# features[counter].append()# Does the question have a name in it?
	# features[counter].append()# Does the question have a name of someone famous in it? (list of celebrities)
	# features[counter].append()# Is the question related to technology?
	counter += 1
	print features