This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def rule2_mod(text,index): | |
doc = nlp(text) | |
phrase = '' | |
for token in doc: | |
if token.i == index: | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function for rule 2 | |
def rule2(text): | |
doc = nlp(text) | |
pat = [] | |
# iterate over tokens | |
for token in doc: | |
phrase = '' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create a df containing sentence and its output for rule 1 | |
row_list = [] | |
# df2 contains all the sentences from all the speeches | |
for i in range(len(df2)): | |
sent = df2.loc[i,'Sent'] | |
year = df2.loc[i,'Year'] | |
output = rule1(sent) | |
dict1 = {'Year':year,'Sent':sent,'Output':output} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create a df containing sentence and its output for rule 1 | |
row_list = [] | |
for i in range(len(df3)): | |
sent = df3.loc[i,'Sent'] | |
year = df3.loc[i,'Year'] | |
output = rule1(sent) | |
dict1 = {'Year':year,'Sent':sent,'Output':output} | |
row_list.append(dict1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function for rule 1: noun(subject), verb, noun(object) | |
def rule1(text): | |
doc = nlp(text) | |
sent = [] | |
for token in doc: | |
# if the token is a verb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
text = df3.loc[9,'Sent'] | |
print(text) | |
doc = nlp(text) | |
displacy.render(doc, style='dep',jupyter=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function to check output percentage for a rule | |
def output_per(df,out_col): | |
result = 0 | |
for out in df[out_col]: | |
if len(out)!=0: | |
result+=1 | |
per = result/len(df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from random import randint | |
def rand_sent(df): | |
index = randint(0, len(df)) | |
print('Index = ',index) | |
doc = nlp(df.loc[index,'Sent'][1:]) | |
displacy.render(doc, style='dep',jupyter=True) | |
return index |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
row_list = [] | |
# df2 contains all sentences from all speeches | |
for i in range(len(df2)): | |
sent = df2.loc[i,'Sent'] | |
if (',' not in sent) and (len(sent.split()) <= 15): | |
year = df2.loc[i,'Year'] | |
length = len(sent.split()) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# rule to extract initiative name | |
def sent_subtree(text): | |
# pattern match for schemes or initiatives | |
patterns = [r'\b(?i)'+'plan'+r'\b', | |
r'\b(?i)'+'programme'+r'\b', | |
r'\b(?i)'+'scheme'+r'\b', | |
r'\b(?i)'+'campaign'+r'\b', | |
r'\b(?i)'+'initiative'+r'\b', | |
r'\b(?i)'+'conference'+r'\b', |