This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Define a pipeline combining a text feature extractor with multi label classifer | |
NB_pipeline = Pipeline([ | |
('tfidf',TfidfVectorizer()), | |
('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True,class_prior=None))), | |
]) | |
#Train the model and get the prediction | |
score = [] | |
for code in label: | |
# train the model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Create pipeline for GradientBoostingClassifier | |
params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5, | |
'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3} | |
GBC_pipeline = Pipeline([ | |
('tfidf',TfidfVectorizer()), | |
('clf',OneVsRestClassifier(ensemble.GradientBoostingClassifier(**params),n_jobs=1)), | |
]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#creates an instance of the TfidfVectorizer | |
vect = TfidfVectorizer() | |
#fits the Tfidfvectorizer to your corpus, creating the vocabulary | |
vect.fit_transform(data_df['Your_freetext_column']) | |
#shows the fitted TfidfVectorizer and your corpus vocabulary | |
len(vect.vocabulary_) | |
vect.vocabulary_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Gets the Tfidf Score of each document in the corpus, and formats it into a Pandas Dataframe | |
vect_data = pd.DataFrame(list(vect_uncommon.transform(uncommon_df['Fault Desc_Rect']).toarray())) | |
#Appends a new column containing the label of your data to the DataFrame | |
vect_data['Fault Code'] = list(data_df['Your_Label']) | |
vect_data.shape | |
vect_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def find_top_words(class,n): | |
ind = np.argpartition(list(vect_data.loc[class]),-n)[-n:] | |
top_words=[] | |
for index in ind: | |
top_words.append([list(vect.vocabulary_.keys())[list(vect.vocabulary_.values()).index(index)], | |
list(vect_data.loc[code])[index]]) | |
return top_words | |
#test the function on class 'ACON'. Grab the top 6 words | |
find_top_words('ACON',6) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
top_words = [] | |
#loop to find top 5 words of each class in the dataset | |
for code in vect_data.index: | |
top_words.append([code,find_top_words(code,5)]) | |
#print the list of top words | |
top_words |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig = go.Figure(data=[edge_trace, node_trace], | |
layout=go.Layout( | |
title='<br>Network Graph of '+str(num_nodes)+' rules', | |
titlefont=dict(size=16), | |
showlegend=False, | |
hovermode='closest', | |
margin=dict(b=20,l=5,r=5,t=40), | |
annotations=[ dict( | |
showarrow=False, | |
xref="paper", yref="paper", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
app.layout = html.Div([ | |
html.Div(dcc.Graph(id='Graph',figure=fig)), | |
html.Div(className='row', children=[ | |
html.Div([html.H2('Overall Data'), | |
html.P('Num of nodes: ' + str(len(G.nodes))), | |
html.P('Num of edges: ' + str(len(G.edges)))], | |
className='three columns'), | |
html.Div([ | |
html.H2('Selected Data'), | |
html.Div(id='selected-data'), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'): | |
# maximum of 6 value cols -> 6 colors | |
colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464'] | |
labelList = [] | |
colorNumList = [] | |
for catCol in cat_cols: | |
labelListTemp = list(set(df[catCol].values)) | |
colorNumList.append(len(labelListTemp)) | |
labelList = labelList + labelListTemp | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import plotly | |
import plotly.plotly as py | |
fig = genSankey(df,cat_cols=['lvl1','lvl2','lvl3','lvl4'],value_cols='count',title='Word Etymology') | |
plotly.offline.plot(fig, validate=False) |
OlderNewer