This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#the dataset consists of baby product reviews on Amazon.com | |
#link for data: https://d18ky98rnyall9.cloudfront.net/_35bdebdff61378878ea2247780005e52_amazon_baby.gl.zip?Expires=1482278400&Signature=blPJv6YQNFgcZh~dULuDECzZlA6eGL1x9lzQKzHknqVHSdudmfjq0XPaokFjv-~Qy8nGADiBBdx4ar0BWgeboW1eTkYHOZzoUIMBfSPQGqA4Q9H8X8vwFyr9R-TC0LE4h4CsTRFH56BtbqpKtjKeJKxVv5E5LfZZiyhZEr6We5M_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A | |
import sframe | |
products = sframe.SFrame('amazon_baby.gl/') | |
#clean the original data: remove punctuation, fill in N/A, remove neutral sentiment, | |
# perform a train/test split, produce word count matrix | |
def remove_punctuation(text): | |
import string | |
return text.translate(None, string.punctuation) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#implement logistic regression from scratch | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists a subset of baby product reviews on Amazon.com | |
import sframe | |
products = sframe.SFrame('amazon_baby_subset.gl/') | |
products = sframe.SFrame.to_dataframe(products) | |
print sum(products['sentiment']==1) #num of positive sentiment 26579 | |
print sum(products['sentiment']==-1) #num of negative sentiment 26493 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Logistic Regression with L2 regularization | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists a subset of baby product reviews on Amazon.com | |
import sframe | |
products = sframe.SFrame('amazon_baby_subset.gl/') | |
print sum(products['sentiment']==1) #num of positive sentiment 26579 | |
print sum(products['sentiment']==-1) #num of negative sentiment 26493 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Identifying safe loans with decision trees | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or | |
#the loan with be charged off and possibly go into default | |
import sframe | |
loans = sframe.SFrame('lending-club-data.gl/') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#build decision trees where the data contain only binary features | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or | |
#the loan with be charged off and possibly go into default | |
import sframe | |
loans = sframe.SFrame('lending-club-data.gl/') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#explore various techniques for preventing overfitting in decision trees | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or | |
#the loan with be charged off and possibly go into default | |
import sframe | |
loans = sframe.SFrame('lending-club-data.gl/') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#use the pre-implemented gradient boosted trees | |
import pandas as pd | |
import numpy as np | |
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or | |
#the loan with be charged off and possibly go into default | |
import sframe | |
loans = sframe.SFrame('lending-club-data.gl/') | |
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Boosting a decision stump from scratch | |
import pandas as pd | |
import numpy as np | |
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or | |
#the loan with be charged off and possibly go into default | |
import sframe | |
loans = sframe.SFrame('lending-club-data.gl/') | |
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#explore precision and recall | |
import pandas as pd | |
import numpy as np | |
#the dataset consists of baby product reviews on Amazon.com | |
import sframe | |
products = sframe.SFrame('amazon_baby.gl/') | |
#clean the original data: remove punctuation, fill in N/A, remove neutral sentiment, | |
# perform a train/test split, produce word count matrix |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Training logistic regression via stochastic gradient ascent | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists a subset of baby product reviews on Amazon.com | |
import sframe | |
products = sframe.SFrame('amazon_baby_subset.gl/') | |
products = sframe.SFrame.to_dataframe(products) |
OlderNewer