This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def anova_machine(Cat_col, target_col, df): | |
"""ANOVA function. Provide the target variable column y, the main data set and a categorical column. | |
A pivot table will be produced. Then an ANOVA performed to see if the columns are significantly different from each other. | |
Currently set for 95% confidence, will update later for higher significance setting.""" | |
p_table = df.pivot(columns=Cat_col, values=target_col) | |
total_columns = len(p_table.columns) | |
total_rows = len(p_table) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sig_num_columns(X_train, y_train, p_thres=0.05): | |
"""Which numerical features held in columns within the training data set are significantly correlated with | |
the target. Returns a dataframe with the column name and its p value. pvalue set to 0.05 for | |
95% confidence level enter a new p_thres if you want to change it. Only returns the significant columns | |
only pass numerical columns to the function! Other column types will return a shape error1""" | |
from scipy.stats import linregress | |
global sig_num | |
sig_num = {} | |
for col in X_train: | |
slope, intercept, rvalue, pvalue, stderr = linregress(X_train[col], y_train) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def replace_all_NaN(df): | |
""" If you are confident that numbers can be replaced with 0 and | |
objects can be replaced by No_columnname this function will do that over the whole data frame | |
will add more data types as I come across them. It prints info() when finished to check it | |
has captured them all""" | |
for col in df: | |
if df[col].dtype == 'object' and df[col].isna().sum() > 0: | |
df[col] = df[col].fillna('No_' + col) | |
elif df[col].dtype == 'float64' and df[col].isna().sum() > 0: | |
df[col] = df[col].fillna(0.0) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def first_n_pairs(dict_to_see, n): | |
""" Useful with large dictionarys to see what thae data looks like""" | |
a = {k: dict_to_see[k] for k in list(dict_to_see)[:n]} | |
print(a) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class MyStreamListener(tweepy.StreamListener): | |
def __init__(self, api=None): | |
super(MyStreamListener, self).__init__() | |
self.num_tweets = 0 | |
self.file = open("tweets.txt", "w") | |
def on_status(self, status): | |
tweet = status._json | |
self.file.write( json.dumps(tweet) + '\n' ) | |
self.num_tweets += 1 |
NewerOlder