Last active
March 10, 2020 09:32
-
-
Save muokicaleb/c867bb6cc058d5f0d4fe82e7fe4e7530 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get sum of all unique values in all columns | |
df.apply(lambda x: len(x.unique())) | |
# SUm null values | |
df.isnull().sum() | |
#apply to rows based on a row | |
df['Name_entity'] = df.apply (lambda df: name_entity_recognition(df['clean_tweet']), axis=1) | |
## New df with solumns | |
df2 = df[['id', 'review', 'features_importance', 'intent_sentences_raw_review']] | |
# genegrate id column | |
df['id'] = (df['title'] + '_' + df['review']).astype('category').cat.codes | |
# move column to first | |
cols = df.columns.tolist() | |
n = int(cols.index('id')) | |
cols = [cols[n]] + cols[:n] + cols[n+1:] | |
df = df[cols] | |
# concatonate dfs vertically | |
df = pd.concat([df1, df2], axis=0) | |
# concatonate dfs horizontality | |
df = pd.concat([df1, df2], axis=1) | |
# reset index | |
df = df.reset_index(drop = True) | |
# drop nan | |
df = df.dropna() | |
# view all entries | |
pd.set_option('display.max_rows', df.shape[0]+1) | |
# save csv | |
df.to_csv('saved.csv', index=False, encoding='utf-8') | |
df_ = df_.rename(columns={"sw_books": "book", "sw_chapter_num": "chapter_num"}) | |
# expand column with list | |
df2 = df.assign(scores=df.score.str.split(',')).explode('score') | |
# split df into equal lengths | |
"""NUMBER_OF_SPLITS = len(df) | |
for i, new_df in enumerate(np.array_split(df,NUMBER_OF_SPLITS)): | |
print(f"creating file {i} of {len(df)}") | |
with open(f"{file_path}/tmp/tw_{i}_{str(uuid.uuid4())}.csv","w") as fo: | |
fo.write(new_df.to_csv()) | |
file_mover(filenames)""" | |
print("csv files created") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def hypo_plots(col, pkind='bar'): | |
print('Value counts of {} \n {}' .format(col,df[col].value_counts().sort_values(ascending=False))) | |
col_plot = df[col].value_counts().plot(kind=pkind, title= (str(col)+ "Value count plots")) | |
plt.show() | |
""" | |
print('\n\n Sales from {} values' .format(col)) | |
print('\n {}' .format(df.groupby(train[col]).mean()['Item_Outlet_Sales'].sort_values(ascending=False))) | |
print("\n") | |
df.groupby(train[col]).mean()['Item_Outlet_Sales'].plot(kind=pkind, | |
title= ("Item sales vs "+str(col))) | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment