Skip to content

Instantly share code, notes, and snippets.

@muokicaleb
Last active March 10, 2020 09:32
Show Gist options
  • Save muokicaleb/c867bb6cc058d5f0d4fe82e7fe4e7530 to your computer and use it in GitHub Desktop.
Save muokicaleb/c867bb6cc058d5f0d4fe82e7fe4e7530 to your computer and use it in GitHub Desktop.
# get sum of all unique values in all columns
df.apply(lambda x: len(x.unique()))
# SUm null values
df.isnull().sum()
#apply to rows based on a row
df['Name_entity'] = df.apply (lambda df: name_entity_recognition(df['clean_tweet']), axis=1)
## New df with solumns
df2 = df[['id', 'review', 'features_importance', 'intent_sentences_raw_review']]
# genegrate id column
df['id'] = (df['title'] + '_' + df['review']).astype('category').cat.codes
# move column to first
cols = df.columns.tolist()
n = int(cols.index('id'))
cols = [cols[n]] + cols[:n] + cols[n+1:]
df = df[cols]
# concatonate dfs vertically
df = pd.concat([df1, df2], axis=0)
# concatonate dfs horizontality
df = pd.concat([df1, df2], axis=1)
# reset index
df = df.reset_index(drop = True)
# drop nan
df = df.dropna()
# view all entries
pd.set_option('display.max_rows', df.shape[0]+1)
# save csv
df.to_csv('saved.csv', index=False, encoding='utf-8')
df_ = df_.rename(columns={"sw_books": "book", "sw_chapter_num": "chapter_num"})
# expand column with list
df2 = df.assign(scores=df.score.str.split(',')).explode('score')
# split df into equal lengths
"""NUMBER_OF_SPLITS = len(df)
for i, new_df in enumerate(np.array_split(df,NUMBER_OF_SPLITS)):
print(f"creating file {i} of {len(df)}")
with open(f"{file_path}/tmp/tw_{i}_{str(uuid.uuid4())}.csv","w") as fo:
fo.write(new_df.to_csv())
file_mover(filenames)"""
print("csv files created")
def hypo_plots(col, pkind='bar'):
print('Value counts of {} \n {}' .format(col,df[col].value_counts().sort_values(ascending=False)))
col_plot = df[col].value_counts().plot(kind=pkind, title= (str(col)+ "Value count plots"))
plt.show()
"""
print('\n\n Sales from {} values' .format(col))
print('\n {}' .format(df.groupby(train[col]).mean()['Item_Outlet_Sales'].sort_values(ascending=False)))
print("\n")
df.groupby(train[col]).mean()['Item_Outlet_Sales'].plot(kind=pkind,
title= ("Item sales vs "+str(col)))
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment