muokicaleb · March 10, 2020 09:32
diff --git a/common_df.functions b/common_df.functions
 # get sum of all unique values in all columns
 df.apply(lambda x: len(x.unique()))

 # SUm null values
 df.isnull().sum()

 #apply to rows based on a row

 df['Name_entity'] = df.apply (lambda df: name_entity_recognition(df['clean_tweet']), axis=1)

 ## New df with solumns
 df2 = df[['id', 'review', 'features_importance', 'intent_sentences_raw_review']]

 # genegrate id column
 df['id'] = (df['title'] + '_' + df['review']).astype('category').cat.codes   

 # move column to first
 cols = df.columns.tolist()
 n = int(cols.index('id'))
 cols = [cols[n]] + cols[:n] + cols[n+1:]
 df = df[cols]

 # concatonate dfs vertically
 df = pd.concat([df1, df2], axis=0)



 # concatonate dfs horizontality
 df = pd.concat([df1, df2], axis=1)

 # reset index
 df = df.reset_index(drop = True) 

 # drop nan
 df = df.dropna()

 # view all entries 
 pd.set_option('display.max_rows', df.shape[0]+1)

 # save csv
 df.to_csv('saved.csv', index=False, encoding='utf-8')

 df_ = df_.rename(columns={"sw_books": "book", "sw_chapter_num": "chapter_num"})

 # expand column with list
 df2 = df.assign(scores=df.score.str.split(',')).explode('score')



 # split df into equal lengths
  """NUMBER_OF_SPLITS = len(df)
        for i, new_df in enumerate(np.array_split(df,NUMBER_OF_SPLITS)):
            print(f"creating file {i} of {len(df)}")
            with open(f"{file_path}/tmp/tw_{i}_{str(uuid.uuid4())}.csv","w") as fo:
                    fo.write(new_df.to_csv())
        file_mover(filenames)"""
        print("csv files created")
diff --git a/valcounts b/valcounts
 def hypo_plots(col, pkind='bar'):
    print('Value counts of {} \n {}' .format(col,df[col].value_counts().sort_values(ascending=False)))
    col_plot = df[col].value_counts().plot(kind=pkind, title= (str(col)+ "Value count plots"))
    plt.show()
    """
    print('\n\n Sales from {} values' .format(col))
    print('\n {}' .format(df.groupby(train[col]).mean()['Item_Outlet_Sales'].sort_values(ascending=False)))
    
    print("\n")
    df.groupby(train[col]).mean()['Item_Outlet_Sales'].plot(kind=pkind,
                                                                     title= ("Item sales vs "+str(col)))
 """
	# get sum of all unique values in all columns
	df.apply(lambda x: len(x.unique()))

	# SUm null values
	df.isnull().sum()

	#apply to rows based on a row

	df['Name_entity'] = df.apply (lambda df: name_entity_recognition(df['clean_tweet']), axis=1)

	## New df with solumns
	df2 = df[['id', 'review', 'features_importance', 'intent_sentences_raw_review']]

	# genegrate id column
	df['id'] = (df['title'] + '_' + df['review']).astype('category').cat.codes

	# move column to first
	cols = df.columns.tolist()
	n = int(cols.index('id'))
	cols = [cols[n]] + cols[:n] + cols[n+1:]
	df = df[cols]

	# concatonate dfs vertically
	df = pd.concat([df1, df2], axis=0)



	# concatonate dfs horizontality
	df = pd.concat([df1, df2], axis=1)

	# reset index
	df = df.reset_index(drop = True)

	# drop nan
	df = df.dropna()

	# view all entries
	pd.set_option('display.max_rows', df.shape[0]+1)

	# save csv
	df.to_csv('saved.csv', index=False, encoding='utf-8')

	df_ = df_.rename(columns={"sw_books": "book", "sw_chapter_num": "chapter_num"})

	# expand column with list
	df2 = df.assign(scores=df.score.str.split(',')).explode('score')



	# split df into equal lengths
	"""NUMBER_OF_SPLITS = len(df)
	for i, new_df in enumerate(np.array_split(df,NUMBER_OF_SPLITS)):
	print(f"creating file {i} of {len(df)}")
	with open(f"{file_path}/tmp/tw_{i}_{str(uuid.uuid4())}.csv","w") as fo:
	fo.write(new_df.to_csv())
	file_mover(filenames)"""
	print("csv files created")
	def hypo_plots(col, pkind='bar'):
	print('Value counts of {} \n {}' .format(col,df[col].value_counts().sort_values(ascending=False)))
	col_plot = df[col].value_counts().plot(kind=pkind, title= (str(col)+ "Value count plots"))
	plt.show()
	"""
	print('\n\n Sales from {} values' .format(col))
	print('\n {}' .format(df.groupby(train[col]).mean()['Item_Outlet_Sales'].sort_values(ascending=False)))

	print("\n")
	df.groupby(train[col]).mean()['Item_Outlet_Sales'].plot(kind=pkind,
	title= ("Item sales vs "+str(col)))
	"""