stanlee321 · July 10, 2020 16:10
diff --git a/pandas_tricks.py b/pandas_tricks.py
 # Split and expand string values into columns 

 df[‘name’] = df.name.str.split(" ", expand=True)

 # GRoup by two columns

 gkk = df.groupby(['Team', 'Position']) 


 # Merge two df with common columns

 #df_values = pd.concat([df_com_count,max_work_com ], axis=1)
 df_values = pd.merge(df_com_count,max_work_com, on = "Ministerio",  how='outer')
 df_values.fillna(0, inplace=True)
 df_values.rename(columns={
    "TotalWork_COM": "Acciones: Comunicacion",
    "TotalWork": "Acciones: Matriz de Acciones"
 }, inplace=True)

 df_values.info()


 # SNS BAR PLOT WITH ROTATIONS

 def plot_bar(df_col, title  = 'Call-center, top 10 coductas'):
    
    city_count  = df_col.value_counts()
    
    city_count = city_count[:10,]
    plt.figure(figsize=(10,5))

    plot_bar_count = sns.barplot(city_count.index, city_count.values, alpha=0.8)

    plt.setp(plot_bar_count.get_xticklabels(), rotation=90)

    plt.title(title)

    plt.ylabel('Number of Calls', fontsize=12)
    plt.xlabel('Tipo de Conducta', fontsize=12)
    plt.show()
    

 def create_timestamp_indexes(self, df, column="", format=""):
    """
    Create datestamp index to dataframe
    """

    df['time'] =  pd.to_datetime(df[column], format=format)

    # df = df.drop(columns_to_remove, 1)
    df.set_index('time', inplace=True) 

    return df


 # Create excel sheets

 # Create a Pandas Excel writer using XlsxWriter as the engine.
 writer = pd.ExcelWriter('pandas_multiple.xlsx', engine='xlsxwriter')

 # Write each dataframe to a different worksheet.
 df1.to_excel(writer, sheet_name='Sheet1')
 df2.to_excel(writer, sheet_name='Sheet2')
 df3.to_excel(writer, sheet_name='Sheet3')

 # Close the Pandas Excel writer and output the Excel file.
 writer.save()
  
 # GRoup by and count


 max_work = remote_df.groupby("Ministerio")["Tareas"].agg(np.max).sort_values(ascending=True).reset_index(name="TotalWork")


 # REGEX FIND BETWEEN:

 import re

 # SEARCH BETWEEN PIPES
 pattern = re.compile(r'(?<=\?id=)[^&comment_id]*')

 # Iterar sobre la lista y buscar lo que esta dentro del patron de pipes | ... |

 match = re.search(pattern, text )

 if match:
    name = match.group(0)
    print(name)
    
    
 #############################


 df_comments_top_N00 = df_comments.assign(
                        topN00 = df_comments["commenter"].isin(df_comments_topN00_c["commenter"]).astype(int)
                    )


 ###############################
 # Parsear la fecha datetime

 from dateutil import parser

 df_comments["timestamp"] = df_comments["comment_timestamp"].apply(lambda x: parser.parse(x) )

 print(df_comments.shape)
 df_comments.head()






 # Barras del numero de post que se tiene de las paginas scrapeadas

 df_post_counts = df_posts_raw["real_page_name"].value_counts()

 ax = df_post_counts.plot(kind="barh", figsize=(13,8))

 for i, v in enumerate(df_post_counts):
    ax.text(v + 1, i + -.15, str(v), color='blue', fontweight='bold')
	# Split and expand string values into columns

	df[‘name’] = df.name.str.split(" ", expand=True)

	# GRoup by two columns

	gkk = df.groupby(['Team', 'Position'])


	# Merge two df with common columns

	#df_values = pd.concat([df_com_count,max_work_com ], axis=1)
	df_values = pd.merge(df_com_count,max_work_com, on = "Ministerio", how='outer')
	df_values.fillna(0, inplace=True)
	df_values.rename(columns={
	"TotalWork_COM": "Acciones: Comunicacion",
	"TotalWork": "Acciones: Matriz de Acciones"
	}, inplace=True)

	df_values.info()


	# SNS BAR PLOT WITH ROTATIONS

	def plot_bar(df_col, title = 'Call-center, top 10 coductas'):

	city_count = df_col.value_counts()

	city_count = city_count[:10,]
	plt.figure(figsize=(10,5))

	plot_bar_count = sns.barplot(city_count.index, city_count.values, alpha=0.8)

	plt.setp(plot_bar_count.get_xticklabels(), rotation=90)

	plt.title(title)

	plt.ylabel('Number of Calls', fontsize=12)
	plt.xlabel('Tipo de Conducta', fontsize=12)
	plt.show()


	def create_timestamp_indexes(self, df, column="", format=""):
	"""
	Create datestamp index to dataframe
	"""

	df['time'] = pd.to_datetime(df[column], format=format)

	# df = df.drop(columns_to_remove, 1)
	df.set_index('time', inplace=True)

	return df


	# Create excel sheets

	# Create a Pandas Excel writer using XlsxWriter as the engine.
	writer = pd.ExcelWriter('pandas_multiple.xlsx', engine='xlsxwriter')

	# Write each dataframe to a different worksheet.
	df1.to_excel(writer, sheet_name='Sheet1')
	df2.to_excel(writer, sheet_name='Sheet2')
	df3.to_excel(writer, sheet_name='Sheet3')

	# Close the Pandas Excel writer and output the Excel file.
	writer.save()

	# GRoup by and count


	max_work = remote_df.groupby("Ministerio")["Tareas"].agg(np.max).sort_values(ascending=True).reset_index(name="TotalWork")


	# REGEX FIND BETWEEN:

	import re

	# SEARCH BETWEEN PIPES
	pattern = re.compile(r'(?<=\?id=)[^&comment_id]*')

	# Iterar sobre la lista y buscar lo que esta dentro del patron de pipes \| ... \|

	match = re.search(pattern, text )

	if match:
	name = match.group(0)
	print(name)


	#############################


	df_comments_top_N00 = df_comments.assign(
	topN00 = df_comments["commenter"].isin(df_comments_topN00_c["commenter"]).astype(int)
	)


	###############################
	# Parsear la fecha datetime

	from dateutil import parser

	df_comments["timestamp"] = df_comments["comment_timestamp"].apply(lambda x: parser.parse(x) )

	print(df_comments.shape)
	df_comments.head()






	# Barras del numero de post que se tiene de las paginas scrapeadas

	df_post_counts = df_posts_raw["real_page_name"].value_counts()

	ax = df_post_counts.plot(kind="barh", figsize=(13,8))

	for i, v in enumerate(df_post_counts):
	ax.text(v + 1, i + -.15, str(v), color='blue', fontweight='bold')