mariogarcia-ar · March 4, 2026 12:14
diff --git a/pandas 1.x cookbook.py b/pandas 1.x cookbook.py
 import numpy as np
 import pandas as pd

 #load dataset
 df = pd.read_csv("data.csv")

 # axis 0 -> row -> i
 # axis 1 -> col -> j

 # get cols
 df.columns
 # get index
 df.index

 # data type
 # float, int, Int64, object, category, bool, boolean, datetime64[ns]
 df.dtypes 
 df.dtypes.values_count()

 # summary dtype, count, non-null, memory
 df.info()

 # select col / row 
 df.loc[:,  "col1"]
 df.loc[:,  ["col1","col2"]
       
 df.iloc[:,  [3,4]
       

 # Series
 sr = df["col1"]
 len(sr)
        
 # Sample
 df.sample(n=5, random_state=42)
        
 # count elements in serie
 sr.value_counts()        

 # check unique in dataframe / serie
 df.unique()
 sr.unique()
        
        
 # statistics
 sr.min(); sr.max(); sr.mean(); sr.median(); sr.std(); sr.quantile(0.2)

 # all stats
 sr.describe()       
        
 # check isna
 sr.isna()
        
 # fill with 0 all null values
 sr.fillna(0)        
        
 # remove null values        
 sr.dropna()        

 # check nan 
 sr.hasnans
        
 # check not nan
 sr.notna()
        
 # arithmetic function
 sr.add(1); sr.sub(1); sr.mul(1); sr.div(1); sr.floordiv(1); sr.mod(1); sr.pow(1);            
        

 # comparation operators
 # <       >         <=        >=        ==        !=        
 sr.lt(7); sr.gt(7); sr.le(7); sr.ge(7); sr.eq(7); sr.ne(7);         
        
 # chain 
 sr.isna().sum()
 df.isna().sum()
        
 # chain with function
 def myprint(a):
        print("value is %s"%(a)) 
        
 (sr.fillna(0)).pipe(myprint) 
        
        
 # rename cols
 col_map = { "col1": "col_one"  }        
 df.rename(columns=col_map)
        
 idx_map = { "row1": "row_one"}        
 df.rename(index=idx_map)
        
        
 # add column
 df["has_seen"] = 0
        
 df.assign(has_seen=0)
        
 total = ( df.col1 + df.col2 + df.col3 )        

 cols = ["col1", "col2", "col3"]        
 sum_total = df.loc[:, cols].sum(axis="columns")         
 df.assign(total=sum_total)
        
        
 def sum_likes(df):
  return df[
    [ 
      c 
      for c in df.columns
        if "like" in c and ("actor" in c or "director" in c) 
    ]
  ].sum(axis=1)
        
 # insert new column
 df.insert(
  loc = df.columns.get_loc("col3"),
  column = "newcol",
  value = df.col2 - df.col18
 )        
        
        
 # remove col
 del df["col3"]        
        
 # -------------------------------------------------------------------------------------------------        
 # 2 - Essential DataFrame Operations        
 # -------------------------------------------------------------------------------------------------        
 type( df[["col1"]]  )  # DataFrame
 type( df["col1"]  )    # Serie
        
 type( df.loc[:, ["col1"]]  )  # DataFrame
 type( df.log[:, "col1"]  )    # Serie
      
 cols = ["col1", "col2", "col3"]        
 df_col = df[ cols ]
        
 # filter / select
 df.select_types(include="int")
 df.select_types(exclude="category")
        
 df.filter(like="fb")  # %fb%
 df.filter(regex=r"\d")        # column with a digit 
        
 # best practices
  # Classify each column as Categorical/Continuous
  # Group column by Categorical/Continuous
  # Order more important group
        
 # summarizing a DataFrame
 df.shape ; df.size; df.ndim; len(df); df.count(); 
        
 # summarizing stats
 df.describe().T
 df.describe(percentiles=[0.1, 0.2, 0.3, 0.99]).T        
        
        
 # skip nan
 df.min(skipna=False)        
        
        
 # sumar los nulos
 df.isna().sum().sum()
        

 df.isna().any().any()        
        
        
 df2.equal(df)
        
 # count
 df.count(axis=1)
 df.cumsum(axis=1)
        
        

 # -------------------------------------------------------------------------------------------------        
 # 3 - Creating and Persisting DataFrame        
 # -------------------------------------------------------------------------------------------------            
 df = pd.read_csv( ".csv", dtype={"col1": np.flota32, "col2":"category"})        
        
 # export
 df.to_excel("file.xls")
        
 df[df.col1 > 54].to_excel( xl_writer, sheet_name="file")
 xl_writer.save()
        

 # convert to datetime
 pd.to_datetime(df.col1)
        
 # read to zip
 import zipfile
 with zipfile.ZipFile( "file.zip") as z:
  print("\n".join(z.namelist()))
  df = pd.read_csv(z.open("file.csv"))
        
 # data base
 import sqlite3
 con = sqlite3.connect("file.db"=
 with con:
  cur = con.cursor()
  cur.execute("")
  _ = con.commit()
        
 import sqlalchemy as sa
 engine = sa.create_engine("sqlite:///file.db", echo=True)
 sa_connection = engine.connect()
                      
 df = pd.read_sql("table", sa_connection, index_col="id")                      
                      
                      
 # read Json                      
 import json
 encoded = json.dumps(people)
                      
 json.loads(encoded)                      
                      
 df = pd.read_json(encoded)
                      
 # read html                      
 url = "https://en.wikipedia.org/wiki/The_Beatles_discography"
 df = pd.read_html(url, match="List of studio", na_values="_")
                      
                      
                      

 # -------------------------------------------------------------------------------------------------        
 # 4 - Beginning Data Analysis       
 # -------------------------------------------------------------------------------------------------                        
 # Exploratory Data Analysis (EDA)
 #     Metadata                      
 #     Descriptive Statistics                      
                      
                      
 df.describe(include=[np.object]).T                    
 df.describe(include=[np.number], percentiles=[0.01, 0.5, 0.099 ]).T
                      
 # Data dictionaries                      

 # Data Type reduce memory usage
 df["col1"] = df["col1"].astype(np.int8)
                      
 # to reduce more momory use categorical if low cardinality                      
 df.select_dtypes(include=["object"]).nunique()
 df["col1"] = df["col1"].astype("category")                      
                      
 df.assign( col1=df["col1"].astype("float16") )
                      
 # select top 100
 df.nlargest(100, "col1") 
                                            
 # select small 100
 df.nsmallest(100, "col1") 
                      
 # select top 3 by group
 (
 df[["col1", "col2", "col3"]]
       .groupby("col2", as_index=False)
       .apply(
              lambda ddf: ddf.sort_values("col3", ascending=False).head(3) # select top 3 by group
       )
       .droplevel(0)
       .sort_values("col3", ascending=False)
 )                      
                      
 # select with different directions                      
 (
 df[["col1", "col2", "col3"]]
       .groupby("col2", as_index=False)
       .apply(
              lambda ddf: ddf.sort_values(["col3","col2"], ascending=[False,True]).head(3) # sort different direction
       )
       .droplevel(0)
       .sort_values("col3", ascending=False)
 )                       
                      
                      
 # get the smaller 5 by col1 from top 100 of col3
 (
       df
          .nlargest(100, "col3")
          .smallest(5, "col1")
 )                      
                      
                      
 # stocks
 import datetime
 import pandas_datareader.data as web
 import requests_cache                      
 session = requests_cache.CachedSession(
    cache_name="cache",
    backend="sqlite",
    expire_after=datetime.timedelta(days=90)
 )
 tsla = web.DataReader("tlsa", data_source="yahoo", start="2020-01-01", session=session)                      
 tsla.plot()                      
                      
 tsla.Close.plot()

 tsla.Close.cummax().plot() # es como un salto al maximo y solo se incrementa si hay un nuevo maximo
                      
 # graficamente dar una alerta cuando el stock baja un 10% de su maximo historico                      
 tsla.Close.cummax().mul(.9).plot()
 tsla.Close.plot()                      
        
        

                      

 # -------------------------------------------------------------------------------------------------        
 # 5 - Exploratory Data Analisys       
 # -------------------------------------------------------------------------------------------------           
        
 # info sobre los data types
 np.iinfo(np.int8)
                      
 # subplot
 import matplotlib.pyplot as plt
 fig, ax = plt.subplots(figsize=(10,8))
                      
 top_n = df.Col1.value_counts().index[:6]
 (
       df.assign(
              newcol = df.Col1.where(
                     df.Col1.isin(top_n), "Other"
              )
       )
        .newCol.value_counts()
        .plot.bar(ax=ax)
 )                      
 fig.savefig("file.png", dpi=300)                      
                      
                      
 # quantile
 (
       df.Col1.fillna("0")
              .str.replace("-","/")
              .str.split("/", expand=True)
              .astype(float)
              .mean(axis=1)
              .pipe(lambda ser_: pd.qcut(ser_, 10))
              .value_counts()
 )                      
                      
                      
 # kolmogorov-Smirnov  -> test de normalidad
 from scipy import stats
 stats.kstest( df.Col1, cdf="norm")
                      
 # graficar la cdf empirica
 from scipy import stats
 fig, ax = plt.subplots(figsize=(10,8))
 stats.probplot( df.Col1, plot=ax)
 fig.savefig("file.png", dpi=300)
                      
                      
 # informacion entre varias categorias con AGG
 mask = df.Col1.isin( ["val1", "val2", "val3"] ) 
 df[ mask  ].groupby("Col1").Col3.agg(["mean", "std"])                      
                      
 # graficar 
 import seaborn as sns
 g = sns.catplot( x="Col1", y="Col3", data=df[mask], kind="box")
 g.ax.figure.savefig("file.png", dpi=300)
                      
                      
 # swarm plot
 g = sns.catplot( x="", y="", data="", kind="box")                      
 sns.swarmplot( x="", y="", data="", kind="box", ax=g.ax) 
 g.ax.figure.savefig("file.png", dpi=300)
                      
 # hue plot
 g = sns.catplot( x="", y="", data="", kind="box", hue="year")                   
                      
 # summary
 df.groupby("Col1").agg(["mean","std"]).style.background_gradient(cmap="RdBu", axis=0)                      
                      
 # regression line
 res = sns.lmplot(x="col1", y="col2", data=df)

 # pearson correlation -> linearity
 # spearman correlation -> if relationship is monotonic                      
                      
                      
                      
                      

 # -------------------------------------------------------------------------------------------------        
 # 6 - Selecting Subset of Data     
 # -------------------------------------------------------------------------------------------------           
                      
                      

 # -------------------------------------------------------------------------------------------------        
 # 7 - Filtering Rows
 # -------------------------------------------------------------------------------------------------           
                    
 # Query
 @name = ["name1", "name2"]
 qs = (
       "ENTERPRISE in @name"
       " and PRICE > 2000" 
 )                      
 df.query(qs)                      
                      
                      
 # round values
 df.col1.clip(lower=300, upper=20000)
                      
 # complement of where
 criteria1 = df.Col1 >= 2000
 creteria2 = df.Col1.isna()
                      
 criteria = criteria1 | criteria2                      
 df.mask(criteria)                      
                      
                      
                      

 # -------------------------------------------------------------------------------------------------        
 # 8 - Index Aligment
 # -------------------------------------------------------------------------------------------------           
 cols = df.columns
 cols.values
 cols + "_A"
 cols > "G"
 c1 = cols[2:6]
 c1.union(c2)   # c1 | c2
 c1.symetric_difference(c2)  # c1 ^ c2                       
                      
                      
 # es importante pero no me da el tiempo 
                      
                      
                      

 # -------------------------------------------------------------------------------------------------        
 # 9 - Grouping for Aggregation, Filtration and Transformation
 # -------------------------------------------------------------------------------------------------           
                      
 (
  df.groupby(["col1", "col2"])[[ "col4", "col7" ]]
    .agg(['sum', 'mean'])
 )
       
                      
                      
 (
  df.groupby(["col1", "col2"]) 
    .agg({
      'col4': ['sum', 'mean', 'size'],
      'col7': ['mean', 'var']
    })
 )
                      
                      
 # info about group
 df.groupby("Col2").ngroups
 df.groupby("Col2").get_group(('FL',1))
                      
 from IPython.displah import display                       
 for name, group in df.groupby("Col3"):
       print(name)
       display(group.head(3))
                      
         
                      
 # use filter in group
 def check_condition(df, val):  # aqui cada df es en realidad un subgrupo
    return  df.max() < val                      
                      
 df.groupby("Col3").filter(check_condition, val=800) 
                      
                      
                      
 # Transform
 def percent_loss(s):
       return ( (s - s.iloc[0])) / s.iloc[0]) * 100
                                      
 # for only one       
 (
  df.query("Name=='Bob' and Month=='Jan'")["Weight"]
    .pipe(percent_loss)
 )        
 # for all groups
 (
 df.groupby(["Name", "Month"])["Weight"]
   .transform(percent_loss)
 )        
 # pivot         /  si hay varios se pueden usar pivot_table 
 (
 df.groupby(["Name", "Month"])["Weight"]
   .transform(percent_loss)
   .pivot(index="Month", columns="Name", values="percent_loss")
 )           
        
        
 # assign para ver el ganador          
 (
 df.groupby(["Name", "Month"])["Weight"]
   .transform(percent_loss)
   .pivot(index="Month", columns="Name", values="percent_loss")
   .assign(winner=lambda df_: np.where(df_.Amy < df_.Bob, "Amy", "Bob"))
 )         
        
        
        
 # assign para ver el ganador          
 (
 df.groupby(["Name", "Month"])["Weight"]
   .transform(percent_loss)
   .pivot(index="Month", columns="Name", values="percent_loss")
   .assign(winner=lambda df_: np.where(df_.Amy < df_.Bob, "Amy", "Bob"))
   .winner.value_counts()
 )         
        
         
 # groupby tiene 4 funciones  
        # .agg        --> scalar                           --> simple column
        # .filter     --> boolean                          --> 
        # .transform  --> serie / dataframe                --> simple column
        # .apply      --> scalar / serie / dataframe       --> multiple columns
        
        
 # analizar valores continuous
 bins = [-np.inf, 200, 500, 1000, 2000, np.inf]
 cuts = pd.cut(df["col1"], bins=bins)
 cuts.value_counts().plot.hist()
 cuts.value_counts(normalize=True)        
        
 # usando quantiles
 df.groupby(cuts)["col2"].quantile(q=[.25, .5, .75])
        
        
                      

 # -------------------------------------------------------------------------------------------------        
 # 10 - Restructuring Data into a Tidy Form
 # -------------------------------------------------------------------------------------------------           
          
 # tidy tools
 # .pivot   .melt   .stack .unstack 
        
 # advanced
 # .rename .rename_axis .reset_index  .set_index        
        
 # wide_to_long
 # pd.wide_to_long(df,         
  
 (
 df.groupby(["col1", "col2", "col3" ])[["col5", "col6"]]
       .agg(["size", "min", "max"])
       .rename_axis(["col10", "col11"], axis="columns")
       .stack("col10")
       .swaplevel("col10", "co1", axis="index")
       .sort_index(level="col2", axis="index")
       .sort_index(level="col10", axis="columns")
 )
	import numpy as np
	import pandas as pd

	#load dataset
	df = pd.read_csv("data.csv")

	# axis 0 -> row -> i
	# axis 1 -> col -> j

	# get cols
	df.columns
	# get index
	df.index

	# data type
	# float, int, Int64, object, category, bool, boolean, datetime64[ns]
	df.dtypes
	df.dtypes.values_count()

	# summary dtype, count, non-null, memory
	df.info()

	# select col / row
	df.loc[:, "col1"]
	df.loc[:, ["col1","col2"]

	df.iloc[:, [3,4]


	# Series
	sr = df["col1"]
	len(sr)

	# Sample
	df.sample(n=5, random_state=42)

	# count elements in serie
	sr.value_counts()

	# check unique in dataframe / serie
	df.unique()
	sr.unique()


	# statistics
	sr.min(); sr.max(); sr.mean(); sr.median(); sr.std(); sr.quantile(0.2)

	# all stats
	sr.describe()

	# check isna
	sr.isna()

	# fill with 0 all null values
	sr.fillna(0)

	# remove null values
	sr.dropna()

	# check nan
	sr.hasnans

	# check not nan
	sr.notna()

	# arithmetic function
	sr.add(1); sr.sub(1); sr.mul(1); sr.div(1); sr.floordiv(1); sr.mod(1); sr.pow(1);


	# comparation operators
	# < > <= >= == !=
	sr.lt(7); sr.gt(7); sr.le(7); sr.ge(7); sr.eq(7); sr.ne(7);

	# chain
	sr.isna().sum()
	df.isna().sum()

	# chain with function
	def myprint(a):
	print("value is %s"%(a))

	(sr.fillna(0)).pipe(myprint)


	# rename cols
	col_map = { "col1": "col_one" }
	df.rename(columns=col_map)

	idx_map = { "row1": "row_one"}
	df.rename(index=idx_map)


	# add column
	df["has_seen"] = 0

	df.assign(has_seen=0)

	total = ( df.col1 + df.col2 + df.col3 )

	cols = ["col1", "col2", "col3"]
	sum_total = df.loc[:, cols].sum(axis="columns")
	df.assign(total=sum_total)


	def sum_likes(df):
	return df[
	[
	c
	for c in df.columns
	if "like" in c and ("actor" in c or "director" in c)
	]
	].sum(axis=1)

	# insert new column
	df.insert(
	loc = df.columns.get_loc("col3"),
	column = "newcol",
	value = df.col2 - df.col18
	)


	# remove col
	del df["col3"]

	# -------------------------------------------------------------------------------------------------
	# 2 - Essential DataFrame Operations
	# -------------------------------------------------------------------------------------------------
	type( df[["col1"]] ) # DataFrame
	type( df["col1"] ) # Serie

	type( df.loc[:, ["col1"]] ) # DataFrame
	type( df.log[:, "col1"] ) # Serie

	cols = ["col1", "col2", "col3"]
	df_col = df[ cols ]

	# filter / select
	df.select_types(include="int")
	df.select_types(exclude="category")

	df.filter(like="fb") # %fb%
	df.filter(regex=r"\d") # column with a digit

	# best practices
	# Classify each column as Categorical/Continuous
	# Group column by Categorical/Continuous
	# Order more important group

	# summarizing a DataFrame
	df.shape ; df.size; df.ndim; len(df); df.count();

	# summarizing stats
	df.describe().T
	df.describe(percentiles=[0.1, 0.2, 0.3, 0.99]).T


	# skip nan
	df.min(skipna=False)


	# sumar los nulos
	df.isna().sum().sum()


	df.isna().any().any()


	df2.equal(df)

	# count
	df.count(axis=1)
	df.cumsum(axis=1)



	# -------------------------------------------------------------------------------------------------
	# 3 - Creating and Persisting DataFrame
	# -------------------------------------------------------------------------------------------------
	df = pd.read_csv( ".csv", dtype={"col1": np.flota32, "col2":"category"})

	# export
	df.to_excel("file.xls")

	df[df.col1 > 54].to_excel( xl_writer, sheet_name="file")
	xl_writer.save()


	# convert to datetime
	pd.to_datetime(df.col1)

	# read to zip
	import zipfile
	with zipfile.ZipFile( "file.zip") as z:
	print("\n".join(z.namelist()))
	df = pd.read_csv(z.open("file.csv"))

	# data base
	import sqlite3
	con = sqlite3.connect("file.db"=
	with con:
	cur = con.cursor()
	cur.execute("")
	_ = con.commit()

	import sqlalchemy as sa
	engine = sa.create_engine("sqlite:///file.db", echo=True)
	sa_connection = engine.connect()

	df = pd.read_sql("table", sa_connection, index_col="id")


	# read Json
	import json
	encoded = json.dumps(people)

	json.loads(encoded)

	df = pd.read_json(encoded)

	# read html
	url = "https://en.wikipedia.org/wiki/The_Beatles_discography"
	df = pd.read_html(url, match="List of studio", na_values="_")




	# -------------------------------------------------------------------------------------------------
	# 4 - Beginning Data Analysis
	# -------------------------------------------------------------------------------------------------
	# Exploratory Data Analysis (EDA)
	# Metadata
	# Descriptive Statistics


	df.describe(include=[np.object]).T
	df.describe(include=[np.number], percentiles=[0.01, 0.5, 0.099 ]).T

	# Data dictionaries

	# Data Type reduce memory usage
	df["col1"] = df["col1"].astype(np.int8)

	# to reduce more momory use categorical if low cardinality
	df.select_dtypes(include=["object"]).nunique()
	df["col1"] = df["col1"].astype("category")

	df.assign( col1=df["col1"].astype("float16") )

	# select top 100
	df.nlargest(100, "col1")

	# select small 100
	df.nsmallest(100, "col1")

	# select top 3 by group
	(
	df[["col1", "col2", "col3"]]
	.groupby("col2", as_index=False)
	.apply(
	lambda ddf: ddf.sort_values("col3", ascending=False).head(3) # select top 3 by group
	)
	.droplevel(0)
	.sort_values("col3", ascending=False)
	)

	# select with different directions
	(
	df[["col1", "col2", "col3"]]
	.groupby("col2", as_index=False)
	.apply(
	lambda ddf: ddf.sort_values(["col3","col2"], ascending=[False,True]).head(3) # sort different direction
	)
	.droplevel(0)
	.sort_values("col3", ascending=False)
	)


	# get the smaller 5 by col1 from top 100 of col3
	(
	df
	.nlargest(100, "col3")
	.smallest(5, "col1")
	)


	# stocks
	import datetime
	import pandas_datareader.data as web
	import requests_cache
	session = requests_cache.CachedSession(
	cache_name="cache",
	backend="sqlite",
	expire_after=datetime.timedelta(days=90)
	)
	tsla = web.DataReader("tlsa", data_source="yahoo", start="2020-01-01", session=session)
	tsla.plot()

	tsla.Close.plot()

	tsla.Close.cummax().plot() # es como un salto al maximo y solo se incrementa si hay un nuevo maximo

	# graficamente dar una alerta cuando el stock baja un 10% de su maximo historico
	tsla.Close.cummax().mul(.9).plot()
	tsla.Close.plot()





	# -------------------------------------------------------------------------------------------------
	# 5 - Exploratory Data Analisys
	# -------------------------------------------------------------------------------------------------

	# info sobre los data types
	np.iinfo(np.int8)

	# subplot
	import matplotlib.pyplot as plt
	fig, ax = plt.subplots(figsize=(10,8))

	top_n = df.Col1.value_counts().index[:6]
	(
	df.assign(
	newcol = df.Col1.where(
	df.Col1.isin(top_n), "Other"
	)
	)
	.newCol.value_counts()
	.plot.bar(ax=ax)
	)
	fig.savefig("file.png", dpi=300)


	# quantile
	(
	df.Col1.fillna("0")
	.str.replace("-","/")
	.str.split("/", expand=True)
	.astype(float)
	.mean(axis=1)
	.pipe(lambda ser_: pd.qcut(ser_, 10))
	.value_counts()
	)


	# kolmogorov-Smirnov -> test de normalidad
	from scipy import stats
	stats.kstest( df.Col1, cdf="norm")

	# graficar la cdf empirica
	from scipy import stats
	fig, ax = plt.subplots(figsize=(10,8))
	stats.probplot( df.Col1, plot=ax)
	fig.savefig("file.png", dpi=300)


	# informacion entre varias categorias con AGG
	mask = df.Col1.isin( ["val1", "val2", "val3"] )
	df[ mask ].groupby("Col1").Col3.agg(["mean", "std"])

	# graficar
	import seaborn as sns
	g = sns.catplot( x="Col1", y="Col3", data=df[mask], kind="box")
	g.ax.figure.savefig("file.png", dpi=300)


	# swarm plot
	g = sns.catplot( x="", y="", data="", kind="box")
	sns.swarmplot( x="", y="", data="", kind="box", ax=g.ax)
	g.ax.figure.savefig("file.png", dpi=300)

	# hue plot
	g = sns.catplot( x="", y="", data="", kind="box", hue="year")

	# summary
	df.groupby("Col1").agg(["mean","std"]).style.background_gradient(cmap="RdBu", axis=0)

	# regression line
	res = sns.lmplot(x="col1", y="col2", data=df)

	# pearson correlation -> linearity
	# spearman correlation -> if relationship is monotonic





	# -------------------------------------------------------------------------------------------------
	# 6 - Selecting Subset of Data
	# -------------------------------------------------------------------------------------------------



	# -------------------------------------------------------------------------------------------------
	# 7 - Filtering Rows
	# -------------------------------------------------------------------------------------------------

	# Query
	@name = ["name1", "name2"]
	qs = (
	"ENTERPRISE in @name"
	" and PRICE > 2000"
	)
	df.query(qs)


	# round values
	df.col1.clip(lower=300, upper=20000)

	# complement of where
	criteria1 = df.Col1 >= 2000
	creteria2 = df.Col1.isna()

	criteria = criteria1 \| criteria2
	df.mask(criteria)




	# -------------------------------------------------------------------------------------------------
	# 8 - Index Aligment
	# -------------------------------------------------------------------------------------------------
	cols = df.columns
	cols.values
	cols + "_A"
	cols > "G"
	c1 = cols[2:6]
	c1.union(c2) # c1 \| c2
	c1.symetric_difference(c2) # c1 ^ c2


	# es importante pero no me da el tiempo




	# -------------------------------------------------------------------------------------------------
	# 9 - Grouping for Aggregation, Filtration and Transformation
	# -------------------------------------------------------------------------------------------------

	(
	df.groupby(["col1", "col2"])[[ "col4", "col7" ]]
	.agg(['sum', 'mean'])
	)



	(
	df.groupby(["col1", "col2"])
	.agg({
	'col4': ['sum', 'mean', 'size'],
	'col7': ['mean', 'var']
	})
	)


	# info about group
	df.groupby("Col2").ngroups
	df.groupby("Col2").get_group(('FL',1))

	from IPython.displah import display
	for name, group in df.groupby("Col3"):
	print(name)
	display(group.head(3))



	# use filter in group
	def check_condition(df, val): # aqui cada df es en realidad un subgrupo
	return df.max() < val

	df.groupby("Col3").filter(check_condition, val=800)



	# Transform
	def percent_loss(s):
	return ( (s - s.iloc[0])) / s.iloc[0]) * 100

	# for only one
	(
	df.query("Name=='Bob' and Month=='Jan'")["Weight"]
	.pipe(percent_loss)
	)
	# for all groups
	(
	df.groupby(["Name", "Month"])["Weight"]
	.transform(percent_loss)
	)
	# pivot / si hay varios se pueden usar pivot_table
	(
	df.groupby(["Name", "Month"])["Weight"]
	.transform(percent_loss)
	.pivot(index="Month", columns="Name", values="percent_loss")
	)


	# assign para ver el ganador
	(
	df.groupby(["Name", "Month"])["Weight"]
	.transform(percent_loss)
	.pivot(index="Month", columns="Name", values="percent_loss")
	.assign(winner=lambda df_: np.where(df_.Amy < df_.Bob, "Amy", "Bob"))
	)



	# assign para ver el ganador
	(
	df.groupby(["Name", "Month"])["Weight"]
	.transform(percent_loss)
	.pivot(index="Month", columns="Name", values="percent_loss")
	.assign(winner=lambda df_: np.where(df_.Amy < df_.Bob, "Amy", "Bob"))
	.winner.value_counts()
	)


	# groupby tiene 4 funciones
	# .agg --> scalar --> simple column
	# .filter --> boolean -->
	# .transform --> serie / dataframe --> simple column
	# .apply --> scalar / serie / dataframe --> multiple columns


	# analizar valores continuous
	bins = [-np.inf, 200, 500, 1000, 2000, np.inf]
	cuts = pd.cut(df["col1"], bins=bins)
	cuts.value_counts().plot.hist()
	cuts.value_counts(normalize=True)

	# usando quantiles
	df.groupby(cuts)["col2"].quantile(q=[.25, .5, .75])




	# -------------------------------------------------------------------------------------------------
	# 10 - Restructuring Data into a Tidy Form
	# -------------------------------------------------------------------------------------------------

	# tidy tools
	# .pivot .melt .stack .unstack

	# advanced
	# .rename .rename_axis .reset_index .set_index

	# wide_to_long
	# pd.wide_to_long(df,

	(
	df.groupby(["col1", "col2", "col3" ])[["col5", "col6"]]
	.agg(["size", "min", "max"])
	.rename_axis(["col10", "col11"], axis="columns")
	.stack("col10")
	.swaplevel("col10", "co1", axis="index")
	.sort_index(level="col2", axis="index")
	.sort_index(level="col10", axis="columns")
	)
No results found