Skip to content

Instantly share code, notes, and snippets.

@mariogarcia-ar
Last active March 4, 2026 12:14
Show Gist options
  • Select an option

  • Save mariogarcia-ar/19e0e83b26d9b4968e05a8113686927e to your computer and use it in GitHub Desktop.

Select an option

Save mariogarcia-ar/19e0e83b26d9b4968e05a8113686927e to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
#load dataset
df = pd.read_csv("data.csv")
# axis 0 -> row -> i
# axis 1 -> col -> j
# get cols
df.columns
# get index
df.index
# data type
# float, int, Int64, object, category, bool, boolean, datetime64[ns]
df.dtypes
df.dtypes.values_count()
# summary dtype, count, non-null, memory
df.info()
# select col / row
df.loc[:, "col1"]
df.loc[:, ["col1","col2"]
df.iloc[:, [3,4]
# Series
sr = df["col1"]
len(sr)
# Sample
df.sample(n=5, random_state=42)
# count elements in serie
sr.value_counts()
# check unique in dataframe / serie
df.unique()
sr.unique()
# statistics
sr.min(); sr.max(); sr.mean(); sr.median(); sr.std(); sr.quantile(0.2)
# all stats
sr.describe()
# check isna
sr.isna()
# fill with 0 all null values
sr.fillna(0)
# remove null values
sr.dropna()
# check nan
sr.hasnans
# check not nan
sr.notna()
# arithmetic function
sr.add(1); sr.sub(1); sr.mul(1); sr.div(1); sr.floordiv(1); sr.mod(1); sr.pow(1);
# comparation operators
# < > <= >= == !=
sr.lt(7); sr.gt(7); sr.le(7); sr.ge(7); sr.eq(7); sr.ne(7);
# chain
sr.isna().sum()
df.isna().sum()
# chain with function
def myprint(a):
print("value is %s"%(a))
(sr.fillna(0)).pipe(myprint)
# rename cols
col_map = { "col1": "col_one" }
df.rename(columns=col_map)
idx_map = { "row1": "row_one"}
df.rename(index=idx_map)
# add column
df["has_seen"] = 0
df.assign(has_seen=0)
total = ( df.col1 + df.col2 + df.col3 )
cols = ["col1", "col2", "col3"]
sum_total = df.loc[:, cols].sum(axis="columns")
df.assign(total=sum_total)
def sum_likes(df):
return df[
[
c
for c in df.columns
if "like" in c and ("actor" in c or "director" in c)
]
].sum(axis=1)
# insert new column
df.insert(
loc = df.columns.get_loc("col3"),
column = "newcol",
value = df.col2 - df.col18
)
# remove col
del df["col3"]
# -------------------------------------------------------------------------------------------------
# 2 - Essential DataFrame Operations
# -------------------------------------------------------------------------------------------------
type( df[["col1"]] ) # DataFrame
type( df["col1"] ) # Serie
type( df.loc[:, ["col1"]] ) # DataFrame
type( df.log[:, "col1"] ) # Serie
cols = ["col1", "col2", "col3"]
df_col = df[ cols ]
# filter / select
df.select_types(include="int")
df.select_types(exclude="category")
df.filter(like="fb") # %fb%
df.filter(regex=r"\d") # column with a digit
# best practices
# Classify each column as Categorical/Continuous
# Group column by Categorical/Continuous
# Order more important group
# summarizing a DataFrame
df.shape ; df.size; df.ndim; len(df); df.count();
# summarizing stats
df.describe().T
df.describe(percentiles=[0.1, 0.2, 0.3, 0.99]).T
# skip nan
df.min(skipna=False)
# sumar los nulos
df.isna().sum().sum()
df.isna().any().any()
df2.equal(df)
# count
df.count(axis=1)
df.cumsum(axis=1)
# -------------------------------------------------------------------------------------------------
# 3 - Creating and Persisting DataFrame
# -------------------------------------------------------------------------------------------------
df = pd.read_csv( ".csv", dtype={"col1": np.flota32, "col2":"category"})
# export
df.to_excel("file.xls")
df[df.col1 > 54].to_excel( xl_writer, sheet_name="file")
xl_writer.save()
# convert to datetime
pd.to_datetime(df.col1)
# read to zip
import zipfile
with zipfile.ZipFile( "file.zip") as z:
print("\n".join(z.namelist()))
df = pd.read_csv(z.open("file.csv"))
# data base
import sqlite3
con = sqlite3.connect("file.db"=
with con:
cur = con.cursor()
cur.execute("")
_ = con.commit()
import sqlalchemy as sa
engine = sa.create_engine("sqlite:///file.db", echo=True)
sa_connection = engine.connect()
df = pd.read_sql("table", sa_connection, index_col="id")
# read Json
import json
encoded = json.dumps(people)
json.loads(encoded)
df = pd.read_json(encoded)
# read html
url = "https://en.wikipedia.org/wiki/The_Beatles_discography"
df = pd.read_html(url, match="List of studio", na_values="_")
# -------------------------------------------------------------------------------------------------
# 4 - Beginning Data Analysis
# -------------------------------------------------------------------------------------------------
# Exploratory Data Analysis (EDA)
# Metadata
# Descriptive Statistics
df.describe(include=[np.object]).T
df.describe(include=[np.number], percentiles=[0.01, 0.5, 0.099 ]).T
# Data dictionaries
# Data Type reduce memory usage
df["col1"] = df["col1"].astype(np.int8)
# to reduce more momory use categorical if low cardinality
df.select_dtypes(include=["object"]).nunique()
df["col1"] = df["col1"].astype("category")
df.assign( col1=df["col1"].astype("float16") )
# select top 100
df.nlargest(100, "col1")
# select small 100
df.nsmallest(100, "col1")
# select top 3 by group
(
df[["col1", "col2", "col3"]]
.groupby("col2", as_index=False)
.apply(
lambda ddf: ddf.sort_values("col3", ascending=False).head(3) # select top 3 by group
)
.droplevel(0)
.sort_values("col3", ascending=False)
)
# select with different directions
(
df[["col1", "col2", "col3"]]
.groupby("col2", as_index=False)
.apply(
lambda ddf: ddf.sort_values(["col3","col2"], ascending=[False,True]).head(3) # sort different direction
)
.droplevel(0)
.sort_values("col3", ascending=False)
)
# get the smaller 5 by col1 from top 100 of col3
(
df
.nlargest(100, "col3")
.smallest(5, "col1")
)
# stocks
import datetime
import pandas_datareader.data as web
import requests_cache
session = requests_cache.CachedSession(
cache_name="cache",
backend="sqlite",
expire_after=datetime.timedelta(days=90)
)
tsla = web.DataReader("tlsa", data_source="yahoo", start="2020-01-01", session=session)
tsla.plot()
tsla.Close.plot()
tsla.Close.cummax().plot() # es como un salto al maximo y solo se incrementa si hay un nuevo maximo
# graficamente dar una alerta cuando el stock baja un 10% de su maximo historico
tsla.Close.cummax().mul(.9).plot()
tsla.Close.plot()
# -------------------------------------------------------------------------------------------------
# 5 - Exploratory Data Analisys
# -------------------------------------------------------------------------------------------------
# info sobre los data types
np.iinfo(np.int8)
# subplot
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10,8))
top_n = df.Col1.value_counts().index[:6]
(
df.assign(
newcol = df.Col1.where(
df.Col1.isin(top_n), "Other"
)
)
.newCol.value_counts()
.plot.bar(ax=ax)
)
fig.savefig("file.png", dpi=300)
# quantile
(
df.Col1.fillna("0")
.str.replace("-","/")
.str.split("/", expand=True)
.astype(float)
.mean(axis=1)
.pipe(lambda ser_: pd.qcut(ser_, 10))
.value_counts()
)
# kolmogorov-Smirnov -> test de normalidad
from scipy import stats
stats.kstest( df.Col1, cdf="norm")
# graficar la cdf empirica
from scipy import stats
fig, ax = plt.subplots(figsize=(10,8))
stats.probplot( df.Col1, plot=ax)
fig.savefig("file.png", dpi=300)
# informacion entre varias categorias con AGG
mask = df.Col1.isin( ["val1", "val2", "val3"] )
df[ mask ].groupby("Col1").Col3.agg(["mean", "std"])
# graficar
import seaborn as sns
g = sns.catplot( x="Col1", y="Col3", data=df[mask], kind="box")
g.ax.figure.savefig("file.png", dpi=300)
# swarm plot
g = sns.catplot( x="", y="", data="", kind="box")
sns.swarmplot( x="", y="", data="", kind="box", ax=g.ax)
g.ax.figure.savefig("file.png", dpi=300)
# hue plot
g = sns.catplot( x="", y="", data="", kind="box", hue="year")
# summary
df.groupby("Col1").agg(["mean","std"]).style.background_gradient(cmap="RdBu", axis=0)
# regression line
res = sns.lmplot(x="col1", y="col2", data=df)
# pearson correlation -> linearity
# spearman correlation -> if relationship is monotonic
# -------------------------------------------------------------------------------------------------
# 6 - Selecting Subset of Data
# -------------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------------
# 7 - Filtering Rows
# -------------------------------------------------------------------------------------------------
# Query
@name = ["name1", "name2"]
qs = (
"ENTERPRISE in @name"
" and PRICE > 2000"
)
df.query(qs)
# round values
df.col1.clip(lower=300, upper=20000)
# complement of where
criteria1 = df.Col1 >= 2000
creteria2 = df.Col1.isna()
criteria = criteria1 | criteria2
df.mask(criteria)
# -------------------------------------------------------------------------------------------------
# 8 - Index Aligment
# -------------------------------------------------------------------------------------------------
cols = df.columns
cols.values
cols + "_A"
cols > "G"
c1 = cols[2:6]
c1.union(c2) # c1 | c2
c1.symetric_difference(c2) # c1 ^ c2
# es importante pero no me da el tiempo
# -------------------------------------------------------------------------------------------------
# 9 - Grouping for Aggregation, Filtration and Transformation
# -------------------------------------------------------------------------------------------------
(
df.groupby(["col1", "col2"])[[ "col4", "col7" ]]
.agg(['sum', 'mean'])
)
(
df.groupby(["col1", "col2"])
.agg({
'col4': ['sum', 'mean', 'size'],
'col7': ['mean', 'var']
})
)
# info about group
df.groupby("Col2").ngroups
df.groupby("Col2").get_group(('FL',1))
from IPython.displah import display
for name, group in df.groupby("Col3"):
print(name)
display(group.head(3))
# use filter in group
def check_condition(df, val): # aqui cada df es en realidad un subgrupo
return df.max() < val
df.groupby("Col3").filter(check_condition, val=800)
# Transform
def percent_loss(s):
return ( (s - s.iloc[0])) / s.iloc[0]) * 100
# for only one
(
df.query("Name=='Bob' and Month=='Jan'")["Weight"]
.pipe(percent_loss)
)
# for all groups
(
df.groupby(["Name", "Month"])["Weight"]
.transform(percent_loss)
)
# pivot / si hay varios se pueden usar pivot_table
(
df.groupby(["Name", "Month"])["Weight"]
.transform(percent_loss)
.pivot(index="Month", columns="Name", values="percent_loss")
)
# assign para ver el ganador
(
df.groupby(["Name", "Month"])["Weight"]
.transform(percent_loss)
.pivot(index="Month", columns="Name", values="percent_loss")
.assign(winner=lambda df_: np.where(df_.Amy < df_.Bob, "Amy", "Bob"))
)
# assign para ver el ganador
(
df.groupby(["Name", "Month"])["Weight"]
.transform(percent_loss)
.pivot(index="Month", columns="Name", values="percent_loss")
.assign(winner=lambda df_: np.where(df_.Amy < df_.Bob, "Amy", "Bob"))
.winner.value_counts()
)
# groupby tiene 4 funciones
# .agg --> scalar --> simple column
# .filter --> boolean -->
# .transform --> serie / dataframe --> simple column
# .apply --> scalar / serie / dataframe --> multiple columns
# analizar valores continuous
bins = [-np.inf, 200, 500, 1000, 2000, np.inf]
cuts = pd.cut(df["col1"], bins=bins)
cuts.value_counts().plot.hist()
cuts.value_counts(normalize=True)
# usando quantiles
df.groupby(cuts)["col2"].quantile(q=[.25, .5, .75])
# -------------------------------------------------------------------------------------------------
# 10 - Restructuring Data into a Tidy Form
# -------------------------------------------------------------------------------------------------
# tidy tools
# .pivot .melt .stack .unstack
# advanced
# .rename .rename_axis .reset_index .set_index
# wide_to_long
# pd.wide_to_long(df,
(
df.groupby(["col1", "col2", "col3" ])[["col5", "col6"]]
.agg(["size", "min", "max"])
.rename_axis(["col10", "col11"], axis="columns")
.stack("col10")
.swaplevel("col10", "co1", axis="index")
.sort_index(level="col2", axis="index")
.sort_index(level="col10", axis="columns")
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment