Last active
March 4, 2026 12:14
-
-
Save mariogarcia-ar/19e0e83b26d9b4968e05a8113686927e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import pandas as pd | |
| #load dataset | |
| df = pd.read_csv("data.csv") | |
| # axis 0 -> row -> i | |
| # axis 1 -> col -> j | |
| # get cols | |
| df.columns | |
| # get index | |
| df.index | |
| # data type | |
| # float, int, Int64, object, category, bool, boolean, datetime64[ns] | |
| df.dtypes | |
| df.dtypes.values_count() | |
| # summary dtype, count, non-null, memory | |
| df.info() | |
| # select col / row | |
| df.loc[:, "col1"] | |
| df.loc[:, ["col1","col2"] | |
| df.iloc[:, [3,4] | |
| # Series | |
| sr = df["col1"] | |
| len(sr) | |
| # Sample | |
| df.sample(n=5, random_state=42) | |
| # count elements in serie | |
| sr.value_counts() | |
| # check unique in dataframe / serie | |
| df.unique() | |
| sr.unique() | |
| # statistics | |
| sr.min(); sr.max(); sr.mean(); sr.median(); sr.std(); sr.quantile(0.2) | |
| # all stats | |
| sr.describe() | |
| # check isna | |
| sr.isna() | |
| # fill with 0 all null values | |
| sr.fillna(0) | |
| # remove null values | |
| sr.dropna() | |
| # check nan | |
| sr.hasnans | |
| # check not nan | |
| sr.notna() | |
| # arithmetic function | |
| sr.add(1); sr.sub(1); sr.mul(1); sr.div(1); sr.floordiv(1); sr.mod(1); sr.pow(1); | |
| # comparation operators | |
| # < > <= >= == != | |
| sr.lt(7); sr.gt(7); sr.le(7); sr.ge(7); sr.eq(7); sr.ne(7); | |
| # chain | |
| sr.isna().sum() | |
| df.isna().sum() | |
| # chain with function | |
| def myprint(a): | |
| print("value is %s"%(a)) | |
| (sr.fillna(0)).pipe(myprint) | |
| # rename cols | |
| col_map = { "col1": "col_one" } | |
| df.rename(columns=col_map) | |
| idx_map = { "row1": "row_one"} | |
| df.rename(index=idx_map) | |
| # add column | |
| df["has_seen"] = 0 | |
| df.assign(has_seen=0) | |
| total = ( df.col1 + df.col2 + df.col3 ) | |
| cols = ["col1", "col2", "col3"] | |
| sum_total = df.loc[:, cols].sum(axis="columns") | |
| df.assign(total=sum_total) | |
| def sum_likes(df): | |
| return df[ | |
| [ | |
| c | |
| for c in df.columns | |
| if "like" in c and ("actor" in c or "director" in c) | |
| ] | |
| ].sum(axis=1) | |
| # insert new column | |
| df.insert( | |
| loc = df.columns.get_loc("col3"), | |
| column = "newcol", | |
| value = df.col2 - df.col18 | |
| ) | |
| # remove col | |
| del df["col3"] | |
| # ------------------------------------------------------------------------------------------------- | |
| # 2 - Essential DataFrame Operations | |
| # ------------------------------------------------------------------------------------------------- | |
| type( df[["col1"]] ) # DataFrame | |
| type( df["col1"] ) # Serie | |
| type( df.loc[:, ["col1"]] ) # DataFrame | |
| type( df.log[:, "col1"] ) # Serie | |
| cols = ["col1", "col2", "col3"] | |
| df_col = df[ cols ] | |
| # filter / select | |
| df.select_types(include="int") | |
| df.select_types(exclude="category") | |
| df.filter(like="fb") # %fb% | |
| df.filter(regex=r"\d") # column with a digit | |
| # best practices | |
| # Classify each column as Categorical/Continuous | |
| # Group column by Categorical/Continuous | |
| # Order more important group | |
| # summarizing a DataFrame | |
| df.shape ; df.size; df.ndim; len(df); df.count(); | |
| # summarizing stats | |
| df.describe().T | |
| df.describe(percentiles=[0.1, 0.2, 0.3, 0.99]).T | |
| # skip nan | |
| df.min(skipna=False) | |
| # sumar los nulos | |
| df.isna().sum().sum() | |
| df.isna().any().any() | |
| df2.equal(df) | |
| # count | |
| df.count(axis=1) | |
| df.cumsum(axis=1) | |
| # ------------------------------------------------------------------------------------------------- | |
| # 3 - Creating and Persisting DataFrame | |
| # ------------------------------------------------------------------------------------------------- | |
| df = pd.read_csv( ".csv", dtype={"col1": np.flota32, "col2":"category"}) | |
| # export | |
| df.to_excel("file.xls") | |
| df[df.col1 > 54].to_excel( xl_writer, sheet_name="file") | |
| xl_writer.save() | |
| # convert to datetime | |
| pd.to_datetime(df.col1) | |
| # read to zip | |
| import zipfile | |
| with zipfile.ZipFile( "file.zip") as z: | |
| print("\n".join(z.namelist())) | |
| df = pd.read_csv(z.open("file.csv")) | |
| # data base | |
| import sqlite3 | |
| con = sqlite3.connect("file.db"= | |
| with con: | |
| cur = con.cursor() | |
| cur.execute("") | |
| _ = con.commit() | |
| import sqlalchemy as sa | |
| engine = sa.create_engine("sqlite:///file.db", echo=True) | |
| sa_connection = engine.connect() | |
| df = pd.read_sql("table", sa_connection, index_col="id") | |
| # read Json | |
| import json | |
| encoded = json.dumps(people) | |
| json.loads(encoded) | |
| df = pd.read_json(encoded) | |
| # read html | |
| url = "https://en.wikipedia.org/wiki/The_Beatles_discography" | |
| df = pd.read_html(url, match="List of studio", na_values="_") | |
| # ------------------------------------------------------------------------------------------------- | |
| # 4 - Beginning Data Analysis | |
| # ------------------------------------------------------------------------------------------------- | |
| # Exploratory Data Analysis (EDA) | |
| # Metadata | |
| # Descriptive Statistics | |
| df.describe(include=[np.object]).T | |
| df.describe(include=[np.number], percentiles=[0.01, 0.5, 0.099 ]).T | |
| # Data dictionaries | |
| # Data Type reduce memory usage | |
| df["col1"] = df["col1"].astype(np.int8) | |
| # to reduce more momory use categorical if low cardinality | |
| df.select_dtypes(include=["object"]).nunique() | |
| df["col1"] = df["col1"].astype("category") | |
| df.assign( col1=df["col1"].astype("float16") ) | |
| # select top 100 | |
| df.nlargest(100, "col1") | |
| # select small 100 | |
| df.nsmallest(100, "col1") | |
| # select top 3 by group | |
| ( | |
| df[["col1", "col2", "col3"]] | |
| .groupby("col2", as_index=False) | |
| .apply( | |
| lambda ddf: ddf.sort_values("col3", ascending=False).head(3) # select top 3 by group | |
| ) | |
| .droplevel(0) | |
| .sort_values("col3", ascending=False) | |
| ) | |
| # select with different directions | |
| ( | |
| df[["col1", "col2", "col3"]] | |
| .groupby("col2", as_index=False) | |
| .apply( | |
| lambda ddf: ddf.sort_values(["col3","col2"], ascending=[False,True]).head(3) # sort different direction | |
| ) | |
| .droplevel(0) | |
| .sort_values("col3", ascending=False) | |
| ) | |
| # get the smaller 5 by col1 from top 100 of col3 | |
| ( | |
| df | |
| .nlargest(100, "col3") | |
| .smallest(5, "col1") | |
| ) | |
| # stocks | |
| import datetime | |
| import pandas_datareader.data as web | |
| import requests_cache | |
| session = requests_cache.CachedSession( | |
| cache_name="cache", | |
| backend="sqlite", | |
| expire_after=datetime.timedelta(days=90) | |
| ) | |
| tsla = web.DataReader("tlsa", data_source="yahoo", start="2020-01-01", session=session) | |
| tsla.plot() | |
| tsla.Close.plot() | |
| tsla.Close.cummax().plot() # es como un salto al maximo y solo se incrementa si hay un nuevo maximo | |
| # graficamente dar una alerta cuando el stock baja un 10% de su maximo historico | |
| tsla.Close.cummax().mul(.9).plot() | |
| tsla.Close.plot() | |
| # ------------------------------------------------------------------------------------------------- | |
| # 5 - Exploratory Data Analisys | |
| # ------------------------------------------------------------------------------------------------- | |
| # info sobre los data types | |
| np.iinfo(np.int8) | |
| # subplot | |
| import matplotlib.pyplot as plt | |
| fig, ax = plt.subplots(figsize=(10,8)) | |
| top_n = df.Col1.value_counts().index[:6] | |
| ( | |
| df.assign( | |
| newcol = df.Col1.where( | |
| df.Col1.isin(top_n), "Other" | |
| ) | |
| ) | |
| .newCol.value_counts() | |
| .plot.bar(ax=ax) | |
| ) | |
| fig.savefig("file.png", dpi=300) | |
| # quantile | |
| ( | |
| df.Col1.fillna("0") | |
| .str.replace("-","/") | |
| .str.split("/", expand=True) | |
| .astype(float) | |
| .mean(axis=1) | |
| .pipe(lambda ser_: pd.qcut(ser_, 10)) | |
| .value_counts() | |
| ) | |
| # kolmogorov-Smirnov -> test de normalidad | |
| from scipy import stats | |
| stats.kstest( df.Col1, cdf="norm") | |
| # graficar la cdf empirica | |
| from scipy import stats | |
| fig, ax = plt.subplots(figsize=(10,8)) | |
| stats.probplot( df.Col1, plot=ax) | |
| fig.savefig("file.png", dpi=300) | |
| # informacion entre varias categorias con AGG | |
| mask = df.Col1.isin( ["val1", "val2", "val3"] ) | |
| df[ mask ].groupby("Col1").Col3.agg(["mean", "std"]) | |
| # graficar | |
| import seaborn as sns | |
| g = sns.catplot( x="Col1", y="Col3", data=df[mask], kind="box") | |
| g.ax.figure.savefig("file.png", dpi=300) | |
| # swarm plot | |
| g = sns.catplot( x="", y="", data="", kind="box") | |
| sns.swarmplot( x="", y="", data="", kind="box", ax=g.ax) | |
| g.ax.figure.savefig("file.png", dpi=300) | |
| # hue plot | |
| g = sns.catplot( x="", y="", data="", kind="box", hue="year") | |
| # summary | |
| df.groupby("Col1").agg(["mean","std"]).style.background_gradient(cmap="RdBu", axis=0) | |
| # regression line | |
| res = sns.lmplot(x="col1", y="col2", data=df) | |
| # pearson correlation -> linearity | |
| # spearman correlation -> if relationship is monotonic | |
| # ------------------------------------------------------------------------------------------------- | |
| # 6 - Selecting Subset of Data | |
| # ------------------------------------------------------------------------------------------------- | |
| # ------------------------------------------------------------------------------------------------- | |
| # 7 - Filtering Rows | |
| # ------------------------------------------------------------------------------------------------- | |
| # Query | |
| @name = ["name1", "name2"] | |
| qs = ( | |
| "ENTERPRISE in @name" | |
| " and PRICE > 2000" | |
| ) | |
| df.query(qs) | |
| # round values | |
| df.col1.clip(lower=300, upper=20000) | |
| # complement of where | |
| criteria1 = df.Col1 >= 2000 | |
| creteria2 = df.Col1.isna() | |
| criteria = criteria1 | criteria2 | |
| df.mask(criteria) | |
| # ------------------------------------------------------------------------------------------------- | |
| # 8 - Index Aligment | |
| # ------------------------------------------------------------------------------------------------- | |
| cols = df.columns | |
| cols.values | |
| cols + "_A" | |
| cols > "G" | |
| c1 = cols[2:6] | |
| c1.union(c2) # c1 | c2 | |
| c1.symetric_difference(c2) # c1 ^ c2 | |
| # es importante pero no me da el tiempo | |
| # ------------------------------------------------------------------------------------------------- | |
| # 9 - Grouping for Aggregation, Filtration and Transformation | |
| # ------------------------------------------------------------------------------------------------- | |
| ( | |
| df.groupby(["col1", "col2"])[[ "col4", "col7" ]] | |
| .agg(['sum', 'mean']) | |
| ) | |
| ( | |
| df.groupby(["col1", "col2"]) | |
| .agg({ | |
| 'col4': ['sum', 'mean', 'size'], | |
| 'col7': ['mean', 'var'] | |
| }) | |
| ) | |
| # info about group | |
| df.groupby("Col2").ngroups | |
| df.groupby("Col2").get_group(('FL',1)) | |
| from IPython.displah import display | |
| for name, group in df.groupby("Col3"): | |
| print(name) | |
| display(group.head(3)) | |
| # use filter in group | |
| def check_condition(df, val): # aqui cada df es en realidad un subgrupo | |
| return df.max() < val | |
| df.groupby("Col3").filter(check_condition, val=800) | |
| # Transform | |
| def percent_loss(s): | |
| return ( (s - s.iloc[0])) / s.iloc[0]) * 100 | |
| # for only one | |
| ( | |
| df.query("Name=='Bob' and Month=='Jan'")["Weight"] | |
| .pipe(percent_loss) | |
| ) | |
| # for all groups | |
| ( | |
| df.groupby(["Name", "Month"])["Weight"] | |
| .transform(percent_loss) | |
| ) | |
| # pivot / si hay varios se pueden usar pivot_table | |
| ( | |
| df.groupby(["Name", "Month"])["Weight"] | |
| .transform(percent_loss) | |
| .pivot(index="Month", columns="Name", values="percent_loss") | |
| ) | |
| # assign para ver el ganador | |
| ( | |
| df.groupby(["Name", "Month"])["Weight"] | |
| .transform(percent_loss) | |
| .pivot(index="Month", columns="Name", values="percent_loss") | |
| .assign(winner=lambda df_: np.where(df_.Amy < df_.Bob, "Amy", "Bob")) | |
| ) | |
| # assign para ver el ganador | |
| ( | |
| df.groupby(["Name", "Month"])["Weight"] | |
| .transform(percent_loss) | |
| .pivot(index="Month", columns="Name", values="percent_loss") | |
| .assign(winner=lambda df_: np.where(df_.Amy < df_.Bob, "Amy", "Bob")) | |
| .winner.value_counts() | |
| ) | |
| # groupby tiene 4 funciones | |
| # .agg --> scalar --> simple column | |
| # .filter --> boolean --> | |
| # .transform --> serie / dataframe --> simple column | |
| # .apply --> scalar / serie / dataframe --> multiple columns | |
| # analizar valores continuous | |
| bins = [-np.inf, 200, 500, 1000, 2000, np.inf] | |
| cuts = pd.cut(df["col1"], bins=bins) | |
| cuts.value_counts().plot.hist() | |
| cuts.value_counts(normalize=True) | |
| # usando quantiles | |
| df.groupby(cuts)["col2"].quantile(q=[.25, .5, .75]) | |
| # ------------------------------------------------------------------------------------------------- | |
| # 10 - Restructuring Data into a Tidy Form | |
| # ------------------------------------------------------------------------------------------------- | |
| # tidy tools | |
| # .pivot .melt .stack .unstack | |
| # advanced | |
| # .rename .rename_axis .reset_index .set_index | |
| # wide_to_long | |
| # pd.wide_to_long(df, | |
| ( | |
| df.groupby(["col1", "col2", "col3" ])[["col5", "col6"]] | |
| .agg(["size", "min", "max"]) | |
| .rename_axis(["col10", "col11"], axis="columns") | |
| .stack("col10") | |
| .swaplevel("col10", "co1", axis="index") | |
| .sort_index(level="col2", axis="index") | |
| .sort_index(level="col10", axis="columns") | |
| ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment