Skip to content

Instantly share code, notes, and snippets.

@datavudeja
Forked from emherrer/PIPE.py
Created February 9, 2026 15:27
Show Gist options
  • Select an option

  • Save datavudeja/7ae8bba737ac2b80d7e59eb89705ee4e to your computer and use it in GitHub Desktop.

Select an option

Save datavudeja/7ae8bba737ac2b80d7e59eb89705ee4e to your computer and use it in GitHub Desktop.
[Functions] Algunas funciones utiles #python #fun #funciones #def #pipe #words #keywords
# Esta funcion se usa para clasificar palabras en diferentes categorias
import pandas as pd
import re
# Definir dict con categorias y palabras afines a esa categoria
category_keywords = {
"SAG": [
r"\b(sag mill nr1|sag mill 1|grinding line nr1|sag mill nr2|sag mill 2|grinding line nr2|"
r"sag mill nr3|sag mill 3|grinding line nr3|sag mill nr4|sag mill nr|sag mill 4|grinding line nr4|"
r"sag mill nr5|sag mill 5|grinding line nr5)\b"
]
}
# Funcion clasifica una columna "desc (strings/palabras)" dentro de categorias pre-definidas en "category_keywords"
def categorize_desc(desc: str, category_keywords: dict) -> str:
for category, patterns in category_keywords.items():
for pattern in patterns:
if re.search(pattern, desc):
return category
return desc
# Funcion crea columna "cat_column" en base a funcion "categorize_desc" que recibe columna "description_name"
def categoriza_words(df: pd.DataFrame, description_name: str, category_keywords: dict, cat_column: str) -> pd.DataFrame:
df[cat_column] = df[description_name].apply(lambda x: categorize_desc(x, category_keywords))
return df
import pandas as pd
import unicodedata
import numpy as np
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
df.columns = (
df.columns.str.strip()
.str.lower()
.map(
lambda x: unicodedata.normalize("NFKD", x)
.encode("ascii", "ignore")
.decode("ascii")
)
.str.replace(".", "", regex=False)
.str.replace(" ", "_", regex=False)
.str.replace("(", "", regex=False)
.str.replace(")", "", regex=False)
.str.replace("-", "_", regex=False)
.str.replace('"', "", regex=False)
.str.replace("%", "pct", regex=False)
.str.replace("/", "", regex=False)
.str.replace("__", "_", regex=False)
.str.replace("\n", "_", regex=False)
)
return df
#Replace con REGEX:
df["column"].str.replace(r"[-/#.]", "", regex=True)
import pandas as pd
import numpy as np
def cleaning_data(df: pd.DataFrame, na_limit: int) -> pd.DataFrame:
#valores negativos
for col in df.columns:
if pd.api.types.is_numeric_dtype(df[col]):
df[col] = np.where(df[col].values <0, np.nan, df[col])
#columnas con alto% NA's
high_na_cols = df.columns[df.isna().sum() / len(df)*100 > na_limit]
df = df.drop(columns=high_na_cols)
#Imputacion NA con mediana
for col in df.columns:
if pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].fillna(df[col].median())
return df
# funcion recibe por ejemplo nombre de columna: "fecha"
def define_fiscal_year(df: pd.DataFrame, date_column: str) -> pd.DataFrame:
df["fiscal_year"] = df[date_column].apply(lambda x: x.year + 1 if x.month >= 7 else x.year)
return df
from functools import wraps
import datetime as dt
import pandas as pd
def log_start(func):
@wraps(func)
def wrapper(*args, **kwargs):
tic = dt.datetime.now()
result = func(*args, **kwargs)
time_taken = str(dt.datetime.now() - tic)
print(f"Just ran step: {func.__name__}, shape={result.shape}, took {time_taken}s")
return result
return wrapper
@log_start
def start_pipeline(df: pd.DataFrame) -> pd.DataFrame:
return df.copy()
df_run = (
df
.pipe(start_pipeline)
....resto de las funciones
)
import pandas as pd
from pathlib import Path
FILEPATH = Path.cwd() / "data/BBDDPM"
def read_and_concatenate_excel_files(sheetname: str) -> pd.DataFrame:
excel_files = FILEPATH.glob("*.xlsx")
df = []
for file in excel_files:
dfs = pd.read_excel(file, sheet_name=sheetname)
df.append(dfs)
return pd.concat(df, ignore_index=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment