-
-
Save datavudeja/7ae8bba737ac2b80d7e59eb89705ee4e to your computer and use it in GitHub Desktop.
[Functions] Algunas funciones utiles #python #fun #funciones #def #pipe #words #keywords
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Esta funcion se usa para clasificar palabras en diferentes categorias | |
| import pandas as pd | |
| import re | |
| # Definir dict con categorias y palabras afines a esa categoria | |
| category_keywords = { | |
| "SAG": [ | |
| r"\b(sag mill nr1|sag mill 1|grinding line nr1|sag mill nr2|sag mill 2|grinding line nr2|" | |
| r"sag mill nr3|sag mill 3|grinding line nr3|sag mill nr4|sag mill nr|sag mill 4|grinding line nr4|" | |
| r"sag mill nr5|sag mill 5|grinding line nr5)\b" | |
| ] | |
| } | |
| # Funcion clasifica una columna "desc (strings/palabras)" dentro de categorias pre-definidas en "category_keywords" | |
| def categorize_desc(desc: str, category_keywords: dict) -> str: | |
| for category, patterns in category_keywords.items(): | |
| for pattern in patterns: | |
| if re.search(pattern, desc): | |
| return category | |
| return desc | |
| # Funcion crea columna "cat_column" en base a funcion "categorize_desc" que recibe columna "description_name" | |
| def categoriza_words(df: pd.DataFrame, description_name: str, category_keywords: dict, cat_column: str) -> pd.DataFrame: | |
| df[cat_column] = df[description_name].apply(lambda x: categorize_desc(x, category_keywords)) | |
| return df | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import unicodedata | |
| import numpy as np | |
| def clean_column_names(df: pd.DataFrame) -> pd.DataFrame: | |
| df.columns = ( | |
| df.columns.str.strip() | |
| .str.lower() | |
| .map( | |
| lambda x: unicodedata.normalize("NFKD", x) | |
| .encode("ascii", "ignore") | |
| .decode("ascii") | |
| ) | |
| .str.replace(".", "", regex=False) | |
| .str.replace(" ", "_", regex=False) | |
| .str.replace("(", "", regex=False) | |
| .str.replace(")", "", regex=False) | |
| .str.replace("-", "_", regex=False) | |
| .str.replace('"', "", regex=False) | |
| .str.replace("%", "pct", regex=False) | |
| .str.replace("/", "", regex=False) | |
| .str.replace("__", "_", regex=False) | |
| .str.replace("\n", "_", regex=False) | |
| ) | |
| return df | |
| #Replace con REGEX: | |
| df["column"].str.replace(r"[-/#.]", "", regex=True) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| def cleaning_data(df: pd.DataFrame, na_limit: int) -> pd.DataFrame: | |
| #valores negativos | |
| for col in df.columns: | |
| if pd.api.types.is_numeric_dtype(df[col]): | |
| df[col] = np.where(df[col].values <0, np.nan, df[col]) | |
| #columnas con alto% NA's | |
| high_na_cols = df.columns[df.isna().sum() / len(df)*100 > na_limit] | |
| df = df.drop(columns=high_na_cols) | |
| #Imputacion NA con mediana | |
| for col in df.columns: | |
| if pd.api.types.is_numeric_dtype(df[col]): | |
| df[col] = df[col].fillna(df[col].median()) | |
| return df |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # funcion recibe por ejemplo nombre de columna: "fecha" | |
| def define_fiscal_year(df: pd.DataFrame, date_column: str) -> pd.DataFrame: | |
| df["fiscal_year"] = df[date_column].apply(lambda x: x.year + 1 if x.month >= 7 else x.year) | |
| return df |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from functools import wraps | |
| import datetime as dt | |
| import pandas as pd | |
| def log_start(func): | |
| @wraps(func) | |
| def wrapper(*args, **kwargs): | |
| tic = dt.datetime.now() | |
| result = func(*args, **kwargs) | |
| time_taken = str(dt.datetime.now() - tic) | |
| print(f"Just ran step: {func.__name__}, shape={result.shape}, took {time_taken}s") | |
| return result | |
| return wrapper | |
| @log_start | |
| def start_pipeline(df: pd.DataFrame) -> pd.DataFrame: | |
| return df.copy() | |
| df_run = ( | |
| df | |
| .pipe(start_pipeline) | |
| ....resto de las funciones | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| from pathlib import Path | |
| FILEPATH = Path.cwd() / "data/BBDDPM" | |
| def read_and_concatenate_excel_files(sheetname: str) -> pd.DataFrame: | |
| excel_files = FILEPATH.glob("*.xlsx") | |
| df = [] | |
| for file in excel_files: | |
| dfs = pd.read_excel(file, sheet_name=sheetname) | |
| df.append(dfs) | |
| return pd.concat(df, ignore_index=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment