Skip to content

Instantly share code, notes, and snippets.

import numpy as np
import pandas as pd
#load dataset
df = pd.read_csv("data.csv")
# axis 0 -> row -> i
# axis 1 -> col -> j
# get cols
@datavudeja
datavudeja / pandas_cheetsheet.py
Created March 4, 2026 12:11 — forked from Ezhvsalate/pandas_cheetsheet.py
Pandas cheetsheet: some useful commands for data preprocessing
# Read data from csv
data = pd.read_csv('data.csv', sep=',', index_col='Number')
# Write data to csv
data.to_csv("data_wo_sensitive_lemmatized.csv", index=False, encoding='utf-8', sep=';')
# Read and concat several files in one dataframe
files = glob.glob('*.csv')
small_dfs = [pd.read_csv(fp, names=columns) for fp in files]
df = pd.concat(small_dfs)
@datavudeja
datavudeja / pandas.py
Created March 4, 2026 12:09 — forked from stiles/pandas.py
Pandas cheat sheet
# List unique values in a DataFrame column
df['Column Name'].unique()
# To extract a specific column (subset the dataframe), you can use [ ] (brackets) or attribute notation.
df.height
df['height']
# are same thing!!! (from http://www.stephaniehicks.com/learnPython/pages/pandas.html
# -or-
# http://www.datacarpentry.org/python-ecology-lesson/02-index-slice-subset/)
@datavudeja
datavudeja / data_quality_checks.py
Created February 18, 2026 17:32 — forked from LeGi0N09/data_quality_checks.py
Python: Automated data quality validation framework
import pandas as pd
from typing import Dict, List
class DataQualityValidator:
def __init__(self, df: pd.DataFrame):
self.df = df
self.issues = []
def check_nulls(self, columns: List[str], threshold: float = 0.05):
"""Check if null percentage exceeds threshold"""
@datavudeja
datavudeja / PIPE.py
Created February 9, 2026 15:27 — forked from emherrer/PIPE.py
[Functions] Algunas funciones utiles #python #fun #funciones #def #pipe #words #keywords
from functools import wraps
import datetime as dt
import pandas as pd
def log_start(func):
@wraps(func)
def wrapper(*args, **kwargs):
tic = dt.datetime.now()
result = func(*args, **kwargs)
@datavudeja
datavudeja / nonprint-char_remover.py
Created February 6, 2026 14:41 — forked from GDBSD/nonprint-char_remover.py
Remove non-printing characters from a Pandas dataframe
def remove_non_printing_chars(df):
"""Clean a dataframe column to remove any non-printing characters.
We've encountered values like tabs in some of the data.
:param df: Pandas dataframe
:return: Pandas dataframe
"""
clean_df = df.copy(deep=True)
clean_df = clean_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
for col in list(clean_df.columns):
#Apply Lambda function to pandas
# if we require other column as a logic for the new column
df = df.assign(Product=lambda x: (x['Field_1'] * x['Field_2'] * x['Field_3']))
# if we need to modify all the element of selected entity based only on that entity
# this will in-place update all the element
df = df.apply(lambda x: np.square(x) if x.name in ['a', 'e', 'g'] else x, axis=1)
# compare from the previous element of the colums use shift
@datavudeja
datavudeja / pandas_utils.py
Created February 4, 2026 16:03 — forked from adrianonvls/pandas_utils.py
Pandas util functions
import pandas as pd
from IPython.display import display
def compare_dfs_cols_and_types(df_x, df_y, df_x_name="x", df_y_name="y"):
"""Function to compare two DataFrames, checking column
names and types, prints the differences (if they exists)
and return a DataFrame with the NaNs sinalizing the mismatches.
:param df_x: First Dataframe
:type df_x: pd.DataFrame
:param df_y: Second DataFrame
@datavudeja
datavudeja / time_unit.py
Created February 4, 2026 16:02 — forked from christophertubbs/time_unit.py
A Python enumeration for defining a unit of time
"""
Describes an enum that may be used to describe a unit of time (not a duration)
NOTE: Remove the numpy logic if your application does not support numpy
"""
import enum
from datetime import datetime
from datetime import timedelta
@datavudeja
datavudeja / convert_enums.py
Created February 4, 2026 15:59 — forked from eugeneko/convert_enums.py
Script for enum modernization in Urho
import sys
import os
import re
re_enum = re.compile(r'\s*enum\s*(\w+)\s*(:.*)?\s*')
re_enum_value = re.compile(r'\s*(\w+)(?:\s*=\s*(.+))?,?(?:\s*\/\/.*)?\s*')
folders_blacklist = [
# 'Urho3D/Audio',
# 'Urho3D/Container',
# 'Urho3D/Core',