Skip to content

Instantly share code, notes, and snippets.

@datavudeja
datavudeja / pd_merge.py
Created October 6, 2025 13:20 — forked from fuhoi/pd_merge.py
pd_merge
import pandas as pd
import hashlib
from datetime import datetime
def detect_dataframe_changes(source_df: pd.DataFrame, target_df: pd.DataFrame, primary_key: str, ignored_columns: list):
"""
Detects inserted, updated, and deleted rows between two DataFrames.
Args:
source_df (pd.DataFrame): The source DataFrame.
@datavudeja
datavudeja / feature_aggregations.py
Created October 6, 2025 13:18 — forked from gvspraveen/feature_aggregations.py
Dataset stats using aggregators
import pandas as pd
import ray
from ray.data import Dataset
from ray.data.context import DataContext, ShuffleStrategy
from typing import List
import time
from ray.data.aggregate import Count, Mean, Min, Max, Quantile, Std, Unique, AggregateFnV2
from ray.data.block import BlockAccessor, Block
from typing import List, Tuple, Optional
from ray.data import Dataset
@datavudeja
datavudeja / hashing.py
Created October 6, 2025 13:17 — forked from knu2xs/hashing.py
Add a MD5 hash column to a Pandas data frame for change analysis.
from hashlib import md5
import pandas as pd
from typing import Optional, Iterable
def get_md5_from_series(input_iterable: Iterable) -> str:
"""
Create a MD5 hash from an Iterable, typically a row from a Pandas ``DataFrame``, but can be any
Iterable object instance such as a list, tuple or Pandas ``Series``.
Args:
@datavudeja
datavudeja / TuningBERTopic.py
Created October 6, 2025 13:16 — forked from Jong-Sig/TuningBERTopic.py
Fine-Tuning Parameters of BERTopic Using Pseudo Grid-Search and Mini-Batch
# conda activate bert-github
#%%
import pandas as pd
import numpy as np
import os
import glob
import pathlib
from tqdm import tqdm
from bs4 import BeautifulSoup
@datavudeja
datavudeja / one_hot_encoding.py
Created October 6, 2025 13:14 — forked from simonespa/one_hot_encoding.py
Pandas and HashingEncoder
# Splits the values and expands them in multiple numbered columns
temp_df = df[column].str.split("|", expand=True).fillna('')
# One-Hot encodes all the values for each column
temp_df = pd.get_dummies(temp_df).astype('uint8')
# Removes the "N_" prefixe for each column to expose duplicates
temp_df = remove_prefixes(temp_df)
# Merges the duplicate columns
#! /usr/bin/env python3
# Usage:
# $ python3 this.py > output.csv
# $ nkf --overwrite --oc=UTF-8-BOM output.csv
import glob
import hashlib
import os
import sys
@datavudeja
datavudeja / compare_dataframes.py
Created October 6, 2025 13:12 — forked from harshvardhaniimi/compare_dataframes.py
A function to compare large data frames by comparing their hashes instead of values for efficiency
import pandas as pd
import hashlib
def hash_dataframe(df):
"""
Generate a hash for a DataFrame using the SHA-256 algorithm.
This function creates a hash for each row of the DataFrame using pandas' `hash_pandas_object`
and then hashes the resulting array of row hashes using `hashlib.sha256`.
@datavudeja
datavudeja / json-to-excel.py
Created October 6, 2025 13:11 — forked from mahimairaja/json-to-excel.py
To write a excel using the data from json.
import pandas as pd
import json
def readJson(filename,) -> dict:
"""
Reads a json file and returns a dictionary
"""
file = open(filename, 'r')
data = file.read()
file.close()
@datavudeja
datavudeja / python_snippets.py
Created October 6, 2025 13:11 — forked from aiventures/python_snippets.py
Python Snippets
#2022-07-25 Argparse Parse from Command Line
"C:\<Entwicklung>\WORK_JUPYTER\root\tools\argparse_template.py"
""" template code for argparse """
@datavudeja
datavudeja / data_importer_with_field_checks.py
Created September 30, 2025 13:47 — forked from adgedenkers/data_importer_with_field_checks.py
Data File Importer That Checks Field Names
import pandas as pd
import pyodbc
class DataImporter:
"""
This Python class takes in a pandas DataFrame from a CSV file and a SQL Server connection string, as
well as a name for the SQL table where the data will be imported. The class compares the columns of
the DataFrame to those in the SQL table and ensures that at least 50% of the fields match. If there
are missing fields, the class adds them to the SQL table using the same naming convention as existing
fields. All new fields are created as varchar fields by default. Once the SQL table has all the fields