Skip to content

Instantly share code, notes, and snippets.

@datavudeja
datavudeja / TuningBERTopic.py
Created October 6, 2025 13:16 — forked from Jong-Sig/TuningBERTopic.py
Fine-Tuning Parameters of BERTopic Using Pseudo Grid-Search and Mini-Batch
# conda activate bert-github
#%%
import pandas as pd
import numpy as np
import os
import glob
import pathlib
from tqdm import tqdm
from bs4 import BeautifulSoup
@datavudeja
datavudeja / one_hot_encoding.py
Created October 6, 2025 13:14 — forked from simonespa/one_hot_encoding.py
Pandas and HashingEncoder
# Splits the values and expands them in multiple numbered columns
temp_df = df[column].str.split("|", expand=True).fillna('')
# One-Hot encodes all the values for each column
temp_df = pd.get_dummies(temp_df).astype('uint8')
# Removes the "N_" prefixe for each column to expose duplicates
temp_df = remove_prefixes(temp_df)
# Merges the duplicate columns
#! /usr/bin/env python3
# Usage:
# $ python3 this.py > output.csv
# $ nkf --overwrite --oc=UTF-8-BOM output.csv
import glob
import hashlib
import os
import sys
@datavudeja
datavudeja / compare_dataframes.py
Created October 6, 2025 13:12 — forked from harshvardhaniimi/compare_dataframes.py
A function to compare large data frames by comparing their hashes instead of values for efficiency
import pandas as pd
import hashlib
def hash_dataframe(df):
"""
Generate a hash for a DataFrame using the SHA-256 algorithm.
This function creates a hash for each row of the DataFrame using pandas' `hash_pandas_object`
and then hashes the resulting array of row hashes using `hashlib.sha256`.
@datavudeja
datavudeja / json-to-excel.py
Created October 6, 2025 13:11 — forked from mahimairaja/json-to-excel.py
To write a excel using the data from json.
import pandas as pd
import json
def readJson(filename,) -> dict:
"""
Reads a json file and returns a dictionary
"""
file = open(filename, 'r')
data = file.read()
file.close()
@datavudeja
datavudeja / python_snippets.py
Created October 6, 2025 13:11 — forked from aiventures/python_snippets.py
Python Snippets
#2022-07-25 Argparse Parse from Command Line
"C:\<Entwicklung>\WORK_JUPYTER\root\tools\argparse_template.py"
""" template code for argparse """
@datavudeja
datavudeja / data_importer_with_field_checks.py
Created September 30, 2025 13:47 — forked from adgedenkers/data_importer_with_field_checks.py
Data File Importer That Checks Field Names
import pandas as pd
import pyodbc
class DataImporter:
"""
This Python class takes in a pandas DataFrame from a CSV file and a SQL Server connection string, as
well as a name for the SQL table where the data will be imported. The class compares the columns of
the DataFrame to those in the SQL table and ensures that at least 50% of the fields match. If there
are missing fields, the class adds them to the SQL table using the same naming convention as existing
fields. All new fields are created as varchar fields by default. Once the SQL table has all the fields
@datavudeja
datavudeja / nullsafe.py
Created September 30, 2025 13:47 — forked from fzhem/nullsafe.py
Null-safe comparison accessor for Pandas
import pandas as pd
@pd.api.extensions.register_series_accessor("nullsafe")
class NullSafeSeriesAccessor:
"""
Null-safe comparison accessor for Pandas Series.
This is equivalent to a null-safe equal operator in SQL (<=>) where
@datavudeja
datavudeja / main.py
Created September 17, 2025 20:20 — forked from mypy-play/main.py
Shared via mypy Playground
# This example is copied verbatim from one of the codebases I contribute to.
# Unfortunately, I can't copy it, so I tried to extract the scenario as it was,
# as an example that I could share publicly.
#
# I'm sure the example could be simplified, but I thought it would be better to
# show it as it is, and leave any conclusions to whomever is going to read it.
#
from typing import Union, TypeVar, Callable
from typing_extensions import TypeAlias, Literal, overload
@datavudeja
datavudeja / units.py
Created September 17, 2025 20:16 — forked from markusand/units.py
A simple unit converter system with Measure value object
"""Unit conversion"""
from dataclasses import dataclass
from functools import total_ordering
from enum import Enum
from typing import Callable, NamedTuple
class UnitDesc(NamedTuple):
"""Unit description"""
scale: float