This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import hashlib | |
| from datetime import datetime | |
| def detect_dataframe_changes(source_df: pd.DataFrame, target_df: pd.DataFrame, primary_key: str, ignored_columns: list): | |
| """ | |
| Detects inserted, updated, and deleted rows between two DataFrames. | |
| Args: | |
| source_df (pd.DataFrame): The source DataFrame. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import ray | |
| from ray.data import Dataset | |
| from ray.data.context import DataContext, ShuffleStrategy | |
| from typing import List | |
| import time | |
| from ray.data.aggregate import Count, Mean, Min, Max, Quantile, Std, Unique, AggregateFnV2 | |
| from ray.data.block import BlockAccessor, Block | |
| from typing import List, Tuple, Optional | |
| from ray.data import Dataset |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from hashlib import md5 | |
| import pandas as pd | |
| from typing import Optional, Iterable | |
| def get_md5_from_series(input_iterable: Iterable) -> str: | |
| """ | |
| Create a MD5 hash from an Iterable, typically a row from a Pandas ``DataFrame``, but can be any | |
| Iterable object instance such as a list, tuple or Pandas ``Series``. | |
| Args: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # conda activate bert-github | |
| #%% | |
| import pandas as pd | |
| import numpy as np | |
| import os | |
| import glob | |
| import pathlib | |
| from tqdm import tqdm | |
| from bs4 import BeautifulSoup |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Splits the values and expands them in multiple numbered columns | |
| temp_df = df[column].str.split("|", expand=True).fillna('') | |
| # One-Hot encodes all the values for each column | |
| temp_df = pd.get_dummies(temp_df).astype('uint8') | |
| # Removes the "N_" prefixe for each column to expose duplicates | |
| temp_df = remove_prefixes(temp_df) | |
| # Merges the duplicate columns |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python3 | |
| # Usage: | |
| # $ python3 this.py > output.csv | |
| # $ nkf --overwrite --oc=UTF-8-BOM output.csv | |
| import glob | |
| import hashlib | |
| import os | |
| import sys |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import hashlib | |
| def hash_dataframe(df): | |
| """ | |
| Generate a hash for a DataFrame using the SHA-256 algorithm. | |
| This function creates a hash for each row of the DataFrame using pandas' `hash_pandas_object` | |
| and then hashes the resulting array of row hashes using `hashlib.sha256`. | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import json | |
| def readJson(filename,) -> dict: | |
| """ | |
| Reads a json file and returns a dictionary | |
| """ | |
| file = open(filename, 'r') | |
| data = file.read() | |
| file.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #2022-07-25 Argparse Parse from Command Line | |
| "C:\<Entwicklung>\WORK_JUPYTER\root\tools\argparse_template.py" | |
| """ template code for argparse """ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import pyodbc | |
| class DataImporter: | |
| """ | |
| This Python class takes in a pandas DataFrame from a CSV file and a SQL Server connection string, as | |
| well as a name for the SQL table where the data will be imported. The class compares the columns of | |
| the DataFrame to those in the SQL table and ensures that at least 50% of the fields match. If there | |
| are missing fields, the class adds them to the SQL table using the same naming convention as existing | |
| fields. All new fields are created as varchar fields by default. Once the SQL table has all the fields |