This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import pandas as pd | |
| #load dataset | |
| df = pd.read_csv("data.csv") | |
| # axis 0 -> row -> i | |
| # axis 1 -> col -> j | |
| # get cols |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Read data from csv | |
| data = pd.read_csv('data.csv', sep=',', index_col='Number') | |
| # Write data to csv | |
| data.to_csv("data_wo_sensitive_lemmatized.csv", index=False, encoding='utf-8', sep=';') | |
| # Read and concat several files in one dataframe | |
| files = glob.glob('*.csv') | |
| small_dfs = [pd.read_csv(fp, names=columns) for fp in files] | |
| df = pd.concat(small_dfs) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # List unique values in a DataFrame column | |
| df['Column Name'].unique() | |
| # To extract a specific column (subset the dataframe), you can use [ ] (brackets) or attribute notation. | |
| df.height | |
| df['height'] | |
| # are same thing!!! (from http://www.stephaniehicks.com/learnPython/pages/pandas.html | |
| # -or- | |
| # http://www.datacarpentry.org/python-ecology-lesson/02-index-slice-subset/) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| from typing import Dict, List | |
| class DataQualityValidator: | |
| def __init__(self, df: pd.DataFrame): | |
| self.df = df | |
| self.issues = [] | |
| def check_nulls(self, columns: List[str], threshold: float = 0.05): | |
| """Check if null percentage exceeds threshold""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from functools import wraps | |
| import datetime as dt | |
| import pandas as pd | |
| def log_start(func): | |
| @wraps(func) | |
| def wrapper(*args, **kwargs): | |
| tic = dt.datetime.now() | |
| result = func(*args, **kwargs) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def remove_non_printing_chars(df): | |
| """Clean a dataframe column to remove any non-printing characters. | |
| We've encountered values like tabs in some of the data. | |
| :param df: Pandas dataframe | |
| :return: Pandas dataframe | |
| """ | |
| clean_df = df.copy(deep=True) | |
| clean_df = clean_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) | |
| for col in list(clean_df.columns): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #Apply Lambda function to pandas | |
| # if we require other column as a logic for the new column | |
| df = df.assign(Product=lambda x: (x['Field_1'] * x['Field_2'] * x['Field_3'])) | |
| # if we need to modify all the element of selected entity based only on that entity | |
| # this will in-place update all the element | |
| df = df.apply(lambda x: np.square(x) if x.name in ['a', 'e', 'g'] else x, axis=1) | |
| # compare from the previous element of the colums use shift |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| from IPython.display import display | |
| def compare_dfs_cols_and_types(df_x, df_y, df_x_name="x", df_y_name="y"): | |
| """Function to compare two DataFrames, checking column | |
| names and types, prints the differences (if they exists) | |
| and return a DataFrame with the NaNs sinalizing the mismatches. | |
| :param df_x: First Dataframe | |
| :type df_x: pd.DataFrame | |
| :param df_y: Second DataFrame |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Describes an enum that may be used to describe a unit of time (not a duration) | |
| NOTE: Remove the numpy logic if your application does not support numpy | |
| """ | |
| import enum | |
| from datetime import datetime | |
| from datetime import timedelta |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import os | |
| import re | |
| re_enum = re.compile(r'\s*enum\s*(\w+)\s*(:.*)?\s*') | |
| re_enum_value = re.compile(r'\s*(\w+)(?:\s*=\s*(.+))?,?(?:\s*\/\/.*)?\s*') | |
| folders_blacklist = [ | |
| # 'Urho3D/Audio', | |
| # 'Urho3D/Container', | |
| # 'Urho3D/Core', |
NewerOlder