Skip to content

Instantly share code, notes, and snippets.

@Eligijus112
Eligijus112 / nrow-time-xgboost.py
Created March 21, 2021 16:28
Speed compare between CPU and GPU xgboost
# Defining the dictionaries
cpu_dict = {
'objective': 'reg:squarederror'
}
gpu_dict = {
'objective': 'reg:squarederror',
'tree_method': 'gpu_hist'
}
@Eligijus112
Eligijus112 / hp-speed-xgoobst.py
Created March 22, 2021 18:21
Gist for evaluation of xgboost speeds for different hp
# Defining the grid of parameters
n_estimators = [30, 100, 200, 300, 600, 900]
max_depth = [4, 6, 8, 12, 14, 16]
# Number of rows to test on
nrows = 30000
# Creating a dictionary
gpu_dict = {
'objective': ['reg:squarederror'],
@Eligijus112
Eligijus112 / Node.py
Created April 12, 2021 06:14
Node for a decision tree
# Data wrangling
import pandas as pd
# Array math
import numpy as np
# Quick value count calculator
from collections import Counter
@Eligijus112
Eligijus112 / RegressionTree.py
Created April 25, 2021 15:00
Regression tree implementation
# Data wrangling
import pandas as pd
# Array math
import numpy as np
# Quick value count calculator
from collections import Counter
@Eligijus112
Eligijus112 / bootsrap_df.py
Created June 14, 2021 16:16
Bootsrapping of a pandas dataframe
# self - an instance of the RandomForestClassifier class
# self.X - pandas dataframe containing feature information
# self.Y - a list of binary response variable
# self.X_obs_fraction - a float in range [0, 1]
def bootstrap_sample(self):
"""
Function that creates a bootstraped sample with the class instance parameters
"""
# Sampling the number of rows with repetition
@Eligijus112
Eligijus112 / best_split_rf.py
Created June 14, 2021 19:29
The function of getting the best split for a random forest decision tree
def best_split(self) -> tuple:
"""
Given the X features and Y targets calculates the best split
for a decision tree
"""
# Creating a dataset for spliting
df = self.X.copy()
df['Y'] = self.Y
# Getting the GINI impurity for the base input
@Eligijus112
Eligijus112 / rf_growth.py
Created June 15, 2021 06:20
A snippet on how to use the custom RandomForest class
# Data wrangling
import pandas as pd
# Accuracy metrics
from sklearn.metrics import precision_score, recall_score
# Reading data for classification
d = pd.read_csv("data/random_forest/telecom_churn.csv")
# Setting the features used
@Eligijus112
Eligijus112 / read_json.py
Created February 27, 2022 09:06
Read json about amazon reviews
# Data wrangling
import pandas as pd
# JSON functionalities
import json
# Reading a json in the .gz format
def read_json(file_path:str) -> pd.DataFrame:
"""
Reads the provided special JSON file format
@Eligijus112
Eligijus112 / pipeline.py
Last active March 4, 2022 07:21
CLF creation pipeline
# Importing all the methods for the pipeline
from pipeline.modules.read_data import read_json
from pipeline.modules.clean_data import clean_text
from pipeline.modules.model_input_preparation import create_X_Y, apply_train_test_split
from pipeline.modules.model_fitting import TextCLF
from pipeline.modules.evaluate_model import eval_model
# Directory traversal
import os
@Eligijus112
Eligijus112 / test_read_clean.py
Last active March 1, 2022 09:52
Integration test
# Main frameworks to test out
from pipeline.modules.read_data import read_json
from pipeline.modules.clean_data import clean_text
# Directory traversals
import os
def test_reading_cleaning():
# Arrange
_path_to_data = os.path.join(