Skip to content

Instantly share code, notes, and snippets.

View gvyshnya's full-sized avatar

George Vyshnya gvyshnya

View GitHub Profile
@gvyshnya
gvyshnya / Parallel Audio Feature Extraction with Dask.py
Created September 2, 2020 19:29
Parallel Audio Feature Extraction with Dask
for ebird in final_data:
print("Starting to process a new species: ", ebird)
ebird_data = train_csv[train_csv['species'] == ebird]
short_file_name = ebird_data['ebird_code'].unique()[0]
print("Short file name: ", short_file_name)
result = []
for index, row in ebird_data.iterrows():
@gvyshnya
gvyshnya / Parallel Audio Feature Extraction with Ray.py
Created September 2, 2020 19:25
Parallel Audio Feature Extraction with Ray
@ray.remote
def extract_feautres(trial_audio_file_path):
# process data frame
function_start_time = dt.datetime.now()
print("Started a file processing at ", function_start_time)
df0 = u.extract_feature_means(trial_audio_file_path)
function_finish_time = dt.datetime.now()
print("Fininished the file processing at ", function_finish_time)
for ebird in final_data:
print("Starting to process a new species: ", ebird)
ebird_data = train_csv[train_csv['species'] == ebird]
short_file_name = ebird_data['ebird_code'].unique()[0]
print("Short file name: ", short_file_name)
pool = mp.Pool(c.NUMBER_OF_CPU_IN_POOL) # use the number of parallel processes as per the configured
funclist = []
@gvyshnya
gvyshnya / Audio Feature Extraction.py
Created September 2, 2020 19:16
Audio Feature Extraction from Audio Files using Librosa
def extract_feature_means(audio_file_path: str) -> pd.DataFrame:
# config settings
number_of_mfcc = c.NUMBER_OF_MFCC
# 1. Importing 1 file
y, sr = librosa.load(audio_file_path)
# Trim leading and trailing silence from an audio signal (silence before and after the actual audio)
signal, _ = librosa.effects.trim(y)
import datetime
from pyspark.sql.functions import year, month, dayofmonth
elevDF = sc.parallelize([
(datetime.datetime(1994, 1, 1, 0, 0), 1, 638.55),
(datetime.datetime(1994, 1, 1, 0, 0), 2, 638.55),
(datetime.datetime(1994, 1, 1, 0, 0), 3, 638.55),
(datetime.datetime(1994, 1, 1, 0, 0), 4, 638.55),
(datetime.datetime(1994, 1, 1, 0, 0), 5, 638.55)
]).toDF(["date", "hour", "value"])
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
# Missing value summary
nan_columns = []
nan_values = []
for column in df.columns:
from sklearn.base import clone
def drop_col_feat_imp(model, X_train, y_train, random_state = 42):
# clone the model to have the exact same specification as the one initially trained
model_clone = clone(model)
# set random_state for comparability
model_clone.random_state = random_state
# training and scoring the benchmark model
model_clone.fit(X_train, y_train)
from sklearn.inspection import permutation_importance
# Here's how you use permutation importance
def get_permutation_importance(X, y, model) -> pd.DataFrame:
result = permutation_importance(model, X, y, n_repeats=1,
random_state=0)
# permutational importance results
result_df = pd.DataFrame(colnames, columns=['Feature'])
result_df['permutation_importance'] = result.get('importances')
from sklearn.feature_selection import SelectFromModel
embeded_rf_selector = SelectFromModel(modeller, max_features=200)
embeded_rf_selector.fit(X, y)
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
from sklearn.feature_selection import RFE
# Define dictionary to store our rankings
ranks = {}
# Create our function which stores the feature rankings to the ranks dictionary
def ranking(ranks, names, order=1):
minmax = MinMaxScaler()
ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
ranks = map(lambda x: round(x,2), ranks)
return dict(zip(names, ranks))