This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
len_pos <- nrow(example_dataset[example_dataset$target==1,]) | |
len_neg <- nrow(example_dataset[example_dataset$target==0,]) | |
train_model <- function(training_data, labels, model_type, ...) { | |
experiment_control <- trainControl(method="repeatedcv", | |
number = 10, | |
repeats = 2, | |
classProbs = T, | |
summaryFunction = custom_summary_function) | |
train(x = training_data, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
import os | |
from zipfile import ZipFile | |
from mock import MagicMock, patch, Mock, mock_open | |
# The functions that are tested: | |
def function_to_test_zipfile(example_arg): | |
with ZipFile(example_arg, 'r') as zip_in: | |
for input_file in zip_in.infolist(): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- First create matches using a UDF, here I am using a combination of Jaro Winkler and (a normalized version of) Levensthein | |
-- | |
-- Input: cleaned_table: a table with "cleaned" names | |
-- Output: tmp_groups: a table with uid - group_id tuples. Each group_id contains all uid's that belong to names that match. | |
DROP TABLE #matches | |
SELECT a.clean_Name, | |
a.uid, | |
b.clean_Name clean_name_2, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import findspark | |
findspark.init("[spark install location]") | |
import pyspark | |
import string | |
from pyspark import SparkContext | |
from pyspark.sql import SQLContext | |
from pyspark.mllib.util import MLUtils | |
from pyspark.sql.types import * | |
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover |