This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"i": 0, | |
"tree_undefined": -2, | |
"features": [ | |
3, | |
3, | |
2, | |
3, | |
-2, | |
-2, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.tree import _tree | |
tree_template = ''' | |
def tree{i}(inputs): | |
tree_undefined = {tree_undefined} | |
features = {features} | |
thresholds = {thresholds} | |
children_left = {children_left} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from json import dumps | |
def rfr_to_json(rfr_object, feature_list, json_filepath=None): | |
''' | |
Function to convert a scikit-learn RandomForestRegressor object to JSON. | |
''' | |
output_dict = dict() | |
output_dict['name'] = 'rf_regression_pipeline' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import rapture.json.jsonBackends.jawn._ | |
import rapture.json.Json | |
import scala.annotation.tailrec | |
case class RandomForestTree( | |
treeId: Int, | |
undefinedIndex: Int, | |
features: Array[Int], | |
thresholds: Array[Double], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyspark.sql.types as t | |
import pyspark.sql.functions as f | |
def spacy_word2vec_grouped(cat_list, id_col, string_col): | |
""" | |
Example usage: | |
vec_sdf = ( | |
sdf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from spacy import load as spacy_load | |
# This loads the largest English corpus, which must be downloaded | |
# separate from package installation. Other choices are available. | |
nlp = spacy_load('en_core_web_lg') | |
def doc_to_spans(list_of_texts, join_string=' ||| '): | |
all_docs = nlp(' ||| '.join(list_of_texts)) | |
split_inds = [i for i, token in enumerate(all_docs) if token.text == '|||'] + [len(all_docs)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from numpy.random import RandomState | |
import pyspark.sql.functions as f | |
from pyspark import StorageLevel | |
def hashmin_jaccard_spark( | |
sdf, node_col, edge_basis_col, suffixes=('A', 'B'), | |
n_draws=100, storage_level=None, seed=42, verbose=False): | |
""" | |
Calculate a sparse Jaccard similarity matrix using MinHash. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import DataFrame | |
from pyspark.sql import types as t, functions as f | |
df = DataFrame({'ids': [1, 2, 3], 'words': ['abracadabra', 'hocuspocus', 'shazam']}) | |
sdf = sparkSession.createDataFrame(df) | |
normalize_word_udf = f.udf(normalize_word, t.StringType()) | |
stops = f.array([f.lit(c) for c in STOPCHARS]) | |
results = sdf.select('ids', normalize_word_udf(f.col('words'), stops).alias('norms')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
outcome_sdf = ( | |
sdf | |
.select( | |
f.create_map( | |
f.col('unique_id'), | |
f.col('feature_list') | |
).alias('feature_map'), | |
) | |
.groupby( | |
f.floor(f.rand() * nparts).alias('grouper') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
# create a file handler | |
handler = logging.FileHandler('error.log') | |
handler.setLevel(logging.ERROR) | |
# create a logging format |
NewerOlder