nkthiebaut’s gists

nkthiebaut / pyspark_dataframe_register.py

Created June 28, 2019 18:44

Instantiate a Spark Session, register a DataFrame, and query it (Spark 2.0+).

	from pyspark import SparkConf
	from pyspark.sql import SparkSession
	from pyspark.sql import DataFrame as SparkDataFrame

	sc = SparkConf()
	sc.set("spark.driver.memory", "4g")
	ss = SparkSession.builder.master("local[4]").config(conf=sc).getOrCreate()

	df = ss.createDataFrame([(1, "kevin"), (2, "steph")], ["id", "name"])
	df.createOrReplaceTempView("players")

nkthiebaut / test_rollbar_setup.py

Created July 1, 2019 23:34

Test Rollbar configuration

	from unittest.mock import patch

	import rollbar

	@patch("rollbar.log.exception")
	def test_rollbar_connection(log_exception_mock):
	"""
	Test sending an exception to Rollbar. The Rollbar Python SDK reports requests
	errors through exceptions logs. Here we check that the exception function
	of the logging module has NOT been called after reporting an exception to Rollbar.

nkthiebaut / xkcd-trend.py

Last active July 2, 2019 23:56

Get and plot Google scholar search queries volume, for different keywords, with the XKCD plot style. Results from direct queries to the Google scholar APIs.

	"""Based on https://github.com/Pold87/academic-keyword-occurrence"""
	import re
	import urllib
	from functools import partial
	from typing import Iterable
	from urllib.parse import urlencode
	from urllib.request import Request, build_opener

	import matplotlib
	import matplotlib.pyplot as plt

nkthiebaut / plot_top_k_accuracies

Created July 14, 2019 01:34

Plot top k accuracies

	import matplotlib.pyplot as plt
	from sklearn.datasets import load_digits
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split

	plt.xkcd()

	X, y = load_digits(return_X_y=True)
	X_train, X_test, y_train, y_test = train_test_split(X, y)

nkthiebaut / soc

Created July 14, 2019 05:35

Codes to names mapping for the O*NET-SOC job titles classification (https://www.onetcenter.org/taxonomy.html)

	SOC_MAJOR_GROUPS = {
	"11": "Management Occupations",
	"13": "Business and Financial Operations Occupations",
	"15": "Computer and Mathematical Occupations",
	"17": "Architecture and Engineering Occupations",
	"19": "Life, Physical, and Social Science Occupations",
	"21": "Community and Social Service Occupations",
	"23": "Legal Occupations",
	"25": "Education, Training, and Library Occupations",
	"27": "Arts, Design, Entertainment, Sports, and Media Occupations",

nkthiebaut / commit.APPLESCRIPT

Created July 24, 2019 17:27

Commit applescript

	on run
	set result to text returned of (display dialog "Enter github password:" default answer "" with hidden answer)
	do shell script "cd /Users/nicolas/Google\\ Drive/notes && git add . && git commit -m new_commit && git push https://nkthiebaut:" & result & "@github.com/nkthiebaut/notes.git"
	say "Notes successfully committed and pushed"
	end run

nkthiebaut / hyperopt.py

Created December 30, 2019 19:35

Hyperopt usage example with sklearn

	from sklearn.model_selection import cross_val_score
	from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
	from hyperopt import fmin, tpe, hp, STATUS_OK, Trials


	def hyperopt_train_test(params):
	t = params['type']
	del params['type']
	if t == 'gb':
	clf = GradientBoostingClassifier(**params)

nkthiebaut / plot_graph.py

Created January 19, 2020 21:06

Plot a graph with Matplotlib

	import matplotlib as mpl
	import matplotlib.pyplot as plt
	%matplotlib inline
	import networkx as nx


	def plot_graph(df, directed=False):
	graph_engine = nx.DiGraph() if directed else None
	G = nx.from_pandas_edgelist(df, source='source', target='target', edge_attr=True, create_using=graph_engine)
	costs = G.edges.data('cost')

nkthiebaut / pandas_json_to_dataframe.py

Created March 23, 2020 20:13

Pandas : column of JSON strings to DataFrame

	# inspired by https://stackoverflow.com/a/50658993/5174617
	import pandas as pd
	import json

	df = pd.DataFrame([['0 {"a":"1","b":"2","c":"3"}'],['1 {"a" :"4","b":"5","c":"6"}']], columns=['json'])
	exploded_df = df['json'].apply(json.loads).apply(pd.Series)

nkthiebaut / logger.py

Created June 26, 2020 17:25

	# Official doc: https://docs.python.org/3/howto/logging-cookbook.html
	import os

	import logging

	from logging import StreamHandler
	from logging import Formatter

	LOGS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../logs')
	if not os.path.exists(LOGS_DIR):

Nicolas nkthiebaut