al102964’s gists

al102964 / pps_plots.py

Created June 19, 2020 08:50

	import ppscore as pps
	import matplotlib.pyplot as plt

	df1_styler = df.corr().abs().style.set_table_attributes("style='display:inline'").set_caption('Correlation Table')
	df2_styler = pps.matrix(df).style.set_table_attributes("style='display:inline'").set_caption('PPS Table')

	fig, ax =plt.subplots(nrows=2, ncols=2,figsize=(16,8))

	display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

al102964 / pps_variables.py

Last active June 19, 2020 08:44

	# Importamos librerias
	import numpy as np
	import pandas as pd

	#Definimos variables de prueba
	# a = espacio lineal entre 0 y 50 + ruido con distribucion normal (mu=0, s2=1)
	# b = espacio lineal entre -50 y 50 + ruido con distribucion normal (mu=0, s2=1)
	# c = a^2
	# d = seno(a)
	# e = b^3

al102964 / install.sh

Last active April 15, 2020 07:06

	sudo apt update
	sudo apt install -y git
	sudo apt install -y python3-pip
	sudo pip install apache-airflow[crypto]
	sudo pip install apache-airflow[postgres]
	sudo-H pip install six==1.10.0
	sudo pip install --upgrade six
	sudo pip install markupsafe
	sudo pip install --upgrade MarkupSafe
	sudo pip install SQLAlchemy==1.3.15

al102964 / linear_regression.py

Created April 13, 2020 01:17

	from pyspark.ml.feature import VectorAssembler
	from pyspark.ml.regression import LinearRegression
	import mlflow
	import mlflow.spark

	completa_metricas = spark.read.parquet("s3://<s3-bucket>/movielens-parquet/training/")

	columnas_input = list(completa_metricas.columns)
	columnas_input.remove('promedio_rating')

al102964 / joins.py

Created April 13, 2020 01:15

	import pyspark.sql.functions as f

	categorias = ['action','adaptation','adventure','apocalypse','artistic',\
	'assassination','based on a true story','biblical','blood','brutal','biographical','bollywood','boring',\
	'cars','cerebral','classic','censorship','comedy','computers','confusing',\
	'cooking','comic','cartoon','court','crime','cult','dark','death','disaster','documentary','drama',
	'depressing','drugs','environment','erotic','fantasy','fighting','football','freedom',\
	'friendship','genius','god','gothic','high school','historical','hollywood','horror',\
	'humor','homosexuality','holiday','independent film','kids','love','magic','marriage',\
	'military','murder','musical','nature','nostalgia','nudity','olympics','original','oscar',\

al102964 / genome_scores.scala

Created April 13, 2020 01:12

	val genome_scores_df = spark.read.format("csv").option("header", "true").option("inferschema", "true").load("s3://<s3-bucket>/genome-scores.csv")

	genome_scores_df.write.mode("overwrite").parquet("s3://<s3-bucket>/movielens-parquet/genome-scores/")

al102964 / movielens_dag.py

Created April 13, 2020 01:02

	import airflowlib.emr_lib as emr
	import os

	from airflow import DAG
	from airflow.operators.python_operator import PythonOperator
	from datetime import datetime, timedelta

	default_args = {
	'owner': 'airflow',
	'depends_on_past': False,

al102964 / bootstrap.sh

Created April 12, 2020 22:46

al102964 / airflow-mlflow.yaml

Created April 12, 2020 22:34

	AWSTemplateFormatVersion: '2010-09-09'

	Description: MLflow server backed by Postgres RDS

	Parameters:
	KeyName:
	Description: Name of an existing EC2 KeyPair to enable SSH access into the Airflow web server
	Type: AWS::EC2::KeyPair::KeyName
	ConstraintDescription: Must be the name of an existing EC2 KeyPair
	S3BucketNameAirflow:

al102964 / Nationality.csv

Created February 14, 2020 07:54

Arturo Gonzalez al102964