Skip to content

Instantly share code, notes, and snippets.

View al102964's full-sized avatar

Arturo Gonzalez al102964

  • Data Scientist ITAM
  • México city
View GitHub Profile
import ppscore as pps
import matplotlib.pyplot as plt
df1_styler = df.corr().abs().style.set_table_attributes("style='display:inline'").set_caption('Correlation Table')
df2_styler = pps.matrix(df).style.set_table_attributes("style='display:inline'").set_caption('PPS Table')
fig, ax =plt.subplots(nrows=2, ncols=2,figsize=(16,8))
display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)
# Importamos librerias
import numpy as np
import pandas as pd
#Definimos variables de prueba
# a = espacio lineal entre 0 y 50 + ruido con distribucion normal (mu=0, s2=1)
# b = espacio lineal entre -50 y 50 + ruido con distribucion normal (mu=0, s2=1)
# c = a^2
# d = seno(a)
# e = b^3
sudo apt update
sudo apt install -y git
sudo apt install -y python3-pip
sudo pip install apache-airflow[crypto]
sudo pip install apache-airflow[postgres]
sudo-H pip install six==1.10.0
sudo pip install --upgrade six
sudo pip install markupsafe
sudo pip install --upgrade MarkupSafe
sudo pip install SQLAlchemy==1.3.15
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
import mlflow
import mlflow.spark
completa_metricas = spark.read.parquet("s3://<s3-bucket>/movielens-parquet/training/")
columnas_input = list(completa_metricas.columns)
columnas_input.remove('promedio_rating')
import pyspark.sql.functions as f
categorias = ['action','adaptation','adventure','apocalypse','artistic',\
'assassination','based on a true story','biblical','blood','brutal','biographical','bollywood','boring',\
'cars','cerebral','classic','censorship','comedy','computers','confusing',\
'cooking','comic','cartoon','court','crime','cult','dark','death','disaster','documentary','drama',
'depressing','drugs','environment','erotic','fantasy','fighting','football','freedom',\
'friendship','genius','god','gothic','high school','historical','hollywood','horror',\
'humor','homosexuality','holiday','independent film','kids','love','magic','marriage',\
'military','murder','musical','nature','nostalgia','nudity','olympics','original','oscar',\
val genome_scores_df = spark.read.format("csv").option("header", "true").option("inferschema", "true").load("s3://<s3-bucket>/genome-scores.csv")
genome_scores_df.write.mode("overwrite").parquet("s3://<s3-bucket>/movielens-parquet/genome-scores/")
import airflowlib.emr_lib as emr
import os
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta
default_args = {
'owner': 'airflow',
'depends_on_past': False,
#!/bin/bash
sudo pip install -U mlflow==1.6.0
sudo pip install -U boto3
AWSTemplateFormatVersion: '2010-09-09'
Description: MLflow server backed by Postgres RDS
Parameters:
KeyName:
Description: Name of an existing EC2 KeyPair to enable SSH access into the Airflow web server
Type: AWS::EC2::KeyPair::KeyName
ConstraintDescription: Must be the name of an existing EC2 KeyPair
S3BucketNameAirflow:
Original Level1
russian [European]
english [European]
french [European]
spanish [European]
italian [European]
swedish [European]
canadian [American]
mexican [American]
colombian [American]