This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import numpy as np | |
#import pandas as pd | |
#from tqdm import tqdm | |
#import cv2 | |
#from src.commons import utils | |
#from constants import | |
def resize_images(data, size=None): | |
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import pandas as pd | |
#from constants import * | |
#from src.commons import utils | |
#import matplotlib.pyplot as plt | |
#import numpy as np | |
# Read the list of all paintings (scraped data) | |
data = pd.read_csv('{}/data.csv'.format(DATA_FOLDER)) | |
for year in data['year'].unique(): | |
full_data = pd.DataFrame() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val t0 = System.nanoTime() | |
// Create the Execution Plan | |
fact_table = fact_table.join(dimension_table, | |
fact_table.col("dimension_id") === dimension_table.col("id")) | |
// Perform an action to run the execution | |
fact_table.count | |
val t1 = System.nanoTime() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val t0 = System.nanoTime() | |
// Create the Execution Plan | |
fact_table = fact_table.join(broadcast(dimension_table), // Here's the magic! | |
fact_table.col("dimension_id") === dimension_table.col("id")) | |
// Perform an action to run the execution | |
fact_table.count | |
val t1 = System.nanoTime() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val t0 = System.nanoTime() | |
// Create the Execution Plan | |
fact_table = fact_table.join(broadcast(dimension_table2), | |
fact_table.col("dimension_2_id") === dimension_table.col("id")) | |
// Perform an action to run the execution | |
fact_table.count | |
val t1 = System.nanoTime() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
al t0 = System.nanoTime() | |
// Do repartitioning | |
fact_table = fact_table.repartition(200, col("uniformly_distributed_column")) | |
// Create the Execution Plan | |
fact_table = fact_table.join(broadcast(dimension_table2), | |
fact_table.col("dimension_2_id") === dimension_table.col("id")) | |
// Perform an action to run the execution |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Create a dummy column which is a number between 0 and 1000, then append this suffix to the dimension_2_id column | |
fact_table = fact_table.withColumn("dummy", monotonically_increasing_id % 1000). | |
withColumn("dimension_2_id_suffix",concat(col("dimension_2_id"),lit("-"), col("dummy"))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Scala random numbers generator | |
val r = scala.util.Random | |
// Create a "population" dataset with the numbers between 0 and 1000 | |
val population:List[List[Int]] = for { | |
i <- (0 to 1000).toList | |
d <- 1 to 1 | |
} yield List(d,i) | |
val df = population.map(x =>(x(0), x(1))).toDF(Seq("dummy_key","suffix"):_*) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val t0 = System.nanoTime() | |
// Create the Execution Plan | |
fact_table = fact_table.join(dimension_table2, | |
fact_table.col("dimension_2_id_suffix") === dimension_table.col("id_suffix")) | |
// Perform an action to run the execution | |
fact_table.count | |
val t1 = System.nanoTime() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// The following row avoids the broadcasting, the dimension_table2 | |
// is very small and my configuration would broadcast it | |
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1) | |
// I'm using caching to simplify the DAG | |
dimension_table2.cache | |
dimension_table2.count | |
fact_table = fact_table.repartition(400) |
OlderNewer