aialenti’s gists

aialenti / block1.py

Created February 14, 2019 22:07

	#import numpy as np
	#import pandas as pd
	#from tqdm import tqdm
	#import cv2
	#from src.commons import utils
	#from constants import


	def resize_images(data, size=None):
	'''

aialenti / block.py

Created February 14, 2019 23:26

	#import pandas as pd
	#from constants import *
	#from src.commons import utils
	#import matplotlib.pyplot as plt
	#import numpy as np

	# Read the list of all paintings (scraped data)
	data = pd.read_csv('{}/data.csv'.format(DATA_FOLDER))
	for year in data['year'].unique():
	full_data = pd.DataFrame()

aialenti / count_rows.scala

Last active October 27, 2019 22:56

	val t0 = System.nanoTime()

	// Create the Execution Plan
	fact_table = fact_table.join(dimension_table,
	fact_table.col("dimension_id") === dimension_table.col("id"))

	// Perform an action to run the execution
	fact_table.count

	val t1 = System.nanoTime()

aialenti / broadcast.scala

Created October 27, 2019 22:59

	val t0 = System.nanoTime()

	// Create the Execution Plan
	fact_table = fact_table.join(broadcast(dimension_table), // Here's the magic!
	fact_table.col("dimension_id") === dimension_table.col("id"))

	// Perform an action to run the execution
	fact_table.count

	val t1 = System.nanoTime()

aialenti / file.scala

Created October 28, 2019 00:08

	val t0 = System.nanoTime()

	// Create the Execution Plan
	fact_table = fact_table.join(broadcast(dimension_table2),
	fact_table.col("dimension_2_id") === dimension_table.col("id"))

	// Perform an action to run the execution
	fact_table.count

	val t1 = System.nanoTime()

aialenti / snippet.scala

Created October 28, 2019 00:28

	al t0 = System.nanoTime()

	// Do repartitioning
	fact_table = fact_table.repartition(200, col("uniformly_distributed_column"))

	// Create the Execution Plan
	fact_table = fact_table.join(broadcast(dimension_table2),
	fact_table.col("dimension_2_id") === dimension_table.col("id"))

	// Perform an action to run the execution

aialenti / gist:20650adea8347e4459478f5bf651b436

Created October 28, 2019 01:07

	// Create a dummy column which is a number between 0 and 1000, then append this suffix to the dimension_2_id column
	fact_table = fact_table.withColumn("dummy", monotonically_increasing_id % 1000).
	withColumn("dimension_2_id_suffix",concat(col("dimension_2_id"),lit("-"), col("dummy")))

aialenti / gist:27a0853a69f56057c6d6ed94df9be2a7

Last active June 21, 2020 11:28

	// Scala random numbers generator
	val r = scala.util.Random

	// Create a "population" dataset with the numbers between 0 and 1000
	val population:List[List[Int]] = for {
	i <- (0 to 1000).toList
	d <- 1 to 1
	} yield List(d,i)
	val df = population.map(x =>(x(0), x(1))).toDF(Seq("dummy_key","suffix"):_*)

aialenti / gist:dddf4189c0db4694e77ae216cb8c757f

Last active June 21, 2020 11:32

	val t0 = System.nanoTime()

	// Create the Execution Plan
	fact_table = fact_table.join(dimension_table2,
	fact_table.col("dimension_2_id_suffix") === dimension_table.col("id_suffix"))

	// Perform an action to run the execution
	fact_table.count

	val t1 = System.nanoTime()

aialenti / file.scala

Last active December 7, 2019 15:41

	// The following row avoids the broadcasting, the dimension_table2
	// is very small and my configuration would broadcast it
	spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)

	// I'm using caching to simplify the DAG
	dimension_table2.cache
	dimension_table2.count

	fact_table = fact_table.repartition(400)