This demonstrates Spark Job, Stage and Tasks Listeners
1) Start spark-shell
Welcome to
____ __
/ __/__ ___ _____/ /__
| $ sudo apt install stockfish | |
| $ sudo apt upgrade stockfish | |
| $ pip install chess | |
| import chess.pgn | |
| import chess.engine | |
| # Load the PGN file | |
| pgn = open("./file.pgn") | |
| engine = chess.engine.SimpleEngine |
| Both of these two functions take two arguments: start and end of the frame and they can be specified as follows: | |
| - Window.unboundedPreceding, Window.unboundedFollowing — the entire window from the beginning to the end | |
| - Window.unboundedPreceding, Window.currentRow — from the beginning of the window to the current row, this is used for the cumulative sum | |
| - using numerical values, for example, 0 means currentRow, but the meaning of other values can differ based on the framing function rowsBetween/rangeBetween. | |
| df.withColumn('activity_sum', sum('activity').over(w)) | |
| https://miro.medium.com/max/1400/1*WYO-zRP1SlrzGqT4S_5Jvw.webp | |
| // Hadoop | |
| Download winutils.exe and hadoop.dll: https://github.com/kontext-tech/winutils | |
| add them inside older C:\hadoop\bin | |
| add env varible hadoop.home.dir and HADOOP_HOME, with value = C:\hadoop | |
| add %HADOOP_HOME%\bin to the path | |
| add hadoop.dll to C:\Windows\system32 | |
| Make sure your JVM is 64 bit. | |
| // Spark | |
| download spark from https://spark.apache.org/downloads.html |
| // case FetchFailedException or MetadataFetchFailedException: how to avoid BroadcastNestedLoopJoin | |
| - spark.executor.memoryOverhead=1g | |
| - spark.kubernetes.memoryOverheadFactor=0.2 | |
| // avoid skew spark >= 3.0 | |
| spark.sql.adaptive.optimizeSkewedJoin.enabled |
| // when you have a skewed data when joining this method will fix it | |
| def saltedJoin(df: DataFrame, buildDf: DataFrame, joinExpression: Column, joinType: String, salt: Int): DataFrame = { | |
| import org.apache.spark.sql.functions._ | |
| val tmpDf = buildDf.withColumn("slt_range", array(Range(0, salt).toList.map(lit): _*)) | |
| val tableDf = tmpDf.withColumn("slt_ratio_s", explode(tmpDf("slt_range"))).drop("slt_range") | |
| val streamDf = df.withColumn("slt_ratio", monotonically_increasing_id % salt) | |
| val saltedExpr = streamDf("slt_ratio") === tableDf("slt_ratio_s") && joinExpression | |
| streamDf.join(tableDf, saltedExpr, joinType).drop("slt_ratio_s").drop("slt_ratio") |
| // Cucmumber datatable to spark dataframe | |
| import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable` | |
| import io.cucumber.datatable.DataTable | |
| import org.apache.spark.sql.DataFrame | |
| def dataTableToDataframe(table: DataTable): DataFrame = { | |
| import sparkSession.implicits._ | |
| val columns: Seq[String] = table.cells().head.toSeq | |
| val data = table.cells().drop(1).toSeq.map(r => r.toList) | |
| data.toDF().select(columns.indices.map(i => col("value")(i).alias(columns(i))): _*) |
| // styles.scss | |
| @import '~@angular/material/prebuilt-themes/indigo-pink.css'; | |
| // HTML | |
| <mat-table class="lessons-table mat-elevation-z8" [dataSource]="dataSource"> | |
| <ng-container matColumnDef="id"> | |
| <mat-header-cell *matHeaderCellDef>#</mat-header-cell> | |
| <mat-cell *matCellDef="let customer">{{customer.id}}</mat-cell> |
| import numpy as np | |
| from PIL import ImageGrab | |
| import cv2 | |
| def draw_detections(img, rects, thickness = 1): | |
| for x, y, w, h in rects: | |
| pad_w, pad_h = int(0.15*w), int(0.05*h) | |
| cv2.rectangle(img, (x+pad_w, y+pad_h), (x+w-pad_w, y+h-pad_h), (0, 255, 0), thickness) | |
| hog = cv2.HOGDescriptor() |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| from sklearn.preprocessing import Imputer | |
| from sklearn.preprocessing import LabelEncoder, OneHotEncoder | |
| from sklearn.linear_model import LinearRegression | |
| # Importing the dataset | |
| dataset = pd.read_csv('train.csv') | |
| df = pd.DataFrame(dataset) |