Attention: this is the key used to sign the certificate requests, anyone holding this can sign certificates on your behalf. So keep it in a safe place!
openssl genrsa -des3 -out rootCA.key 4096
import concurrent.futures | |
import os | |
import re | |
from timeit import timeit | |
import requests | |
from tqdm import tqdm | |
URLS = 'urls' |
FROM apache/zeppelin:0.8.0 | |
# Workaround to "fix" https://issues.apache.org/jira/browse/ZEPPELIN-3586 | |
RUN echo "$LOG_TAG Download Spark binary" && \ | |
wget -O /tmp/spark-2.3.1-bin-hadoop2.7.tgz http://apache.panu.it/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz && \ | |
tar -zxvf /tmp/spark-2.3.1-bin-hadoop2.7.tgz && \ | |
rm -rf /tmp/spark-2.3.1-bin-hadoop2.7.tgz && \ | |
mv spark-2.3.1-bin-hadoop2.7 /spark-2.3.1-bin-hadoop2.7 | |
// reference: https://stackoverflow.com/questions/36795680/copy-schema-from-one-dataframe-to-another-dataframe?rq=1 | |
case class Person(Dummy: String, Name: String, Timestamp: String, Age: Int) | |
val personDF = spark.sparkContext.parallelize(Seq(Person("dummy", "Ray", "12345", 23), Person("dummy", "John", "12345", 44))).toDF() | |
val personSchema = StructType( | |
Seq(StructField("Name", StringType, true), | |
StructField("Age", IntegerType, true))) | |
var dataRDD = spark.sparkContext.emptyRDD[Row] |
FROM openjdk:8-jre-alpine | |
RUN mkdir -p /opt/app | |
WORKDIR /opt/app | |
COPY ./run_jar.sh ./app-assembly.jar ./ | |
ENTRYPOINT ["./run_jar.sh"] |
from IPython.display import HTML | |
from IPython.display import display | |
# Taken from https://stackoverflow.com/questions/31517194/how-to-hide-one-specific-cell-input-or-output-in-ipython-notebook | |
tag = HTML('''<script> | |
code_show=true; | |
function code_toggle() { | |
if (code_show){ | |
$('div.cell.code_cell.rendered.selected div.input').hide(); | |
} else { |
#!/usr/bin/env python | |
import pyaudio | |
import socket | |
import sys | |
FORMAT = pyaudio.paInt16 | |
CHANNELS = 1 | |
RATE = 44100 | |
CHUNK = 4096 |
<?xml version="1.0" encoding="UTF-8"?> | |
<project xmlns="http://maven.apache.org/POM/4.0.0" | |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
<modelVersion>4.0.0</modelVersion> | |
<groupId>com.spnotes.spark</groupId> | |
<artifactId>HelloSparkStreaming</artifactId> | |
<version>1.0-SNAPSHOT</version> | |
<properties> |
curl http://spark-cluster-ip:6066/v1/submissions/status/driver-20151008145126-0000 |
Mostly taken from [3]
The RDD is how Spark simplifies complex operations like join or groupBy and hides the fact that under the hood, you’re dealing with fragmented data.
The number of partitions is important because a stage in Spark will operate on one partition at a time (and load the data in that partition into memory). Consequently, if you have fewer partitions than active stages, you will wind up under-utilizing your cluster. Furthermore, since with fewer partitions there’s more data in each partition, you increase the memory pressure on your program. On the flip side, with too many partitions, your performance may degrade as you take a greater hit from network and disk I/O.