Created
June 24, 2016 20:31
-
-
Save jongwook/79bfd00f34f5f525ac9e40ab5b2a3808 to your computer and use it in GitHub Desktop.
Scripts to build Spark according to the CDH version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Jenkins build parameters | |
TAG=v2.0.0-rc1 | |
SNAPSHOT=false | |
BRANCH=branch-2.0 | |
CDH=5.7.1 | |
DEPLOY=false | |
# should abort when any command fails | |
set -e | |
# The jenkins script below | |
WORKSPACE=/tmp/spark-build | |
rm -rf $WORKSPACE | |
mkdir -p $WORKSPACE | |
cd $WORKSPACE | |
export PATH=$JAVA_HOME/bin:$PATH | |
# proxy settings for kakao | |
if [[ $(hostname -f) == *"iwilab"* ]] || [[ $(hostname -f) == *"akao"* ]]; then | |
export http_proxy=http://proxy.iwilab.com:8082 | |
export https_proxy=http://proxy.iwilab.com:8082 | |
export no_proxy="localhost,127.0.0.1,.iwilab.com" | |
fi | |
git init . | |
git config remote.origin.url https://github.com/apache/spark | |
git config remote.origin.fetch +refs/heads/*:refs/remotes/origin/* | |
git fetch --tags | |
rm -rf * | |
if [[ -z "$TAG" ]]; then | |
git checkout $BRANCH | |
git reset --hard | |
git pull | |
TAG=`git describe --tags` | |
echo "TAG=$TAG" | |
SNAPSHOT=true | |
else | |
git reset $TAG --hard | |
TAG=`git describe --tags` | |
fi | |
VERSION=${TAG/v/}-cdh$CDH | |
echo "SNAPSHOT=$SNAPSHOT" | |
if [[ $SNAPSHOT == "true" ]]; then | |
VERSION=$VERSION-SNAPSHOT | |
fi | |
echo "VERSION=$VERSION" | |
subdirs=`echo */` | |
poms=`for dir in $subdirs; do ls ${dir}pom.xml 2> /dev/null || echo > /dev/null; ls ${dir}*/pom.xml 2> /dev/null || echo > /dev/null; done` | |
poms="pom.xml $poms" | |
echo '#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
from lxml import etree | |
nexus = """ | |
<distributionManagement> | |
<repository> | |
<id>dk-aa-release</id> | |
<name>DaumKakao AA Releases</name> | |
<url>http://maven.daumcorp.com/content/repositories/dk-aa-release/</url> | |
</repository> | |
<snapshotRepository> | |
<id>dk-aa-snapshots</id> | |
<name>Daumkakao AA Snapshots</name> | |
<url>http://maven.daumcorp.com/content/repositories/dk-aa-snapshots/</url> | |
</snapshotRepository> | |
</distributionManagement> | |
""" | |
cloudera = """ | |
<repository> | |
<id>cloudera</id> | |
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> | |
</repository> | |
""" | |
filename = sys.argv[1] | |
tag = sys.argv[2] | |
with open(filename) as input: | |
pom = etree.XML(input.read().encode("utf-8")) | |
def childtags(xml, tag): | |
return [el for el in xml.iterchildren() if type(el.tag) is str and el.tag.split("}")[-1] == tag] | |
def childtag(xml, tag): | |
try: | |
return childtags(xml, tag)[0] | |
except: | |
return None | |
# update package versions | |
version = childtag(pom, "version") | |
if version is not None: | |
version.text = tag | |
# update the parent pom version | |
parent = childtag(pom, "parent") | |
if parent is not None: | |
group = childtag(parent, "groupId") | |
artifact = childtag(parent, "artifactId") | |
version = childtag(parent, "version") | |
if group.text == "org.apache" and artifact.text == "apache": | |
group.text = "com.cloudera.cdh" | |
artifact.text = "cdh-root" | |
version.text = "'$CDH'" | |
elif group.text == "org.apache.spark" and artifact.text == "spark-parent_2.11": | |
version.text = tag | |
if artifact.text == "cdh-root": | |
for elem in childtags(pom, "distributionManagement"): | |
pom.remove(elem) | |
pom.append(etree.XML(nexus)) | |
repos = childtag(pom, "repositories") | |
repos.append(etree.XML(cloudera)) | |
# remove test-only dependencies | |
dependencies = childtag(pom, "dependencies") | |
for dependency in childtags(dependencies, "dependency"): | |
scope = childtag(dependency, "scope") | |
if scope is not None and scope.text == "test": | |
dependencies.remove(dependency) | |
group = childtag(dependency, "groupId") | |
artifact = childtag(dependency, "artifactId") | |
if artifact.text == "hive-beeline": | |
dependencies.append(etree.XML(""" | |
<dependency> | |
<groupId>jline</groupId> | |
<artifactId>jline</artifactId> | |
<version>2.12</version> | |
</dependency> | |
""")) | |
# remove shadeTestJar configuration | |
build = childtag(pom, "build") | |
if build is not None: | |
plugins = childtag(build, "plugins") | |
if plugins is not None: | |
for plugin in childtags(plugins, "plugin"): | |
configuration = childtag(plugin, "configuration") | |
if configuration is not None: | |
shade = childtag(configuration, "shadeTestJar") | |
if shade is not None: | |
configuration.remove(shade) | |
# edit properties to follow CDH versions | |
properties = childtag(pom, "properties") | |
if properties is not None: | |
values = { | |
"slf4j.version": "${cdh.slf4j.version}", | |
"hadoop.version": "${cdh.hadoop.version}", | |
"protobuf.version": "${cdh.protobuf.version}", | |
"hbase.version": "${cdh.hbase.version}", | |
"hbase.artifact": "hbase-server", | |
"flume.version": "${cdh.flume-ng.version}", | |
"zookeeper.version": "${cdh.zookeeper.version}", | |
"hive.group": "org.spark-project.hive", | |
"hive.version": "1.2.1.spark2", | |
"hive.version.short": "1.2.1", | |
"jline.version": "${scala.version}", | |
"derby.version": "10.11.1.1", | |
"parquet.version": "1.8.1", | |
"avro.version": "${cdh.avro.version}", | |
"avro.mapred.classifier": "hadoop2", | |
"jets3t.version": "${cdh.jets3t.version}", | |
"codehaus.jackson.version": "${cdh.jackson.version}", | |
"fasterxml.jackson.version": "2.6.5", | |
"snappy.version": "${cdh.hadoop-snappy.version}" | |
} | |
for name, value in values.items(): | |
property = childtag(properties, name) | |
if property is not None: | |
property.text = value | |
# fix datanucleus versions | |
if filename == "pom.xml": | |
management = childtag(pom, "dependencyManagement") | |
dependencies = childtag(management, "dependencies") | |
dependencies.append(etree.XML(""" | |
<dependency> | |
<groupId>org.datanucleus</groupId> | |
<artifactId>datanucleus-rdbms</artifactId> | |
<version>3.2.9</version> | |
</dependency> | |
""")) | |
dependencies.append(etree.XML(""" | |
<dependency> | |
<groupId>org.datanucleus</groupId> | |
<artifactId>datanucleus-api-jdo</artifactId> | |
<version>3.2.6</version> | |
</dependency> | |
""")) | |
# remove scalastyle | |
build = childtag(pom, "build") | |
if build is not None: | |
plugins = childtag(build, "plugins") | |
if plugins is not None: | |
for plugin in childtags(plugins, "plugin"): | |
artifact = childtag(plugin, "artifactId") | |
if artifact is not None and artifact.text == "scalastyle-maven-plugin": | |
plugins.remove(plugin) | |
with open(filename, "w") as output: | |
output.write(etree.tostring(pom).decode("utf-8")) | |
output.write("\n") | |
' > patch.py | |
chmod +x patch.py | |
for pom in $poms; do | |
echo "Patching $pom..." | |
./patch.py $pom $VERSION | |
done | |
if which gsed; then | |
SED=gsed | |
else | |
SED=sed | |
fi | |
$SED -i "s/val shortAppName/val shortAppName = appName \/\//g" core/src/main/scala/org/apache/spark/ui/UIUtils.scala | |
echo ' | |
# Default system properties included when running spark-submit. | |
# This is useful for setting default environmental settings. | |
spark.master yarn | |
spark.eventLog.enabled true | |
spark.eventLog.dir hdfs:///user/spark/applicationHistory | |
spark.driver.memory 4g | |
spark.executor.memory 4g | |
spark.executor.instances 4 | |
spark.executor.extraJavaOptions -XX:+PrintGCDetails | |
#spark.yarn.historyServer.address http://raccon015.kr2.iwilab.com:18088 | |
' > conf/spark-defaults.conf | |
echo ' | |
#!/usr/bin/env bash | |
# This file is sourced when running various Spark programs. | |
# Copy it as spark-env.sh and edit that to configure Spark for your site. | |
export HADOOP_CONF_DIR=/etc/hive/conf | |
' > conf/spark-env.sh | |
echo ' | |
# Set everything to be logged to the console | |
log4j.rootCategory=INFO, console | |
log4j.appender.console=org.apache.log4j.ConsoleAppender | |
log4j.appender.console.target=System.err | |
log4j.appender.console.layout=org.apache.log4j.PatternLayout | |
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n | |
# Settings to quiet third party logs that are too verbose | |
log4j.logger.org.eclipse.jetty=WARN | |
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO | |
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO | |
log4j.rootCategory=WARN, console | |
' > conf/log4j.properties | |
if [[ $CDH == 5.3.* ]]; then | |
HADOOP=2.5.0-cdh$CDH | |
HADOOP_PROFILE=hadoop-2.4 | |
else | |
HADOOP=2.6.0-cdh$CDH | |
HADOOP_PROFILE=hadoop-2.6 | |
fi | |
export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m" | |
./dev/make-distribution.sh --name 2.11 --tgz -Psparkr -Phive -Pyarn -Dhadoop.version=$HADOOP -P$HADOOP_PROFILE -DskipTests -Dmaven.test.skip=true -Dscala-2.11 | |
if [[ $DEPLOY == "true" ]]; then | |
build/mvn install -Phive -Pyarn -Dhadoop.version=$HADOOP -Phadoop-2.4 -D$HADOOP_PROFILE -Dmaven.test.skip=true -Dscala-2.11 | |
build/mvn deploy -Phive -Pyarn -Dhadoop.version=$HADOOP -Phadoop-2.4 -D$HADOOP_PROFILE -Dmaven.test.skip=true -Dscala-2.11 | |
fi | |
# copy the default configuration files | |
for tgz in *.tgz; do | |
name=${tgz/\.tgz/} | |
tar=${tgz/\.tgz/\.tar} | |
assembly=spark-assembly-$VERSION-hadoop$HADOOP.jar | |
gunzip $tgz | |
mkdir -p $name/conf | |
cp conf/spark-defaults.conf conf/spark-env.sh $name/conf/ | |
tar -rf $tar $name/conf/spark-defaults.conf $name/conf/spark-env.sh | |
gzip $tar | |
mv $tar.gz ../$tgz | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment