On each node:
Set up packages and install Hadoop:
#!/bin/bash
sudo yum install java-1.8.0-openjdk-devel wget git bzip2 -y
echo export JAVA_HOME=/usr/lib/jvm/java >> ~/.bashrc
source ~/.bashrc
| <?xml version="1.0" encoding="UTF-8"?> | |
| <!DOCTYPE fontconfig SYSTEM "fonts.dtd"> | |
| <fontconfig> | |
| <!-- | |
| Make Noto Color Emoji the initial fallback font for sans-serif, sans, and | |
| monospace. | |
| --> | |
| <match> | |
| <test name="family"><string>sans-serif</string></test> |
| # Base16-Monokai | |
| # | |
| # color values from https://github.com/chriskempson/base16-vim/blob/037f328/colors/base16-monokai.vim | |
| function bobthefish_colors -S -d 'Define a custom bobthefish color scheme' | |
| __bobthefish_colors base16-dark | |
| set -l base00 272822 | |
| set -l base01 383830 | |
| set -l base02 49483e |
| package com.github.geofbot; | |
| import org.apache.commons.cli.*; | |
| import org.apache.commons.collections.CollectionUtils; | |
| import org.apache.flink.api.common.functions.*; | |
| import org.apache.flink.api.common.operators.Order; | |
| import org.apache.flink.api.java.DataSet; | |
| import org.apache.flink.api.java.ExecutionEnvironment; | |
| import org.apache.flink.api.java.aggregation.Aggregations; | |
| import org.apache.flink.api.java.operators.IterativeDataSet; |
| #!/bin/bash | |
| # Runs after installation of included Flink | |
| set -e | |
| cd ~ | |
| sudo rm /usr/lib/flink/lib/flink-* # make sure we don't have two versions of jars | |
| sudo tar -xzf flinkerations-emr.tgz -C /usr/lib/ | |
| rm flinkerations-emr.tgz | |
| # Copy over EMRFS jars to Flink lib path | |
| sudo cp /usr/share/aws/emr/s3-dist-cp/lib/*.jar /usr/lib/flink/lib/ |
| # Barebones test file to check for issues | |
| import math | |
| from flink.functions.Aggregation import Sum | |
| from flink.functions.GroupReduceFunction import GroupReduceFunction | |
| from flink.plan.Environment import get_environment | |
| class NormalizeVectorGroupReducer(GroupReduceFunction): | |
| """ |
| [Unit] | |
| Description=Apache Flink | |
| [Service] | |
| Type=forking | |
| User=ec2-user | |
| ExecStart=/home/ec2-user/flink/bin/start-cluster.sh | |
| ExecStop=-/home/ec2-user/flink/bin/stop-cluster.sh | |
| Restart=always | |
On each node:
Set up packages and install Hadoop:
#!/bin/bash
sudo yum install java-1.8.0-openjdk-devel wget git bzip2 -y
echo export JAVA_HOME=/usr/lib/jvm/java >> ~/.bashrc
source ~/.bashrc
| # modified from http://www.willmcginnis.com/2015/11/08/getting-started-with-python-and-apache-flink/ | |
| from flink.plan.Environment import get_environment | |
| from flink.plan.Constants import INT, STRING, WriteMode | |
| from flink.functions.GroupReduceFunction import GroupReduceFunction | |
| class Adder(GroupReduceFunction): | |
| def reduce(self, iterator, collector): | |
| count, word = iterator.next() | |
| count += sum([x[0] for x in iterator]) |
| package org.apache.flink; | |
| import org.apache.commons.io.Charsets; | |
| import org.apache.commons.io.FileUtils; | |
| import org.apache.flink.api.common.functions.RichFlatMapFunction; | |
| import org.apache.flink.api.common.functions.RichMapFunction; | |
| import org.apache.flink.api.java.DataSet; | |
| import org.apache.flink.api.java.ExecutionEnvironment; | |
| import org.apache.flink.api.java.tuple.Tuple; | |
| import org.apache.flink.api.java.tuple.Tuple1; |
NOTE: HDFS is required for Flink's DistributedCache which distributes Python plans to worker nodes. We use BlueData Hadoop CDH nodes.
Remember to make sure you aren't using env.execute(local=True) in your Python plans!
On the master node:
Install git and other useful things that we like
sudo yum install git bzip2 -y