On each node:
Set up packages and install Hadoop:
#!/bin/bash
sudo yum install java-1.8.0-openjdk-devel wget git bzip2 -y
echo export JAVA_HOME=/usr/lib/jvm/java >> ~/.bashrc
source ~/.bashrc
<?xml version="1.0" encoding="UTF-8"?> | |
<!DOCTYPE fontconfig SYSTEM "fonts.dtd"> | |
<fontconfig> | |
<!-- | |
Make Noto Color Emoji the initial fallback font for sans-serif, sans, and | |
monospace. | |
--> | |
<match> | |
<test name="family"><string>sans-serif</string></test> |
# Base16-Monokai | |
# | |
# color values from https://github.com/chriskempson/base16-vim/blob/037f328/colors/base16-monokai.vim | |
function bobthefish_colors -S -d 'Define a custom bobthefish color scheme' | |
__bobthefish_colors base16-dark | |
set -l base00 272822 | |
set -l base01 383830 | |
set -l base02 49483e |
package com.github.geofbot; | |
import org.apache.commons.cli.*; | |
import org.apache.commons.collections.CollectionUtils; | |
import org.apache.flink.api.common.functions.*; | |
import org.apache.flink.api.common.operators.Order; | |
import org.apache.flink.api.java.DataSet; | |
import org.apache.flink.api.java.ExecutionEnvironment; | |
import org.apache.flink.api.java.aggregation.Aggregations; | |
import org.apache.flink.api.java.operators.IterativeDataSet; |
#!/bin/bash | |
# Runs after installation of included Flink | |
set -e | |
cd ~ | |
sudo rm /usr/lib/flink/lib/flink-* # make sure we don't have two versions of jars | |
sudo tar -xzf flinkerations-emr.tgz -C /usr/lib/ | |
rm flinkerations-emr.tgz | |
# Copy over EMRFS jars to Flink lib path | |
sudo cp /usr/share/aws/emr/s3-dist-cp/lib/*.jar /usr/lib/flink/lib/ |
# Barebones test file to check for issues | |
import math | |
from flink.functions.Aggregation import Sum | |
from flink.functions.GroupReduceFunction import GroupReduceFunction | |
from flink.plan.Environment import get_environment | |
class NormalizeVectorGroupReducer(GroupReduceFunction): | |
""" |
[Unit] | |
Description=Apache Flink | |
[Service] | |
Type=forking | |
User=ec2-user | |
ExecStart=/home/ec2-user/flink/bin/start-cluster.sh | |
ExecStop=-/home/ec2-user/flink/bin/stop-cluster.sh | |
Restart=always | |
On each node:
Set up packages and install Hadoop:
#!/bin/bash
sudo yum install java-1.8.0-openjdk-devel wget git bzip2 -y
echo export JAVA_HOME=/usr/lib/jvm/java >> ~/.bashrc
source ~/.bashrc
# modified from http://www.willmcginnis.com/2015/11/08/getting-started-with-python-and-apache-flink/ | |
from flink.plan.Environment import get_environment | |
from flink.plan.Constants import INT, STRING, WriteMode | |
from flink.functions.GroupReduceFunction import GroupReduceFunction | |
class Adder(GroupReduceFunction): | |
def reduce(self, iterator, collector): | |
count, word = iterator.next() | |
count += sum([x[0] for x in iterator]) |
package org.apache.flink; | |
import org.apache.commons.io.Charsets; | |
import org.apache.commons.io.FileUtils; | |
import org.apache.flink.api.common.functions.RichFlatMapFunction; | |
import org.apache.flink.api.common.functions.RichMapFunction; | |
import org.apache.flink.api.java.DataSet; | |
import org.apache.flink.api.java.ExecutionEnvironment; | |
import org.apache.flink.api.java.tuple.Tuple; | |
import org.apache.flink.api.java.tuple.Tuple1; |
NOTE: HDFS is required for Flink's DistributedCache which distributes Python plans to worker nodes. We use BlueData Hadoop CDH nodes.
Remember to make sure you aren't using env.execute(local=True)
in your Python plans!
On the master node:
Install git
and other useful things that we like
sudo yum install git bzip2 -y