Toshiaki Toyama manboubird

Set up pseudo distributed mode yarn cluster with cdh5.1.2

(C-x means ctrl+x, M-x means alt+x)

The default prefix is C-b. If you (or your muscle memory) prefer C-a, you need to add this to ~/.tmux.conf:

	Personal vim command memo

	- Tab
	gt : go to next tab
	gT : go to previous tab
	{i}gt : go to tab in position i
	:tab split : copy the current window to a new tab of its own

	- FuzzyFinder.vim
	:FufFile **/

	#!/bin/bash
	PID_FILE=/usr/local/hive/pid/hiveserver2.pid
	if [ -f ${PID_FILE} ]; then
	echo "PID_FILE exists. ${PID_FILE}"
	echo "kill `cat ${PID_FILE}`"
	echo "rm ${PID_FILE}"
	exit
	fi
	/usr/local/hive/bin/hiveserver2 >> /usr/local/hive/logs/hiveserver2.`date +%F`.log 2>&1 &
	PID=$!

	# http://www.gnu.org/software/parallel/parallel_tutorial.html#controling_the_output

	# issue hive queries
	cat hqls.hql \| parallel --line-buffer --tagstring "[ {#} - {} ]" --results parallel_std_logs --eta --progress --joblog parallel_job_logs -j0 --halt 1 --load 100% --noswap "hive -e {} > result_{#}_{}.txt"

	# dry run
	cat hqls.hql \| parallel --dry-run --line-buffer --tagstring "[ {#} - {} ]" --results parallel_std_logs --eta --progress --joblog parallel_job_logs -j0 --halt 1 --load 100% --noswap "hive -e {} > result_{#}_{}.txt"

	# transfer jar and submit job
	/usr/local/bin/parallel -S hadoop@localhost --wd ... --transfer "/usr/local/hadoop-mr1/bin/hadoop jar {} scalding.marketing.JobRunner scalding.marketing.UniqueUser --hdfs --output /tmp/output --dt 2000-02-08" ::: ./target/scalding-marketing-0.0.1.jar

	#!/usr/bin/ruby

	class IPGenerator
	public
	def initialize(session_count, session_length)
	@session_count = session_count
	@session_length = session_length

	@sessions = {}
	end

	; http://nathanmarz.com/blog/introducing-cascalog-a-clojure-based-query-language-for-hado.html
	;
	; Run with method executing a script:
	; drake -w cascalog.drake %playground
	;
	; Run with code eval:
	; drake -w cascalog.drake %playground.eval
	;
	CASCALOG_JAR:=$[HOME]/local/bin/cascalog-standalone.jar

	// transcribed from an Apache Spark 1.0 spark-shell session
	// using data from http://chriswhong.com/open-data/foil_nyc_taxi/
	// and the QTree algorithm for approximate quantiles over large datasets
	// each of the distanceRange and minutesRange calculations below takes about 15 minutes on my four-core SSD-based Macbook Pro

	import com.twitter.algebird._
	import com.twitter.algebird.Operators._
	implicit val qtSemigroupD = new QTreeSemigroup[Double](6)

	val in = sc.textFile("trip_data") // a directory containing all the trip_data*.csv files downloaded from the above link

	#!/bin/bash

	set -o pipefail # Trace errors through pipes
	set -o errtrace # Trace ERR through 'time command'

	error() {
	JOB="$0" # job name
	LASTLINE="$1" # line of error occurrence
	LASTERR="$2" # error code
	echo "ERROR in ${JOB}:line ${LASTLINE} - exit code ${LASTERR}"