cipri7329 cipri7329

zeppelin 0.6.1 with cdh 5.7.0 on ubuntu 14.04 lts

Instalation

resources

required packages

sudo apt-get install node nodejs npm

load from file

val moviesDump = sc.textFile("hdfs://localhost:8020/user/datalake/movies/ml-latest/movies.csv")

case class Movie(movieId : Integer, title : String, genres : List[String])
 
val movies = moviesDump.map(s => s.split(",")).filter(s => s(0)!="movieId")
    .map( 
        s => Movie(s(0).toInt,
            s.slice(1, s.size-1).mkString(""),

GET _cluster/state?pretty

GET _search
{
    "query": {
        "match_all": {}
 }

  code sample

Scala Code Snippets

Scalar product

given two lists of doubles a scalar product is the sum of the products between each pair of corresponding elements

	/**
	*
	Find longest sequence of zeros in binary representation of an integer.
	*/

	public class BinaryLongestZeroSequence {
	/**
	* worst-case time complexity is O(log(N));
	* number of bits = log(N) ==> worst case is O(N)
	* @param N

	import time

	__author__ = 'user'

	import base64
	import json
	from kafka import KafkaConsumer

	from kafka import TopicPartition

	import time
	from kafka import KafkaProducer
	import json
	import base64

	KAFKA_TOPIC = "scraped-data"
	KAFKA_HOST = "localhost:9092"

	producer = KafkaProducer(bootstrap_servers=KAFKA_HOST, value_serializer=lambda v: json.dumps(v).encode('utf-8'))


	wikiagent.sources = spool
	wikiagent.channels = memChannel
	wikiagent.sinks = HDFS

	# source config
	wikiagent.sources.spool.type = spooldir
	wikiagent.sources.spool.channels = memChannel

	wikiagent.sources.spool.spoolDir = /home/ubuntu/datalake/processed

	//In the cell below, determine what is the most frequent CHARACTER in the README, and how many times was it used?
	//spark and scala

	var charCounts2 = readme.flatMap(line => line.toList).
	filter( a => !a.equals("\n") && !a.equals(" ") && !a.equals("") ).
	filter( _ != ' ').
	map(character => (character, 1)).
	reduceByKey((a,b) => a + b).
	reduce((a, b) => if (a._2 > b._2) a else b)
	//take(55).

	//example from https://courses.bigdatauniversity.com/courses/course-v1:BigDataUniversity+BD0212EN+2016/ exercises

	val input1 = sc.textFile("data/trips/*")

	val header1 = input1.first // to skip the header row

	val trips = input1.
	filter(_ != header1).
	map(_.split(",")).
	map(utils.Trip.parse(_))