sudo vim /etc/yum.repos.d/cassandra.repo
[cassandra]
name=Apache Cassandra
baseurl=https://www.apache.org/dist/cassandra/redhat/311x/
CREATE TABLE sampledb.test_empty_array_parquet | |
WITH ( | |
format = 'PARQUET', | |
external_location = 's3://somewhere' | |
) | |
AS SELECT * | |
FROM sampledb.test_empty_array |
/* | |
Add following dependencies: | |
com.microsoft.azure:azure-storage:2.0.0 | |
org.apache.hadoop:hadoop-azure:2.7.3 | |
Exclude: | |
com.fasterxml.jackson.core:*:* | |
*/ | |
spark.conf.set( | |
"fs.azure.account.key.<your-storage-account-name>.blob.core.windows.net", | |
"<your-storage-account-access-key>") |
#! /bin/sh | |
### BEGIN INIT INFO | |
# Provides: elasticsearch | |
# Required-Start: $all | |
# Required-Stop: $all | |
# Default-Start: 2 3 4 5 | |
# Default-Stop: 0 1 6 | |
# Short-Description: Starts elasticsearch | |
# Description: Starts elasticsearch using start-stop-daemon | |
### END INIT INFO |
import org.apache.spark.sql.functions.udf | |
import spark.implicits._ | |
// read avro | |
val input = "/Users/hakanilter/dev/workspace/mc/data/avroFiles/*" | |
val data = spark.read | |
.format("com.databricks.spark.avro") | |
.option("header","true") | |
.load(input) |
# Script for generating csv partitions report for Impala | |
IMPALA_DAEMON=localhost | |
databases=$(impala-shell --quiet -i $IMPALA_DAEMON -d default --delimited -q "SHOW DATABASES" | cut -f1 | grep -e dl -e ods_) | |
for database in $databases | |
do | |
echo $database | |
directory="partitions/$database" | |
mkdir -p $directory |
tier1.sources = source1 | |
tier1.channels = channel1 | |
tier1.sinks = sink1 | |
# sources | |
tier1.sources.source1.type = org.apache.flume.source.kafka.KafkaSource | |
tier1.sources.source1.zookeeperConnect = localhost:2181 | |
tier1.sources.source1.topic = network-data | |
tier1.sources.source1.groupId = flume-kafka-test | |
tier1.sources.source1.channels = channel1 |
#!/bin/bash | |
# install git | |
sudo yum install git | |
# maven | |
sudo wget http://repos.fedorapeople.org/repos/dchen/apache-maven/epel-apache-maven.repo -O /etc/yum.repos.d/epel-apache-maven.repo | |
sudo sed -i s/\$releasever/6/g /etc/yum.repos.d/epel-apache-maven.repo | |
sudo yum install -y apache-maven | |
mvn --version |
var elasticsearch = require('elasticsearch'); | |
var elastic = new elasticsearch.Client({ | |
host: 'localhost:9200', | |
log: 'info' | |
}); | |
var kafka = require('kafka-node'), | |
HighLevelConsumer = kafka.HighLevelConsumer, | |
client = new kafka.Client(), | |
consumer = new HighLevelConsumer( |
SparkConf sparkConf = new SparkConf() | |
.setAppName(JdbcDynamoDbExportJob.class.getSimpleName()) | |
.setMaster(config.getProperty("spark.master")); | |
JavaSparkContext jsc = new JavaSparkContext(sparkConf); | |
SQLContext sqlContext = new SQLContext(jsc); | |
// read from database | |
Properties properties = new Properties(); | |
properties.setProperty("user", config.getProperty("jdbc.user")); | |
properties.setProperty("password", config.getProperty("jdbc.pass")); |