sudo vim /etc/yum.repos.d/cassandra.repo
[cassandra]
name=Apache Cassandra
baseurl=https://www.apache.org/dist/cassandra/redhat/311x/
| sudo su | |
| apt-get install -y libyaml-dev python-dev python3-dev python3-pip | |
| pip3 install awscli-cwlogs | |
| if [ ! -d /var/awslogs/bin ] ; then | |
| mkdir -p /var/awslogs/bin | |
| ln -s /usr/local/bin/aws /var/awslogs/bin/aws | |
| fi | |
| mkdir /opt/awslogs | |
| cd /opt/awslogs | |
| curl https://s3.amazonaws.com/aws-cloudwatch/downloads/latest/awslogs-agent-setup.py -O |
| CREATE TABLE sampledb.test_empty_array_parquet | |
| WITH ( | |
| format = 'PARQUET', | |
| external_location = 's3://somewhere' | |
| ) | |
| AS SELECT * | |
| FROM sampledb.test_empty_array |
| /* | |
| Add following dependencies: | |
| com.microsoft.azure:azure-storage:2.0.0 | |
| org.apache.hadoop:hadoop-azure:2.7.3 | |
| Exclude: | |
| com.fasterxml.jackson.core:*:* | |
| */ | |
| spark.conf.set( | |
| "fs.azure.account.key.<your-storage-account-name>.blob.core.windows.net", | |
| "<your-storage-account-access-key>") |
| #! /bin/sh | |
| ### BEGIN INIT INFO | |
| # Provides: elasticsearch | |
| # Required-Start: $all | |
| # Required-Stop: $all | |
| # Default-Start: 2 3 4 5 | |
| # Default-Stop: 0 1 6 | |
| # Short-Description: Starts elasticsearch | |
| # Description: Starts elasticsearch using start-stop-daemon | |
| ### END INIT INFO |
| import org.apache.spark.sql.functions.udf | |
| import spark.implicits._ | |
| // read avro | |
| val input = "/Users/hakanilter/dev/workspace/mc/data/avroFiles/*" | |
| val data = spark.read | |
| .format("com.databricks.spark.avro") | |
| .option("header","true") | |
| .load(input) |
| # Script for generating csv partitions report for Impala | |
| IMPALA_DAEMON=localhost | |
| databases=$(impala-shell --quiet -i $IMPALA_DAEMON -d default --delimited -q "SHOW DATABASES" | cut -f1 | grep -e dl -e ods_) | |
| for database in $databases | |
| do | |
| echo $database | |
| directory="partitions/$database" | |
| mkdir -p $directory |
| tier1.sources = source1 | |
| tier1.channels = channel1 | |
| tier1.sinks = sink1 | |
| # sources | |
| tier1.sources.source1.type = org.apache.flume.source.kafka.KafkaSource | |
| tier1.sources.source1.zookeeperConnect = localhost:2181 | |
| tier1.sources.source1.topic = network-data | |
| tier1.sources.source1.groupId = flume-kafka-test | |
| tier1.sources.source1.channels = channel1 |
| #!/bin/bash | |
| # install git | |
| sudo yum install git | |
| # maven | |
| sudo wget http://repos.fedorapeople.org/repos/dchen/apache-maven/epel-apache-maven.repo -O /etc/yum.repos.d/epel-apache-maven.repo | |
| sudo sed -i s/\$releasever/6/g /etc/yum.repos.d/epel-apache-maven.repo | |
| sudo yum install -y apache-maven | |
| mvn --version |
| var elasticsearch = require('elasticsearch'); | |
| var elastic = new elasticsearch.Client({ | |
| host: 'localhost:9200', | |
| log: 'info' | |
| }); | |
| var kafka = require('kafka-node'), | |
| HighLevelConsumer = kafka.HighLevelConsumer, | |
| client = new kafka.Client(), | |
| consumer = new HighLevelConsumer( |