Denis A da115115

Spark - File formats and storage options

In this document, I'm using a data file containing 40 million records. The file is a text file with one record per line.

The following Scala code is run in a spark-shell:

val filename = "<path to the file>"
val file = sc.textFile(filename)
file.count()

Syntax: cat <filename> | jq -c '.[] | select( .<key> | contains("<value>"))'

Example: To get json record having _id equal 611

cat my.json | jq -c '.[] | select( ._id | contains(611))'

Remember: if JSON value has no double quotes (eg. for numeric) to do not supply in filter i.e. in contains(611)

Go to the download site - http://www.oracle.com/technetwork/topics/intel-macsoft-096467.html
Download "instantclient-basic-macos.x64-12.1.0.2.0.zip"
Download "instantclient-sqlplus-macos.x64-12.1.0.2.0.zip"
Download "instantclient-sdk-macos.x64-12.1.0.2.0.zip"

	from flask import Flask
	from flask import render_template
	import csv
	import json

	app = Flask(__name__)

	@app.route('/')
	def my_runs():
	runs = []

	import java.net.URI;
	import java.net.URISyntaxException;
	import java.sql.*;

	public class PrestoJDBC {
	// JDBC driver name and database URL
	static final String JDBC_DRIVER = "com.facebook.presto.jdbc.PrestoDriver";
	//static final String JDBC_DRIVER = "com.teradata.presto.jdbc42.Driver";

	static final String DB_URL = "jdbc:presto://ec2-xx-xx-xxx-xxx.ap-northeast-1.compute.amazonaws.com:8889/hive/default";

	--
	-- This will register the "planet" table within your AWS account
	--
	CREATE EXTERNAL TABLE planet (
	id BIGINT,
	type STRING,
	tags MAP<STRING,STRING>,
	lat DECIMAL(9,7),
	lon DECIMAL(10,7),
	nds ARRAY<STRUCT<ref: BIGINT>>,

	# Below are the dependencies required for installing the common combination of numpy, scipy, pandas and matplotlib
	# in an Alpine based Docker image.
	FROM alpine:3.4
	RUN echo "http://dl-8.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories
	RUN apk --no-cache --update-cache add gcc gfortran python python-dev py-pip build-base wget freetype-dev libpng-dev openblas-dev
	RUN ln -s /usr/include/locale.h /usr/include/xlocale.h
	RUN pip install numpy scipy pandas matplotlib

	#!/bin/sh

	git filter-branch --env-filter '

	OLD_EMAIL="[email protected]"
	CORRECT_NAME="The Dark Knight"
	CORRECT_EMAIL="[email protected]"

	if [ "$GIT_COMMITTER_EMAIL" = "$OLD_EMAIL" ]
	then

	from airflow import DAG
	from airflow.operators import BashOperator
	from datetime import datetime
	import os
	import sys

	args = {
	'owner': 'airflow'
	, 'start_date': datetime(2017, 1, 27)
	, 'provide_context': True

	GATEWAY_FLAGS := -I. -I/usr/local/include -I$(GOPATH)/src/github.com/grpc-ecosystem/grpc-gateway/third_party/googleapis -I/usr/local/include

	GRPC_FLAGS := --python_out=. --grpc_python_out=.

	code:
	python -m grpc_tools.protoc $(GRPC_FLAGS) $(GATEWAY_FLAGS) *.proto

	gw:
	protoc $(GATEWAY_FLAGS) \
	--go_out=Mgoogle/api/annotations.proto=github.com/grpc-ecosystem/grpc-gateway/third_party/googleapis/google/api,plugins=grpc:. \