Vince Gonzalez vicenteg

Yes, Drill can query Sequencefile, via the hive metastore. Here's how.

Copy some sample data to an MapRFS/HDFS location

hadoop fs -put /opt/mapr/hive/hive-0.13/examples/files/kv1.seq /user/vgonzalez/tmp

Create an external table in Hive, referencing the sequencefile

Good idea to create a volume, so you can get an idea of the space consumption and how compression helps you:

maprcli volume create -name eoddata -path /user/vgonzalez/eoddata

Assuming you have installed log-synth to /opt, the following will create 10 million rows in 50 threads, with each thread producing a file:

/opt/log-synth/synth -schema eoddata.json -count $((10 * 10**6)) -format json -output /mapr/se1/user/vgonzalez/eoddata/2015-05-18 -threads 50

Tweets

create or replace view MapRTweets as select
CAST(t.`dir3` as INT) as `hour`,
CAST(t.`dir2` as INT) as `day`,
CAST(t.`dir1` as INT) as `month`,
CAST(t.`dir0` as INT) as `year`,
CAST(t.`id` as BIGINT) as `id`,
CAST(t.`user`.`id` as BIGINT) as `user_id`,
CAST(t.`text` as VARCHAR(140)) as `tweet`,

	<?xml version='1.0' encoding='UTF-8'?>
	<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
	Licensed to the Apache Software Foundation (ASF) under one or more
	contributor license agreements. See the NOTICE file distributed with
	this work for additional information regarding copyright ownership.
	The ASF licenses this file to You under the Apache License, Version 2.0
	(the "License"); you may not use this file except in compliance with
	the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	mport os

	import luigi


	class Foo(luigi.Task):
	def run(self):
	print "Running Foo"

	def requires(self):

	DELETE /tweets-2015-04-29
	POST tweets-2015-04-28

	PUT /tweets-2015-04-29
	{
	"settings": {
	"analysis": {
	"analyzer": {
	"tweet_text_analyzer": {
	"type": "english"

	import datetime
	import luigi

	class TaskX(luigi.Task):
	x = luigi.IntParameter(default=777)

	def run(self):
	with self.output().open("w") as f:
	print >>f, self.x

	server1 ansible_ssh_host=10.255.134.34
	server2 ansible_ssh_host=10.255.134.35
	server3 ansible_ssh_host=10.255.134.36
	server4 ansible_ssh_host=10.255.134.37
	server5 ansible_ssh_host=10.255.134.38


	[cluster]
	server[1:5]

	#!/bin/sh

	REGIONS="ap-southeast-2 ap-southeast-1 ap-northeast-1 us-east-1 us-west-1 us-west-2 eu-west-1 eu-central-1"

	for region in $REGIONS; do
	aws ec2 describe-spot-instance-requests --region $region ~/data/requests/spot-instance-requests-$region.json;
	done

	for region in $REGIONS; do
	aws ec2 describe-instances --region $region > ~/data/instances/instances-$region.json;

	{
	"type": "file",
	"enabled": true,
	"connection": "file:///",
	"workspaces": {
	"root": {
	"location": "/",
	"writable": false,
	"defaultInputFormat": null
	},