Yes, Drill can query Sequencefile, via the hive metastore. Here's how.
hadoop fs -put /opt/mapr/hive/hive-0.13/examples/files/kv1.seq /user/vgonzalez/tmp
<?xml version='1.0' encoding='UTF-8'?> | |
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!-- | |
Licensed to the Apache Software Foundation (ASF) under one or more | |
contributor license agreements. See the NOTICE file distributed with | |
this work for additional information regarding copyright ownership. | |
The ASF licenses this file to You under the Apache License, Version 2.0 | |
(the "License"); you may not use this file except in compliance with | |
the License. You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 |
Good idea to create a volume, so you can get an idea of the space consumption and how compression helps you:
maprcli volume create -name eoddata -path /user/vgonzalez/eoddata
Assuming you have installed log-synth to /opt, the following will create 10 million rows in 50 threads, with each thread producing a file:
/opt/log-synth/synth -schema eoddata.json -count $((10 * 10**6)) -format json -output /mapr/se1/user/vgonzalez/eoddata/2015-05-18 -threads 50
mport os | |
import luigi | |
class Foo(luigi.Task): | |
def run(self): | |
print "Running Foo" | |
def requires(self): |
DELETE /tweets-2015-04-29 | |
POST tweets-2015-04-28 | |
PUT /tweets-2015-04-29 | |
{ | |
"settings": { | |
"analysis": { | |
"analyzer": { | |
"tweet_text_analyzer": { | |
"type": "english" |
import datetime | |
import luigi | |
class TaskX(luigi.Task): | |
x = luigi.IntParameter(default=777) | |
def run(self): | |
with self.output().open("w") as f: | |
print >>f, self.x |
server1 ansible_ssh_host=10.255.134.34 | |
server2 ansible_ssh_host=10.255.134.35 | |
server3 ansible_ssh_host=10.255.134.36 | |
server4 ansible_ssh_host=10.255.134.37 | |
server5 ansible_ssh_host=10.255.134.38 | |
[cluster] | |
server[1:5] |
create or replace view MapRTweets as select
CAST(t.`dir3` as INT) as `hour`,
CAST(t.`dir2` as INT) as `day`,
CAST(t.`dir1` as INT) as `month`,
CAST(t.`dir0` as INT) as `year`,
CAST(t.`id` as BIGINT) as `id`,
CAST(t.`user`.`id` as BIGINT) as `user_id`,
CAST(t.`text` as VARCHAR(140)) as `tweet`,
#!/bin/sh | |
REGIONS="ap-southeast-2 ap-southeast-1 ap-northeast-1 us-east-1 us-west-1 us-west-2 eu-west-1 eu-central-1" | |
for region in $REGIONS; do | |
aws ec2 describe-spot-instance-requests --region $region ~/data/requests/spot-instance-requests-$region.json; | |
done | |
for region in $REGIONS; do | |
aws ec2 describe-instances --region $region > ~/data/instances/instances-$region.json; |
{ | |
"type": "file", | |
"enabled": true, | |
"connection": "file:///", | |
"workspaces": { | |
"root": { | |
"location": "/", | |
"writable": false, | |
"defaultInputFormat": null | |
}, |