Yes, Drill can query Sequencefile, via the hive metastore. Here's how.
hadoop fs -put /opt/mapr/hive/hive-0.13/examples/files/kv1.seq /user/vgonzalez/tmp
| <?xml version='1.0' encoding='UTF-8'?> | |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!-- | |
| Licensed to the Apache Software Foundation (ASF) under one or more | |
| contributor license agreements. See the NOTICE file distributed with | |
| this work for additional information regarding copyright ownership. | |
| The ASF licenses this file to You under the Apache License, Version 2.0 | |
| (the "License"); you may not use this file except in compliance with | |
| the License. You may obtain a copy of the License at | |
| http://www.apache.org/licenses/LICENSE-2.0 |
Good idea to create a volume, so you can get an idea of the space consumption and how compression helps you:
maprcli volume create -name eoddata -path /user/vgonzalez/eoddata
Assuming you have installed log-synth to /opt, the following will create 10 million rows in 50 threads, with each thread producing a file:
/opt/log-synth/synth -schema eoddata.json -count $((10 * 10**6)) -format json -output /mapr/se1/user/vgonzalez/eoddata/2015-05-18 -threads 50
| mport os | |
| import luigi | |
| class Foo(luigi.Task): | |
| def run(self): | |
| print "Running Foo" | |
| def requires(self): |
| DELETE /tweets-2015-04-29 | |
| POST tweets-2015-04-28 | |
| PUT /tweets-2015-04-29 | |
| { | |
| "settings": { | |
| "analysis": { | |
| "analyzer": { | |
| "tweet_text_analyzer": { | |
| "type": "english" |
| import datetime | |
| import luigi | |
| class TaskX(luigi.Task): | |
| x = luigi.IntParameter(default=777) | |
| def run(self): | |
| with self.output().open("w") as f: | |
| print >>f, self.x |
| server1 ansible_ssh_host=10.255.134.34 | |
| server2 ansible_ssh_host=10.255.134.35 | |
| server3 ansible_ssh_host=10.255.134.36 | |
| server4 ansible_ssh_host=10.255.134.37 | |
| server5 ansible_ssh_host=10.255.134.38 | |
| [cluster] | |
| server[1:5] |
create or replace view MapRTweets as select
CAST(t.`dir3` as INT) as `hour`,
CAST(t.`dir2` as INT) as `day`,
CAST(t.`dir1` as INT) as `month`,
CAST(t.`dir0` as INT) as `year`,
CAST(t.`id` as BIGINT) as `id`,
CAST(t.`user`.`id` as BIGINT) as `user_id`,
CAST(t.`text` as VARCHAR(140)) as `tweet`,| #!/bin/sh | |
| REGIONS="ap-southeast-2 ap-southeast-1 ap-northeast-1 us-east-1 us-west-1 us-west-2 eu-west-1 eu-central-1" | |
| for region in $REGIONS; do | |
| aws ec2 describe-spot-instance-requests --region $region ~/data/requests/spot-instance-requests-$region.json; | |
| done | |
| for region in $REGIONS; do | |
| aws ec2 describe-instances --region $region > ~/data/instances/instances-$region.json; |
| { | |
| "type": "file", | |
| "enabled": true, | |
| "connection": "file:///", | |
| "workspaces": { | |
| "root": { | |
| "location": "/", | |
| "writable": false, | |
| "defaultInputFormat": null | |
| }, |