-
-
Save dain/e931a43b3463136fd7bf to your computer and use it in GitHub Desktop.
/* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package com.facebook.presto.orc; | |
import com.facebook.presto.orc.metadata.OrcMetadataReader; | |
import com.facebook.presto.spi.block.Block; | |
import com.facebook.presto.spi.type.Type; | |
import com.facebook.presto.testing.TestingConnectorSession; | |
import com.google.common.collect.ImmutableMap; | |
import io.airlift.units.DataSize; | |
import org.joda.time.DateTimeZone; | |
import java.io.File; | |
import java.io.IOException; | |
import java.util.Map; | |
import java.util.Map.Entry; | |
import static com.facebook.presto.spi.type.BigintType.BIGINT; | |
import static com.facebook.presto.spi.type.DoubleType.DOUBLE; | |
import static com.facebook.presto.spi.type.VarcharType.VARCHAR; | |
import static io.airlift.units.DataSize.Unit.MEGABYTE; | |
@SuppressWarnings("UseOfSystemOutOrSystemErr") | |
public final class ScanOrcFile | |
{ | |
public static void main(String... args) | |
throws Exception | |
{ | |
File file = new File(args[0]); | |
FileOrcDataSource orcDataSource = new FileOrcDataSource(file, new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE)); | |
OrcReader orcReader = new OrcReader(orcDataSource, new OrcMetadataReader(), new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE)); | |
// | |
// Set your column types here | |
// | |
Map<Integer, Type> columnTypes = ImmutableMap.<Integer, Type>builder() | |
.put(0, BIGINT) | |
.put(1, DOUBLE) | |
.put(2, VARCHAR) | |
.build(); | |
OrcRecordReader recordReader = orcReader.createRecordReader(columnTypes, OrcPredicate.TRUE, DateTimeZone.getDefault()); | |
long rows = 0; | |
for (int batchSize = recordReader.nextBatch(); batchSize > 0; batchSize = recordReader.nextBatch()) { | |
rows += readBatch(columnTypes, recordReader); | |
} | |
System.out.println(); | |
System.out.println("rows: " + rows); | |
} | |
private static int readBatch(Map<Integer, Type> columnTypes, OrcRecordReader recordReader) | |
throws IOException | |
{ | |
int batchSize = recordReader.nextBatch(); | |
for (Entry<Integer, Type> entry : columnTypes.entrySet()) { | |
Block block = recordReader.readBlock(entry.getValue(), entry.getKey()); | |
System.out.print(entry.getValue().getObjectValue(TestingConnectorSession.SESSION, block, 0)); | |
System.out.print(","); | |
} | |
System.out.println(); | |
return batchSize; | |
} | |
} |
Hi Dain,
Thanks for the example code. It's been very helpful for me to start playing with ORC.
I wonder if you can further explain how to get lazy materialization to work? I adapted the above code into a simple example below. After I read the blocks, I found their class to be FixedWidthBlock, instead of LazyFixedWidthBlock. IIUC it means blocks are not lazily loaded. Do you know how I can somehow turn on lazy materialization?
Thanks.
import static com.facebook.presto.spi.type.BigintType.BIGINT;
import static com.facebook.presto.spi.type.DoubleType.DOUBLE;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.Writer;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions;
import org.joda.time.DateTimeZone;
import com.facebook.presto.orc.FileOrcDataSource;
import com.facebook.presto.orc.OrcDataSource;
import com.facebook.presto.orc.OrcReader;
import com.facebook.presto.orc.OrcRecordReader;
import com.facebook.presto.orc.TupleDomainOrcPredicate;
import com.facebook.presto.orc.memory.AggregatedMemoryContext;
import com.facebook.presto.orc.metadata.OrcMetadataReader;
import com.facebook.presto.spi.block.Block;
import com.facebook.presto.spi.type.Type;
import com.google.common.collect.ImmutableMap;
import io.airlift.units.DataSize;
import io.airlift.units.DataSize.Unit;
public class TestOrc {
private static final String ORC_FILE = "/tmp/test.orc";
private static class TestStruct {
public final long key;
public final double value;
public TestStruct(long key, double value) {
this.key = key;
this.value = value;
}
}
private static void write() throws IOException {
ObjectInspector inspector = ObjectInspectorFactory
.getReflectionObjectInspector(
TestStruct.class,
ObjectInspectorOptions.JAVA);
OrcFile.WriterOptions options = OrcFile.writerOptions(new Configuration())
.inspector(inspector);
Writer writer = OrcFile.createWriter(new Path(ORC_FILE), options);
for (int i = 0; i < 10000; ++i) {
writer.addRow(new TestStruct(i, i * 2));
}
writer.close();
}
private static void read() throws IOException {
OrcDataSource source = new FileOrcDataSource(
new File(ORC_FILE),
new DataSize(1, Unit.MEGABYTE),
new DataSize(8, Unit.MEGABYTE),
new DataSize(8, Unit.MEGABYTE));
OrcReader reader = new OrcReader(
source,
new OrcMetadataReader(),
new DataSize(1, Unit.MEGABYTE),
new DataSize(8, Unit.MEGABYTE));
Map<Integer, Type> columns = ImmutableMap.<Integer, Type>builder()
.put(0, BIGINT)
.put(1, DOUBLE)
.build();
OrcRecordReader recordReader = reader.createRecordReader(
columns, TupleDomainOrcPredicate.TRUE, DateTimeZone.UTC,
new AggregatedMemoryContext());
long rows = 0;
for (int batchSize = recordReader.nextBatch(); batchSize > 0;
batchSize = recordReader.nextBatch()) {
rows += batchSize;
for (Entry<Integer, Type> entry : columns.entrySet()) {
Block block = recordReader.readBlock(entry.getValue(), entry.getKey());
System.out.println(block.getClass());
}
}
System.out.println(rows);
}
public static void main(String[] args) throws IOException {
write();
read();
}
}
Hi Dain,
Thanks For the above code
I'm using above code to scan orc file, but i'm facing this issue,
Exception in thread "main" java.lang.IllegalArgumentException: Unsupported type: VARCHAR
at com.facebook.presto.orc.reader.StreamReaders.createStreamReader(StreamReaders.java:57)
at com.facebook.presto.orc.OrcRecordReader.createStreamReaders(OrcRecordReader.java:385)
at com.facebook.presto.orc.OrcRecordReader.(OrcRecordReader.java:175)
at com.facebook.presto.orc.OrcReader.createRecordReader(OrcReader.java:193)
at com.facebook.presto.orc.OrcReader.createRecordReader(OrcReader.java:168)
at com.facebook.presto.orc.ScanOrcFile.main(ScanOrcFile.java:58)
Could you please let me know whats the problem?
Actually the orc file which i'm trying to read is created by using Hive. By the way i'm using 0.126