Last active
March 24, 2018 07:06
-
-
Save tilakpatidar/7547419c095353f695845de381d1b7cb to your computer and use it in GitHub Desktop.
Apache Gobblin job file for pulling csv files from a S3 Bucket
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ==================================================================== | |
# PullCsvFromS3 | |
# Pull CSV data from a directory S3 to our local system | |
# ==================================================================== | |
job.name=PullCsvFromS3 | |
job.description=Pull CSV data from a directory S3 to our local system | |
fs.uri=file:/// | |
# Set working directory | |
work.dir=/Users/tilak/gobblin/mopar-demo | |
writer.staging.dir=${work.dir}/taskStaging | |
writer.output.dir=${work.dir}/taskOutput | |
mr.job.root.dir=${work.dir}/working | |
# Set state store | |
state.store.enabled=true | |
state.store.type=mysql | |
state.store.db.jdbc.driver=com.mysql.jdbc.Driver | |
state.store.db.url=jdbc:mysql://localhost/mopar_demo | |
state.store.db.user=gobblin | |
state.store.db.password=gobblin | |
# Set writer and publisher | |
writer.fs.uri=file:/// | |
data.publisher.final.dir=${work.dir}/output | |
writer.builder.class=org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriterBuilder | |
data.publisher.type=org.apache.gobblin.data.management.copy.publisher.CopyDataPublisher | |
writer.destination.type=HDFS | |
data.publisher.fs.uri=${fs.uri} | |
data.publisher.metadata.output.dir=${work.dir}/metadata_out | |
# Source Configuration | |
source.class=org.apache.gobblin.data.management.copy.CopySource | |
gobblin.dataset.profile.class=org.apache.gobblin.data.management.copy.CopyableGlobDatasetFinder | |
gobblin.dataset.pattern=pricing.products_*.csv | |
# To copy from particular directory gobblin.dataset.pattern=some_folder/*.csv | |
gobblin.copy.recursive.update=true | |
fork.record.queue.capacity=1 | |
# Source S3 Configuration | |
source.filebased.fs.uri=s3a://<bucket_name> | |
source.filebased.preserve.file.name=true | |
source.filebased.encrypted.fs.s3a.access.key=<s3-access-key> | |
source.filebased.encrypted.fs.s3a.secret.key=<s3-secret-key> | |
fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem | |
fs.s3a.buffer.dir=${work.dir}/buffer-dir | |
fs.s3a.connection.ssl.enabled=false | |
# Converters | |
converter.classes=org.apache.gobblin.converter.IdentityConverter | |
# ==================================================================== | |
# Distcp configurations (do not change) | |
# ==================================================================== | |
job.class=org.apache.gobblin.azkaban.AzkabanJobLauncher | |
extract.namespace=org.apache.gobblin.copy | |
distcp.persist.dir=/tmp/distcp-persist-dir | |
task.maxretries=0 | |
workunit.retry.enabled=false | |
# Job History server | |
job.history.store.enabled=true | |
job.history.store.url=jdbc:mysql://localhost/mopar_demo | |
job.history.store.jdbc.driver=com.mysql.jdbc.Driver | |
job.history.store.user=gobblin | |
job.history.store.password=gobblin | |
# Other s3a settings | |
# Should be greater than 5MB else distcp won't work | |
fs.s3a.multipart.size=67108864 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment