Last active
March 23, 2019 07:44
-
-
Save prashant-shahi/bf4eec7b472543009fe4b0ca968f3d4a to your computer and use it in GitHub Desktop.
Python Application which uses SQL-like SELECT Expression to read gzip-compressed CSV in S3 and output result in CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <!-- | |
| Then, open the file $HADOOP_HOME/etc/hadoop/core-site.xml for editing. | |
| In this example Minio server is running at http://127.0.0.1:9000 with access key minio and secret key minio123. | |
| Make sure to update relevant sections with valid Minio server endpoint and credentials. | |
| --> | |
| <?xml version="1.0" encoding="UTF-8"?> | |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | |
| <configuration> | |
| <property> | |
| <name>fs.s3a.endpoint</name> | |
| <description>AWS S3 endpoint to connect to. An up-to-date list is | |
| provided in the AWS Documentation: regions and endpoints. Without this | |
| property, the standard region (s3.amazonaws.com) is assumed. | |
| </description> | |
| <value>http://127.0.0.1:9000</value> | |
| </property> | |
| <property> | |
| <name>fs.s3a.access.key</name> | |
| <description>AWS access key ID.</description> | |
| <value>minio</value> | |
| </property> | |
| <property> | |
| <name>fs.s3a.secret.key</name> | |
| <description>AWS secret key.</description> | |
| <value>minio123</value> | |
| </property> | |
| <property> | |
| <name>fs.s3a.path.style.access</name> | |
| <value>true</value> | |
| <description>Enable S3 path style access ie disabling the default virtual hosting behaviour. | |
| Useful for S3A-compliant storage providers as it removes the need to set up DNS for virtual hosting. | |
| </description> | |
| </property> | |
| <property> | |
| <name>fs.s3a.impl</name> | |
| <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value> | |
| <description>The implementation class of the S3A Filesystem</description> | |
| </property> | |
| </configuration> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env/env python3 | |
| import boto3 | |
| import os | |
| s3 = boto3.client('s3', | |
| endpoint_url='http://localhost:9000', | |
| aws_access_key_id=os.environ.get("AWS_ACCESS", 'minio'), | |
| aws_secret_access_key=os.environ.get("AWS_SECRET", 'minio123'), | |
| region_name='us-east-1') | |
| r = s3.select_object_content( | |
| Bucket=os.environ.get("BUCKET_NAME",'mycsvbucket'), | |
| Key=os.environ.get("OBJECT_PATH", 'sampledata/TotalPopulation.csv.gz'), | |
| ExpressionType='SQL', | |
| Expression="select * from s3object s where s.Location like '%United States%'", | |
| InputSerialization={ | |
| 'CSV': { | |
| "FileHeaderInfo": "USE", | |
| }, | |
| 'CompressionType': 'GZIP', | |
| }, | |
| OutputSerialization={'CSV': {}}, | |
| ) | |
| for event in r['Payload']: | |
| if 'Records' in event: | |
| records = event['Records']['Payload'].decode('utf-8') | |
| print(records) | |
| elif 'Stats' in event: | |
| statsDetails = event['Stats']['Details'] | |
| print("Stats details bytesScanned: ") | |
| print(statsDetails['BytesScanned']) | |
| print("Stats details bytesProcessed: ") | |
| print(statsDetails['BytesProcessed']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment