Created
July 13, 2018 14:28
-
-
Save pbrumblay/8b6d462b027cb96630d6e3a265be107d to your computer and use it in GitHub Desktop.
Python script to extract schema from avro file in google cloud storage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from google.cloud import storage | |
| import sys | |
| from avro.datafile import DataFileReader | |
| from avro.io import DatumReader | |
| import json | |
| client = storage.Client() | |
| bucket_name = sys.argv[1] | |
| blob_name = sys.argv[2] | |
| file_name = sys.argv[3] | |
| bucket = client.lookup_bucket(bucket_name) | |
| if bucket is None: | |
| raise ValueError('Could not find bucket %s' % bucket_name) | |
| blob = bucket.blob(blob_name) | |
| blob.download_to_filename(file_name, start=0, end=100000) | |
| reader = DataFileReader(open(file_name, "rb"), DatumReader()) | |
| schema = reader.get_meta('avro.schema') | |
| parsed = json.loads(schema) | |
| print(json.dumps(parsed, indent=4, sort_keys=True)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment