This code is companion to my blog post here: https://blog.sandipb.net/2015/05/20/serializing-structured-data-into-avro-using-python/
Last active
January 20, 2024 22:40
-
-
Save sandipb/53ce9e81569adf29d37a to your computer and use it in GitHub Desktop.
Avro schema for weather data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import avro.schema | |
from avro.datafile import DataFileWriter | |
from avro.io import DatumWriter | |
import csv | |
import calendar | |
import time | |
schema = avro.schema.parse(open("weather.avsc").read()) | |
writer_null = DataFileWriter(open("weather_data_null.avro", "wb"), DatumWriter(), schema, codec="null") | |
writer_deflate = DataFileWriter(open("weather_data_deflate.avro", "wb"), DatumWriter(), schema, codec="deflate") | |
writer_snappy = DataFileWriter(open("weather_data_snappy.avro", "wb"), DatumWriter(), schema, codec="snappy") | |
# Header: Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather | |
fields = "time temp dew_point_temp humidity wind_speed visibility pressure weather".split() | |
headers = dict([(v,i) for i,v in enumerate(fields)]) | |
with open("weather_2012.csv") as csvfile: | |
reader = csv.reader(csvfile) | |
reader.next() # skip header | |
for boring_row in reader: | |
row = dict(zip(fields, boring_row)) | |
# convert fields to right type | |
row["time"] = int(time.mktime((time.strptime(row["time"], "%Y-%m-%d %H:%M:%S")))) | |
for int_field in "humidity wind_speed".split(): | |
row[int_field] = int(row[int_field]) | |
for float_field in "temp dew_point_temp visibility pressure".split(): | |
row[float_field] = float(row[float_field]) | |
writer_null.append(row) | |
writer_deflate.append(row) | |
writer_snappy.append(row) | |
writer_null.close() | |
writer_deflate.close() | |
writer_snappy.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"namespace": "net.sandipb.avro.example.weather", | |
"type": "record", | |
"name": "Reading", | |
"doc": "Weather reading at a point in time", | |
"fields": [ | |
{"name": "time", "type": "int", "doc": "Seconds since epoch"}, | |
{"name": "temp", "type": "float", "doc": "Temperature in Celsius"}, | |
{"name": "dew_point_temp", "type": "float", "doc": "Dew point temperature in Celsius"}, | |
{"name": "humidity", "type": "int", "doc": "Relative humidity %"}, | |
{"name": "wind_speed", "type": "float", "doc": "Wind speed in km/h"}, | |
{"name": "visibility", "type": "float", "doc": "Visibility in km"}, | |
{"name": "pressure", "type": "float", "doc": "Atmospheric pressure in kPa"}, | |
{"name": "weather", "type": "string", "doc": "Weather summary"} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Will you explain the code ?