Created
February 6, 2019 03:13
-
-
Save nocollier/df1c85ddff1e1e613dc07fd9c0c4f48b to your computer and use it in GitHub Desktop.
A tool to convert Ameriflux CSV files into a CF-compliant netCDF4 format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Converts csv files downloaded from the Ameriflux archive into a CF-compliant netCDF4 file. | |
""" | |
import argparse | |
import glob | |
import re | |
import xlrd | |
import numpy as np | |
import cftime as cf | |
from netCDF4 import Dataset | |
from cf_units import Unit | |
def ParseAMFUnitsTable(): | |
change = {"adimensional":"1", "deg C":"degC", "Decimal degrees":"degrees", "‰ (permil)":"1e-3"} | |
prefix = {"n":"1e-9", "µ":"1e-6", "m":"1e-3"} | |
desc = {} | |
unit = {} | |
for line in open("amf.txt").readlines(): | |
line = line.replace("\n", "") | |
if "TIMESTAMP" in line: continue | |
if line.count("\t") == 0: continue | |
key, d, u = line.split("\t") | |
match = re.search("(.*)mol(.*)", u.split()[0]) | |
if match: | |
U = u.replace("%smol" % (match.group(1)), | |
"%s mol" % (prefix[match.group(1)])) | |
u = U.replace(match.group(2), "") | |
d = d + " " + match.group(2) | |
for chg in change.keys(): | |
u = u.replace(chg,change[chg]) | |
try: | |
Unit(u) | |
desc[key] = d | |
unit[key] = u | |
except: | |
pass | |
return desc, unit | |
def FindBaseName(name, names): | |
name = name.split("_") | |
base = "" | |
for i in range(len(name)): | |
tmp = "_".join(name[:(i+1)]) | |
if tmp in names: | |
base = tmp | |
if base == "": | |
print(name,names) | |
return base | |
def ParseAMFExcelFile(filename): | |
data = {} | |
with xlrd.open_workbook(filename) as wb: | |
ws = wb.sheet_by_index(0) | |
for i in range(1,ws.nrows): | |
val = ws.cell_value(i,4) | |
try: | |
val = float(val) | |
if val.is_integer(): val = int(val) | |
except ValueError: | |
pass | |
data[ws.cell_value(i,3)] = val | |
return data | |
def ToNetCDF4(dset,name,lat,lon,t,tb,v,attributes=None,prealloc=[]): | |
if not dset.dimensions.keys(): | |
dset.createDimension("ndata",size=1) | |
dset.createDimension("nb",size=2) | |
dset.createDimension("time") | |
T = dset.createVariable("time","double",("time")) | |
T.setncattr("units","days since 1850-01-01 00:00:00") | |
T.setncattr("calendar","standard") | |
T.setncattr("axis","T") | |
T.setncattr("long_name","time") | |
T.setncattr("standard_name","time") | |
T.setncattr("bounds","time_bnds") | |
T[...] = t | |
TB = dset.createVariable("time_bnds","double",("time","nb")) | |
TB[...] = tb | |
LA = dset.createVariable("lat","double",("ndata")) | |
LA[...] = np.asarray([lat]) | |
LO = dset.createVariable("lon","double",("ndata")) | |
LO[...] = np.asarray([lon]) | |
V = dset.createVariable(vname,"double",("time","ndata"),zlib=True,chunksizes=(t.size,1)) | |
V[...] = v.reshape((-1,1)) | |
if attributes: | |
for key in attributes.keys(): | |
V.setncattr(key,attributes[key]) | |
csvs = glob.glob("*.csv") | |
xlss = glob.glob("*.xlsx") | |
sites = [csv.split("_")[1] for csv in csvs] | |
parser = argparse.ArgumentParser(__doc__) | |
parser.add_argument('--sites', dest="sites", metavar='SITES', type=str, nargs="+", default=sites) | |
args = parser.parse_args() | |
desc, unit = ParseAMFUnitsTable() | |
for site in args.sites: | |
csv = [f for f in csvs if site in f] | |
xls = [f for f in xlss if site in f] | |
if not csv: | |
print("No csv file found for the site %s" % (site)) | |
continue | |
if len(csv) > 1: | |
print("Multiple csv files found for the site %s: " % (site), ", ".join(csvs)) | |
continue | |
csv = csv[0] | |
print("Parsing site %s..." % site) | |
# parse the xlsx file | |
if not xls: | |
print(" No Excel file found") | |
gattrs = {} | |
else: | |
print(" getting meta data from %s" % xls[0]) | |
gattrs = ParseAMFExcelFile(xls[0]) | |
version = float(csv.replace(".csv", "").split("_")[-1].replace("-", ".")) | |
gattrs["version"] = version | |
# search for the site latitude/longitude | |
lats = [key for key in gattrs.keys() if "LOCATION_LAT" in key] | |
lons = [key for key in gattrs.keys() if "LOCATION_LONG" in key] | |
if not ((len(lats) == 1) and (len(lons) == 1)): | |
print(" Unknown site location, skipping") | |
continue | |
lat = gattrs[lats[0]] | |
lon = gattrs[lons[0]] | |
# parse the csv file | |
rec = np.genfromtxt(csv, delimiter=",", skip_header=2, names=True) #, max_rows=10) | |
t0 = rec['TIMESTAMP_START'].astype(str) | |
t0 = cf.date2num([cf.datetime(int(t[:4]), int(t[4:6]), int(t[6:8]), int(t[8:10]), int(t[10:12])) | |
for t in t0], "days since 1850-01-01") | |
tf = rec['TIMESTAMP_END'].astype(str) | |
tf = cf.date2num([cf.datetime(int(t[:4]), int(t[4:6]), int(t[6:8]), int(t[8:10]), int(t[10:12])) | |
for t in tf], "days since 1850-01-01") | |
tb = np.vstack([t0, tf]).T | |
t = tb.mean(axis=1) | |
vnames = [vname for vname in rec.dtype.names if "TIMESTAMP" not in vname] | |
with Dataset("AMF_%s.nc" % site, mode="w") as dset: | |
for attr in gattrs.keys(): | |
dset.setncattr(attr,gattrs[attr]) | |
for vname in vnames: | |
v = np.ma.masked_values(rec[vname], -9999) | |
if v.mask.all(): | |
# if all the data in a variable is marked invalid, | |
# then we will skip the variable | |
print(" skipping %s, all invalid" % vname) | |
continue | |
print(" encoding %s" % vname) | |
base = FindBaseName(vname, unit.keys()) | |
ToNetCDF4(dset,vname,lat,lon,t,tb,v,attributes={"units":unit[base], | |
"standard_name":desc[base]}, | |
prealloc = vnames) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment