Last active
October 22, 2021 20:12
-
-
Save echeipesh/55999853302ab897ae498ce22a659994 to your computer and use it in GitHub Desktop.
GlobalForestWatch Output Audits
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#%% | |
gt_dir = "/opt/data/gfw2-data/forest_change/umd_landsat_alerts/prod/analysis" | |
pro_dir = "/opt/data/gfwpro/gfwpro-raster-data" | |
import os | |
gt_rasters = [file for file in os.listdir(gt_dir) if file.endswith(".tif")] | |
pro_rasters = [file for file in os.listdir(pro_dir) if file.endswith(".tif")] | |
path = gt_rasters[0] | |
import xarray | |
import numpy as np | |
import multiprocessing as mp | |
for path in gt_rasters: | |
print(f"Checking: {path}") | |
with xarray.open_rasterio(f"{gt_dir}/{path}") as gt_raster: | |
with xarray.open_rasterio(f"{pro_dir}/{path}") as pro_raster: | |
assert(np.array_equal(gt_raster.data, pro_raster.data)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#%% | |
from datetime import date, timedelta | |
from typing import Tuple | |
from wri_list_tools.list_utils import read_list_tsv | |
def decode_alert(v: int) -> Tuple[date, bool]: | |
if v >= 30000: | |
confidence = True | |
days = v - 30000 | |
elif v >= 20000: | |
confidence = False | |
days = v - 20000 | |
else: | |
raise ValueError(v) | |
base = date(year=2014, month=12, day=31) | |
delta = timedelta(days=days) | |
return (base + delta, confidence) | |
assert(decode_alert(20732) == (date(2017,1,1), False)) | |
assert(decode_alert(30732) == (date(2017,1,1), True)) | |
#%% | |
from itertools import islice | |
from numpy import linspace | |
from typing import List, Tuple | |
import math | |
def split_bbox(bbox, x_step, y_step) -> List[Tuple]: | |
def n_grams(a, n): | |
z = (islice(a, i, None) for i in range(n)) | |
return list(zip(*z)) | |
(xmin, ymin, xmax, ymax) = bbox | |
x_tiles = math.ceil((xmax - xmin) / x_step) | |
y_tiles = math.ceil((ymax - ymin) / y_step) | |
x_breaks = linspace(xmin, xmax, num=x_tiles + 1) | |
y_breaks = linspace(ymin, ymax, num=y_tiles + 1) | |
return [ (xmin, ymin, xmax, ymax) | |
for (ymin, ymax) in n_grams(y_breaks, 2) | |
for (xmin, xmax) in n_grams(x_breaks, 2) | |
] | |
#%% | |
glad_pro = "/opt/data/gfwpro/gfwpro-raster-data/050W_20S_040W_10S.tif" | |
glad_gt = "/opt/data/gfw2-data/forest_change/umd_landsat_alerts/prod/analysis/050W_20S_040W_10S.tif" | |
#%% | |
import rasterio as rio | |
from shapely.geometry import box | |
import geopandas as gpd | |
with rio.open(glad_gt) as ra: | |
bounds = ra.bounds | |
geom = box(*bounds) | |
windows = split_bbox(bounds, | |
x_step = (bounds[2]-bounds[0])/100, | |
y_step = (bounds[3]-bounds[1])/100) | |
glad_test_df = gpd.GeoDataFrame.from_dict({ | |
"list_id": [1]*len(windows), | |
"location_id": list(range(0, len(windows))), | |
"geometry": [box(*bounds) for bounds in windows] | |
}) | |
from wri_list_tools import write_list_tsv | |
write_list_tsv(glad_test_df, "/tmp/glad-test-location-split-100.tsv") | |
#%% | |
from wri_list_tools import read_list_tsv | |
read_list_tsv("/tmp/glad-test-location-split-100.tsv").to_file("/tmp/glad-test-location-split-100.geojson", driver="GeoJSON") | |
#%% | |
import xarray | |
from xarray.coding.frequencies import month_anchor_check | |
arr = xarray.open_rasterio(glad_gt) | |
#%% | |
import numpy as np | |
glad_alerts = arr.data[np.where(arr.data>0)] | |
(len(arr.data), len(glad_alerts)) | |
#%% | |
day_alerts={} | |
week_alerts = {} | |
month_alerts = {} | |
for alert in glad_alerts: | |
assert(alert > 0) | |
(d, conf) = decode_alert(int(alert)) | |
month_key = f"{d.year}-{d.month:02}" | |
isoc = d.isocalendar() | |
week_key = f"{isoc.year}-{isoc.week:02}" | |
day_key = d.isoformat() | |
day_alerts[day_key] = day_alerts.get(day_key, 0) + 1 | |
week_alerts[week_key] = week_alerts.get(week_key, 0) + 1 | |
month_alerts[month_key] = month_alerts.get(month_key, 0) + 1 | |
from pprint import pprint | |
pprint(month_alerts) | |
#%% load Dashboard job output | |
from wri_list_tools import Dashboard | |
dash_test = Dashboard("/tmp/dashboard-glad-test-matched") | |
dash_test.df | |
# dash_test['glad_alerts_monthly'].reset_index()['gadm_id'].unique() | |
#%% | |
month_key = '2020-07' | |
gt_ans = dash_test['glad_alerts_monthly'].\ | |
reset_index().\ | |
query(f"month=='{month_key}'")\ | |
['glad_alerts_monthly'].sum() | |
py_ans = month_alerts[month_key] | |
(py_ans, gt_ans) | |
#%% GDAL valid alerts | |
int(0.00856 * 40000 * 40000) | |
#%% total GLAD Alerts from Python | |
int(sum(month_alerts.values())) | |
#%% total GLAD Alerts from GT | |
dash_test['glad_alerts_daily']['glad_alerts_daily'].sum() | |
#%% Monthly aggregate from GT | |
gt_glad_alerts_monthly = dash_test['glad_alerts_monthly'].reset_index()[['month','glad_alerts_monthly']].groupby(['month']).sum().sort_index() | |
gt_glad_alerts_monthly.sort_index() | |
#%% monthly DF from Python | |
import pandas as pd | |
py_glad_alerts_monthly = pd.DataFrame.from_dict(month_alerts.copy(), orient="index", columns=['glad_alerts_monthly']).rename_axis("month").sort_index() | |
py_glad_alerts_monthly | |
#%% join monhtly alerts | |
gt_glad_alerts_monthly.compare(py_glad_alerts_monthly) | |
#%% Weekly aggregate from GT | |
gt_glad_alerts_weekly = dash_test['glad_alerts_weekly'].reset_index()[['week','glad_alerts_weekly']].groupby(['week']).sum().sort_index() | |
gt_glad_alerts_weekly.sort_index() | |
#%% weekly DF from Python | |
import pandas as pd | |
py_glad_alerts_weekly = pd.DataFrame.from_dict(week_alerts, orient="index", columns=['glad_alerts_weekly']).rename_axis("week").sort_index() | |
py_glad_alerts_weekly | |
#%% | |
joined = py_glad_alerts_weekly.join(gt_glad_alerts_weekly, lsuffix="_py", rsuffix="_gt") #.to_csv("/tmp/weekly_diff.csv") | |
joined.query("glad_alerts_weekly_py != glad_alerts_weekly_gt") | |
#%% Weekly aggregate from GT | |
gt_glad_alerts_daily = dash_test['glad_alerts_daily'].reset_index()[['date','glad_alerts_daily']].groupby(['date']).sum().sort_index() | |
gt_glad_alerts_daily.sort_index() | |
#%% weekly DF from Python | |
import pandas as pd | |
py_glad_alerts_daily = pd.DataFrame.from_dict(day_alerts.copy(), orient="index", columns=['glad_alerts_daily']).rename_axis("date").sort_index() | |
py_glad_alerts_daily | |
#%% | |
joined = py_glad_alerts_daily.join(gt_glad_alerts_daily, lsuffix="_py", rsuffix="_gt") #.to_csv("/tmp/weekly_diff.csv") | |
joined.query("glad_alerts_daily_py != glad_alerts_daily_gt") | |
## Prepare test inputs from GADM context features | |
#%% What is the Dashboard job being intersected with? | |
import pandas as pd | |
from shapely import wkb | |
import geopandas as gpd | |
context_df = pd.read_csv("/opt/data/gfwpro/gadm36_adm2_1_1.tsv", delimiter="\t") | |
context_df['geometry'] = context_df['geom'].map(lambda s: wkb.loads(s, hex=True)) | |
# gpd.GeoDataFrame(context_df.drop('geom', axis=1)).to_file("/tmp/gadm36_adm2_1_1.gpkg", driver="GPKG") | |
input_df = context_df.query("tile_id=='10S_050W'")[['fid','geometry']] | |
input_df = input_df.rename(columns={'fid': 'location_id'}) | |
input_df['list_id'] = 1 | |
input_df = input_df.set_index(['location_id', 'list_id']) | |
write_list_tsv(input_df, '/tmp/gadm36_input_10S_050W.tsv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment