-
-
Save djhocking/eff1072b54b6d8049270 to your computer and use it in GitHub Desktop.
# fetch temperature data | |
tbl_values <- left_join(tbl_series, | |
select(tbl_variables, variable_id, variable_name), | |
by=c('variable_id'='variable_id')) %>% | |
select(-file_id) %>% | |
filter(location_id %in% df_locations$location_id, | |
variable_name=="TEMP") %>% | |
left_join(tbl_values, | |
by=c('series_id'='series_id')) %>% | |
left_join(select(tbl_locations, location_id, location_name, latitude, longitude, featureid=catchment_id), | |
by=c('location_id'='location_id')) %>% | |
left_join(tbl_agencies, | |
by=c('agency_id'='agency_id')) %>% | |
mutate(year = date_part('year', datetime)) | |
df_values <- collect(tbl_values) | |
df_values <- df_values %>% | |
mutate(datetime=with_tz(datetime, tzone='EST')) | |
summary(df_values) | |
# create climateData input dataset (too big without pre-filter or smaller join) | |
# tried to do the year filter within the postgres tbl query but was getting errors with recognizing | |
climate <- tbl_daymet %>% | |
mutate(year = date_part('year', date)) %>% | |
filter(featureid %in% df_locations$featureid) | |
tbl_climate <- climate %>% | |
filter(year %in% unique(df_values$year)) # distinct() doesn't work on numeric values | |
climateData <- collect(tbl_climate) |
Ok I think this will work for getting the daily daymet values associated with only the catchments and years that have data. The first part (WITH loc_years AS (...)
) is called a Common Table Expression (CTE) (read this) and basically creates a temporary table named loc_years
that can then be used in the join with the daymet
table. Note this table is not saved in the database, it is only used by the query when it is run.
The loc_year
CTE will be a table containing the unique combination of featureid, year, variable and number of values (n
). But this is also filtered to only include water temperature (so the variable column is kind of moot). The query below also filters for only two featureids ('740020', '746011')
just for testing. You can remove the last line of the WHERE clause (AND l.catchment_id IN ('740020', '746011')
) to retrieve all the featureids with data.
WITH loc_year AS (
SELECT l.catchment_id AS featureid,
DATE_PART('year', v.datetime) AS year,
var.name as variable, count(v.value) AS N
FROM series s, values v, locations l, variables var
WHERE s.id = v.series_id
AND s.location_id=l.id
AND s.variable_id=var.id
AND var.name='TEMP'
AND l.catchment_id IN ('740020', '746011')
GROUP BY var.name, featureid, year
)
SELECT date_part('year', d.date) as year, d.featureid, ly.n as n_values,
d.date, d.tmax, d.tmin, d.prcp, d.dayl, d.srad, d.vp, d.swe
FROM daymet d
INNER JOIN loc_year ly
ON d.featureid=ly.featureid
AND date_part('year', d.date)=ly.year
ORDER BY d.featureid, d.date;
The multicolumn index (featureid, year) on daymet is still being created, might take a few hours. but after that is ready, this query should hopefully be at least somewhat faster.
Here's code to try running in R:
library(RPostgreSQL)
drv <- dbDriver("PostgreSQL")
# create connection
con <- dbConnect(drv, dbname="conte_dev", host="127.0.0.1", user="conte", password="conte")
# create sql query string
qry <- "WITH loc_year AS (
SELECT l.catchment_id AS featureid,
DATE_PART('year', v.datetime) AS year,
var.name as variable, count(v.value) AS N
FROM series s, values v, locations l, variables var
WHERE s.id = v.series_id
AND s.location_id=l.id
AND s.variable_id=var.id
AND var.name='TEMP'
AND l.catchment_id IN ('740020', '746011')
GROUP BY var.name, featureid, year
)
SELECT date_part('year', d.date) as year, d.featureid, ly.n as n_values,
d.date, d.tmax, d.tmin, d.prcp, d.dayl, d.srad, d.vp, d.swe
FROM daymet d
INNER JOIN loc_year ly
ON d.featureid=ly.featureid
AND date_part('year', d.date)=ly.year
ORDER BY d.featureid, d.date;"
# submit query
result <- dbSendQuery(con, qry)
# fetch results (n=-1 means return all rows, use n=5 to return just first 5 rows, for example)
df <- fetch(result, n=-1)
# check that each featureid has only one or more complete years of daily daymet values
table(df$year, df$featureid)
# plot
library(ggplot2)
theme_set(theme_bw())
ggplot(df, aes(date, tmax)) +
geom_line() +
facet_wrap(~featureid)
FYI, i'm building a new index on the daymet table that is based on both
featureid
anddate_part('year', daymet.date)
. This should drastically speed up these queries since most daymet queries will involve filter by both featureid and the year. Cross your fingers...