Skip to content

Instantly share code, notes, and snippets.

@boshek
Created August 23, 2018 16:27
Show Gist options
  • Save boshek/d2393bc0edb2f77b50a83d88532ae284 to your computer and use it in GitHub Desktop.
Save boshek/d2393bc0edb2f77b50a83d88532ae284 to your computer and use it in GitHub Desktop.
title output
Simple SQL and dplyr
html_document
keep_md
true

Load Packages

library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts -------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()  masks stats::filter()
## x purrr::is_null() masks testthat::is_null()
## x dplyr::lag()     masks stats::lag()
## x dplyr::matches() masks testthat::matches()
library(tidyhydat)
library(DBI) 

Setup database connection

conn <- hy_src()

Example dplyr-SQL equivalencies

Basic select a tbl

tbl(conn, "DLY_FLOWS") %>% 
  show_query()
## <SQL>
## SELECT *
## FROM `DLY_FLOWS`

Now select a table and filter

tbl(conn, "DLY_FLOWS") %>% 
  filter(STATION_NUMBER == "08MF005") %>% 
  show_query()
## <SQL>
## SELECT *
## FROM `DLY_FLOWS`
## WHERE (`STATION_NUMBER` = '08MF005')

Two filters

tbl(conn, "DLY_FLOWS") %>% 
  filter(STATION_NUMBER == "08MF005" & YEAR > 2013) %>% 
  show_query()
## <SQL>
## SELECT *
## FROM `DLY_FLOWS`
## WHERE (`STATION_NUMBER` = '08MF005' AND `YEAR` > 2013.0)

Let's create a group and take the mean of year

tbl(conn, "DLY_FLOWS") %>% 
  group_by(STATION_NUMBER) %>%
  summarise(year_mean = mean(YEAR, na.rm = TRUE)) %>% 
  show_query()
## <SQL>
## SELECT `STATION_NUMBER`, AVG(`YEAR`) AS `year_mean`
## FROM `DLY_FLOWS`
## GROUP BY `STATION_NUMBER`

How selecting just two columns from a table

tbl(conn, "DLY_FLOWS") %>% 
  select(STATION_NUMBER, YEAR) %>% 
  show_query()
## <SQL>
## SELECT `STATION_NUMBER`, `YEAR`
## FROM `DLY_FLOWS`

Mutating a column

tbl(conn, "DLY_FLOWS") %>% 
  select(YEAR) %>% 
  mutate(Year_plus_one = YEAR + 1) %>% 
  show_query()
## <SQL>
## SELECT `YEAR`, `YEAR` + 1.0 AS `Year_plus_one`
## FROM (SELECT `YEAR`
## FROM `DLY_FLOWS`)

Let's try a join

tbl(conn, "STN_DATUM_CONVERSION") %>% 
  select(STATION_NUMBER, cf = CONVERSION_FACTOR) %>% 
  left_join(tbl(conn, "SED_SAMPLES") %>% 
              select(STATION_NUMBER, fl = FLOW), 
            by = c("STATION_NUMBER")) %>% 
  show_query()
## <SQL>
## SELECT `TBL_LEFT`.`STATION_NUMBER` AS `STATION_NUMBER`, `TBL_LEFT`.`cf` AS `cf`, `TBL_RIGHT`.`fl` AS `fl`
##   FROM (SELECT `STATION_NUMBER`, `CONVERSION_FACTOR` AS `cf`
## FROM `STN_DATUM_CONVERSION`) AS `TBL_LEFT`
##   LEFT JOIN (SELECT `STATION_NUMBER`, `FLOW` AS `fl`
## FROM `SED_SAMPLES`) AS `TBL_RIGHT`
##   ON (`TBL_LEFT`.`STATION_NUMBER` = `TBL_RIGHT`.`STATION_NUMBER`)
hy_src_disconnect(conn)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment