sql_build creates a select_query S3 object, that is rendered to a SQL string
by sql_render. The output from sql_build is designed to be easy to test, as
it's database diagnostic, and has a hierarchical structure.
-
opA sequence of lazy operations -
conA database connection. The defaultNULLuses a set of rules that should be very similar to ANSI 92, and allows for testing without an active database connection. -
...Other arguments passed on to the methods. Not currently used.
sql_build(op, con, ...)
select_query(from, select = sql("*"), where = character(),
group_by = character(), having = character(), order_by = character(),
limit = NULL, distinct = FALSE)
join_query(x, y, type = "inner", by = NULL, suffix = c(".x", ".y"))
semi_join_query(x, y, anti = FALSE, by = NULL)
set_op_query(x, y, type = type)
sql_render(query, con = NULL, ...)sql_buildis generic over the lazy operations, lazy_ops, and generates an S3 object that represents the query.sql_rendertakes a query object and then calls a function that is generic over the database. For example,sql_build.op_mutategenerates aselect_query, andsql_render.select_querycallssql_select, which has different methods for different databases.- The default methods should generate ANSI 92 SQL where possible, so backends only need to override the methods if they are not ANSI compliant.
# ----------------------------- Single table ops -----------------------------
sql_build.op_select(op, con, ...){
vars <- select_vars_(op_vars(op$x), op$dots, include = op_grps(op$x))
select_query(sql_build(op$x, con), ident(vars))}
sql_build.op_rename(op, con, ...) {
vars <- rename_vars_(op_vars(op$x), op$dots)
select_query(sql_build(op$x, con), ident(vars))}
sql_build.op_arrange(op, con, ...) {
order_vars <- translate_sql_(op$dots, con, op_vars(op$x))
group_vars <- c.sql(ident(op_grps(op$x)), con = con)
select_query(sql_build(op$x, con), order_by = order_vars)}
sql_build.op_summarise(op, con, ...){
select_vars <- translate_sql_(op$dots, con, op_vars(op$x), window = FALSE)
group_vars <- c.sql(ident(op_grps(op$x)), con = con)
select_query(sql_build(op$x, con),
select = c.sql(group_vars, select_vars, con = con),
group_by = group_vars)}
sql_build.op_mutate(op, con, ...){
vars <- op_vars(op$x)
new_vars <- translate_sql_(op$dots, con, vars,
vars_group = op_grps(op),
vars_order = op_sort(op))
old_vars <- ident(setdiff(vars, names(new_vars)))
select_query(sql_build(op$x, con),
select = c.sql(old_vars, new_vars, con = con))}
sql_build.op_head(op, con, ...){
select_query(sql_build(op$x, con), limit = op$args$n)}
sql_build.op_group_by(op, con, ...){
sql_build(op$x, con, ...)}
sql_build.op_ungroup(op, con, ...) {
sql_build(op$x, con, ...)}
sql_build.op_filter(op, con, ...) {
vars <- op_vars(op$x)
where_sql <- translate_sql_(op$dots, con, vars = vars)
select_query(sql_build(op$x, con),
where = where_sql)}
sql_build.op_distinct(op, con, ...) {
if (length(op$dots) == 0) {
select_query(sql_build(op$x, con),
distinct = TRUE)}
else {
if (op$args$.keep_all) {
stop("Can't calculate distinct only on specified columns
with SQL unless .keep_all is FALSE", call. = FALSE)}
group_vars <- c.sql(ident(names(op$dots)), con = con)
select_query(sql_build(op$x, con),
select = group_vars,
group_by = group_vars)}}
# ----------------------------- Dual table ops -----------------------------
sql_build.op_join(op, con, ...) {
# Ensure tables have unique names
x_names <- op_vars(op$x)
y_names <- op_vars(op$y)
by <- op$args$by
uniques <- unique_names(x_names, y_names, by = by, suffix = op$args$suffix)
if (is.null(uniques)) {
x <- op$x; y <- op$y}
else {
# TODO: it would be better to construct an explicit FROM statement
# that used the table names to disambiguate the fields names: this
# would remove a layer of subqueries and would make sql_join more
# flexible.
x <- select_(op$x, .dots = setNames(x_names, uniques$x))
y <- select_(op$y, .dots = setNames(y_names, uniques$y))
by$x <- unname(uniques$x[by$x])
by$y <- unname(uniques$y[by$y])}
join_query(x, y,
type = op$args$type,
by = by)}
sql_build.op_semi_join(op, con, ...) {
semi_join_query(op$x, op$y, anti = op$args$anti, by = op$args$by)}
sql_build.op_set_op(op, con, ...) {
set_op_query(op$x, op$y, type = op$args$type)}select_query <- function(from,
select = sql("*"),
where = character(),
group_by = character(),
having = character(),
order_by = character(),
limit = NULL,
distinct = FALSE) {
stopifnot(is.character(select))
stopifnot(is.character(where))
stopifnot(is.character(group_by))
stopifnot(is.character(having))
stopifnot(is.character(order_by))
stopifnot(is.null(limit) || (is.numeric(limit) && length(limit) == 1L))
stopifnot(is.logical(distinct), length(distinct) == 1L)
structure(
list(
from = from,
select = select,
where = where,
group_by = group_by,
having = having,
order_by = order_by,
distinct = distinct,
limit = limit
),
class = c("select_query", "query")
)
}
print.select_query <- function(x, ...) {
cat("<SQL SELECT", if (x$distinct) " DISTINCT", ">\n", sep = "")
cat("From: ", x$from, "\n", sep = "")
if (length(x$select)) cat("Select: ", named_commas(x$select), "\n", sep = "")
if (length(x$where)) cat("Where: ", named_commas(x$where), "\n", sep = "")
if (length(x$group_by)) cat("Group by: ", named_commas(x$group_by), "\n", sep = "")
if (length(x$order_by)) cat("Order by: ", named_commas(x$order_by), "\n", sep = "")
if (length(x$having)) cat("Having: ", named_commas(x$having), "\n", sep = "")
if (length(x$limit)) cat("Limit: ", x$limit, "\n", sep = "")}
join_query <- function(x, y, type = "inner", by = NULL, suffix = c(".x", ".y")) {
structure(
list(
x = x,
y = y,
type = type,
by = by,
suffix = suffix
),
class = c("join_query", "query")
)
}
print.join_query <- function(x, ...) {
cat("<SQL JOIN (", toupper(x$type), ")>\n", sep = "")
cat("By: ", paste0(x$by$x, "-", x$by$y, collapse = ", "), "\n", sep = "")
cat(named_rule("X"), "\n", sep = "")
print(x$x$ops)
cat(named_rule("Y"), "\n", sep = "")
print(x$y$ops)
}
semi_join_query <- function(x, y, anti = FALSE, by = NULL) {
structure(
list(
x = x,
y = y,
anti = anti,
by = by
),
class = c("semi_join_query", "query")
)
}
print.semi_join_query <- function(x, ...) {
cat("<SQL ", if (x$anti) "ANTI" else "SEMI", " JOIN>\n", sep = "")
cat("By: ", paste0(x$by$x, "-", x$by$y, collapse = ", "), "\n", sep = "")
cat(named_rule("X"), "\n", sep = "")
print(x$x$ops)
cat(named_rule("Y"), "\n", sep = "")
print(x$y$ops)
}
set_op_query <- function(x, y, type = type) {
structure(
list(
x = x,
y = y,
type = type
),
class = c("set_op_query", "query")
)
}
print.set_op_query <- function(x, ...) {
cat("<SQL ", x$type, ">\n", sep = "")
cat(named_rule("X"), "\n", sep = "")
print(x$x$ops)
cat(named_rule("Y"), "\n", sep = "")
print(x$y$ops)
}This set of S3 classes describe the action of dplyr verbs. These are currently used for SQL sources to separate the description of operations in R from their computation in SQL. This API is very new so is likely to evolve in the future.
op_vars and op_grps compute the variables and groups from a sequence of lazy operations. op_sort tracks the order of the data for use in window functions.
op_base_remote(src, x, vars = NULL) {
# If not literal sql, must be a table identifier
if (!is.sql(x)) { x <- ident(x) }
if (is.null(vars)) { vars <- db_query_fields(src$con, x) }
op_base("remote", src, x, vars)
}
print.op_base_remote(x, ...) {
cat("Source: ", src_desc(x$src), "\n", sep = "")
if (inherits(x$x, "ident")){ cat("From: ", x$x, "\n", sep = "") }
else { cat("From: <derived table>\n") }
cat("<Table: ", x$x, ">\n", sep = "")
}
op_base_local(df, env = parent.frame()) { op_base("local", src_df(env = env), df, names(df)) }
print.op_base_local(x, ...) { cat("<Local data frame> ", dim_desc(x$x), "\n", sep = "") }
op_base(name, src, x, vars) {
stopifnot(is.character(vars))
structure(
list(
src = src,
x = x,
vars = vars
),
class = c(paste0("op_base_", name), "op_base", "op")
)
}
op_single(name, x, dots = list(), args = list()) {
structure(
list(
name = name,
x = x,
dots = dots,
args = args
),
class = c(paste0("op_", name), "op_single", "op")
)
}
add_op_single(name, .data, dots = list(), args = list()) {
.data$ops <- op_single(name, x = .data$ops, dots = dots, args = args)
.data
}
print.op_single(x, ...) {
print(x$x)
cat("-> ", x$name, "()\n", sep = "")
for (dot in x$dots) {
cat(" - ", deparse_trunc(dot$expr), "\n", sep = "")
}
}
<!--# from R/utils.r-->
<!--deparse_trunc <- function(x, width = getOption("width")) {-->
<!-- text <- deparse(x, width.cutoff = width)-->
<!-- if (length(text) == 1 && nchar(text) < width) return(text)-->
<!-- paste0(substr(text[1], 1, width - 3), "...")-->
<!--}-->
op_double(name, x, y, args = list()) {
structure(
list(
name = name,
x = x,
y = y,
args = args
),
class = c(paste0("op_", name), "op_double", "op")
)
}
# op_grps -----------------------------------------------------------------
op_grps.op_base(op){ character() }
op_grps.op_group_by(op) {
if (isTRUE(op$args$add)){ union(op_grps(op$x), names(op$dots)) }
else { names(op$dots) }}
op_grps.op_ungroup(op) { NULL }
op_grps.op_summarise(op) {
grps <- op_grps(op$x)
if (length(grps) == 1) { NULL }
else { grps[-length(grps)] }}
op_grps.op_single(op){ op_grps(op$x) }
op_grps.op_double(op){ op_grps(op$x) }
op_grps.tbl_lazy(op) { op_grps(op$ops) }
# op_vars -----------------------------------------------------------------
op_vars.op_base(op){ op$vars }
op_vars.op_select(op){ names(select_vars_(op_vars(op$x), op$dots, include = op_grps(op$x))) }
op_vars.op_rename(op){ names(rename_vars_(op_vars(op$x), op$dots)) }
op_vars.op_summarise(op) { c(op_grps(op$x), names(op$dots)) }
op_vars.op_mutate(op){ unique(c(op_vars(op$x), names(op$dots))) }
op_vars.op_single(op){ op_vars(op$x) }
op_vars.op_join <- function(op) {
by <- op$args$by
x_vars <- op_vars(op$x)
y_vars <- op_vars(op$y)
unique <- unique_names(x_vars, y_vars, by = by, suffix = op$args$suffix)
if (is.null(unique)) { c(by$x, setdiff(x_vars, by$x), setdiff(y_vars, by$y)) }
else { union(unique$x, unique$y) }
}
op_vars.op_semi_join(op){ op_vars(op$x) }
op_vars.op_set_op(op){ op_vars(op$x) }
op_vars.tbl_lazy(op){ op_vars(op$ops) }
# op_sort -----------------------------------------------------------------
# This is only used to determine the order for window functions
# so it purposely ignores grouping.
op_sort(op){ UseMethod("op_sort") }
op_sort.op_base(op){ NULL }
op_sort.op_summarise(op){ NULL }
op_sort.op_arrange(op){
order_vars <- translate_sql_(op$dots, NULL, op_vars(op))
c.sql(op_sort(op$x), order_vars, drop_null = TRUE)}
op_sort.op_single(op){ op_sort(op$x) }
op_sort.op_double(op){ op_sort(op$x) }
op_sort.tbl_lazy(op){ op_sort(op$ops) }sql_render.op(query, con = NULL, ...) {
sql_render(sql_build(query, ...), con = con, ...)}
sql_render.tbl_sql(query, con = NULL, ...) {
sql_render(sql_build(query$ops, query$src$con, ...), con = query$src$con, ...)}
sql_render.tbl_lazy(query, con = NULL, ...) {
sql_render(sql_build(query$ops, con = NULL, ...), con = NULL, ...)}
sql_render.select_query(query, con = NULL, ..., root = FALSE) {
from <- sql_subquery(con, sql_render(query$from, con, ..., root = root), name = NULL)
sql_select(
con, query$select, from, where = query$where, group_by = query$group_by,
having = query$having, order_by = query$order_by, limit = query$limit,
distinct = query$distinct,
...)}
sql_render.ident(query, con = NULL, ..., root = TRUE) {
if (root) {
sql_select(con, sql("*"), query)
} else {
query}}
sql_render.sql(query, con = NULL, ...) { query }
sql_render.join_query(query, con = NULL, ..., root = FALSE) {
from_x <- sql_subquery(con, sql_render(query$x, con, ..., root = root), name = NULL)
from_y <- sql_subquery(con, sql_render(query$y, con, ..., root = root), name = NULL)
sql_join(con, from_x, from_y, type = query$type, by = query$by)}
sql_render.semi_join_query(query, con = NULL, ..., root = FALSE) {
from_x <- sql_subquery(con, sql_render(query$x, con, ..., root = root), name = "_LEFT")
from_y <- sql_subquery(con, sql_render(query$y, con, ..., root = root), name = "_RIGHT")
sql_semi_join(con, from_x, from_y, anti = query$anti, by = query$by)}
sql_render.set_op_query(query, con = NULL, ..., root = FALSE) {
from_x <- sql_render(query$x, con, ..., root = TRUE)
from_y <- sql_render(query$y, con, ..., root = TRUE)
sql_set_op(con, from_x, from_y, method = query$type)}These generics are used to run build various SQL queries. A default method generates ANSI 92 compliant SQL, but variations in SQL across databases means that it's likely that a backend will require at least a few methods.
conA database connection.
A SQL string.
sql_select(con, select, from, where = NULL, group_by = NULL,
having = NULL, order_by = NULL, limit = NULL, distinct = FALSE, ...)
sql_subquery(con, from, name = random_table_name(), ...)
sql_join(con, x, y, type = "inner", by = NULL, ...)
sql_semi_join(con, x, y, anti = FALSE, by = NULL, ...)
sql_set_op(con, x, y, method)
sql_escape_string(con, x)
sql_escape_ident(con, x)
sql_select(con, select, from, where = NULL,
group_by = NULL, having = NULL,
order_by = NULL, limit = NULL,
distinct = FALSE, ...){
out <- vector("list", 7)
names(out) <- c("select","from","where","group_by","having","order_by","limit")
assert_that(is.character(select), length(select) > 0L)
out$select <- build_sql(
"SELECT ",
if (distinct) sql("DISTINCT "),
escape(select, collapse = ", ", con = con))
assert_that(is.character(from), length(from) == 1L)
out$from <- build_sql("FROM ", from, con = con)
if (length(where) > 0L) {
assert_that(is.character(where))
where_paren <- escape(where, parens = TRUE, con = con)
out$where <- build_sql("WHERE ", sql_vector(where_paren, collapse = " AND "))}
if (length(group_by) > 0L) {
assert_that(is.character(group_by))
out$group_by <- build_sql("GROUP BY ",
escape(group_by, collapse = ", ", con = con))}
if (length(having) > 0L) {
assert_that(is.character(having))
out$having <- build_sql("HAVING ",
escape(having, collapse = ", ", con = con))}
if (length(order_by) > 0L) {
assert_that(is.character(order_by))
out$order_by <- build_sql("ORDER BY ",
escape(order_by, collapse = ", ", con = con))}
if (!is.null(limit)) {
assert_that(is.numeric(limit), length(limit) == 1L)
out$limit <- build_sql("LIMIT ",
sql(format(trunc(limit), scientific = FALSE)),
con = con)}
escape(unname(compact(out)), collapse = "\n", parens = FALSE, con = con)}
sql_subquery(con, from, name = unique_name(), ...) {
if (is.ident(from)){
setNames(from, name)}
else {
build_sql("(", from, ") ", ident(name %||% random_table_name()), con = con)}}
sql_join(con, x, y, type = "inner", by = NULL, ...) {
join <- switch(type,
left = sql("LEFT"),
inner = sql("INNER"),
right = sql("RIGHT"),
full = sql("FULL"),
stop("Unknown join type:", type, call. = FALSE))
using <- all(by$x == by$y)
if (using) {
cond <- build_sql("USING ", lapply(by$x, ident), con = con)
} else {
on <- sql_vector(paste0(sql_escape_ident(con, by$x), " = ", sql_escape_ident(con, by$y)),
collapse = " AND ", parens = TRUE)
cond <- build_sql("ON ", on, con = con)}
build_sql(
'SELECT * FROM ',x, "\n\n",
join, " JOIN\n\n" ,
y, "\n\n",
cond,
con = con)}
sql_semi_join(con, x, y, anti = FALSE, by = NULL, ...) {
# X and Y are subqueries named _LEFT and _RIGHT
left <- escape(ident("_LEFT"), con = con)
right <- escape(ident("_RIGHT"), con = con)
on <- sql_vector(
paste0(left, ".", sql_escape_ident(con, by$x), " = ",
right, ".", sql_escape_ident(con, by$y)),
collapse = " AND ",
parens = TRUE,
con = con)
build_sql(
'SELECT * FROM ', x, '\n\n',
'WHERE ', if (anti) sql('NOT '), 'EXISTS (\n',
' SELECT 1 FROM ', y, '\n',
' WHERE ', on, '\n',
')',
con = con)}
sql_set_op(con, x, y, method) {
build_sql(x, "\n", sql(method), "\n", y)}
sql_escape_string(con, x) {
sql_quote(x, "'")}
sql_escape_ident(con, x) {
sql_quote(x, '"')}These functions are critical when writing functions that translate R functions to sql functions. Typically a conversion function should escape all it's inputs and return an sql object.
...Character vectors that will be combined into a single SQL expression.identflags its input as a identifier, to ensure that it gets the correct quoting.xAn object to escape. Existing sql vectors will be left as is, character vectors are escaped with single quotes, numeric vectors have trailing.0added if they're whole numbers, identifiers are escaped with double quotes.parenscollapse Controls behaviour when multiple values are supplied.parens-
should be a logical flag, or if `NA`, will wrap in parens if length > 1.
Default behaviour: lists are always wrapped in parens and separated by commas, identifiers are separated by commas and never wrapped, atomic vectors are separated by spaces and wrapped in parens if needed.
# Doubles vs. integers
escape(1:5)
escape(c(1, 5.4))
# String vs known sql vs. sql identifier
escape("X")
escape(sql("X"))
escape(ident("X"))
# Escaping is idempotent
escape("X")
escape(escape("X"))
escape(escape(escape("X")))
sql(...) {
x <- c(...)
if (length(x) == 0) {
structure(character(), class = c("sql", "character"))
} else {
stopifnot(is.character(x))
structure(x, class = c("sql", "character"))}}
ident(...) {
x <- c(...)
if (length(x) == 0) return(sql())
stopifnot(is.character(x))
structure(x, class = c("ident", "sql", "character"))}
c.sql(..., drop_null = FALSE, con = NULL) {
input <- list(...)
if (drop_null) input <- compact(input)
sql(unlist(lapply(input, escape, collapse = NULL, con = con)))}
unique.sql(x, ...){ sql(NextMethod()) }
setOldClass(c("sql", "character"))
setOldClass(c("ident", "sql", "character"))
is.sql(x){ inherits(x, "sql") }
is.ident(x){ inherits(x, "ident") }
print.sql(x, ...){ cat(format(x, ...), sep = "\n") }
format.sql(x, ...){ paste0("<SQL> ", x) }
format.ident(x, ...){ paste0("<VAR> ", escape(x)) }
escape(x, parens = NA, collapse = " ", con = NULL) {
UseMethod("escape") }
escape.ident(x, parens = FALSE, collapse = ", ", con = NULL) {
y <- sql_escape_ident(con, x)
sql_vector(names_to_as(y, con), parens, collapse)}
escape.logical(x, parens = NA, collapse = ", ", con = NULL) {
x <- as.character(x)
x[is.na(x)] <- "NULL"
sql_vector(x, parens, collapse)}
escape.factor(x, parens = NA, collapse = ", ", con = NULL) {
x <- as.character(x)
escape.character(x, parens = parens, collapse = collapse, con = con)}
escape.Date(x, parens = NA, collapse = ", ", con = NULL) {
x <- as.character(x)
escape.character(x, parens = parens, collapse = collapse, con = con)}
escape.POSIXt(x, parens = NA, collapse = ", ", con = NULL) {
x <- strftime(x, "%Y-%m-%dT%H:%M:%OSZ", tz = "UTC")
escape.character(x, parens = parens, collapse = collapse, con = con)}
escape.character(x, parens = NA, collapse = ", ", con = NULL) {
sql_vector(sql_escape_string(con, x), parens, collapse, con = con)}
escape.double(x, parens = NA, collapse = ", ", con = NULL) {
missing <- is.na(x)
x <- ifelse(is.wholenumber(x), sprintf("%.1f", x), as.character(x))
x[missing] <- "NULL"
sql_vector(x, parens, collapse)}
escape.integer(x, parens = NA, collapse = ", ", con = NULL) {
x[is.na(x)] <- "NULL"
sql_vector(x, parens, collapse)}
escape.NULL(x, parens = NA, collapse = " ", con = NULL) { sql("NULL") }
escape.sql(x, parens = NULL, collapse = NULL, con = NULL) {
sql_vector(x, isTRUE(parens), collapse, con = con)}
escape.list(x, parens = TRUE, collapse = ", ", con = NULL) {
pieces <- vapply(x, escape, character(1), con = con)
sql_vector(pieces, parens, collapse)}
sql_vector(x, parens = NA, collapse = " ", con = NULL) {
if (is.na(parens)) { parens <- length(x) > 1L }
x <- names_to_as(x, con = con)
x <- paste(x, collapse = collapse)
if (parens) x <- paste0("(", x, ")")
sql(x)}
names_to_as(x, con = NULL) {
names <- names2(x)
as <- ifelse(names == '', '', paste0(' AS ', sql_escape_ident(con, names)))
paste0(x, as)}Build a SQL string.
This is a convenience function that should prevent sql injection attacks (which in the context of dplyr are most likely to be accidental not deliberate) by automatically escaping all expressions in the input, while treating bare strings as sql. This is unlikely to prevent any serious attack, but should make it unlikely that you produce invalid sql.
...input to convert to SQL. Usesqlto preserve user input as is (dangerous), andidentto label user input as sql identifiers (safe).envthe environment in which to evaluate the arguments. Should not be needed in typical use.condatabase connection; used to select correct quoting characters.
It makes use of the dots function (explained here, defined in the following way:
dots <- function(...){
eval(substitute(alist(...)))
}
See also: https://github.com/hadley/pryr/blob/master/R/dots.r
build_sql("SELECT * FROM TABLE")
x <- "TABLE"
build_sql("SELECT * FROM ", x)
build_sql("SELECT * FROM ", ident(x))
build_sql("SELECT * FROM ", sql(x))
# http://xkcd.com/327/
name <- "Robert'); DROP TABLE Students;--"
build_sql("INSERT INTO Students (Name) VALUES (", name, ")")
build_sql(..., .env = parent.frame(), con = NULL) {
escape_expr <- function(x) {
# If it's a string, leave it as is
if (is.character(x)) return(x)
val <- eval(x, .env)
# Skip nulls, so you can use if statements like in paste
if (is.null(val)) return("")
escape(val, con = con)
}
pieces <- vapply(dots(...), escape_expr, character(1))
sql(paste0(pieces, collapse = ""))
}
#' Helper function for quoting sql elements.
#'
#' If the quote character is present in the string, it will be doubled.
#' \code{NA}s will be replaced with NULL.
#'
#' @export
#' @param x Character vector to escape.
#' @param quote Single quoting character.
#' @export
#' @keywords internal
#' @examples
#' sql_quote("abc", "'")
#' sql_quote("I've had a good day", "'")
#' sql_quote(c("abc", NA), "'")
sql_quote(x, quote) {
y <- gsub(quote, paste0(quote, quote), x, fixed = TRUE)
y <- paste0(quote, y, quote)
y[is.na(x)] <- "NULL"
names(y) <- names(x)
y
}Translate an expression to sql.
When creating a package that maps to a new SQL based src, you'll often want to provide some additional mappings from common R commands to the commands that your tbl provides. These three functions make that easy.
sql_infix and sql_prefix create default SQL infix and prefix functions
given the name of the SQL function. They don't perform any input checking,
but do correctly escape their input, and are useful for quickly providing
default wrappers for a new SQL variant.
scalar,aggregate,windowThe three families of functions than a SQL variant can supply.....funs named functions, used to add custom converters from standard R functions to sql functions. Specify individually in..., or provide a list of.funs.parentthe sql variant that this variant should inherit from. Defaults tobase_sqlwhich provides a standard set of mappings for the most common operators and functions.fthe name of the sql function as a stringnfor `sql_infix`, an optional number of arguments to expect. Will signal error if not correct.
See also: sql for an example of a more customised sql conversion function.
sql_variant(scalar = sql_translator(),
aggregate = sql_translator(),
window = sql_translator()) {
stopifnot(is.environment(scalar))
stopifnot(is.environment(aggregate))
stopifnot(is.environment(window))
structure(list(scalar = scalar, aggregate = aggregate, window = window),
class = "sql_variant")}
is.sql_variant(x) inherits(x, "sql_variant")
print.sql_variant(x, ...) {
wrap_ls <- function(x, ...) {
vars <- sort(ls(envir = x))
wrapped <- strwrap(paste0(vars, collapse = ", "), ...)
if (identical(wrapped, "")) return()
paste0(wrapped, "\n", collapse = "")}
cat("<sql_variant>\n")
cat(wrap_ls(x$scalar, prefix = "scalar: "))
cat(wrap_ls(x$aggregate, prefix = "aggregate: "))
cat(wrap_ls(x$window, prefix = "window: "))}
names.sql_variant(x) {
c(ls(envir = x$scalar), ls(envir = x$aggregate), ls(envir = x$window))}
sql_translator(..., .funs = list(),
.parent = new.env(parent = emptyenv())) {
funs <- c(list(...), .funs)
if (length(funs) == 0) return(.parent)
list2env(funs, copy_env(.parent))}
copy_env(from, to = NULL, parent = parent.env(from)) {
list2env(as.list(from), envir = to, parent = parent)}
sql_infix(f) {
assert_that(is.string(f))
f <- toupper(f)
function(x, y) { build_sql(x, " ", sql(f), " ", y) }}
sql_prefix(f, n = NULL) {
assert_that(is.string(f))
f <- toupper(f)
function(..., na.rm) {
if (!missing(na.rm)) {
message("na.rm not needed in SQL: NULL are always dropped", call. = FALSE)}
args <- list(...)
if (!is.null(n) && length(args) != n) {
stop("Invalid number of args to SQL ", f, ". Expecting ", n,
call. = FALSE)}
if (any(names2(args) != "")) {
warning("Named arguments ignored for SQL ", f, call. = FALSE)}
build_sql(sql(f), args)}}
sql_not_supported(f) {
assert_that(is.string(f))
f <- toupper(f)
function(...) { stop(f, " is not available in this SQL variant", call. = FALSE) }
}
win_rank(f) {
force(f)
function(order = NULL) {
over(build_sql(sql(f), list()), partition_group(), order %||% partition_order())}}
win_recycled(f) {
force(f)
function(x) { over(build_sql(sql(f), list(x)), partition_group()) }}
win_cumulative(f) {
force(f)
function(x) { over(build_sql(sql(f), list(x)), partition_group(), partition_order(), frame = c(-Inf, 0))}}
win_absent(f) {
force(f)
function(...) { stop("Window function `", f, "()` is not supported by this database", call. = FALSE) }}
# Use a global variable to communicate state of partitioning between
# tbl and sql translator. This isn't the most amazing design, but it keeps
# things loosely coupled and is straightforward to understand.
partition <- new.env(parent = emptyenv())
partition$group_by <- NULL
partition$order_by <- NULL
partition$con <- NULL
set_partition_con(con) {
old <- partition$con
partition$con <- con
invisible(old)}
set_partition_group(vars) {
stopifnot(is.null(vars) || is.character(vars))
old <- partition$group_by
partition$group_by <- vars
invisible(old)}
set_partition_order(vars) {
stopifnot(is.null(vars) || is.character(vars))
old <- partition$order_by
partition$order_by <- vars
invisible(old)}
set_partition(group_by, order_by, con = NULL) {
old <- list(partition$group_by, partition$order_by)
if (is.list(group_by)) {
order_by <- group_by[[2]]
group_by <- group_by[[1]]}
partition$group_by <- group_by
partition$order_by <- order_by
partition$con <- con
invisible(old)}
partition_group() partition$group_by
partition_order() partition$order_by
partition_con() partition$conThe base translator, base_sql, provides custom mappings for ! (to NOT), && and & to
AND, || and | to OR, ^ to POWER, \%>\% to \%, ceiling to CEIL, mean to
AVG, var to VARIANCE, tolower to LOWER, toupper to UPPER and nchar to length.
c and : keep their usual R behaviour so you can easily create vectors that are passed to sql.
All other functions will be preserved as is. R's infix functions (e.g. \%like\%) will be
converted to their sql equivalents (e.g. LIKE). You can use this to access SQL string
concatenation: || is mapped to OR, but \%||\% is mapped to ||. To suppress this behaviour,
and force errors immediately when dplyr doesn't know how to translate a function it encounters,
using set the dplyr.strict_sql option to TRUE. You can also use sql to insert a raw sql string.
sql_if(cond, if_true, if_false = NULL) {
build_sql(
"CASE WHEN (", cond, ")",
" THEN (", if_true, ")",
if (!is.null(if_false)) build_sql(" ELSE (", if_false, ")"),
" END")}
base_scalar <- sql_translator(
`+` = sql_infix("+"),
`*` = sql_infix("*"),
`/` = sql_infix("/"),
`%%` = sql_infix("%"),
`^` = sql_prefix("power", 2),
`-` = function(x, y = NULL) {
if (is.null(y)) {
if (is.numeric(x)) {
-x }
else {
build_sql(sql("-"), x)}}
else {
build_sql(x, sql(" - "), y)}
},
`!=` = sql_infix("!="),
`==` = sql_infix("="),
`<` = sql_infix("<"),
`<=` = sql_infix("<="),
`>` = sql_infix(">"),
`>=` = sql_infix(">="),
`!` = sql_prefix("not"),
`&` = sql_infix("and"),
`&&` = sql_infix("and"),
`|` = sql_infix("or"),
`||` = sql_infix("or"),
xor = function(x, y) {
sql(sprintf("%1$s OR %2$s AND NOT (%1$s AND %2$s)", escape(x), escape(y)))
},
abs = sql_prefix("abs", 1),
acos = sql_prefix("acos", 1),
acosh = sql_prefix("acosh", 1),
asin = sql_prefix("asin", 1),
asinh = sql_prefix("asinh", 1),
atan = sql_prefix("atan", 1),
atan2 = sql_prefix("atan2", 2),
atanh = sql_prefix("atanh", 1),
ceil = sql_prefix("ceil", 1),
ceiling = sql_prefix("ceil", 1),
cos = sql_prefix("cos", 1),
cosh = sql_prefix("cosh", 1),
cot = sql_prefix("cot", 1),
coth = sql_prefix("coth", 1),
exp = sql_prefix("exp", 1),
floor = sql_prefix("floor", 1),
log = function(x, base = exp(1)) {
build_sql(sql("log"), list(x, base))
},
log10 = sql_prefix("log10", 1),
round = sql_prefix("round", 2),
sign = sql_prefix("sign", 1),
sin = sql_prefix("sin", 1),
sinh = sql_prefix("sinh", 1),
sqrt = sql_prefix("sqrt", 1),
tan = sql_prefix("tan", 1),
tolower = sql_prefix("lower", 1),
toupper = sql_prefix("upper", 1),
nchar = sql_prefix("length", 1),
`if` = sql_if,
if_else = sql_if,
ifelse = sql_if,
sql = function(...) sql(...),
`(` = function(x) {
build_sql("(", x, ")")},
`{` = function(x) {
build_sql("(", x, ")")},
desc = function(x) {
build_sql(x, sql(" DESC"))},
is.null = function(x) {
build_sql("(", x, ") IS NULL")},
is.na = function(x) {
build_sql("(", x, ") IS NULL")},
na_if = sql_prefix("NULL_IF", 2),
as.numeric = function(x) build_sql("CAST(", x, " AS NUMERIC)"),
as.integer = function(x) build_sql("CAST(", x, " AS INTEGER)"),
as.character = function(x) build_sql("CAST(", x, " AS TEXT)"),
c = function(...) escape(c(...)),
`:` = function(from, to) escape(from:to),
between = function(x, left, right) {
build_sql(x, " BETWEEN ", left, " AND ", right)},
pmin = sql_prefix("min"),
pmax = sql_prefix("max"),
`__dplyr_colwise_fun` = function(...) {
stop("colwise verbs only accept bare functions with local sources",
call. = FALSE)
}
)
base_symbols <- sql_translator(
pi = sql("PI()"),
`*` = sql("*"),
`NULL` = sql("NULL")
)
base_agg <- sql_translator(
# SQL-92 aggregates
# http://db.apache.org/derby/docs/10.7/ref/rrefsqlj33923.html
n = sql_prefix("count"),
mean = sql_prefix("avg", 1),
var = sql_prefix("variance", 1),
sum = sql_prefix("sum", 1),
min = sql_prefix("min", 1),
max = sql_prefix("max", 1),
n_distinct = function(x) {
build_sql("COUNT(DISTINCT ", x, ")")
}
)
base_win <- sql_translator(
# rank functions have a single order argument that overrides the default
row_number = win_rank("row_number"),
min_rank = win_rank("rank"),
rank = win_rank("rank"),
dense_rank = win_rank("dense_rank"),
percent_rank = win_rank("percent_rank"),
cume_dist = win_rank("cume_dist"),
ntile = function(order_by, n) {
over(
build_sql("NTILE", list(as.integer(n))),
partition_group(),
order_by %||% partition_order()
)
},
# Recycled aggregate fuctions take single argument, don't need order and
# include entire partition in frame.
mean = win_recycled("avg"),
sum = win_recycled("sum"),
min = win_recycled("min"),
max = win_recycled("max"),
n = function() {
over(sql("COUNT(*)"), partition_group())
},
# Cumulative function are like recycled aggregates except that R names
# have cum prefix, order_by is inherited and frame goes from -Inf to 0.
cummean = win_cumulative("mean"),
cumsum = win_cumulative("sum"),
cummin = win_cumulative("min"),
cummax = win_cumulative("max"),
# Finally there are a few miscellaenous functions that don't follow any
# particular pattern
nth = function(x, order = NULL) {
over(build_sql("NTH_VALUE", list(x)), partition_group(), order %||% partition$order())
},
first = function(x, order = NULL) {
over(build_sql("FIRST_VALUE", list(x)), partition_group(), order %||% partition_order())
},
last = function(x, order = NULL) {
over(build_sql("LAST_VALUE", list(x)), partition_group(), order %||% partition_order())
},
lead = function(x, n = 1L, default = NA, order = NULL) {
over(
build_sql("LEAD", list(x, n, default)),
partition_group(),
order %||% partition_order()
)
},
lag = function(x, n = 1L, default = NA, order = NULL) {
over(
build_sql("LAG", list(x, n, default)),
partition_group(),
order %||% partition_order()
)
},
order_by = function(order_by, expr) {
old <- set_partition(partition_group(), order_by)
on.exit(set_partition(old))
expr
}
)
base_no_win <- sql_translator(
row_number = win_absent("row_number"),
min_rank = win_absent("rank"),
rank = win_absent("rank"),
dense_rank = win_absent("dense_rank"),
percent_rank = win_absent("percent_rank"),
cume_dist = win_absent("cume_dist"),
ntile = win_absent("ntile"),
mean = win_absent("avg"),
sum = win_absent("sum"),
min = win_absent("min"),
max = win_absent("max"),
n = win_absent("n"),
cummean = win_absent("mean"),
cumsum = win_absent("sum"),
cummin = win_absent("min"),
cummax = win_absent("max"),
nth = win_absent("nth_value"),
first = win_absent("first_value"),
last = win_absent("last_value"),
lead = win_absent("lead"),
lag = win_absent("lag"),
order_by = win_absent("order_by")
)translate_sql(..., con = NULL, vars = character(), vars_group = NULL,
vars_order = NULL, window = TRUE)
translate_sql_(dots, con = NULL, vars = character(), vars_group = NULL,
vars_order = NULL, window = TRUE)
-
..., dotsExpressions to translate. sql_translate automatically quotes them for you. sql_translate_ expects a list of already quoted objects. -
conAn optional database connection to control the details of the translation. The default, NULL, generates ANSI SQL. -
varsA character vector giving variable names in the remote data source. If this is supplied, translate_sql will call partial_eval to interpolate in the values from local variables. -
vars_group, vars_orderGrouping and ordering variables used for windowed functions. -
windowUse FALSE to suppress generation of the OVER statement used for window functions. This is necessary when generating SQL for a grouped summary.
translate_sql <- function(...,
con = NULL,
vars = character(),
vars_group = NULL,
vars_order = NULL,
window = TRUE) {
dots <- lazyeval::lazy_dots(...)
translate_sql_(dots,
con = con,
vars = vars,
vars_group = vars_group,
vars_order = vars_order,
window = window
)
}
translate_sql_ <- function(dots,
con = NULL,
vars = character(),
vars_group = NULL,
vars_order = NULL,
window = TRUE) {
expr <- lazyeval::as.lazy_dots(dots, env = parent.frame())
if (!any(has_names(expr))) {
names(expr) <- NULL
}
if (length(vars) > 0) {
# If variables are known, partially evaluate input
expr <- partial_eval2(expr, vars)
} else {
# Otherwise just extract expressions, ignoring the environment
# from which they came
expr <- lapply(expr, "[[", "expr")
}
variant <- sql_translate_env(con)
if (window) {
old_con <- set_partition_con(con)
on.exit(set_partition_con(old_con), add = TRUE)
old_group <- set_partition_group(vars_group)
on.exit(set_partition_group(old_group), add = TRUE)
old_order <- set_partition_order(vars_order)
on.exit(set_partition_order(old_order), add = TRUE)
}
pieces <- lapply(expr, function(x) {
if (is.atomic(x)) return(escape(x, con = con))
env <- sql_env(x, variant, con, window = window)
escape(eval(x, envir = env))
})
sql(unlist(pieces))
}
sql_env <- function(expr, variant, con, window = FALSE,
strict = getOption("dplyr.strict_sql")) {
stopifnot(is.sql_variant(variant))
# Default for unknown functions
if (!strict) {
unknown <- setdiff(all_calls(expr), names(variant))
default_env <- ceply(unknown, default_op, parent = emptyenv())
} else {
default_env <- new.env(parent = emptyenv())
}
# Known R -> SQL functions
special_calls <- copy_env(variant$scalar, parent = default_env)
if (!window) {
special_calls2 <- copy_env(variant$aggregate, parent = special_calls)
} else {
special_calls2 <- copy_env(variant$window, parent = special_calls)
}
# Existing symbols in expression
names <- all_names(expr)
name_env <- ceply(names, function(x) escape(ident(x), con = con),
parent = special_calls2)
# Known sql expressions
symbol_env <- copy_env(base_symbols, parent = name_env)
symbol_env
}
default_op <- function(x) {
assert_that(is.string(x))
infix <- c("::", "$", "@", "^", "*", "/", "+", "-", ">", ">=", "<", "<=",
"==", "!=", "!", "&", "&&", "|", "||", "~", "<-", "<<-")
if (x %in% infix) {
sql_infix(x)
} else if (grepl("^%.*%$", x)) {
x <- substr(x, 2, nchar(x) - 1)
sql_infix(x)
} else {
sql_prefix(x)
}
}
all_calls <- function(x) {
if (!is.call(x)) return(NULL)
fname <- as.character(x[[1]])
unique(c(fname, unlist(lapply(x[-1], all_calls), use.names = FALSE)))
}
all_names <- function(x) {
if (is.name(x)) return(as.character(x))
if (!is.call(x)) return(NULL)
unique(unlist(lapply(x[-1], all_names), use.names = FALSE))
}
# character vector -> environment
ceply <- function(x, f, ..., parent = parent.frame()) {
if (length(x) == 0) return(new.env(parent = parent))
l <- lapply(x, f, ...)
names(l) <- x
# ‘names’ is a generic accessor function, and ‘names<-’ is a generic
# replacement function. The default methods get and set the
# ‘"names"’ attribute of a vector (including a list) or pairlist.
list2env(l, parent = parent)
# list2env: From a _named_ ‘list l’, create an ‘environment’ containing
# all list components as objects, or “multi-assign” from ‘l’ into a
# pre-existing environment.
# parent: (for the case ‘envir = NULL’): a parent frame aka enclosing
# environment, see ‘new.env’
}