Skip to content

Instantly share code, notes, and snippets.

@bukzor
Created October 4, 2024 17:51
Show Gist options
  • Save bukzor/ff7e600caf5aa8aa2074a750884167f7 to your computer and use it in GitHub Desktop.
Save bukzor/ff7e600caf5aa8aa2074a750884167f7 to your computer and use it in GitHub Desktop.
Unit testing for bigquery SQL
#!/usr/bin/env py.test
from __future__ import annotations
import subprocess
from pathlib import Path
from typing import IO
from typing import Iterable
import pandas
from _pytest.python import Metafunc
from pytest import fixture
from devinfra_metrics.gcp_clients import bigquery
# for cleaner pytest tracebacks
subprocess.__tracebackhide__ = ( # pyright:ignore[reportAttributeAccessIssue]
True
)
QUERY_TEMPLATE = """\
WITH
# input sql
{input_sql}
# query under test
{query}
# prevent enormous results
LIMIT 100
"""
def all_query_under_test() -> Iterable[Path]:
return Path(__file__).parent.glob("*.sql")
def all_input_sql_for(query_under_test: Path) -> Iterable[Path]:
testdata = query_under_test.parent / (query_under_test.name + ".test")
for input_sql in testdata.glob("*.input.sql"):
input_sql = ( # prettier and git-diff work better w/o symlinks
input_sql.resolve()
)
yield input_sql
def get_testid(path: Path):
return (
path.name.removesuffix(".sql")
.removesuffix(".bq")
.removesuffix(".input")
)
def pytest_generate_tests(metafunc: Metafunc):
if (
"query_under_test" in metafunc.fixturenames
and "input_sql" in metafunc.fixturenames
):
metafunc.parametrize(
argnames=["query_under_test", "input_sql"],
argvalues=[
(query_under_test, input_sql)
for query_under_test in all_query_under_test()
for input_sql in all_input_sql_for(query_under_test)
],
ids=get_testid,
)
@fixture
def test_query(input_sql: Path, query_under_test: Path) -> str:
query = query_under_test.read_text()
before, sep, after = ("\n" + query).partition("\nWITH\n")
if sep:
query = "," + after
else:
query = before
test_query = QUERY_TEMPLATE.format(
input_sql=input_sql.read_text().replace("\n", "\n "), query=query
)
print(test_query) # very helpful for debug
return test_query
@fixture
def bigquery_results(test_query: str) -> pandas.DataFrame:
query_result = bigquery.query(test_query).result()
return query_result.to_dataframe() # type:ignore
def sh_run(
cmd: tuple[str, ...],
input: str | None = None,
stdout: IO[str] | None = None,
):
__tracebackhide__ = True
proc = subprocess.run(
("sh", "-exc", '"$@"', "-") + cmd,
input=input,
stdout=stdout,
encoding="UTF-8",
)
if proc.returncode != 0:
import shlex
raise subprocess.CalledProcessError(proc.returncode, shlex.join(cmd))
@fixture
def output_json(input_sql: Path, bigquery_results: pandas.DataFrame) -> Path:
result = input_sql.with_name(
input_sql.name.removesuffix(".input.sql") + ".output.json"
)
if not result.exists():
result.touch()
if result.read_text() == "":
# it's too easy to forget, otherwise
sh_run(("git", "add", str(result)))
with result.open("w") as f:
bigquery_results.to_json( # type:ignore
f, orient="table", indent=2
)
print(bigquery_results)
return result
def assert_expected(output_json: Path):
__tracebackhide__ = True
# workaround for: https://github.com/prettier/prettier/issues/16683
sh_run(
("prettier", "--parser=json"),
input=output_json.read_text(),
stdout=output_json.open("w"),
)
sh_run((
"git",
"diff",
"--color=always",
"--no-relative",
"--exit-code",
str(output_json),
))
class DescribeGocdBigqueryView:
def it_works_for_examples(self, output_json: Path):
assert_expected(output_json)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment