Skip to content

Instantly share code, notes, and snippets.

@takemikami
Last active December 24, 2024 06:48
Show Gist options
  • Save takemikami/5f6967684c7eadd184647e804229166b to your computer and use it in GitHub Desktop.
Save takemikami/5f6967684c7eadd184647e804229166b to your computer and use it in GitHub Desktop.
# dbt_score_rules/dbt_best_practice.py
#
# dbt-score: https://dbt-score.picnic.tech/
#
# How we style our dbt models - Fields and model names
# https://docs.getdbt.com/best-practices/how-we-style/1-how-we-style-our-dbt-models
import re
from dbt_score import Model, RuleViolation, rule
import inflect
inflect_engine = inflect.engine()
@rule
def model_pluralized(model: Model) -> RuleViolation | None:
"""
Models should be pluralized, for example, customers, orders, products.
"""
if not inflect_engine.singular_noun(model.name):
return RuleViolation(message=f"Model should be pluralized.")
@rule
def have_primary_key(model: Model) -> RuleViolation | None:
"""
Each model should have a primary key.
The primary key of a model should be named <object>_id, for example, account_id.
This makes it easier to know what id is being referenced in downstream joined models.
"""
if len([c.name for c in model.columns if re.match(".*_id$", c.name)]) == 0:
return RuleViolation(message=f"Model should have a primary key like <object>_id.")
@rule
def model_use_underscores(model: Model) -> RuleViolation | None:
"""
Use underscores for naming dbt models; avoid dots.
ok: models_without_dots
ng: models.with.dots
"""
if "." in model.name:
return RuleViolation(message=f"Use underscores for naming dbt models; avoid dots.")
@rule
def key_is_string_type(model: Model) -> RuleViolation | None:
"""
Keys should be string data types.
"""
ng_cols = [
c.name for c in model.columns if c.data_type and re.match(".*_id$", c.name) and c.data_type.lower() != "string"
]
if ng_cols:
return RuleViolation(message=f"Keys should be string data types: {','.join(ng_cols)}")
@rule
def column_avoid_reserved_words(model: Model, reserved_words=None) -> RuleViolation | None:
"""
Avoid reserved words as column names.
"""
if not reserved_words:
reserved_words = RESERVED_WORDS
ng_cols = [c.name for c in model.columns if c.name in reserved_words]
if ng_cols:
return RuleViolation(message=f"Avoid reserved words as column names: {','.join(ng_cols)}")
@rule
def column_name_boolean(model: Model, boolean_types=None) -> RuleViolation | None:
"""
Booleans should be prefixed with is_ or has_.
"""
if not boolean_types:
boolean_types = ["boolean", "bool"]
ng_cols = [
c.name
for c in model.columns
if c.data_type
and c.data_type.lower() in boolean_types
and not (re.match("^has_.*", c.name) or re.match("^is_.*", c.name))
]
if ng_cols:
return RuleViolation(message=f"Boolean columns should be prefixed with is_ or has_: {','.join(ng_cols)}")
@rule
def column_name_timestamp(model: Model, timestamp_types=None) -> RuleViolation | None:
"""
Timestamp columns should be named <event>_at(for example, created_at) and
should be in UTC. If a different timezone is used, this should be indicated with a suffix
(created_at_pt).
"""
if not timestamp_types:
timestamp_types = ["timestamp"]
ng_cols = [
c.name
for c in model.columns
if c.data_type
and c.data_type.lower() in timestamp_types
and not (re.match(".*_at$", c.name) or re.match(".*_at_[^_]*$", c.name))
]
if ng_cols:
return RuleViolation(
message=f"Timestamp columns should be named <event>_at or <event>_at_<tz>: {','.join(ng_cols)}"
)
@rule
def column_name_date(model: Model, date_types=None) -> RuleViolation | None:
"""
Dates should be named <event>_date. For example, created_date.
"""
if not date_types:
date_types = ["date"]
ng_cols = [
c.name
for c in model.columns
if c.data_type and c.data_type.lower() in date_types and not re.match(".*_date$", c.name)
]
if ng_cols:
return RuleViolation(message=f"Date columns should be named <event>_date: {','.join(ng_cols)}")
@rule
def schema_snake_case(model: Model) -> RuleViolation | None:
"""
Schema name should be in snake_case.
"""
if "-" in model.schema or (re.match("[A-Z]+", model.schema) and re.match("[a-z]+", model.schema)):
return RuleViolation(message=f"Schema name should be in snake_case: {model.schema}")
@rule
def model_snake_case(model: Model) -> RuleViolation | None:
"""
Table name should be in snake_case.
"""
if "-" in model.name or (re.match("[A-Z]+", model.name) and re.match("[a-z]+", model.name)):
return RuleViolation(message=f"Table name should be in snake_case: {model.name}")
@rule
def column_snake_case(model: Model) -> RuleViolation | None:
"""
Column name should be in snake_case.
"""
ng_cols = [
c.name
for c in model.columns
if "-" in c.name or (re.match("[A-Z]+", model.name) and re.match("[a-z]+", model.name))
]
if ng_cols:
return RuleViolation(message=f"Column name should be in snake_case: {','.join(ng_cols)}")
@rule
def column_type_order(
model: Model,
string_types=None,
numeric_types=None,
boolean_types=None,
date_types=None,
timestamp_types=None,
) -> RuleViolation | None:
"""
Use a consistent ordering of data types and consider grouping and labeling columns by type,
as in the example below. This will minimize join errors and make it easier to read the model, as
well as help downstream consumers of the data understand the data types and scan models for
the columns they need. We prefer to use the following order: ids, strings, numerics, booleans,
dates, and timestamps.
"""
if not string_types:
string_types = ["string"]
if not numeric_types:
numeric_types = [
"int64",
"int",
"smallint",
"integer",
"bigint",
"tinyint",
"byteint",
"numeric",
"decimal",
"bignumeric",
"bigdecimal",
"float64",
]
if not boolean_types:
boolean_types = ["boolean", "bool"]
if not date_types:
date_types = ["date"]
if not timestamp_types:
timestamp_types = ["timestamp"]
def detect_type(col):
if re.match(".*_id$", col.name):
return 0
elif col.data_type.lower() in string_types:
return 1
elif col.data_type.lower() in numeric_types:
return 2
elif col.data_type.lower() in boolean_types:
return 3
elif col.data_type.lower() in date_types:
return 4
elif col.data_type.lower() in timestamp_types:
return 5
return -1
cols_types = [detect_type(c) for c in model.columns]
cols_types_filtered = [e for e in cols_types if e >= 0]
for e in zip(cols_types_filtered[0:-1], cols_types_filtered[1:]):
if e[0] > e[1]:
return RuleViolation(
message=f"Columns order should be ids, strings, numerics, booleans, dates, and timestamps."
)
RESERVED_WORDS = [
"all",
"and",
"any",
"array",
"as",
"asc",
"assert_rows_modified",
"at",
"between",
"by",
"case",
"cast",
"collate",
"contains",
"create",
"cross",
"cube",
"current",
"default",
"define",
"desc",
"distinct",
"else",
"end",
"enum",
"escape",
"except",
"exclude",
"exists",
"extract",
"false",
"fetch",
"following",
"for",
"from",
"full",
"group",
"grouping",
"groups",
"hash",
"having",
"if",
"ignore",
"in",
"inner",
"intersect",
"interval",
"into",
"is",
"join",
"lateral",
"left",
"like",
"limit",
"lookup",
"merge",
"natural",
"new",
"no",
"not",
"null",
"nulls",
"of",
"on",
"or",
"order",
"outer",
"over",
"partition",
"preceding",
"proto",
"qualify",
"range",
"recursive",
"respect",
"right",
"rollup",
"rows",
"select",
"set",
"some",
"struct",
"tablesample",
"then",
"to",
"treat",
"true",
"unbounded",
"union",
"unnest",
"using",
"when",
"where",
"window",
"with",
"within",
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment