Last active
December 24, 2024 06:48
-
-
Save takemikami/5f6967684c7eadd184647e804229166b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# dbt_score_rules/dbt_best_practice.py | |
# | |
# dbt-score: https://dbt-score.picnic.tech/ | |
# | |
# How we style our dbt models - Fields and model names | |
# https://docs.getdbt.com/best-practices/how-we-style/1-how-we-style-our-dbt-models | |
import re | |
from dbt_score import Model, RuleViolation, rule | |
import inflect | |
inflect_engine = inflect.engine() | |
@rule | |
def model_pluralized(model: Model) -> RuleViolation | None: | |
""" | |
Models should be pluralized, for example, customers, orders, products. | |
""" | |
if not inflect_engine.singular_noun(model.name): | |
return RuleViolation(message=f"Model should be pluralized.") | |
@rule | |
def have_primary_key(model: Model) -> RuleViolation | None: | |
""" | |
Each model should have a primary key. | |
The primary key of a model should be named <object>_id, for example, account_id. | |
This makes it easier to know what id is being referenced in downstream joined models. | |
""" | |
if len([c.name for c in model.columns if re.match(".*_id$", c.name)]) == 0: | |
return RuleViolation(message=f"Model should have a primary key like <object>_id.") | |
@rule | |
def model_use_underscores(model: Model) -> RuleViolation | None: | |
""" | |
Use underscores for naming dbt models; avoid dots. | |
ok: models_without_dots | |
ng: models.with.dots | |
""" | |
if "." in model.name: | |
return RuleViolation(message=f"Use underscores for naming dbt models; avoid dots.") | |
@rule | |
def key_is_string_type(model: Model) -> RuleViolation | None: | |
""" | |
Keys should be string data types. | |
""" | |
ng_cols = [ | |
c.name for c in model.columns if c.data_type and re.match(".*_id$", c.name) and c.data_type.lower() != "string" | |
] | |
if ng_cols: | |
return RuleViolation(message=f"Keys should be string data types: {','.join(ng_cols)}") | |
@rule | |
def column_avoid_reserved_words(model: Model, reserved_words=None) -> RuleViolation | None: | |
""" | |
Avoid reserved words as column names. | |
""" | |
if not reserved_words: | |
reserved_words = RESERVED_WORDS | |
ng_cols = [c.name for c in model.columns if c.name in reserved_words] | |
if ng_cols: | |
return RuleViolation(message=f"Avoid reserved words as column names: {','.join(ng_cols)}") | |
@rule | |
def column_name_boolean(model: Model, boolean_types=None) -> RuleViolation | None: | |
""" | |
Booleans should be prefixed with is_ or has_. | |
""" | |
if not boolean_types: | |
boolean_types = ["boolean", "bool"] | |
ng_cols = [ | |
c.name | |
for c in model.columns | |
if c.data_type | |
and c.data_type.lower() in boolean_types | |
and not (re.match("^has_.*", c.name) or re.match("^is_.*", c.name)) | |
] | |
if ng_cols: | |
return RuleViolation(message=f"Boolean columns should be prefixed with is_ or has_: {','.join(ng_cols)}") | |
@rule | |
def column_name_timestamp(model: Model, timestamp_types=None) -> RuleViolation | None: | |
""" | |
Timestamp columns should be named <event>_at(for example, created_at) and | |
should be in UTC. If a different timezone is used, this should be indicated with a suffix | |
(created_at_pt). | |
""" | |
if not timestamp_types: | |
timestamp_types = ["timestamp"] | |
ng_cols = [ | |
c.name | |
for c in model.columns | |
if c.data_type | |
and c.data_type.lower() in timestamp_types | |
and not (re.match(".*_at$", c.name) or re.match(".*_at_[^_]*$", c.name)) | |
] | |
if ng_cols: | |
return RuleViolation( | |
message=f"Timestamp columns should be named <event>_at or <event>_at_<tz>: {','.join(ng_cols)}" | |
) | |
@rule | |
def column_name_date(model: Model, date_types=None) -> RuleViolation | None: | |
""" | |
Dates should be named <event>_date. For example, created_date. | |
""" | |
if not date_types: | |
date_types = ["date"] | |
ng_cols = [ | |
c.name | |
for c in model.columns | |
if c.data_type and c.data_type.lower() in date_types and not re.match(".*_date$", c.name) | |
] | |
if ng_cols: | |
return RuleViolation(message=f"Date columns should be named <event>_date: {','.join(ng_cols)}") | |
@rule | |
def schema_snake_case(model: Model) -> RuleViolation | None: | |
""" | |
Schema name should be in snake_case. | |
""" | |
if "-" in model.schema or (re.match("[A-Z]+", model.schema) and re.match("[a-z]+", model.schema)): | |
return RuleViolation(message=f"Schema name should be in snake_case: {model.schema}") | |
@rule | |
def model_snake_case(model: Model) -> RuleViolation | None: | |
""" | |
Table name should be in snake_case. | |
""" | |
if "-" in model.name or (re.match("[A-Z]+", model.name) and re.match("[a-z]+", model.name)): | |
return RuleViolation(message=f"Table name should be in snake_case: {model.name}") | |
@rule | |
def column_snake_case(model: Model) -> RuleViolation | None: | |
""" | |
Column name should be in snake_case. | |
""" | |
ng_cols = [ | |
c.name | |
for c in model.columns | |
if "-" in c.name or (re.match("[A-Z]+", model.name) and re.match("[a-z]+", model.name)) | |
] | |
if ng_cols: | |
return RuleViolation(message=f"Column name should be in snake_case: {','.join(ng_cols)}") | |
@rule | |
def column_type_order( | |
model: Model, | |
string_types=None, | |
numeric_types=None, | |
boolean_types=None, | |
date_types=None, | |
timestamp_types=None, | |
) -> RuleViolation | None: | |
""" | |
Use a consistent ordering of data types and consider grouping and labeling columns by type, | |
as in the example below. This will minimize join errors and make it easier to read the model, as | |
well as help downstream consumers of the data understand the data types and scan models for | |
the columns they need. We prefer to use the following order: ids, strings, numerics, booleans, | |
dates, and timestamps. | |
""" | |
if not string_types: | |
string_types = ["string"] | |
if not numeric_types: | |
numeric_types = [ | |
"int64", | |
"int", | |
"smallint", | |
"integer", | |
"bigint", | |
"tinyint", | |
"byteint", | |
"numeric", | |
"decimal", | |
"bignumeric", | |
"bigdecimal", | |
"float64", | |
] | |
if not boolean_types: | |
boolean_types = ["boolean", "bool"] | |
if not date_types: | |
date_types = ["date"] | |
if not timestamp_types: | |
timestamp_types = ["timestamp"] | |
def detect_type(col): | |
if re.match(".*_id$", col.name): | |
return 0 | |
elif col.data_type.lower() in string_types: | |
return 1 | |
elif col.data_type.lower() in numeric_types: | |
return 2 | |
elif col.data_type.lower() in boolean_types: | |
return 3 | |
elif col.data_type.lower() in date_types: | |
return 4 | |
elif col.data_type.lower() in timestamp_types: | |
return 5 | |
return -1 | |
cols_types = [detect_type(c) for c in model.columns] | |
cols_types_filtered = [e for e in cols_types if e >= 0] | |
for e in zip(cols_types_filtered[0:-1], cols_types_filtered[1:]): | |
if e[0] > e[1]: | |
return RuleViolation( | |
message=f"Columns order should be ids, strings, numerics, booleans, dates, and timestamps." | |
) | |
RESERVED_WORDS = [ | |
"all", | |
"and", | |
"any", | |
"array", | |
"as", | |
"asc", | |
"assert_rows_modified", | |
"at", | |
"between", | |
"by", | |
"case", | |
"cast", | |
"collate", | |
"contains", | |
"create", | |
"cross", | |
"cube", | |
"current", | |
"default", | |
"define", | |
"desc", | |
"distinct", | |
"else", | |
"end", | |
"enum", | |
"escape", | |
"except", | |
"exclude", | |
"exists", | |
"extract", | |
"false", | |
"fetch", | |
"following", | |
"for", | |
"from", | |
"full", | |
"group", | |
"grouping", | |
"groups", | |
"hash", | |
"having", | |
"if", | |
"ignore", | |
"in", | |
"inner", | |
"intersect", | |
"interval", | |
"into", | |
"is", | |
"join", | |
"lateral", | |
"left", | |
"like", | |
"limit", | |
"lookup", | |
"merge", | |
"natural", | |
"new", | |
"no", | |
"not", | |
"null", | |
"nulls", | |
"of", | |
"on", | |
"or", | |
"order", | |
"outer", | |
"over", | |
"partition", | |
"preceding", | |
"proto", | |
"qualify", | |
"range", | |
"recursive", | |
"respect", | |
"right", | |
"rollup", | |
"rows", | |
"select", | |
"set", | |
"some", | |
"struct", | |
"tablesample", | |
"then", | |
"to", | |
"treat", | |
"true", | |
"unbounded", | |
"union", | |
"unnest", | |
"using", | |
"when", | |
"where", | |
"window", | |
"with", | |
"within", | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment