takemikami · December 24, 2024 06:48
diff --git a/dbt_best_practice.py b/dbt_best_practice.py
 # dbt_score_rules/dbt_best_practice.py
 #
 # dbt-score: https://dbt-score.picnic.tech/
 #
 # How we style our dbt models - Fields and model names
 # https://docs.getdbt.com/best-practices/how-we-style/1-how-we-style-our-dbt-models

 import re
 from dbt_score import Model, RuleViolation, rule
 import inflect

 inflect_engine = inflect.engine()


 @rule
 def model_pluralized(model: Model) -> RuleViolation | None:
    """
    Models should be pluralized, for example, customers, orders, products.
    """
    if not inflect_engine.singular_noun(model.name):
        return RuleViolation(message=f"Model should be pluralized.")


 @rule
 def have_primary_key(model: Model) -> RuleViolation | None:
    """
    Each model should have a primary key.
    The primary key of a model should be named <object>_id, for example, account_id.
    This makes it easier to know what id is being referenced in downstream joined models.
    """
    if len([c.name for c in model.columns if re.match(".*_id$", c.name)]) == 0:
        return RuleViolation(message=f"Model should have a primary key like <object>_id.")


 @rule
 def model_use_underscores(model: Model) -> RuleViolation | None:
    """
    Use underscores for naming dbt models; avoid dots.
    ok: models_without_dots
    ng: models.with.dots
    """
    if "." in model.name:
        return RuleViolation(message=f"Use underscores for naming dbt models; avoid dots.")


 @rule
 def key_is_string_type(model: Model) -> RuleViolation | None:
    """
    Keys should be string data types.
    """
    ng_cols = [
        c.name for c in model.columns if c.data_type and re.match(".*_id$", c.name) and c.data_type.lower() != "string"
    ]
    if ng_cols:
        return RuleViolation(message=f"Keys should be string data types: {','.join(ng_cols)}")


 @rule
 def column_avoid_reserved_words(model: Model, reserved_words=None) -> RuleViolation | None:
    """
    Avoid reserved words as column names.
    """
    if not reserved_words:
        reserved_words = RESERVED_WORDS
    ng_cols = [c.name for c in model.columns if c.name in reserved_words]
    if ng_cols:
        return RuleViolation(message=f"Avoid reserved words as column names: {','.join(ng_cols)}")


 @rule
 def column_name_boolean(model: Model, boolean_types=None) -> RuleViolation | None:
    """
    Booleans should be prefixed with is_ or has_.
    """
    if not boolean_types:
        boolean_types = ["boolean", "bool"]
    ng_cols = [
        c.name
        for c in model.columns
        if c.data_type
        and c.data_type.lower() in boolean_types
        and not (re.match("^has_.*", c.name) or re.match("^is_.*", c.name))
    ]
    if ng_cols:
        return RuleViolation(message=f"Boolean columns should be prefixed with is_ or has_: {','.join(ng_cols)}")


 @rule
 def column_name_timestamp(model: Model, timestamp_types=None) -> RuleViolation | None:
    """
    Timestamp columns should be named <event>_at(for example, created_at) and
    should be in UTC. If a different timezone is used, this should be indicated with a suffix
    (created_at_pt).
    """
    if not timestamp_types:
        timestamp_types = ["timestamp"]
    ng_cols = [
        c.name
        for c in model.columns
        if c.data_type
        and c.data_type.lower() in timestamp_types
        and not (re.match(".*_at$", c.name) or re.match(".*_at_[^_]*$", c.name))
    ]
    if ng_cols:
        return RuleViolation(
            message=f"Timestamp columns should be named <event>_at or <event>_at_<tz>: {','.join(ng_cols)}"
        )


 @rule
 def column_name_date(model: Model, date_types=None) -> RuleViolation | None:
    """
    Dates should be named <event>_date. For example, created_date.
    """
    if not date_types:
        date_types = ["date"]
    ng_cols = [
        c.name
        for c in model.columns
        if c.data_type and c.data_type.lower() in date_types and not re.match(".*_date$", c.name)
    ]
    if ng_cols:
        return RuleViolation(message=f"Date columns should be named <event>_date: {','.join(ng_cols)}")


 @rule
 def schema_snake_case(model: Model) -> RuleViolation | None:
    """
    Schema name should be in snake_case.
    """
    if "-" in model.schema or (re.match("[A-Z]+", model.schema) and re.match("[a-z]+", model.schema)):
        return RuleViolation(message=f"Schema name should be in snake_case: {model.schema}")


 @rule
 def model_snake_case(model: Model) -> RuleViolation | None:
    """
    Table name should be in snake_case.
    """
    if "-" in model.name or (re.match("[A-Z]+", model.name) and re.match("[a-z]+", model.name)):
        return RuleViolation(message=f"Table name should be in snake_case: {model.name}")


 @rule
 def column_snake_case(model: Model) -> RuleViolation | None:
    """
    Column name should be in snake_case.
    """
    ng_cols = [
        c.name
        for c in model.columns
        if "-" in c.name or (re.match("[A-Z]+", model.name) and re.match("[a-z]+", model.name))
    ]
    if ng_cols:
        return RuleViolation(message=f"Column name should be in snake_case: {','.join(ng_cols)}")


 @rule
 def column_type_order(
    model: Model,
    string_types=None,
    numeric_types=None,
    boolean_types=None,
    date_types=None,
    timestamp_types=None,
 ) -> RuleViolation | None:
    """
    Use a consistent ordering of data types and consider grouping and labeling columns by type,
    as in the example below. This will minimize join errors and make it easier to read the model, as
    well as help downstream consumers of the data understand the data types and scan models for
    the columns they need. We prefer to use the following order: ids, strings, numerics, booleans,
    dates, and timestamps.
    """
    if not string_types:
        string_types = ["string"]
    if not numeric_types:
        numeric_types = [
            "int64",
            "int",
            "smallint",
            "integer",
            "bigint",
            "tinyint",
            "byteint",
            "numeric",
            "decimal",
            "bignumeric",
            "bigdecimal",
            "float64",
        ]
    if not boolean_types:
        boolean_types = ["boolean", "bool"]
    if not date_types:
        date_types = ["date"]
    if not timestamp_types:
        timestamp_types = ["timestamp"]

    def detect_type(col):
        if re.match(".*_id$", col.name):
            return 0
        elif col.data_type.lower() in string_types:
            return 1
        elif col.data_type.lower() in numeric_types:
            return 2
        elif col.data_type.lower() in boolean_types:
            return 3
        elif col.data_type.lower() in date_types:
            return 4
        elif col.data_type.lower() in timestamp_types:
            return 5
        return -1

    cols_types = [detect_type(c) for c in model.columns]
    cols_types_filtered = [e for e in cols_types if e >= 0]
    for e in zip(cols_types_filtered[0:-1], cols_types_filtered[1:]):
        if e[0] > e[1]:
            return RuleViolation(
                message=f"Columns order should be ids, strings, numerics, booleans, dates, and timestamps."
            )


 RESERVED_WORDS = [
    "all",
    "and",
    "any",
    "array",
    "as",
    "asc",
    "assert_rows_modified",
    "at",
    "between",
    "by",
    "case",
    "cast",
    "collate",
    "contains",
    "create",
    "cross",
    "cube",
    "current",
    "default",
    "define",
    "desc",
    "distinct",
    "else",
    "end",
    "enum",
    "escape",
    "except",
    "exclude",
    "exists",
    "extract",
    "false",
    "fetch",
    "following",
    "for",
    "from",
    "full",
    "group",
    "grouping",
    "groups",
    "hash",
    "having",
    "if",
    "ignore",
    "in",
    "inner",
    "intersect",
    "interval",
    "into",
    "is",
    "join",
    "lateral",
    "left",
    "like",
    "limit",
    "lookup",
    "merge",
    "natural",
    "new",
    "no",
    "not",
    "null",
    "nulls",
    "of",
    "on",
    "or",
    "order",
    "outer",
    "over",
    "partition",
    "preceding",
    "proto",
    "qualify",
    "range",
    "recursive",
    "respect",
    "right",
    "rollup",
    "rows",
    "select",
    "set",
    "some",
    "struct",
    "tablesample",
    "then",
    "to",
    "treat",
    "true",
    "unbounded",
    "union",
    "unnest",
    "using",
    "when",
    "where",
    "window",
    "with",
    "within",
 ]
	# dbt_score_rules/dbt_best_practice.py
	#
	# dbt-score: https://dbt-score.picnic.tech/
	#
	# How we style our dbt models - Fields and model names
	# https://docs.getdbt.com/best-practices/how-we-style/1-how-we-style-our-dbt-models

	import re
	from dbt_score import Model, RuleViolation, rule
	import inflect

	inflect_engine = inflect.engine()


	@rule
	def model_pluralized(model: Model) -> RuleViolation \| None:
	"""
	Models should be pluralized, for example, customers, orders, products.
	"""
	if not inflect_engine.singular_noun(model.name):
	return RuleViolation(message=f"Model should be pluralized.")


	@rule
	def have_primary_key(model: Model) -> RuleViolation \| None:
	"""
	Each model should have a primary key.
	The primary key of a model should be named <object>_id, for example, account_id.
	This makes it easier to know what id is being referenced in downstream joined models.
	"""
	if len([c.name for c in model.columns if re.match(".*_id$", c.name)]) == 0:
	return RuleViolation(message=f"Model should have a primary key like <object>_id.")


	@rule
	def model_use_underscores(model: Model) -> RuleViolation \| None:
	"""
	Use underscores for naming dbt models; avoid dots.
	ok: models_without_dots
	ng: models.with.dots
	"""
	if "." in model.name:
	return RuleViolation(message=f"Use underscores for naming dbt models; avoid dots.")


	@rule
	def key_is_string_type(model: Model) -> RuleViolation \| None:
	"""
	Keys should be string data types.
	"""
	ng_cols = [
	c.name for c in model.columns if c.data_type and re.match(".*_id$", c.name) and c.data_type.lower() != "string"
	]
	if ng_cols:
	return RuleViolation(message=f"Keys should be string data types: {','.join(ng_cols)}")


	@rule
	def column_avoid_reserved_words(model: Model, reserved_words=None) -> RuleViolation \| None:
	"""
	Avoid reserved words as column names.
	"""
	if not reserved_words:
	reserved_words = RESERVED_WORDS
	ng_cols = [c.name for c in model.columns if c.name in reserved_words]
	if ng_cols:
	return RuleViolation(message=f"Avoid reserved words as column names: {','.join(ng_cols)}")


	@rule
	def column_name_boolean(model: Model, boolean_types=None) -> RuleViolation \| None:
	"""
	Booleans should be prefixed with is_ or has_.
	"""
	if not boolean_types:
	boolean_types = ["boolean", "bool"]
	ng_cols = [
	c.name
	for c in model.columns
	if c.data_type
	and c.data_type.lower() in boolean_types
	and not (re.match("^has_.", c.name) or re.match("^is_.", c.name))
	]
	if ng_cols:
	return RuleViolation(message=f"Boolean columns should be prefixed with is_ or has_: {','.join(ng_cols)}")


	@rule
	def column_name_timestamp(model: Model, timestamp_types=None) -> RuleViolation \| None:
	"""
	Timestamp columns should be named <event>_at(for example, created_at) and
	should be in UTC. If a different timezone is used, this should be indicated with a suffix
	(created_at_pt).
	"""
	if not timestamp_types:
	timestamp_types = ["timestamp"]
	ng_cols = [
	c.name
	for c in model.columns
	if c.data_type
	and c.data_type.lower() in timestamp_types
	and not (re.match("._at$", c.name) or re.match("._at_[^_]*$", c.name))
	]
	if ng_cols:
	return RuleViolation(
	message=f"Timestamp columns should be named <event>_at or <event>_at_<tz>: {','.join(ng_cols)}"
	)


	@rule
	def column_name_date(model: Model, date_types=None) -> RuleViolation \| None:
	"""
	Dates should be named <event>_date. For example, created_date.
	"""
	if not date_types:
	date_types = ["date"]
	ng_cols = [
	c.name
	for c in model.columns
	if c.data_type and c.data_type.lower() in date_types and not re.match(".*_date$", c.name)
	]
	if ng_cols:
	return RuleViolation(message=f"Date columns should be named <event>_date: {','.join(ng_cols)}")


	@rule
	def schema_snake_case(model: Model) -> RuleViolation \| None:
	"""
	Schema name should be in snake_case.
	"""
	if "-" in model.schema or (re.match("[A-Z]+", model.schema) and re.match("[a-z]+", model.schema)):
	return RuleViolation(message=f"Schema name should be in snake_case: {model.schema}")


	@rule
	def model_snake_case(model: Model) -> RuleViolation \| None:
	"""
	Table name should be in snake_case.
	"""
	if "-" in model.name or (re.match("[A-Z]+", model.name) and re.match("[a-z]+", model.name)):
	return RuleViolation(message=f"Table name should be in snake_case: {model.name}")


	@rule
	def column_snake_case(model: Model) -> RuleViolation \| None:
	"""
	Column name should be in snake_case.
	"""
	ng_cols = [
	c.name
	for c in model.columns
	if "-" in c.name or (re.match("[A-Z]+", model.name) and re.match("[a-z]+", model.name))
	]
	if ng_cols:
	return RuleViolation(message=f"Column name should be in snake_case: {','.join(ng_cols)}")


	@rule
	def column_type_order(
	model: Model,
	string_types=None,
	numeric_types=None,
	boolean_types=None,
	date_types=None,
	timestamp_types=None,
	) -> RuleViolation \| None:
	"""
	Use a consistent ordering of data types and consider grouping and labeling columns by type,
	as in the example below. This will minimize join errors and make it easier to read the model, as
	well as help downstream consumers of the data understand the data types and scan models for
	the columns they need. We prefer to use the following order: ids, strings, numerics, booleans,
	dates, and timestamps.
	"""
	if not string_types:
	string_types = ["string"]
	if not numeric_types:
	numeric_types = [
	"int64",
	"int",
	"smallint",
	"integer",
	"bigint",
	"tinyint",
	"byteint",
	"numeric",
	"decimal",
	"bignumeric",
	"bigdecimal",
	"float64",
	]
	if not boolean_types:
	boolean_types = ["boolean", "bool"]
	if not date_types:
	date_types = ["date"]
	if not timestamp_types:
	timestamp_types = ["timestamp"]

	def detect_type(col):
	if re.match(".*_id$", col.name):
	return 0
	elif col.data_type.lower() in string_types:
	return 1
	elif col.data_type.lower() in numeric_types:
	return 2
	elif col.data_type.lower() in boolean_types:
	return 3
	elif col.data_type.lower() in date_types:
	return 4
	elif col.data_type.lower() in timestamp_types:
	return 5
	return -1

	cols_types = [detect_type(c) for c in model.columns]
	cols_types_filtered = [e for e in cols_types if e >= 0]
	for e in zip(cols_types_filtered[0:-1], cols_types_filtered[1:]):
	if e[0] > e[1]:
	return RuleViolation(
	message=f"Columns order should be ids, strings, numerics, booleans, dates, and timestamps."
	)


	RESERVED_WORDS = [
	"all",
	"and",
	"any",
	"array",
	"as",
	"asc",
	"assert_rows_modified",
	"at",
	"between",
	"by",
	"case",
	"cast",
	"collate",
	"contains",
	"create",
	"cross",
	"cube",
	"current",
	"default",
	"define",
	"desc",
	"distinct",
	"else",
	"end",
	"enum",
	"escape",
	"except",
	"exclude",
	"exists",
	"extract",
	"false",
	"fetch",
	"following",
	"for",
	"from",
	"full",
	"group",
	"grouping",
	"groups",
	"hash",
	"having",
	"if",
	"ignore",
	"in",
	"inner",
	"intersect",
	"interval",
	"into",
	"is",
	"join",
	"lateral",
	"left",
	"like",
	"limit",
	"lookup",
	"merge",
	"natural",
	"new",
	"no",
	"not",
	"null",
	"nulls",
	"of",
	"on",
	"or",
	"order",
	"outer",
	"over",
	"partition",
	"preceding",
	"proto",
	"qualify",
	"range",
	"recursive",
	"respect",
	"right",
	"rollup",
	"rows",
	"select",
	"set",
	"some",
	"struct",
	"tablesample",
	"then",
	"to",
	"treat",
	"true",
	"unbounded",
	"union",
	"unnest",
	"using",
	"when",
	"where",
	"window",
	"with",
	"within",
	]