Skip to content

Instantly share code, notes, and snippets.

View hakanilter's full-sized avatar

Hakan İlter hakanilter

View GitHub Profile
@hakanilter
hakanilter / pyspark_schema_util.py
Last active March 2, 2023 16:05
PySpark schema save/load example
import json
from pyspark.sql.types import *
def save_schema_as_json(df, schema_file):
"""
Saves dataframe schema as json
"""
schema = df.schema.json()
schema = json.dumps(json.loads(schema), indent=4)
with open(schema_file, "w") as f:
@hakanilter
hakanilter / dynamic.tf
Created May 13, 2022 12:42
Terraform dynamic replication config for an S3 bucket
dynamic "replication_configuration" {
for_each = var.replicate_data_bucket ? [1] : []
content {
role = aws_iam_role.data_backup_replication_role.arn
rules {
id = "raw-data-replication"
prefix = "data/raw/"
status = "Enabled"
@hakanilter
hakanilter / execute_athena_query.py
Last active March 15, 2023 16:24
Boto3 Athena Query Example
def execute_athena_query(query, database="default", timeout=30, sleep_time=10):
context = {"Database": database}
config = {"OutputLocation": os.environ["ATHENA_BUCKET"]}
# Execute query
request = athena.start_query_execution(QueryString=query, QueryExecutionContext=context, ResultConfiguration=config)
# Wait for query result
num_tries = int(timeout / sleep_time)
status = athena.get_query_execution(QueryExecutionId=request["QueryExecutionId"])
@hakanilter
hakanilter / aws_eventbridge_ecs_fargate_task_scheduler.tf
Last active March 24, 2023 14:08
AWS EventBridge ECS Fargate Task Scheduler Terraform Example
resource "aws_scheduler_schedule_group" "ecs_schedule_group" {
name = "ecs-schedule-group"
}
resource "aws_scheduler_schedule" "data_import_job_schedule" {
name = "${var.name}-data-import-job-schedule-${var.env_name}"
group_name = aws_scheduler_schedule_group.ecs_schedule_group.name
flexible_time_window {
mode = "OFF"
@hakanilter
hakanilter / automatic_dynamic_masking.sql
Created November 1, 2023 10:20
DBT Databricks automatic data masking macro, must be configured as a post-hook
{% macro get_query_results_as_dict(query) %}
{{ return(adapter.dispatch('get_query_results_as_dict', 'dbt_utils')(query)) }}
{% endmacro %}
{% macro automatic_dynamic_masking() %}
{% set sensitive_columns = ['email', 'firstname', 'lastname', 'middlename', 'name', 'phone', 'telephone'] %}
{% set query %}
SELECT * FROM {{ this }} LIMIT 0
@hakanilter
hakanilter / jdbc_import_ssh.py
Last active November 22, 2024 13:31
Spark JDBC Import Over SSH
import os
import json
import boto3
import base64
from sshtunnel import SSHTunnelForwarder
"""
This Python class, **`JDBCImportOverSSH`**, facilitates the transfer of data from a JDBC-compatible source to a Delta table via an SSH tunnel. Here's an overview:
1. **Configuration Handling**: