Skip to content

Instantly share code, notes, and snippets.

View mbiemann's full-sized avatar

Marcell Biemann mbiemann

View GitHub Profile

Glue PySpark Job

Connect to AWS usind SAML2AWS

saml2aws login --skip-prompt --disable-keychain

role=arn:aws:iam::xxx:role/xxx
credentials=$(aws sts assume-role --role-arn $role --role-session-name tmp --profile saml)
export AWS_ACCESS_KEY_ID=$(echo "$credentials" | grep -o '"AccessKeyId": "[^"]*' | cut -d '"' -f 4)
@mbiemann
mbiemann / git_copy_between_branches.sh
Last active February 15, 2024 09:57
Git Copy File from one branch to another
git checkout <target_branch>
git diff <source_branch> --name-only | cat
git checkout <source> "file name"
@mbiemann
mbiemann / aws_s3_list_objects_bulletproof.py
Last active April 27, 2023 10:31
Get all S3 Objects saving partial to avoid restart from begin if interrupt
bucket = 'bucket_name'
prefix = 'path/folder/'
file_meta = './local/folder_meta.json'
file_detail = './local/folder_detail.json'
file_partial = './local/folder_partial.json'
stop_key = 'path/folder/partition9/filename9999.csv.gz'
data_meta = {}
data_detail = {}
try:
filter @message like '[INFO]'
| filter @message not like 'Found credentials in environment'
| fields @timestamp, @message, substr(@message, 0, 6) as msg_level, substr(@message, 7, 24) as msg_timestamp, substr(@message, 32, 33) as msg_uuid, substr(@message, 69) as msg_body
| parse msg_body '"event": "*"' as event_str
| parse msg_body '"event": {*}' as event_json
| parse event_json '"messageType": "*"' as messageType
| stats count(*) as qty by event_str, messageType
| sort qty desc
from json import dumps
from datetime import datetime
# ==============================================================================
def info(msg):
print(f"{datetime.now().isoformat(' ')[:19]} [INFO] {msg}")
def debug(msg):
print(f"{datetime.now().isoformat(' ')[:19]} [DEBUG] {msg}")
@mbiemann
mbiemann / databricks-kinesis-spark.py
Last active April 24, 2022 20:23
Databricks Kinesis PySpark - shardsPerTask
import boto3
print("check || cluster_workes >= target || cluster_workes >= stream_shards / shards_per_task || cluster_workes >= stream_shards / (stream_shards / cluster_workes)")
cluster_workes = int(spark.sparkContext.getConf().get("spark.databricks.clusterUsageTags.clusterWorkers"))
stream_shards = boto3.client("kinesis").describe_stream_summary(StreamName=source_stream)["StreamDescriptionSummary"]["OpenShardCount"]
shards_per_task = int(stream_shards / cluster_workes)
if shards_per_task < 1:
raise Exception(f"Sizing Error: Cluster Workers can't be {cluster_workes}. It must be up to {stream_shards}.")
target = int(stream_shards / shards_per_task)
@mbiemann
mbiemann / emr_pyspark_redshift_boto3.py
Created March 28, 2022 19:33
EMR PySpark Redshit Data API Boto3
import boto3
import time
redshift = boto3.client("redshift-data")
def redshift_sql(query):
resp = redshift.execute_statement(
ClusterIdentifier="xxx",
Database="xxx",
DbUser="xxx",
@mbiemann
mbiemann / Makefile
Last active November 15, 2022 12:25
Some Tips and Examples of MAKEFILE
#!make
MAKEFLAGS += --always-make
MAKEFILE_VERSION = 0.0.2 2022-11-15 12:17 PM
# ==============================================================================
.EXPORT_ALL_VARIABLES:
ifneq ($(wildcard ~/.makeconfig),)
@mbiemann
mbiemann / loop.sh
Created March 12, 2022 16:31
[SHELL] Using For interaction with Array Variable
WHATIHAVE=("a dog" "three cats" debts)
for i in ${WHATIHAVE[@]}
do
echo "I have ${i}."
done