Skip to content

Instantly share code, notes, and snippets.

@Dminor7
Dminor7 / azure_blob_storage_dataframe.py
Last active November 11, 2023 23:09
Upload DataFrame to Azure Blob Storage as CSV file and Download CSV file as dataframe. Azure Python v12.5.0
import os, uuid
from io import BytesIO
from datetime import datetime
from urllib.parse import urlparse
from azure.storage.blob import BlobServiceClient
import pandas as pd
def azure_upload_df(container=None, dataframe=None, filename=None):
"""
Upload DataFrame to Azure Blob Storage for given container
@Dminor7
Dminor7 / export_to_sheets.py
Last active October 17, 2020 23:13
Python pandas dataframe to google sheets [Read, Write, and Append]
import gspread_dataframe as gd
import gspread as gs
gc = gs.service_account(filename="credentials.json")
def export_to_sheets(sheet_name,df,mode='r'):
ws = gc.open("Hashnode").worksheet(sheet_name)
if(mode=='w'):
ws.clear()
gd.set_with_dataframe(worksheet=ws,dataframe=df,include_index=False,include_column_header=True,resize=True)
return True
@Dminor7
Dminor7 / google_sheets.py
Last active May 8, 2021 17:11
Python Script to write data in google sheets
"""
# Worksheet header to data key mapping
header_to_key = {
"Name":"name",
"Age":"age",
"Weight(Kg)":"weight_kg",
}
# List of objects to load into worksheet
data = [
{
@Dminor7
Dminor7 / gce-persistent-volume.yml
Created July 24, 2021 11:51
GCE Persistent Disk Volume for KubernetesPodOperator in Cloud Composer
apiVersion: v1
kind: PersistentVolume
metadata:
name: my-volume
spec:
storageClassName: ""
capacity:
storage: 10G
accessModes:
- ReadOnlyMany
@Dminor7
Dminor7 / dag.py
Created July 24, 2021 12:03
KubernetesPodOperator with Volume mounted in read_only.
from datetime import datetime
from airflow import models
from kubernetes.client import models as k8s
from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import (
KubernetesPodOperator,
)
@Dminor7
Dminor7 / slack-notification-dag.py
Last active September 9, 2021 18:21
Airflow dag for failed tasks slack notification.
from datetime import datetime
import uuid
from airflow.providers.slack.hooks.slack_webhook import SlackWebhookHook
from airflow.operators.python_operator import PythonOperator
from airflow.utils.state import State
from airflow import DAG
@Dminor7
Dminor7 / flatten.py
Created November 9, 2021 08:22
Flatten Nested Dictionary within list or having list items. #flatten #python
"""
Using a generator we can `yield` key and value pairs when the value is not an instance of `list` or `dict`
else we recursively call the function to flatten the object. To preserve empty `list` and `dict`
we add additional check on obj when checking the obj type.
"""
def flatten(obj, prefix=[], sep="_"):
if isinstance(obj, list) and obj:
for i, o in enumerate(obj):
@Dminor7
Dminor7 / customer_mean_event_vector.sql
Last active May 17, 2023 06:21
Calculate Average of vectors in Presto
SELECT
customer_id,
reduce(
event_vectors,
repeat(0.0, 512), -- dimension of vector here it is 512
(sum_array, element_array) -> zip_with(sum_array, element_array, (s, e) -> s + e),
state_array -> transform(state_array, s -> s / cardinality(event_vectors))
) as mean_event_vector
FROM(
SELECT
@Dminor7
Dminor7 / nested_parquet_to_bigquery.py
Last active August 21, 2023 07:07
Functions for moving data between DataFrame and Google Cloud Storage/BigQuery using Arrow and GCS libraries.
"""
Check this stackeroverflow question: https://stackoverflow.com/questions/68303327/unnecessary-list-item-nesting-in-bigquery-schemas-from-pyarrow-upload-dataframe
Check this github issue: https://github.com/googleapis/python-bigquery/issues/19
"""
from google.cloud.bigquery._pandas_helpers import *
from google.cloud.bigquery import _helpers
from google.cloud import storage
from google.cloud import bigquery
@Dminor7
Dminor7 / generate_30min_time_range.py
Created November 29, 2023 06:19
Generate a time range (start_time, end_time) based on a given timestamp, with intervals last 30 minutes completed. Using pendulum package
import pendulum
def generate_30min_time_range(timestamp_str):
timestamp = pendulum.parse(timestamp_str)
start_time = timestamp.start_of('hour')
if timestamp.minute >= 30:
start_time = start_time.add(minutes=30)
result = []
while start_time <= timestamp:
result.append(start_time)