Skip to content

Instantly share code, notes, and snippets.

@karpanGit
karpanGit / python function, multiple unpackings.py
Last active January 9, 2022 14:53
python function, multiple unpackings
# experiment with multiple parameter unpacking (lists)
def f(*args):
return(args)
res = f(*(1,3,4), *{5,6,7}, *[10,20,30])
print(res)
# prints (1, 3, 4, 5, 6, 7, 10, 20, 30)
f = lambda *args: args
res = f(*(1,3,4), *{5,6,7}, *[10,20,30])
print(res)
# prints (1, 3, 4, 5, 6, 7, 10, 20, 30)
@karpanGit
karpanGit / understanding numpy broadcasting.py
Created December 7, 2021 18:04
understanding numpy broadcasting
# experiment with numpy broadcasting
import numpy as np
# 2x2 + 2, -> 2x2 + 1x2 -> 2x2 + 1x2
print('example 1 2x2 + 2, -> 2x2 + 1x2 -> 2x2 + 1x2')
a = np.array([[1, 2], [3, 4]])
b = np.array([10, 20])
print(a.shape, b.shape)
print(a)
print(b)
a2, b2 = np.broadcast_arrays(a, b)
@karpanGit
karpanGit / pandas. reorder cetegorical.py
Created November 13, 2021 13:26
pandas, reorder categorical
# experiment with ordering and sorting
import pandas as pd
# create a categorical series
s = pd.Series(pd.Categorical(['a', 'b', 'c', 'a'], ordered=True))
print(s)
print(s.cat.categories)
print(s.cat.codes)
@karpanGit
karpanGit / pandas, order of groups and order within groups in groupby
Created November 6, 2021 21:01
pandas, order of groups and order within groups in groupby.py
# experiment with groupby cumcount and ngroup
import pandas as pd
import numpy as np
df = pd.DataFrame({'a':list('aaaabbba'), 'b':1})
pd.concat([df.groupby(['a']).cumcount().rename('order within group'), df.groupby(['a']).ngroup().rename('group order')], axis='columns')
@karpanGit
karpanGit / python, daylight time savings.py
Last active October 30, 2021 19:40
python, daylight time savings
# dateutil understands daylight saving time, but timedelta is not applied correctly
from datetime import datetime, timedelta
from dateutil import tz, parser
tm = parser.parse('27 March 2021 00:00')
tm = tm.replace(tzinfo=tz.gettz('Europe/Helsinki'))
print(tm, tm.tzinfo)
tm2 = tm + timedelta(hours=48)
print(tm2)
@karpanGit
karpanGit / python, convert timezone.py
Last active October 30, 2021 16:56
python, convert timezone
# convert timezone (vanilla python)
from datetime import datetime, timezone, timedelta
# get the current local time, including the timezone
now = datetime.now().astimezone()
print(now)
# this simply changes the timezone to UTC
print(now.replace(tzinfo=timezone.utc))
# this converts the time correctly to UTC
print(datetime.fromtimestamp(now.timestamp(), tz=timezone.utc))
@karpanGit
karpanGit / pandas, groupby, aggregation (named).py
Created September 5, 2021 09:32
pandas, groupby, aggregation (named)
import pandas as pd
import numpy as np
import random
# create a dataframe
n = 50
df = pd.DataFrame({'a': random.choices(['foo', 'boo', 'bah'], k=n), 'b': np.random.rand(n), 'c': np.random.rand(n)*10})
grouped = df.groupby('a')
# example 1, one column, one function, name(s) not specified
@karpanGit
karpanGit / pandas, multi index, select with cross section or index slice.py
Created September 4, 2021 12:59
pandas, multi index, select with cross section or index slice
import pandas as pd
import numpy as np
# create a dataset with multi indices for both both index and columns
def indx_names(prefix: str, num: int):
return [f'{prefix}{_:_>2}' for _ in range(num)]
idx = pd.MultiIndex.from_product([indx_names('A',4), indx_names('B',2), indx_names('C',4), indx_names('D',2)])
cols = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1'])
df = pd.DataFrame(np.arange(len(idx)*len(cols)).reshape(len(idx), len(cols)), index=idx, columns=cols)
@karpanGit
karpanGit / pandas, slicing, multiindex.py
Created September 4, 2021 12:15
pandas, slicing, multiindex
import pandas as pd
import numpy as np
# create a dataset with multi indices for both both index and columns
def indx_names(prefix: str, num: int):
return [f'{prefix}{_:_>2}' for _ in range(num)]
idx = pd.MultiIndex.from_product([indx_names('A',4), indx_names('B',2), indx_names('C',4), indx_names('D',2)])
cols = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1'])
df = pd.DataFrame(np.arange(len(idx)*len(cols)).reshape(len(idx), len(cols)), index=idx, columns=cols)
@karpanGit
karpanGit / pyspark, create dataframe with columns of struct (Row) type.py
Last active July 29, 2021 15:17
pyspark, create dataframe with columns of struct (Row) type
# experiment with complex elements
import os
os.environ['PYSPARK_PYTHON'] = '/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/bin/python3'
# setup the python path
import sys
sys.path = ['/usr/hdp/current/spark2-client/python',
'/usr/hdp/current/spark2-client/python/lib/py4j-0.10.4-src.zip'] + sys.path
from pyspark.sql import SparkSession
from pyspark.sql import Row