karpanGit’s gists

karpanGit / python function, multiple unpackings.py

Last active January 9, 2022 14:53

python function, multiple unpackings

	# experiment with multiple parameter unpacking (lists)
	def f(*args):
	return(args)
	res = f((1,3,4), {5,6,7}, *[10,20,30])
	print(res)
	# prints (1, 3, 4, 5, 6, 7, 10, 20, 30)
	f = lambda *args: args
	res = f((1,3,4), {5,6,7}, *[10,20,30])
	print(res)
	# prints (1, 3, 4, 5, 6, 7, 10, 20, 30)

karpanGit / understanding numpy broadcasting.py

Created December 7, 2021 18:04

understanding numpy broadcasting

	# experiment with numpy broadcasting
	import numpy as np
	# 2x2 + 2, -> 2x2 + 1x2 -> 2x2 + 1x2
	print('example 1 2x2 + 2, -> 2x2 + 1x2 -> 2x2 + 1x2')
	a = np.array([[1, 2], [3, 4]])
	b = np.array([10, 20])
	print(a.shape, b.shape)
	print(a)
	print(b)
	a2, b2 = np.broadcast_arrays(a, b)

karpanGit / pandas. reorder cetegorical.py

Created November 13, 2021 13:26

pandas, reorder categorical



	# experiment with ordering and sorting
	import pandas as pd
	# create a categorical series
	s = pd.Series(pd.Categorical(['a', 'b', 'c', 'a'], ordered=True))
	print(s)
	print(s.cat.categories)
	print(s.cat.codes)

karpanGit / pandas, order of groups and order within groups in groupby

Created November 6, 2021 21:01

pandas, order of groups and order within groups in groupby.py

	# experiment with groupby cumcount and ngroup
	import pandas as pd
	import numpy as np
	df = pd.DataFrame({'a':list('aaaabbba'), 'b':1})
	pd.concat([df.groupby(['a']).cumcount().rename('order within group'), df.groupby(['a']).ngroup().rename('group order')], axis='columns')

karpanGit / python, daylight time savings.py

Last active October 30, 2021 19:40

python, daylight time savings

	# dateutil understands daylight saving time, but timedelta is not applied correctly
	from datetime import datetime, timedelta
	from dateutil import tz, parser
	tm = parser.parse('27 March 2021 00:00')
	tm = tm.replace(tzinfo=tz.gettz('Europe/Helsinki'))
	print(tm, tm.tzinfo)
	tm2 = tm + timedelta(hours=48)
	print(tm2)

karpanGit / python, convert timezone.py

Last active October 30, 2021 16:56

python, convert timezone

	# convert timezone (vanilla python)
	from datetime import datetime, timezone, timedelta
	# get the current local time, including the timezone
	now = datetime.now().astimezone()
	print(now)
	# this simply changes the timezone to UTC
	print(now.replace(tzinfo=timezone.utc))
	# this converts the time correctly to UTC
	print(datetime.fromtimestamp(now.timestamp(), tz=timezone.utc))

karpanGit / pandas, groupby, aggregation (named).py

Created September 5, 2021 09:32

pandas, groupby, aggregation (named)

	import pandas as pd
	import numpy as np
	import random

	# create a dataframe
	n = 50
	df = pd.DataFrame({'a': random.choices(['foo', 'boo', 'bah'], k=n), 'b': np.random.rand(n), 'c': np.random.rand(n)*10})
	grouped = df.groupby('a')

	# example 1, one column, one function, name(s) not specified

karpanGit / pandas, multi index, select with cross section or index slice.py

Created September 4, 2021 12:59

pandas, multi index, select with cross section or index slice

	import pandas as pd
	import numpy as np

	# create a dataset with multi indices for both both index and columns
	def indx_names(prefix: str, num: int):
	return [f'{prefix}{_:_>2}' for _ in range(num)]
	idx = pd.MultiIndex.from_product([indx_names('A',4), indx_names('B',2), indx_names('C',4), indx_names('D',2)])
	cols = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1'])
	df = pd.DataFrame(np.arange(len(idx)*len(cols)).reshape(len(idx), len(cols)), index=idx, columns=cols)

karpanGit / pandas, slicing, multiindex.py

Created September 4, 2021 12:15

pandas, slicing, multiindex

	import pandas as pd
	import numpy as np

	# create a dataset with multi indices for both both index and columns
	def indx_names(prefix: str, num: int):
	return [f'{prefix}{_:_>2}' for _ in range(num)]
	idx = pd.MultiIndex.from_product([indx_names('A',4), indx_names('B',2), indx_names('C',4), indx_names('D',2)])
	cols = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1'])
	df = pd.DataFrame(np.arange(len(idx)*len(cols)).reshape(len(idx), len(cols)), index=idx, columns=cols)

karpanGit / pyspark, create dataframe with columns of struct (Row) type.py

Last active July 29, 2021 15:17

pyspark, create dataframe with columns of struct (Row) type

	# experiment with complex elements
	import os
	os.environ['PYSPARK_PYTHON'] = '/bin/python3'
	os.environ['PYSPARK_DRIVER_PYTHON'] = '/bin/python3'
	# setup the python path
	import sys
	sys.path = ['/usr/hdp/current/spark2-client/python',
	'/usr/hdp/current/spark2-client/python/lib/py4j-0.10.4-src.zip'] + sys.path
	from pyspark.sql import SparkSession
	from pyspark.sql import Row

karpan karpanGit