This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # experiment with multiple parameter unpacking (lists) | |
| def f(*args): | |
| return(args) | |
| res = f(*(1,3,4), *{5,6,7}, *[10,20,30]) | |
| print(res) | |
| # prints (1, 3, 4, 5, 6, 7, 10, 20, 30) | |
| f = lambda *args: args | |
| res = f(*(1,3,4), *{5,6,7}, *[10,20,30]) | |
| print(res) | |
| # prints (1, 3, 4, 5, 6, 7, 10, 20, 30) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # experiment with numpy broadcasting | |
| import numpy as np | |
| # 2x2 + 2, -> 2x2 + 1x2 -> 2x2 + 1x2 | |
| print('example 1 2x2 + 2, -> 2x2 + 1x2 -> 2x2 + 1x2') | |
| a = np.array([[1, 2], [3, 4]]) | |
| b = np.array([10, 20]) | |
| print(a.shape, b.shape) | |
| print(a) | |
| print(b) | |
| a2, b2 = np.broadcast_arrays(a, b) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # experiment with ordering and sorting | |
| import pandas as pd | |
| # create a categorical series | |
| s = pd.Series(pd.Categorical(['a', 'b', 'c', 'a'], ordered=True)) | |
| print(s) | |
| print(s.cat.categories) | |
| print(s.cat.codes) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # experiment with groupby cumcount and ngroup | |
| import pandas as pd | |
| import numpy as np | |
| df = pd.DataFrame({'a':list('aaaabbba'), 'b':1}) | |
| pd.concat([df.groupby(['a']).cumcount().rename('order within group'), df.groupby(['a']).ngroup().rename('group order')], axis='columns') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # dateutil understands daylight saving time, but timedelta is not applied correctly | |
| from datetime import datetime, timedelta | |
| from dateutil import tz, parser | |
| tm = parser.parse('27 March 2021 00:00') | |
| tm = tm.replace(tzinfo=tz.gettz('Europe/Helsinki')) | |
| print(tm, tm.tzinfo) | |
| tm2 = tm + timedelta(hours=48) | |
| print(tm2) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # convert timezone (vanilla python) | |
| from datetime import datetime, timezone, timedelta | |
| # get the current local time, including the timezone | |
| now = datetime.now().astimezone() | |
| print(now) | |
| # this simply changes the timezone to UTC | |
| print(now.replace(tzinfo=timezone.utc)) | |
| # this converts the time correctly to UTC | |
| print(datetime.fromtimestamp(now.timestamp(), tz=timezone.utc)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| import random | |
| # create a dataframe | |
| n = 50 | |
| df = pd.DataFrame({'a': random.choices(['foo', 'boo', 'bah'], k=n), 'b': np.random.rand(n), 'c': np.random.rand(n)*10}) | |
| grouped = df.groupby('a') | |
| # example 1, one column, one function, name(s) not specified |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| # create a dataset with multi indices for both both index and columns | |
| def indx_names(prefix: str, num: int): | |
| return [f'{prefix}{_:_>2}' for _ in range(num)] | |
| idx = pd.MultiIndex.from_product([indx_names('A',4), indx_names('B',2), indx_names('C',4), indx_names('D',2)]) | |
| cols = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1']) | |
| df = pd.DataFrame(np.arange(len(idx)*len(cols)).reshape(len(idx), len(cols)), index=idx, columns=cols) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| # create a dataset with multi indices for both both index and columns | |
| def indx_names(prefix: str, num: int): | |
| return [f'{prefix}{_:_>2}' for _ in range(num)] | |
| idx = pd.MultiIndex.from_product([indx_names('A',4), indx_names('B',2), indx_names('C',4), indx_names('D',2)]) | |
| cols = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1']) | |
| df = pd.DataFrame(np.arange(len(idx)*len(cols)).reshape(len(idx), len(cols)), index=idx, columns=cols) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # experiment with complex elements | |
| import os | |
| os.environ['PYSPARK_PYTHON'] = '/bin/python3' | |
| os.environ['PYSPARK_DRIVER_PYTHON'] = '/bin/python3' | |
| # setup the python path | |
| import sys | |
| sys.path = ['/usr/hdp/current/spark2-client/python', | |
| '/usr/hdp/current/spark2-client/python/lib/py4j-0.10.4-src.zip'] + sys.path | |
| from pyspark.sql import SparkSession | |
| from pyspark.sql import Row |