Skip to content

Instantly share code, notes, and snippets.

@andersy005
Last active February 20, 2025 01:35
Show Gist options
  • Select an option

  • Save andersy005/20763b0be62bc9f7541cce8555b7614a to your computer and use it in GitHub Desktop.

Select an option

Save andersy005/20763b0be62bc9f7541cce8555b7614a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "88083538-5edc-4f7b-ac73-fca589cf5bca",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import json\n",
"import inspect\n",
"import re\n",
"import itertools\n",
"import requests"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "038860a0-d7b6-4cc3-a3ce-8b76f57b1947",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>project_id</th>\n",
" <th>quantity</th>\n",
" <th>retirement_account</th>\n",
" <th>retirement_beneficiary</th>\n",
" <th>retirement_note</th>\n",
" <th>retirement_reason</th>\n",
" <th>transaction_date</th>\n",
" <th>transaction_type</th>\n",
" <th>vintage</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>VCS1</td>\n",
" <td>12630</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2009-03-26 00:00:00+00:00</td>\n",
" <td>issuance</td>\n",
" <td>2007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>VCS1</td>\n",
" <td>9074</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2014-01-21 00:00:00+00:00</td>\n",
" <td>issuance</td>\n",
" <td>2006</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>VCS10</td>\n",
" <td>153460</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2009-04-22 00:00:00+00:00</td>\n",
" <td>issuance</td>\n",
" <td>2006</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>VCS10</td>\n",
" <td>368968</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2009-04-22 00:00:00+00:00</td>\n",
" <td>issuance</td>\n",
" <td>2007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>VCS10</td>\n",
" <td>505908</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2009-04-22 00:00:00+00:00</td>\n",
" <td>issuance</td>\n",
" <td>2008</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" project_id quantity retirement_account retirement_beneficiary \\\n",
"0 VCS1 12630 None None \n",
"1 VCS1 9074 None None \n",
"2 VCS10 153460 None None \n",
"3 VCS10 368968 None None \n",
"4 VCS10 505908 None None \n",
"\n",
" retirement_note retirement_reason transaction_date \\\n",
"0 None None 2009-03-26 00:00:00+00:00 \n",
"1 None None 2014-01-21 00:00:00+00:00 \n",
"2 None None 2009-04-22 00:00:00+00:00 \n",
"3 None None 2009-04-22 00:00:00+00:00 \n",
"4 None None 2009-04-22 00:00:00+00:00 \n",
"\n",
" transaction_type vintage \n",
"0 issuance 2007 \n",
"1 issuance 2006 \n",
"2 issuance 2006 \n",
"3 issuance 2007 \n",
"4 issuance 2008 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_parquet(\"s3://carbonplan-offsets-db/final/2025-02-14/credits-augmented.parquet\", engine=\"fastparquet\")\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ff566ce7-ee2a-490b-9813-bce9d2880841",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>project_id</th>\n",
" <th>quantity</th>\n",
" <th>retirement_account</th>\n",
" <th>retirement_beneficiary</th>\n",
" <th>retirement_note</th>\n",
" <th>retirement_reason</th>\n",
" <th>transaction_date</th>\n",
" <th>transaction_type</th>\n",
" <th>vintage</th>\n",
" <th>merged_beneficiary</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>VCS1</td>\n",
" <td>12630</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>2009-03-26 00:00:00+00:00</td>\n",
" <td>issuance</td>\n",
" <td>2007</td>\n",
" <td>;%;%;%;%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>VCS1</td>\n",
" <td>9074</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>2014-01-21 00:00:00+00:00</td>\n",
" <td>issuance</td>\n",
" <td>2006</td>\n",
" <td>;%;%;%;%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>VCS10</td>\n",
" <td>153460</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>2009-04-22 00:00:00+00:00</td>\n",
" <td>issuance</td>\n",
" <td>2006</td>\n",
" <td>;%;%;%;%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>VCS10</td>\n",
" <td>368968</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>2009-04-22 00:00:00+00:00</td>\n",
" <td>issuance</td>\n",
" <td>2007</td>\n",
" <td>;%;%;%;%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>VCS10</td>\n",
" <td>505908</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>2009-04-22 00:00:00+00:00</td>\n",
" <td>issuance</td>\n",
" <td>2008</td>\n",
" <td>;%;%;%;%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>438196</th>\n",
" <td>GLD338</td>\n",
" <td>455</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>Essent Retail Energie</td>\n",
" <td>;%</td>\n",
" <td>2009-03-30 00:00:00+00:00</td>\n",
" <td>retirement</td>\n",
" <td>2007</td>\n",
" <td>;%;%Essent Retail Energie;%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>438197</th>\n",
" <td>GLD338</td>\n",
" <td>32039</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>green gas sales until september 2009; Essent R...</td>\n",
" <td>;%</td>\n",
" <td>2009-11-02 00:00:00+00:00</td>\n",
" <td>retirement</td>\n",
" <td>2007</td>\n",
" <td>;%;%green gas sales until september 2009; Esse...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>438198</th>\n",
" <td>GLD338</td>\n",
" <td>7961</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>Essent Retail Energie</td>\n",
" <td>;%</td>\n",
" <td>2009-03-30 00:00:00+00:00</td>\n",
" <td>retirement</td>\n",
" <td>2007</td>\n",
" <td>;%;%Essent Retail Energie;%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>438199</th>\n",
" <td>GLD338</td>\n",
" <td>8000</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>2009-06-23 00:00:00+00:00</td>\n",
" <td>retirement</td>\n",
" <td>2007</td>\n",
" <td>;%;%;%;%</td>\n",
" </tr>\n",
" <tr>\n",
" <th>438200</th>\n",
" <td>GLD338</td>\n",
" <td>1000</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>;%</td>\n",
" <td>2008-07-01 00:00:00+00:00</td>\n",
" <td>retirement</td>\n",
" <td>2007</td>\n",
" <td>;%;%;%;%</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>438201 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" project_id quantity retirement_account retirement_beneficiary \\\n",
"0 VCS1 12630 ;% ;% \n",
"1 VCS1 9074 ;% ;% \n",
"2 VCS10 153460 ;% ;% \n",
"3 VCS10 368968 ;% ;% \n",
"4 VCS10 505908 ;% ;% \n",
"... ... ... ... ... \n",
"438196 GLD338 455 ;% ;% \n",
"438197 GLD338 32039 ;% ;% \n",
"438198 GLD338 7961 ;% ;% \n",
"438199 GLD338 8000 ;% ;% \n",
"438200 GLD338 1000 ;% ;% \n",
"\n",
" retirement_note retirement_reason \\\n",
"0 ;% ;% \n",
"1 ;% ;% \n",
"2 ;% ;% \n",
"3 ;% ;% \n",
"4 ;% ;% \n",
"... ... ... \n",
"438196 Essent Retail Energie ;% \n",
"438197 green gas sales until september 2009; Essent R... ;% \n",
"438198 Essent Retail Energie ;% \n",
"438199 ;% ;% \n",
"438200 ;% ;% \n",
"\n",
" transaction_date transaction_type vintage \\\n",
"0 2009-03-26 00:00:00+00:00 issuance 2007 \n",
"1 2014-01-21 00:00:00+00:00 issuance 2006 \n",
"2 2009-04-22 00:00:00+00:00 issuance 2006 \n",
"3 2009-04-22 00:00:00+00:00 issuance 2007 \n",
"4 2009-04-22 00:00:00+00:00 issuance 2008 \n",
"... ... ... ... \n",
"438196 2009-03-30 00:00:00+00:00 retirement 2007 \n",
"438197 2009-11-02 00:00:00+00:00 retirement 2007 \n",
"438198 2009-03-30 00:00:00+00:00 retirement 2007 \n",
"438199 2009-06-23 00:00:00+00:00 retirement 2007 \n",
"438200 2008-07-01 00:00:00+00:00 retirement 2007 \n",
"\n",
" merged_beneficiary \n",
"0 ;%;%;%;% \n",
"1 ;%;%;%;% \n",
"2 ;%;%;%;% \n",
"3 ;%;%;%;% \n",
"4 ;%;%;%;% \n",
"... ... \n",
"438196 ;%;%Essent Retail Energie;% \n",
"438197 ;%;%green gas sales until september 2009; Esse... \n",
"438198 ;%;%Essent Retail Energie;% \n",
"438199 ;%;%;%;% \n",
"438200 ;%;%;%;% \n",
"\n",
"[438201 rows x 10 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.fillna(';%')\n",
"df['merged_beneficiary'] = df['retirement_account'] + df['retirement_beneficiary'] + df['retirement_note'] + df['retirement_reason']\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9932c647-fd2a-442a-852b-158cc3bc43a1",
"metadata": {},
"outputs": [],
"source": [
"rule_mapping = requests.get('https://github.com/user-attachments/files/18803757/beneficiacy-mappings.json').json()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f46318b4-3d24-43d8-a297-9eee59cbf2b0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"723"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(rule_mapping)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b54ad059-73df-40de-b12d-cbb9fac1e4ba",
"metadata": {},
"outputs": [],
"source": [
"languages = set()\n",
"grel_rules = []\n",
"jython_rules = []\n",
"other_rules = []\n",
"for rule in rule_mapping:\n",
" expression = rule['expression']\n",
" try:\n",
" language, code = expression.split(':', 1)\n",
" languages.add(language)\n",
" if language == 'jython' and rule['columnName'] == 'merged_beneficiary':\n",
" jython_rules.append(rule)\n",
" elif language == 'grel' and rule['columnName'] == 'merged_beneficiary':\n",
" grel_rules.append(rule)\n",
" except ValueError:\n",
" other_rules.append(rule)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "5a62b2f2-a6be-46b0-87f5-1b42485a4597",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"({'grel', 'jython'}, 70, 34, 619)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"languages, len(grel_rules), len(jython_rules), len(other_rules)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b89900bd-4265-4bfe-b1e6-f64046dce936",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<function __main__.expression_func(value)>"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# https://github.com/jezcope/pyrefine/blob/develop/pyrefine/expressions.py\n",
"def _indent_lines(lines, size=4):\n",
" \"\"\"Indent each line given by size spaces.\"\"\"\n",
" return ((' ' * size) + line for line in lines)\n",
"\n",
"_ERROR_HANDLERS = {\n",
" 'set-to-blank': ' return None',\n",
" 'store-error': ' return e',\n",
" 'keep-original': ' return value',\n",
"}\n",
"def _handle_errors(lines, on_error):\n",
" \"\"\"Wrap the given source lines in an error-handling block.\"\"\"\n",
" yield 'try:'\n",
" yield from _indent_lines(lines)\n",
" yield 'except Exception as e:'\n",
" yield _ERROR_HANDLERS[on_error]\n",
"\n",
"def compile_expression(expression, on_error='set-to-blank'):\n",
" \"\"\"Compile the given expression into a callable function.\"\"\"\n",
" try:\n",
" language, code = expression.split(':', 1)\n",
" except ValueError as error:\n",
" raise ValueError(f'No language given in expression: {expression}',\n",
" error)\n",
"\n",
" if language not in ('jython', 'grel'):\n",
" raise ValueError(f'Unknown expression language: {language}')\n",
"\n",
" if language == 'jython':\n",
"\n",
" # Clean up the code - remove any trailing semicolons which are valid in Jython but not Python\n",
" #code = code.replace(';', '')\n",
" # Convert .contains() calls to use Python's \"in\" operator.\n",
" # This regex captures patterns like: object.contains(pattern) followed by an optional colon.\n",
" code = re.sub(r'(\\S+)\\.contains\\(\\s*(.+?)\\s*\\)(:?)', r'\\2 in \\1\\3:', code)\n",
"\n",
" elif language == 'grel':\n",
" raise ValueError(f'GREL expressions not supported yet')\n",
" func_body = _handle_errors(\n",
" code.splitlines(),\n",
" on_error)\n",
" func_code = '\\n'.join(itertools.chain(['def expression_func(value):'],\n",
" _indent_lines(func_body)))\n",
" context = {}\n",
" eval(compile(func_code, 'pyrefine_exp', 'exec'), None, context)\n",
" func = context['expression_func']\n",
" func.__source__ = func_code\n",
" return func\n",
"\n",
"\n",
"t = compile_expression(jython_rules[0]['expression'], jython_rules[0]['onError'])\n",
"t"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "5ba815bf-0abd-4fe8-b4af-b92a05131b45",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"def expression_func(value):\n",
" try:\n",
" if value.startswith(\";%;%ExxonMobil de Colombia;%\"):\n",
" return \"Exxon Mobil\"\n",
" else:\n",
" return value\n",
" except Exception as e:\n",
" return value\n"
]
}
],
"source": [
"print(compile_expression(jython_rules[0]['expression'], jython_rules[0]['onError']).__source__)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "eca01e68-984b-4ee0-8c05-30ce3781222e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"invalid syntax (pyrefine_exp, line 3) {'op': 'core/text-transform', 'engineConfig': {'facets': [{'type': 'text', 'name': 'merged_beneficiary', 'columnName': 'merged_beneficiary', 'query': ';%', 'mode': 'text', 'caseSensitive': False, 'invert': False}, {'type': 'text', 'name': 'merged_beneficiary', 'columnName': 'merged_beneficiary', 'query': ';%;%[a-zA-Z,\\\\.]+;%', 'mode': 'regex', 'caseSensitive': False, 'invert': False}, {'type': 'list', 'name': 'merged_beneficiary', 'expression': 'isNull(value)', 'columnName': 'merged_beneficiary', 'invert': False, 'omitBlank': False, 'omitError': False, 'selection': [{'v': {'v': False, 'l': 'false'}}], 'selectBlank': False, 'selectError': False}], 'mode': 'row-based'}, 'columnName': 'merged_beneficiary', 'expression': 'jython:if value.contains(\"%;%;%Anonymously;%\");\\n return \\'\\'\\nelse:\\n return value', 'onError': 'keep-original', 'repeat': False, 'repeatCount': 10, 'description': 'Text transform on cells in column merged_beneficiary using expression jython:if value.contains(\"%;%;%Anonymously;%\");\\n return \\'\\'\\nelse:\\n return value'}\n"
]
}
],
"source": [
"py_transforms = []\n",
"errors = []\n",
"for rule in jython_rules:\n",
" try:\n",
" py_transforms.append(compile_expression(rule['expression'], rule['onError']))\n",
" except Exception as e:\n",
" errors.append(rule)\n",
" print(e, rule)\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "a298d4c0-ef87-4d93-a89b-321ddb410e0a",
"metadata": {},
"outputs": [],
"source": [
"def apply_all_rules(value, transforms):\n",
" for transform in transforms:\n",
" value = transform(value)\n",
" return value"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "77715675-a21e-4081-9b13-31c59809e832",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Airbnb'"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"apply_all_rules(\";%;%AirBNB;%\", py_transforms)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "465ce36d-b36f-4f08-b88b-75fdcd445760",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"';%;%airBNB;%'"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"apply_all_rules(\";%;%airBNB;%\", py_transforms)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2070ebe3-5e64-4d3b-8d48-f897f5047be0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment