Skip to content

Instantly share code, notes, and snippets.

@phobson
Created December 15, 2015 20:42
Show Gist options
  • Save phobson/4603e138982870db9811 to your computer and use it in GitHub Desktop.
Save phobson/4603e138982870db9811 to your computer and use it in GitHub Desktop.
upload only new values from one db to another
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Merging DFs to avoid duplicates"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas\n",
"\n",
"def addSecondColumnLevel(df, levelval, levelname):\n",
" \"\"\" Add a second level to the column-index of a dataframe.\n",
" \"\"\"\n",
"\n",
" if isinstance(df.columns, pandas.MultiIndex):\n",
" raise ValueError('Dataframe already has MultiIndex on columns')\n",
"\n",
" origlevel = 'quantity'\n",
" if df.columns.names[0] is not None:\n",
" origlevel = df.columns.names[0]\n",
"\n",
" # define the index\n",
" colarray = [[levelval]*len(df.columns), df.columns]\n",
" colindex = pandas.MultiIndex.from_arrays(colarray)\n",
"\n",
" # copy the dataframe and redefine the columns\n",
" newdf = df.copy()\n",
" newdf.columns = colindex\n",
" newdf.columns.names = [levelname, origlevel]\n",
" return newdf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define the existing destination data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th>db</th>\n",
" <th colspan=\"2\" halign=\"left\">destination</th>\n",
" </tr>\n",
" <tr>\n",
" <th>quantity</th>\n",
" <th>adult</th>\n",
" <th>bebbeh</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pk_col</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>dog</td>\n",
" <td>puppy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>cat</td>\n",
" <td>kitten</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td>bear</td>\n",
" <td>cub</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td>sheep</td>\n",
" <td>lamb</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"db destination \n",
"quantity adult bebbeh\n",
"pk_col \n",
"a dog puppy\n",
"b cat kitten\n",
"c bear cub\n",
"d sheep lamb"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data1 = {\n",
" 'pk_col': list('abcd'),\n",
" 'adult': ['dog', 'cat', 'bear', 'sheep'],\n",
" 'bebbeh': ['puppy', 'kitten', 'cub', 'lamb'],\n",
"}\n",
"df1 = (\n",
" pandas.DataFrame(data1)\n",
" .set_index(['pk_col']) \n",
" .pipe(addSecondColumnLevel, 'destination', 'db')\n",
")\n",
"df1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define the most recent source data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th>db</th>\n",
" <th colspan=\"2\" halign=\"left\">source</th>\n",
" </tr>\n",
" <tr>\n",
" <th>quantity</th>\n",
" <th>adult</th>\n",
" <th>bebbeh</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pk_col</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>dog</td>\n",
" <td>puppy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>cat</td>\n",
" <td>kitten</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td>bear</td>\n",
" <td>cub</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td>sheep</td>\n",
" <td>lamb</td>\n",
" </tr>\n",
" <tr>\n",
" <th>e</th>\n",
" <td>chicken</td>\n",
" <td>chick</td>\n",
" </tr>\n",
" <tr>\n",
" <th>f</th>\n",
" <td>cow</td>\n",
" <td>calf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"db source \n",
"quantity adult bebbeh\n",
"pk_col \n",
"a dog puppy\n",
"b cat kitten\n",
"c bear cub\n",
"d sheep lamb\n",
"e chicken chick\n",
"f cow calf"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data2 = {\n",
" 'pk_col': list('abcdef'),\n",
" 'adult': ['dog', 'cat', 'bear', 'sheep', 'chicken', 'cow'],\n",
" 'bebbeh': ['puppy', 'kitten', 'cub', 'lamb', 'chick', 'calf'],\n",
"}\n",
"df2 = (\n",
" pandas.DataFrame(data2)\n",
" .set_index(['pk_col'])\n",
" .pipe(addSecondColumnLevel, 'source', 'db')\n",
")\n",
"df2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Join the dataframes, keep only the rows not in the `destination`, & upload\n",
"Rows not in the `destination` will be null"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>pk_col</th>\n",
" <th>adult</th>\n",
" <th>bebbeh</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>e</td>\n",
" <td>chicken</td>\n",
" <td>chick</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>f</td>\n",
" <td>cow</td>\n",
" <td>calf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" pk_col adult bebbeh\n",
"0 e chicken chick\n",
"1 f cow calf"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_new_values(df, dstcol='destination', srccol='source'):\n",
" not_in_destination = pandas.isnull(df[dstcol])\n",
" new_df = (\n",
" df.copy() # make a copy\n",
" [not_in_destination] # nulls out rows where `dest` == `src`\n",
" [srccol] # select only the `source` columns\n",
" .dropna(how='all') # drop null rows\n",
" )\n",
" new_df.columns.names = [None]\n",
" return new_df\n",
"\n",
"(\n",
" df1.join(df2, how='right')\n",
" .pipe(get_new_values)\n",
" .reset_index()\n",
" #.to_sql('<TABLENAME>', <ENGINE>, if_exists='append', index=False)\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment