-
-
Save olivx/64d657ff67ecfcc38917f5f34cc1495b to your computer and use it in GitHub Desktop.
Annotations of Pandas DataFrame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import timeit\n", | |
"import xlrd\n", | |
"from django.contrib.auth.models import User" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tic = timeit.default_timer()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename = '/tmp/example.xlsx'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_excel(filename)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>NAME</th>\n", | |
" <th>EMAIL</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2011</th>\n", | |
" <td>Angela Brown</td>\n", | |
" <td>[email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2824</th>\n", | |
" <td>Angela Brown</td>\n", | |
" <td>[email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3599</th>\n", | |
" <td>Angela Jones</td>\n", | |
" <td>[email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1227</th>\n", | |
" <td>Angela Jones</td>\n", | |
" <td>[email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2605</th>\n", | |
" <td>Anthony Evans</td>\n", | |
" <td>[email protected]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" NAME EMAIL\n", | |
"2011 Angela Brown [email protected]\n", | |
"2824 Angela Brown [email protected]\n", | |
"3599 Angela Jones [email protected]\n", | |
"1227 Angela Jones [email protected]\n", | |
"2605 Anthony Evans [email protected]" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"email = df['EMAIL']\n", | |
"# Retornando os itens duplicados\n", | |
"df[email.isin(email[email.duplicated()])].sort_values(by=['EMAIL']).head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>index</th>\n", | |
" <th>NAME</th>\n", | |
" <th>EMAIL</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>4957</th>\n", | |
" <td>4995</td>\n", | |
" <td>Helen Mcallister</td>\n", | |
" <td>[email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4958</th>\n", | |
" <td>4996</td>\n", | |
" <td>Scott Hall</td>\n", | |
" <td>[email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4959</th>\n", | |
" <td>4997</td>\n", | |
" <td>Dawn Dowling</td>\n", | |
" <td>[email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4960</th>\n", | |
" <td>4998</td>\n", | |
" <td>John Campbell</td>\n", | |
" <td>[email protected]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4961</th>\n", | |
" <td>4999</td>\n", | |
" <td>Barbara Alldredge</td>\n", | |
" <td>[email protected]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" index NAME EMAIL\n", | |
"4957 4995 Helen Mcallister [email protected]\n", | |
"4958 4996 Scott Hall [email protected]\n", | |
"4959 4997 Dawn Dowling [email protected]\n", | |
"4960 4998 John Campbell [email protected]\n", | |
"4961 4999 Barbara Alldredge [email protected]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Removendo os itens duplicados\n", | |
"dffinal = df.drop_duplicates('EMAIL').reset_index()\n", | |
"dffinal.tail()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def create_user(new_users):\n", | |
" # Separa full_name em first_name e last_name\n", | |
" users = []\n", | |
" for new_user in new_users:\n", | |
" full_name = new_user['full_name']\n", | |
" email = new_user['email']\n", | |
" first_name = full_name.split()[0]\n", | |
" last_name = full_name.split()[1:]\n", | |
" last_name = ' '.join(last_name)\n", | |
" username = email\n", | |
" user = User(\n", | |
" first_name=first_name,\n", | |
" last_name=last_name,\n", | |
" email=email,\n", | |
" username=username\n", | |
" )\n", | |
" users.append(user)\n", | |
" return users" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def import_users(new_users):\n", | |
" User.objects.bulk_create(new_users)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"new_users = []\n", | |
"emails = []" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"emails = dffinal['EMAIL'].values.tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"exist_users = User.objects.filter(email__in=emails).values_list('email', flat=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Manuel Marshall [email protected]\n", | |
"Joy Flores [email protected]\n", | |
"Nicholas Waits [email protected]\n", | |
"Laura Cross [email protected]\n", | |
"Felipe Garland [email protected]\n" | |
] | |
} | |
], | |
"source": [ | |
"for row in dffinal.head().itertuples():\n", | |
" print(row.NAME, row.EMAIL)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for row in dffinal.itertuples():\n", | |
" full_name = row.NAME\n", | |
" email = row.EMAIL\n", | |
" if email not in exist_users:\n", | |
" data = dict(full_name=full_name, email=email)\n", | |
" new_users.append(data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"new_users_ = create_user(new_users)\n", | |
"import_users(new_users_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"17 existentes\n", | |
"4945 novos\n" | |
] | |
} | |
], | |
"source": [ | |
"print(len(exist_users), 'existentes')\n", | |
"print(len(new_users), 'novos')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"toc = timeit.default_timer()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2.1476561470044544" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"toc - tic" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Django Shell-Plus", | |
"language": "python", | |
"name": "django_extensions" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment