Skip to content

Instantly share code, notes, and snippets.

@olivx
Forked from rg3915/FromDict.ipynb
Created December 5, 2018 21:29
Show Gist options
  • Save olivx/64d657ff67ecfcc38917f5f34cc1495b to your computer and use it in GitHub Desktop.
Save olivx/64d657ff67ecfcc38917f5f34cc1495b to your computer and use it in GitHub Desktop.
Annotations of Pandas DataFrame
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import timeit\n",
"import xlrd\n",
"from django.contrib.auth.models import User"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"tic = timeit.default_timer()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"filename = '/tmp/example.xlsx'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_excel(filename)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NAME</th>\n",
" <th>EMAIL</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2011</th>\n",
" <td>Angela Brown</td>\n",
" <td>[email protected]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2824</th>\n",
" <td>Angela Brown</td>\n",
" <td>[email protected]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3599</th>\n",
" <td>Angela Jones</td>\n",
" <td>[email protected]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1227</th>\n",
" <td>Angela Jones</td>\n",
" <td>[email protected]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2605</th>\n",
" <td>Anthony Evans</td>\n",
" <td>[email protected]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NAME EMAIL\n",
"2011 Angela Brown [email protected]\n",
"2824 Angela Brown [email protected]\n",
"3599 Angela Jones [email protected]\n",
"1227 Angela Jones [email protected]\n",
"2605 Anthony Evans [email protected]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"email = df['EMAIL']\n",
"# Retornando os itens duplicados\n",
"df[email.isin(email[email.duplicated()])].sort_values(by=['EMAIL']).head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>NAME</th>\n",
" <th>EMAIL</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4957</th>\n",
" <td>4995</td>\n",
" <td>Helen Mcallister</td>\n",
" <td>[email protected]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4958</th>\n",
" <td>4996</td>\n",
" <td>Scott Hall</td>\n",
" <td>[email protected]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4959</th>\n",
" <td>4997</td>\n",
" <td>Dawn Dowling</td>\n",
" <td>[email protected]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4960</th>\n",
" <td>4998</td>\n",
" <td>John Campbell</td>\n",
" <td>[email protected]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4961</th>\n",
" <td>4999</td>\n",
" <td>Barbara Alldredge</td>\n",
" <td>[email protected]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" index NAME EMAIL\n",
"4957 4995 Helen Mcallister [email protected]\n",
"4958 4996 Scott Hall [email protected]\n",
"4959 4997 Dawn Dowling [email protected]\n",
"4960 4998 John Campbell [email protected]\n",
"4961 4999 Barbara Alldredge [email protected]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Removendo os itens duplicados\n",
"dffinal = df.drop_duplicates('EMAIL').reset_index()\n",
"dffinal.tail()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def create_user(new_users):\n",
" # Separa full_name em first_name e last_name\n",
" users = []\n",
" for new_user in new_users:\n",
" full_name = new_user['full_name']\n",
" email = new_user['email']\n",
" first_name = full_name.split()[0]\n",
" last_name = full_name.split()[1:]\n",
" last_name = ' '.join(last_name)\n",
" username = email\n",
" user = User(\n",
" first_name=first_name,\n",
" last_name=last_name,\n",
" email=email,\n",
" username=username\n",
" )\n",
" users.append(user)\n",
" return users"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def import_users(new_users):\n",
" User.objects.bulk_create(new_users)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"new_users = []\n",
"emails = []"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"emails = dffinal['EMAIL'].values.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"exist_users = User.objects.filter(email__in=emails).values_list('email', flat=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Manuel Marshall [email protected]\n",
"Joy Flores [email protected]\n",
"Nicholas Waits [email protected]\n",
"Laura Cross [email protected]\n",
"Felipe Garland [email protected]\n"
]
}
],
"source": [
"for row in dffinal.head().itertuples():\n",
" print(row.NAME, row.EMAIL)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"for row in dffinal.itertuples():\n",
" full_name = row.NAME\n",
" email = row.EMAIL\n",
" if email not in exist_users:\n",
" data = dict(full_name=full_name, email=email)\n",
" new_users.append(data)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"new_users_ = create_user(new_users)\n",
"import_users(new_users_)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"17 existentes\n",
"4945 novos\n"
]
}
],
"source": [
"print(len(exist_users), 'existentes')\n",
"print(len(new_users), 'novos')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"toc = timeit.default_timer()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2.1476561470044544"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"toc - tic"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Django Shell-Plus",
"language": "python",
"name": "django_extensions"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment