Skip to content

Instantly share code, notes, and snippets.

@rg3915
Last active December 15, 2021 18:51
Show Gist options
  • Save rg3915/5fb3a2e7338115bc92e82b7a9a2b372b to your computer and use it in GitHub Desktop.
Save rg3915/5fb3a2e7338115bc92e82b7a9a2b372b to your computer and use it in GitHub Desktop.
Annotations of Pandas DataFrame
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import timeit\n",
"from django.contrib.auth.models import User"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"max_value = 5"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"tic = timeit.default_timer()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"filename = '/tmp/example.xlsx'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_excel(filename)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NOME</th>\n",
" <th>EMAIL</th>\n",
" <th>CPF</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Alex Swartz Wertz</td>\n",
" <td>[email protected]</td>\n",
" <td>00000000000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Valerie Berry Mattlin</td>\n",
" <td>[email protected]</td>\n",
" <td>00000000000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Brooke Coley Roy</td>\n",
" <td>[email protected]</td>\n",
" <td>00000000000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Joe Levin Collins</td>\n",
" <td>[email protected]</td>\n",
" <td>00000000000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Lynn Alford Timmons</td>\n",
" <td>[email protected]</td>\n",
" <td>00000000000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NOME EMAIL CPF\n",
"0 Alex Swartz Wertz [email protected] 00000000000000\n",
"1 Valerie Berry Mattlin [email protected] 00000000000000\n",
"2 Brooke Coley Roy [email protected] 00000000000000\n",
"3 Joe Levin Collins [email protected] 00000000000000\n",
"4 Lynn Alford Timmons [email protected] 00000000000000"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"email = df['EMAIL']\n",
"# Retornando os itens duplicados\n",
"# df[email.isin(email[email.duplicated()])]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>NOME</th>\n",
" <th>EMAIL</th>\n",
" <th>CPF</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>49991</th>\n",
" <td>49995</td>\n",
" <td>Warren Stowell Young</td>\n",
" <td>[email protected]</td>\n",
" <td>00000000000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49992</th>\n",
" <td>49996</td>\n",
" <td>Patricia Cheek Mitchell</td>\n",
" <td>[email protected]</td>\n",
" <td>00000000000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49993</th>\n",
" <td>49997</td>\n",
" <td>Linda Denton Breton</td>\n",
" <td>[email protected]</td>\n",
" <td>00000000000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49994</th>\n",
" <td>49998</td>\n",
" <td>Clyde Lopes Leonard</td>\n",
" <td>[email protected]</td>\n",
" <td>00000000000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49995</th>\n",
" <td>49999</td>\n",
" <td>Josefina Wilkinson Graves</td>\n",
" <td>[email protected]</td>\n",
" <td>00000000000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" index NOME EMAIL \\\n",
"49991 49995 Warren Stowell Young [email protected] \n",
"49992 49996 Patricia Cheek Mitchell [email protected] \n",
"49993 49997 Linda Denton Breton [email protected] \n",
"49994 49998 Clyde Lopes Leonard [email protected] \n",
"49995 49999 Josefina Wilkinson Graves [email protected] \n",
"\n",
" CPF \n",
"49991 00000000000000 \n",
"49992 00000000000000 \n",
"49993 00000000000000 \n",
"49994 00000000000000 \n",
"49995 00000000000000 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Removendo os itens duplicados\n",
"dffinal = df.drop_duplicates('EMAIL').reset_index()\n",
"dffinal.tail()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"49996"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dffinal)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def create_user2(new_users):\n",
" # Separa full_name em first_name e last_name\n",
" users = []\n",
" for new_user in new_users:\n",
" full_name = new_user['full_name']\n",
" email = new_user['email']\n",
" first_name = full_name.split()[0]\n",
" last_name = full_name.split()[1:]\n",
" last_name = ' '.join(last_name)\n",
" username = email\n",
" sha_password = 'kkkkkkkkkkkkkkkkkkkk'\n",
" user = User(\n",
" first_name=first_name,\n",
" last_name=last_name,\n",
" email=email,\n",
" username=username,\n",
" password=sha_password\n",
" )\n",
" users.append(user)\n",
" return users"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def import_users2(new_users):\n",
" User.objects.bulk_create(new_users)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"new_users = []\n",
"emails = []\n",
"emails = dffinal['EMAIL'].values.tolist()\n",
"exist_users = User.objects.filter(email__in=emails).values_list('email', flat=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# for row in dffinal.head().itertuples():\n",
"# print(row.NOME, row.EMAIL, row.CPF)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"for row in dffinal.head(max_value).itertuples():\n",
" full_name = row.NOME\n",
" email = row.EMAIL\n",
" cpf = row.CPF\n",
" if email not in exist_users:\n",
" data = dict(full_name=full_name, email=email, cpf=cpf)\n",
" new_users.append(data)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"new_users_ = create_user2(new_users)\n",
"import_users2(new_users_)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(0, 'existentes')\n",
"(5, 'novos')\n"
]
}
],
"source": [
"print(len(exist_users), 'existentes')\n",
"print(len(new_users), 'novos')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"toc = timeit.default_timer()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2.6345059871673584"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"toc - tic"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"users_query_list = User.objects.filter(email__in=new_users_)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# users_query_list[:5]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"simpleuser = Group.objects.get(name='simpleuser')\n",
"simpleuser.id"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"user_groups_list = []\n",
"# https://stackoverflow.com/a/34093544\n",
"for user in users_query_list:\n",
" user_groups = User.groups.through(user_id=user.pk, group_id=simpleuser.pk)\n",
" user_groups_list.append(user_groups)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<User_groups: User_groups object>,\n",
" <User_groups: User_groups object>,\n",
" <User_groups: User_groups object>,\n",
" <User_groups: User_groups object>,\n",
" <User_groups: User_groups object>]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"User.groups.through.objects.bulk_create(user_groups_list)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"toc = timeit.default_timer()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3.3119399547576904"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"toc - tic"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"# UserCPF\n",
"exist_usercpfs = UserCPF.objects.filter(user__email__in=emails).values_list('user__email', flat=True)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[email protected]\n",
"[email protected]\n",
"[email protected]\n",
"[email protected]\n",
"[email protected]\n"
]
}
],
"source": [
"new_usercpfs = []\n",
"for row in dffinal.head(max_value).itertuples():\n",
" email = row.EMAIL\n",
" cpf = row.CPF\n",
" # Verifica se UserCPF ja existe\n",
" if email not in exist_usercpfs:\n",
" print(email)\n",
" user = User.objects.get(email=email)\n",
" data = dict(user=user, cpf=cpf)\n",
" new_usercpfs.append(data)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# função create_usercpf(new_usercpfs)\n",
"new_usercpfs2 = []\n",
"for user in new_usercpfs:\n",
" new_usercpfs2.append(UserCPF(user=user['user'], cpf=user['cpf']))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<UserCPF: Alex Swartz Wertz>,\n",
" <UserCPF: Valerie Berry Mattlin>,\n",
" <UserCPF: Brooke Coley Roy>,\n",
" <UserCPF: Joe Levin Collins>,\n",
" <UserCPF: Lynn Alford Timmons>]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"UserCPF.objects.bulk_create(new_usercpfs2)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"toc = timeit.default_timer()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4.112852096557617"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"toc - tic"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# Employee\n",
"firm = Company.objects.get(name='Empresa')\n",
"exist_employees = Employee.objects.filter(user__email__in=emails).values_list('user__email', flat=True)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Company: Empresa>"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"firm"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"new_employees = []\n",
"for row in dffinal.head(max_value).itertuples():\n",
" email = row.EMAIL\n",
" if email not in exist_employees:\n",
" user = User.objects.get(email=email)\n",
" data = dict(user=user, firm=firm, department='14')\n",
" new_employees.append(data)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'department': '14', 'firm': <Company: Empresa>, 'user': <User: [email protected]>}\n",
"{'department': '14', 'firm': <Company: Empresa>, 'user': <User: [email protected]>}\n",
"{'department': '14', 'firm': <Company: Empresa>, 'user': <User: [email protected]>}\n",
"{'department': '14', 'firm': <Company: Empresa>, 'user': <User: [email protected]>}\n",
"{'department': '14', 'firm': <Company: Empresa>, 'user': <User: [email protected]>}\n"
]
}
],
"source": [
"# função create_employee(new_employees)\n",
"new_employees2 = []\n",
"for user in new_employees:\n",
" print(user)\n",
" new_employees2.append(Employee(user=user['user'], firm=user['firm'], department=user['department']))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<Employee: Alex Swartz Wertz>,\n",
" <Employee: Valerie Berry Mattlin>,\n",
" <Employee: Brooke Coley Roy>,\n",
" <Employee: Joe Levin Collins>,\n",
" <Employee: Lynn Alford Timmons>]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Employee.objects.bulk_create(new_employees2)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"# Permissões\n",
"def create_permissions():\n",
" # Criando permissão view_manage_career_pages\n",
" content_type = ContentType.objects.get_for_model(Company)\n",
" permission = Permission.objects.get(\n",
" content_type=content_type,\n",
" codename='view_manage_career_pages'\n",
" )\n",
" # Habilitando permissão para simpleuser\n",
" g = Group.objects.get(name='simpleuser')\n",
" g.permissions.add(permission)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"def create_permissions_quiz():\n",
" # Criando permissão add_quiz\n",
" content_type = ContentType.objects.get_for_model(Quiz)\n",
" permission = Permission.objects.get(\n",
" content_type=content_type,\n",
" codename='add_quiz'\n",
" )\n",
" # Habilitando permissão para simpleuser\n",
" g = Group.objects.get(name='simpleuser')\n",
" g.permissions.add(permission)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"create_permissions()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"create_permissions_quiz()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"toc = timeit.default_timer()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5.437417984008789"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"toc - tic"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Django Shell-Plus",
"language": "python",
"name": "django_extensions"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@paulochf
Copy link

paulochf commented Jul 16, 2018

Código que substitui da célula [6] em diante do separe_email.ipynb:

emails_df = df.email_.str.split(";").apply(pd.Series).stack().reset_index(level=1, drop=True).to_frame('emails')

dffinal = df.drop("email_", axis=1).merge(emails_df, left_index=True, right_index=True)

Um detalhe é a falta de um índice único dos registros. Então, supondo o índice automático decorrente da importação do arquivo Excel como ID válido, esse ID é o usado no "join" (merge).

@rg3915
Copy link
Author

rg3915 commented Aug 1, 2018

Para transformar Query do Django em DataFrame:
df = pd.DataFrame(list(User.objects.all().values()))

@rg3915
Copy link
Author

rg3915 commented Aug 1, 2018

Pra mostrar somente algumas colunas:
df[['id', 'email']]

@rg3915
Copy link
Author

rg3915 commented Aug 28, 2018

Transformando todos os campos do DataFrame num jSON
df.T.apply(dict).tolist()

@rg3915
Copy link
Author

rg3915 commented Aug 28, 2018

Renomeando colunas
df.rename(columns={'old_name': 'new_name'})

@rg3915
Copy link
Author

rg3915 commented Aug 28, 2018

Selecionando as colunas de 10 em 10
df.iloc[:, :10]
com transposição
df.T.iloc[:10]

@rg3915
Copy link
Author

rg3915 commented Oct 11, 2018

Novo df com algumas colunas.
new = old.filter(['A','B','D'], axis=1)

@rg3915
Copy link
Author

rg3915 commented Oct 12, 2018

Como não inserir chaves com valor nulo no dicionário:
df.T.apply(lambda x: dict(x.dropna())).tolist()

@rg3915
Copy link
Author

rg3915 commented Oct 17, 2018

Retornando os valores cujo tamanho da string seja maior que...
df[df['foo'].str.len() > 50]['foo']

Mostrando o tamanho de cada célula
df[df['foo'].str.len() > 50]['foo'].str.len()

@rg3915
Copy link
Author

rg3915 commented Nov 9, 2018

Mostrando o tamanho de cada célula da coluna

df['foo_len'] = df['foo'].apply(len)
df[['foo_len', 'foo']]

Ou

df['foo'].str.len()

@rg3915
Copy link
Author

rg3915 commented Dec 5, 2018

Suponha que você tenha Cliente e Obra.

Pegando o ID do Cliente que está em Obra e trocando pelo nome do Cliente que está no outro DataFrame.

# JSON do Cliente com IDCliente e Cliente
dict_cliente = df_cliente[['IDCliente', 'Cliente']].T.apply(dict).tolist()

[
 {'IDCliente': 288, 'Cliente': 'Cliente Um'},
 {'IDCliente': 1, 'Cliente': 'Cliente Dois'},
 {'IDCliente': 959, 'Cliente': 'Cliente Três'},
]

Montando o dicionário que será usado como busca de cada Cliente a partir do seu ID.

_dict_cliente = {}
for item in dict_cliente:
    _dict_cliente[item['IDCliente']] = item['Cliente']

_dict_cliente

{
 288: 'Cliente Um',
 1: 'Cliente Dois',
 959: 'Cliente Três',
}

A partir desse dicionário fazemos a busca no outro DataFrame.

for row in df.itertuples():
    nome_cliente = _dict_cliente.get(row.IDCliente)
    print(row.IDCliente, nome_cliente)

288 Cliente Um
1   Cliente Dois
959 Cliente Três

@rg3915
Copy link
Author

rg3915 commented Dec 9, 2018

Retornando o valor máximo agrupado por ano.

df.groupby(['Ano'])['NumeroOrcamento'].max()
df.reset_index()

@rg3915
Copy link
Author

rg3915 commented Dec 10, 2018

Verificando data vazia:

for row in df.itertuples():
    if row.DataOrcamento is pd.NaT:
        print('Vazio')
    else:
        print(row.DataOrcamento)

@rg3915
Copy link
Author

rg3915 commented Dec 16, 2018

Definindo vários fillna diferentes por coluna:

values = {'last_name': '', 'occupation': '', 'age': 0}
df = df.fillna(value=values)
df.head()

@rg3915
Copy link
Author

rg3915 commented Aug 6, 2019

Se tiver problema com liblzma, faça um downgrade do Pandas para pandas==0.24.2.
https://stackoverflow.com/a/57115325

@rg3915
Copy link
Author

rg3915 commented Aug 6, 2019

Retorna o tamanho do maior objeto de cada coluna.

dict_sizes = {}
for col in df.columns:
    try:
        print(f'{col} max length: {df[col].map(len).max()}\n')
        dict_sizes[col] = df[col].map(len).max()
    except Exception as e:
        raise e
dict_sizes

@rg3915
Copy link
Author

rg3915 commented Sep 12, 2019

@rg3915
Copy link
Author

rg3915 commented Nov 6, 2019

dtype example
df['estoque'] =df['estoque'].fillna(0).astype(int)

@rg3915
Copy link
Author

rg3915 commented Nov 6, 2019

Pandas Dataframe df to Django
https://www.laurivan.com/save-pandas-dataframe-as-django-model/

Produto.objects.bulk_create(
    Produto(**item) for item in df.to_dict('records')
)

@rg3915
Copy link
Author

rg3915 commented Nov 13, 2019

Definindo os tipos das colunas com dtype

dict_types_annot = {
    'produto': str,
    'ncm': str,
    'preco': float,
    'estoque': 'Int64',
}

# Define os tipos das colunas
dff = df.astype(dict_types_annot, errors='ignore')

# Troca 'nan' por None e float por None.
dff = dff.replace({'nan': None, float('nan'): None})

dff.to_dict('records')

Produto.objects.bulk_create(
    Produto(**item) for item in dff.to_dict('records')
)

@rg3915
Copy link
Author

rg3915 commented Jan 22, 2020

Intersecção de dataframes

import pandas as pd
import numpy as np
import datetime
from random import randint

df1 = pd.DataFrame({
    'letters': ['A', 'B', 'C', 'D', 'E', 'J', 'K', 'M'],
    'B': np.random.randint(0, 10, 8),
})
df1

df2 = pd.DataFrame({
    'letters': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'U', 'Z'],
    'B': np.random.randint(0, 10, 26),
})
df2

# Retorna o que tem de comum nos dois dataframes.
pd.merge(df1, df2, how='inner', on='fruits')

# Retorna o que tem de comum, considerando o df1.
pd.merge(df1, df2, how='left', on='fruits')

# Retorna o que tem de comum, considerando o df2.
pd.merge(df1, df2, how='right', on='fruits')

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment