Skip to content

Instantly share code, notes, and snippets.

View dmesquita's full-sized avatar

Déborah Mesquita dmesquita

View GitHub Profile
We can't make this file beautiful and searchable because it's too large.
index,submission_date,reviewer_id,product_id,product_name,product_brand,site_category_lv1,site_category_lv2,review_title,overall_rating,recommend_to_a_friend,review_text,reviewer_birth_year,reviewer_gender,reviewer_state
45,2018-01-01 06:04:46,3e8a71fb5dd1b1ccea3cd139908d759d83124a7757d69f8392f28b404c8a480f,118212404,Livro - Primeiro Amor,,Livros,Literatura Estrangeira,Nao chegou,2,No,Nao veio esse produto nao chegou entao não tem como eu avalia,1992.0,F,SP
48,2018-01-01 06:07:02,2bd5d5aca0691c9dd0c12cc66260d2ddeb7151262e99f3da647972f004c5fb20,124499501,Livro - Física Conceitual,,Livros,Ciências Exatas,o produto foi entregue dentro do prazo,5,Yes,"O produto chegou no prazo e em ótimo estado, tenho sido muito bem atendida quando solicito algo no site.",1957.0,F,MG
145,2018-01-01 07:21:09,263617650a44b0d2dae824f83cca33a0e73f913013ea0f35ef13826983951ef8,277621,Livro - Redescobrindo A Sua Beleza,,Livros,Moda e Beleza,Comprei e não recebi,1,No,"Como assim? Comprei e não recebi. Questionei aqui nas lojas americanas
We can make this file beautiful and searchable if this error is corrected: It looks like row 9 should actually have 15 columns, instead of 14 in line 8.
Continente,ISO 3166-1 alpha-3,País,Respondentes,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
África,ZAF,África do Sul,1,South Africa,ZA,ZAF,710,ISO 3166-2:ZA,Africa,Sub-Saharan Africa,Southern Africa,2.0,202.0,18.0
Europa,ALB,Albânia,1,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
Europa,DEU,Alemanha,5,Germany,DE,DEU,276,ISO 3166-2:DE,Europe,Western Europe,,150.0,155.0,
América do Sul,ARG,Argentina,2,Argentina,AR,ARG,32,ISO 3166-2:AR,Americas,Latin America and the Caribbean,South America,19.0,419.0,5.0
Oceania,AUS,Austrália,12,Australia,AU,AUS,36,ISO 3166-2:AU,Oceania,Australia and New Zealand,,9.0,53.0,
Ásia,BGD,Bangladesh,1,Bangladesh,BD,BGD,50,ISO 3166-2:BD,Asia,Southern Asia,,142.0,34.0,
Europa,BEL,Bélgica,3,Belgium,BE,BEL,56,ISO 3166-2:BE,Europe,Western Europe,,150.0,155.0,
América do Sul,BRA,Brasil,21,Brazil,BR,BRA,76,ISO 3166-2:BR,Americas,Latin America and the Caribbean,South America,19.0,41
import sys
import os
import yaml
from sklearn.naive_bayes import MultinomialNB
import pickle
# read the command line params
if len(sys.argv) != 3:
sys.stderr.write('Arguments error. Usage:\n')
sys.stderr.write(
import sys
import os
from sklearn.metrics import precision_recall_curve, auc
import pickle
import json
# read command line parameters
if len(sys.argv) != 5:
sys.stderr.write('Arguments error. Usage:\n')
sys.stderr.write(
import sys
import os
import yaml
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
# read command line params
if len(sys.argv) != 3:
sys.stderr.write('Arguments error. Usage:\n')
import os
import yaml
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
# read params
params = yaml.safe_load(open('params.yaml'))['prepare']
categories = params['categories']
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_curve, auc
categories = ["comp.graphics","sci.space"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
newsgroups_all = fetch_20newsgroups(subset='all', categories=categories)
import pandas as pd
import modin.pandas as pd_modin
import cudf as pd_cudf
results_groupby = []
### Read in the data with Pandas
for run in range(0,30):
df = pd.read_csv("../inep/dados/microdados_educacao_superior_2018//microdados_ed_superior_2018/dados/DM_ALUNO.CSV",
delimiter="|",
import pandas as pd
import modin.pandas as pd_modin
import cudf as pd_cudf
results_fillna = []
### Read in the data with Pandas
for run in range(0,30):
df = pd.read_csv("../inep/dados/microdados_educacao_superior_2018//microdados_ed_superior_2018/dados/DM_ALUNO.CSV")
import pandas as pd
import modin.pandas as pd_modin
import cudf as pd_cudf
results_loading = []
### Read in the data with Pandas
for run in range(0,30):
s = time.time()
df = pd.read_csv("../inep/dados/microdados_educacao_superior_2018//microdados_ed_superior_2018/dados/DM_ALUNO.CSV")