Last active
September 5, 2016 03:08
-
-
Save fernandojunior/12fe947c743b3170704bcf9b4ce7f3cc to your computer and use it in GitHub Desktop.
kickante collector using seleninum, beautifulsoup4, python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Quem contribuiu mais frequentemente? | |
# Quem mais contribuiu em doações? | |
data = read.csv("data/data.csv") | |
data$amount = sapply(data$amount, function(x){ | |
x = gsub("\\.", "", x) | |
x = gsub(",", ".", x) | |
x = gsub("R\\$", "", x) | |
if (x == "Não divulgado") | |
x = '0.0' | |
return(as.numeric(x)) | |
}) | |
head(sort(table(data$name), decreasing=TRUE), n=10) | |
# Compra Presencial Anônimo | |
# 556 211 | |
# Jessica Souza Silva David Guilllem | |
# 10 6 | |
# Wellcon Treinamento e Consultoria Lmtda Hugo de Jesus Amaral | |
# 6 5 | |
# Patricia Fatima Crepaldi Bento da Silva Rodrigo Campos Lucie | |
# 5 5 | |
# Danisa Acrílicos Erinaldo Rodrigues Nascimento | |
# 4 4 | |
agg = aggregate(amount ~ name, data, sum) | |
head(agg[order(agg$amount, decreasing=TRUE),], n=10) | |
# name amount | |
# 477 Compra Presencial 224857.2 | |
# 1268 Latino Travel 16000.0 | |
# 231 Anônimo 14544.2 | |
# 1750 Patricia Fatima Crepaldi Bento da Silva 3471.0 | |
# 1346 Luana Gomes Landeiro 2377.0 | |
# 196 André Luiz Cruz 2296.0 | |
# 3 ABILIO ALVES DE CARVALHO NETO 2180.0 | |
# 2069 Sidnei Aparecido De Toledo Junior 2180.0 | |
# 2201 Thiago Breyer 2180.0 | |
# 494 Cristina Maria de Fiori 2153.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# wget http://chromedriver.storage.googleapis.com/2.23/chromedriver_linux64.zip | |
# unzip chromedriver_linux64.zip | |
# virtualenv env && . env/bin/activate | |
# pip install -r requirements.txt | |
import time | |
import csv | |
import json | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from bs4 import BeautifulSoup | |
URL = 'http://www.kickante.com.br/campanhas/bel-pesce-por-todas-as-capitais-do-brasil/kickadores' | |
CHROMEDRIVER = './chromedriver' | |
# TOTAL_LOADS = -1 # load full page | |
TOTAL_LOADS = 2 # load the page only 2 times | |
browser = webdriver.Chrome(executable_path=CHROMEDRIVER) | |
browser.get(URL) | |
def current_page_source(): | |
return BeautifulSoup(browser.page_source, 'html.parser') | |
def count_loads(page_source): | |
return len(page_source.select('body style')) | |
def stop_load(counts_history): | |
counts = count_loads(current_page_source()) | |
if (len(counts_history) > 5 | |
and all([counts == x for x in counts_history[-5:]])): | |
return True # no more raw data to load | |
counts_history.append(counts) | |
if TOTAL_LOADS > 0 and len(counts_history) == TOTAL_LOADS: | |
return True | |
return False | |
def parse_row(row): | |
select_one = row.select_one | |
row = { | |
'url': select_one('.order-author-picture a'), | |
'name': select_one('.order-author-name'), | |
'date': select_one('.order-author-date'), | |
'amount': select_one('.order-amount'), | |
'reward': select_one('.order-reward') | |
} | |
return { | |
'url': row['url']['href'] if row['url'] else '', | |
'name': row['name'].text if row['name'] else '', | |
'date': row['date'].text if row['date'] else '', | |
'amount': row['amount'].text if row['amount'] else '', | |
'reward': row['reward'].text if row['reward'] else '' | |
} | |
def load_full_page(): | |
counts_history = [count_loads(current_page_source())] | |
while True: | |
body = browser.find_element_by_tag_name("body") | |
body.send_keys(Keys.PAGE_DOWN) | |
time.sleep(2) | |
if (stop_load(counts_history)): | |
break | |
return current_page_source() | |
def save_csv(data): | |
with open('data.csv', 'w', newline='') as f: | |
writer = csv.DictWriter(f, list(data[0].keys())) | |
writer.writeheader() | |
writer.writerows(data) | |
def save_json(data): | |
with open('data.json', 'w') as f: | |
f.write(json.dumps(data, indent=2)) | |
if __name__ == '__main__': | |
raw_data = load_full_page().select('.views-row') | |
data = [parse_row(row) for row in raw_data] | |
save_csv(data) | |
save_json(data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
selenium | |
beautifulsoup4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment