Skip to content

Instantly share code, notes, and snippets.

@fernandojunior
Last active September 5, 2016 03:08
Show Gist options
  • Save fernandojunior/12fe947c743b3170704bcf9b4ce7f3cc to your computer and use it in GitHub Desktop.
Save fernandojunior/12fe947c743b3170704bcf9b4ce7f3cc to your computer and use it in GitHub Desktop.
kickante collector using seleninum, beautifulsoup4, python
# Quem contribuiu mais frequentemente?
# Quem mais contribuiu em doações?
data = read.csv("data/data.csv")
data$amount = sapply(data$amount, function(x){
x = gsub("\\.", "", x)
x = gsub(",", ".", x)
x = gsub("R\\$", "", x)
if (x == "Não divulgado")
x = '0.0'
return(as.numeric(x))
})
head(sort(table(data$name), decreasing=TRUE), n=10)
# Compra Presencial Anônimo
# 556 211
# Jessica Souza Silva David Guilllem
# 10 6
# Wellcon Treinamento e Consultoria Lmtda Hugo de Jesus Amaral
# 6 5
# Patricia Fatima Crepaldi Bento da Silva Rodrigo Campos Lucie
# 5 5
# Danisa Acrílicos Erinaldo Rodrigues Nascimento
# 4 4
agg = aggregate(amount ~ name, data, sum)
head(agg[order(agg$amount, decreasing=TRUE),], n=10)
# name amount
# 477 Compra Presencial 224857.2
# 1268 Latino Travel 16000.0
# 231 Anônimo 14544.2
# 1750 Patricia Fatima Crepaldi Bento da Silva 3471.0
# 1346 Luana Gomes Landeiro 2377.0
# 196 André Luiz Cruz 2296.0
# 3 ABILIO ALVES DE CARVALHO NETO 2180.0
# 2069 Sidnei Aparecido De Toledo Junior 2180.0
# 2201 Thiago Breyer 2180.0
# 494 Cristina Maria de Fiori 2153.0
# wget http://chromedriver.storage.googleapis.com/2.23/chromedriver_linux64.zip
# unzip chromedriver_linux64.zip
# virtualenv env && . env/bin/activate
# pip install -r requirements.txt
import time
import csv
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
URL = 'http://www.kickante.com.br/campanhas/bel-pesce-por-todas-as-capitais-do-brasil/kickadores'
CHROMEDRIVER = './chromedriver'
# TOTAL_LOADS = -1 # load full page
TOTAL_LOADS = 2 # load the page only 2 times
browser = webdriver.Chrome(executable_path=CHROMEDRIVER)
browser.get(URL)
def current_page_source():
return BeautifulSoup(browser.page_source, 'html.parser')
def count_loads(page_source):
return len(page_source.select('body style'))
def stop_load(counts_history):
counts = count_loads(current_page_source())
if (len(counts_history) > 5
and all([counts == x for x in counts_history[-5:]])):
return True # no more raw data to load
counts_history.append(counts)
if TOTAL_LOADS > 0 and len(counts_history) == TOTAL_LOADS:
return True
return False
def parse_row(row):
select_one = row.select_one
row = {
'url': select_one('.order-author-picture a'),
'name': select_one('.order-author-name'),
'date': select_one('.order-author-date'),
'amount': select_one('.order-amount'),
'reward': select_one('.order-reward')
}
return {
'url': row['url']['href'] if row['url'] else '',
'name': row['name'].text if row['name'] else '',
'date': row['date'].text if row['date'] else '',
'amount': row['amount'].text if row['amount'] else '',
'reward': row['reward'].text if row['reward'] else ''
}
def load_full_page():
counts_history = [count_loads(current_page_source())]
while True:
body = browser.find_element_by_tag_name("body")
body.send_keys(Keys.PAGE_DOWN)
time.sleep(2)
if (stop_load(counts_history)):
break
return current_page_source()
def save_csv(data):
with open('data.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, list(data[0].keys()))
writer.writeheader()
writer.writerows(data)
def save_json(data):
with open('data.json', 'w') as f:
f.write(json.dumps(data, indent=2))
if __name__ == '__main__':
raw_data = load_full_page().select('.views-row')
data = [parse_row(row) for row in raw_data]
save_csv(data)
save_json(data)
selenium
beautifulsoup4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment