Last active
October 22, 2015 19:35
-
-
Save 667bdrm/7e8f3057ac9464d49596 to your computer and use it in GitHub Desktop.
betsbc.com results to xlsx converter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
require 'rubygems' | |
require 'net/http' | |
require 'uri' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'axlsx' | |
$toto_base = "http://betsbc.com/toto/" | |
toto_list = $toto_base + "?action=old" | |
#toto_list = 'toto-list-example.htm' | |
toto_sample = 'toto-example.htm' | |
$user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.854.0 Safari/535.2" | |
$timeout = 25 | |
$results = 0 | |
def parse_links(result_link, wb, title) | |
url = result_link.gsub(/^\/toto\//, $toto_base) | |
result_doc = Nokogiri::HTML(open(url, 'User-Agent' => $user_agent), nil, 'utf-8') | |
result_doc.encoding = 'utf-8' | |
tables = result_doc.search('//table') | |
sheet = wb.add_worksheet(:name => title) | |
for table in tables; | |
rows = table.search('tr') | |
for row in rows; | |
cells = row.search('td') | |
plain_cells = [] | |
i = 0 | |
for cell in cells; | |
#puts "'"+cell.content+"'" | |
plain_cells.push(cell.text) | |
i = i + 1 | |
end | |
sheet.add_row(plain_cells) | |
end | |
end | |
return wb | |
end | |
puts "Loading page list..." | |
doc = Nokogiri::HTML(open(toto_list, 'User-Agent' => $user_agent), nil, 'utf-8') | |
doc.encoding = 'utf-8' | |
list_links = [] | |
result_links = [] | |
pages_links = [] | |
pages_links = doc.search('//a[contains(@href, "?action=old")]') | |
lastlink = pages_links.last.attr('href') | |
lastpage = Integer(/p=(\d+)/.match(lastlink)[1]) | |
if ARGV[0] then | |
lastpage = Integer(ARGV[0]) | |
end | |
puts "Last page: " + lastpage.to_s | |
i = 0 | |
while i <= lastpage do | |
if i > 0 then | |
list_links.push(toto_list + "&p=" + i.to_s) | |
else | |
list_links.push(toto_list) | |
end | |
i += 1 | |
end | |
p = Axlsx::Package.new | |
wb = p.workbook | |
count = 0 | |
sleep($timeout) | |
for list_link in list_links; | |
puts "Loading page " + list_link | |
list_doc = Nokogiri::HTML(open(list_link, 'User-Agent' => $user_agent), nil, 'utf-8') | |
list_doc.encoding = 'utf-8' | |
res_links = list_doc.search('//a[contains(@href, "/toto/get.php")]') | |
if $results > 0 and count > $results then | |
break | |
end | |
for res_link in res_links; | |
if $results > 0 and count > $results then | |
break | |
end | |
url = res_link.attr('href').gsub(/^\/toto\//, $toto_base) | |
title = res_link.text.gsub("Тотализатор ", "").gsub("Тираж ", "") | |
puts "Loading results at " + url | |
wb = parse_links(url, wb, title) | |
count = count + 1 | |
sleep($timeout) | |
end | |
sleep($timeout) | |
end | |
p.serialize('results-' + Time.now.to_i.to_s + '.xlsx') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment