Skip to content

Instantly share code, notes, and snippets.

@msva
Created July 28, 2016 11:32
Show Gist options
  • Save msva/f90ff8ce5a2fb0202ef06cc836ac4bec to your computer and use it in GitHub Desktop.
Save msva/f90ff8ce5a2fb0202ef06cc836ac4bec to your computer and use it in GitHub Desktop.
stdin
#!/usr/bin/env luajit
local iconv=require"iconv";
local w2u = iconv.new("utf-8","cp1251");
local u2w = iconv.new("cp1251","utf-8");
local cURL = require("cURL");
local html=require"htmlparser"
math.randomseed(tonumber(math.randomseed(os.time()) or os.time())+os.time()); -- randomize
local UAs = {
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:37.0) Gecko/20130101 Firefox/37.0"; -- Firefox 37, Windows 7, 64bit
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:37.0) Gecko/20130101 Firefox/37.0"; -- Firefox 37, Ubuntu 12, 32bit
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.22"; -- Chrome 34, Windows 7, 64bit
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/43.0.2357.10 Safari/537.22"; -- Chrome 34, Windows XP
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/42.0.2311.82 Safari/537.22"; -- Chrome 25, Mac OS 10.7
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/43.0.2537.10 Safari/537.22"; -- Chrome 25, Linux, 64bit
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/43.0.2537.10 Safari/537.22"; -- Chrome 25, Linux, 32bit
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"; -- Internet Explorer 9, Windows 7, 64bit
"Mozilla/5.0 (compatible; MSIE 11.0; Windows NT 8.1; WOW64; Trident/7.0)"; -- Internet Explorer 11, Windows 8.1, 64bit
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17"; -- Safari, Mac OS 10.8
}
local headers = {"Accept: text/*",
"Accept-Language: ru,en",
"Accept-Charset: utf-8,cp1251,koi8-r,iso-8859-5,*",
"Cache-Control: no-cache"}
local buf="";
local base_url="http://nsk-mahaon.ru";
local c = cURL.easy_init()
c:setopt_httpheader(headers)
c:setopt_cookiefile("")
c:setopt_followlocation(1)
c:setopt_useragent(UAs[math.random(1,#UAs)]);
c:setopt_url(base_url);
c:setopt_writefunction(function(result) buf=buf..result; return true; end);
c:perform();
local main_page=html.parse(buf)("a[class|='cat']");
buf="";
for _,e in ipairs(main_page) do
local cat_url=base_url..e.attributes.href;
local cat_name=e:getcontent():gsub(" ","_");
print([[Начало обработки категории "]]..cat_name..[["]])
print("",[[Обработка страницы 1 в категории "]]..cat_name..[["...]])
c:setopt_useragent(UAs[math.random(1,#UAs)]);
c:setopt_url(cat_url);
c:setopt_writefunction(function(result) buf=buf..result; return true; end);
c:perform();
local page_url=html.parse(buf)("div.item-list > ul.pager > li.pager-last.last > a.active");
local pages = page_url[1] and tonumber(page_url[1].attributes.href:match(".*page=(%d+)"));
pages = pages or 0;
local cat_pages = {};
cat_pages[1]=html.parse(buf)("table.views-table > tbody > tr > td.views-field");
buf="";
if pages > 0 then
for pc = 1,pages,1 do
print("",[[Обработка страницы ]]..(pc+1)..[[ в категории "]]..cat_name..[["...]])
c:setopt_useragent(UAs[math.random(1,#UAs)]);
c:setopt_url(cat_url.."?page="..pc);
buf="";
c:setopt_writefunction(function(result) buf=buf..result; return true; end);
c:perform();
cat_pages[pc+1]=html.parse(buf)("table.views-table > tbody > tr > td.views-field");
buf="";
end
end
cat_price={};
for _,cat_page in ipairs(cat_pages) do
for _,cat_e in ipairs(cat_page) do
print("","", cat_e.attributes.class)
end
end
print([[Конец обработки категории "]]..cat_name..[["]])
end
c:close();
os.exit(0);
out="";
for _,t in ipairs(price_t) do
local csv_str_t={}
local l1,l2 = "", "";
for k,v in pairs(t) do
k=k:gsub([["]],[[""]]);
v=v:gsub([["]],[[""]]);
if (#out>0) then
l1=l1..(#l1>0 and [[;]] or l1)..[["]]..v..[["]];
else
l1=l1..(#l1>0 and [[;]] or l1)..[["]]..k..[["]];
l2=l2..(#l2>0 and [[;]] or l2)..[["]]..v..[["]];
end
end
l1=((#l1>0) and (l1.."\n") or l1)
l2=((#l2>0) and (l2.."\n") or l2)
out=out..l1..l2;
end
io.output(filename.."_"..os.date("%d%m%y_%H%M%S")..".csv"):write(out);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment