Created
July 22, 2018 12:30
-
-
Save notsobad/e24d6f9fc6f5e54ff91415b93466d50b to your computer and use it in GitHub Desktop.
Page parser using headless chrome.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const url = process.argv[2]; | |
//console.log(url); | |
(async () => { | |
const browser = await puppeteer.launch(); | |
const page = await browser.newPage(); | |
await page.goto(url, {waitUntil: 'networkidle0'}); | |
await page.waitFor(1500); | |
const info = await page.evaluate(() => { | |
let products = []; | |
let blocks = document.querySelectorAll('div.product-list.m-bg-white > ul > li'); | |
for(let i=0; i<blocks.length; i++){ | |
console.log(i, blocks[i]) | |
let name = blocks[i].querySelector('.view-info > h1 > a'); | |
let vendor = blocks[i].querySelector('.view-info > div.line.cfg.mt6 > a'); | |
let price = blocks[i].querySelector('.view-price > div > span'); | |
let rate = blocks[i].querySelector('.view-desc > div.line.info.mt6 > div > i'); | |
let sold = blocks[i].querySelector('.view-desc > div.line.cfg.mt6'); | |
let product = { | |
'name': name && name.innerText, | |
'vendor': vendor && vendor.innerText, | |
'price': price && price.innerText, | |
'sold': sold && sold.innerText, | |
'rate': rate && rate.style.width | |
}; | |
console.log(i, product); | |
products.push(product); | |
} | |
return { | |
'products' : products | |
}; | |
}); | |
info['url'] = url; | |
console.log(JSON.stringify(info)); | |
//console.log(info); | |
await browser.close(); | |
})(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf8 | |
import json | |
import os | |
import sys | |
import csv | |
from mycsv import UnicodeWriter | |
csv_out = open('out.csv', 'wb') | |
fields = ['name', 'vendor', 'price', 'sold', 'rate'] | |
#writer = csv.DictWriter(csv_out, fieldnames=fields, encoding='utf-8') | |
writer = UnicodeWriter(csv_out, delimiter="\t") | |
for i in range(1, 466): | |
name = './out/%s.json' % i | |
if not os.path.isfile(name): | |
print "%s not exists" % name | |
continue | |
obj = json.load(open(name)) | |
for p in obj['products']: | |
row = [p[n] or '' for n in fields] | |
#row = [p['name'], p['vendor'], p['price'], p['sold'], p['rate']] | |
writer.writerow(row) | |
print p | |
csv_out.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for i in {9..465};do | |
timeout 35 node x3.js "https://market.aliyun.com/products/?spm=5176.730005-52734001.0.0.F9SiNi&priceTag=1-&pageIndex=$i" | tee out/$i.json; | |
sleep 5; | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment