Created
April 26, 2020 16:58
-
-
Save HelloWorld017/e05d5dec7955e9346e992c5e93a4dfa9 to your computer and use it in GitHub Desktop.
Musinsa Crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const axios = require('axios'); | |
const cheerio = require('cheerio'); | |
const fs = require('fs'); | |
const path = require('path'); | |
const promisePipe = require('promisepipe'); | |
const signale = require('signale'); | |
const baseUrl = 'https://store.musinsa.com'; | |
const api = axios.create({ | |
baseURL: baseUrl, | |
headers: { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36' | |
}, | |
responseType: 'text' | |
}); | |
const getNumber = elem => | |
parseInt(elem.text().replace(/[^0-9]/g, '')); | |
const crawl = async (id, dest, maxPage = null) => { | |
try { | |
await fs.promises.mkdir(dest); | |
} catch(e) {} | |
const listUrl = `/app/items/lists/${id}`; | |
const crawlPage = async page => { | |
const { data: body } = await api(`${listUrl}?page=${page}`); | |
const $ = cheerio.load(body); | |
const listItems = $('.list-box .li_box').toArray(); | |
const images = []; | |
for(const listItem of listItems) { | |
const elem = $(listItem); | |
/* | |
const title = elem.children('.list_info').text(); | |
const price = elem.children('.txt_price_member').text(); | |
const like = getNumber(elem.children('.txt_cnt_like')); | |
*/ | |
const image = elem.find('.list_img img[data-original]'); | |
const imageUrl = image.attr('data-original'); | |
images.push(new URL( | |
imageUrl, | |
baseUrl | |
).toString()); | |
} | |
const errors = []; | |
for(const imageUrl of images) { | |
try { | |
const { data: imageStream } = await api({ | |
url: imageUrl, | |
responseType: 'stream' | |
}); | |
const fileName = path.basename(imageUrl); | |
await promisePipe( | |
imageStream, | |
fs.createWriteStream(path.join(dest, fileName)) | |
); | |
} catch(e) { | |
signale.error(e); | |
errors.push(imageUrl); | |
} | |
} | |
return errors; | |
}; | |
const { data: body } = await api(listUrl); | |
const $ = cheerio.load(body); | |
const goods = getNumber($('.box_num_goods')); | |
const pages = getNumber($('.totalPagingNum')); | |
signale.info(`Found ${goods} goods.`); | |
const errors = []; | |
for(let page = 1; page <= pages; page++) { | |
const pageErrors = await crawlPage(page); | |
if(pageErrors.length > 0) { | |
signale.error(`${pageErrors.length} errors have been occurred while crawling`); | |
errors.push(...pageErrors); | |
} | |
signale.success(`Crawled page ${page}/${Math.min(maxPage, pages)}.`); | |
if(maxPage && page >= maxPage) { | |
signale.info("Touched the maximum page. Stopping."); | |
break; | |
} | |
await new Promise(resolve => setTimeout(resolve, 2000)); | |
} | |
if(errors.length > 0) { | |
await fs.promises.writeFile('./errors.json', JSON.stringify(errors)); | |
} | |
signale.success(`Done with ${errors.length} errors.`); | |
}; | |
// crawl('002020', '../dataset/cardigan', 25); | |
crawl('001001', '../dataset/tshirts', 25); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment