Created
July 21, 2015 13:48
-
-
Save sftblw/9aa5fe6bf5207f1ba6dc to your computer and use it in GitHub Desktop.
hitomiapp2.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var cheerio = require('cheerio'); | |
var request = require('request'); | |
var url = require('url'); | |
var path = require('path'); | |
//var fs = require('fs'); | |
var fs = require('fs-extra'); | |
//var mkdirp = require('mkdirp'); | |
var async = require('async'); | |
var indexQueue = []; | |
var downloadQueue = []; | |
var hitomi = '웹 사이트 주소 (안알랴줌)'; | |
var targetDirName = 'dl2'; | |
var reqQueue = async.queue(function (urlItem, callback) { | |
var requesting = true; | |
request(urlItem, function (error, response, body) { | |
if (!error && response.statusCode == 200) { | |
callback(body); | |
} else { | |
reqQueue.push(urlItem); | |
} | |
//this.q.shift(); // remove front | |
requesting = false; | |
}); | |
//while (requesting); | |
}, 1); | |
var imgDlQueue = async.queue(function (urlItem, callback) { | |
//console.log("imgdl : " + urlItem.imgUrl); | |
//console.log("asyncImgDl Start"); | |
var requesting = true; | |
// request | |
// .get(urlItem.imgUrl) | |
// .on('error', function(err) { | |
// console.log(err) | |
// }).on('response', function () {callback()}) | |
// .pipe(fs.createWriteStream(urlItem.directory+"/"+urlItem.name)) | |
// ; | |
var options_req = { | |
url : urlItem.imgUrl, | |
encoding: null // binary data | |
} | |
request(options_req, function (error, response, body) { | |
if (!error && response.statusCode == 200) { | |
fs.writeFile(urlItem.directory+"/"+urlItem.name, body, 'binary', callback); | |
//callback(body); | |
} else { | |
reqQueue.push(urlItem); | |
} | |
//this.q.shift(); // remove front | |
requesting = false; | |
}); | |
//while (requesting); | |
}, 5); | |
// var reqQueue = { | |
// qLimit : 5, | |
// q: [], | |
// req: function (urlItem, callbackFn) { // 외부 접근용 함수 | |
// console.log("req start, q.Length : " + this.q.length); | |
// this.q.push({url: urlItem, callback: callbackFn}); | |
// while (this.q.length > this.qLimit) { | |
// console.log("req wait, q.Length : " + this.q.length); | |
// process.nextTick(); | |
// } | |
// this.procReq(); | |
// console.log("req end"); | |
// }, | |
// procReq: function () { // 실제로 리퀘스트를 처리 | |
// console.log("procReq start"); | |
// | |
// var r = this.q[0]; | |
// | |
// var reqEnd = false; | |
// | |
// request(r.url, function (error, response, body) { | |
// if (!error && response.statusCode == 200) { | |
// r.callback(body); | |
// reqEnd = true; | |
// } else { | |
// this.q.push(r); | |
// reqEnd = true; | |
// } | |
// this.q.shift(); // remove front | |
// }); | |
// | |
// //while(!reqEnd) { process.nextTick(); }; | |
// } | |
// } | |
// 페이지별로 요청 | |
function requestIndex(i) { | |
console.log("page " + i + " loading..."); | |
var pageEnd = false; | |
reqQueue.push(url.resolve(hitomi, '/index-korean-' + i.toString() + '.html'), function (body) { | |
checkBody(body, function () { | |
console.log("requesting next index..."); | |
requestIndex(i+1); // next request | |
}); | |
pageEnd = true; | |
while (imgDlQueue.length() != 0) process.nextTick(); | |
}); | |
//while (!pageEnd) process.nextTick(); | |
} | |
requestIndex(1); | |
// 본문에 대한 처리. 본문 요청이 완료되면 콜백으로 호출됨 | |
function checkBody(body, callback) { | |
//console.log("checkbody start"); | |
$ = cheerio.load(body); | |
var $galleriesElem = $('.gallery-content > div') | |
var galleriesElemDone = 0; | |
$galleriesElem.each(function (index, elem) { | |
//console.log("galeriesElem.each on work"); | |
// 해당 항목의 정보 생성 | |
var targetItem = { | |
id : $('div > a', this).attr('href').split('/')[2].split('.')[0], | |
name : $('div h1 a', this).text().trim(), | |
series : $('.dj-desc tr:nth-child(1) td:nth-child(2) a', this).html(), | |
type : $('.dj-desc tr:nth-child(2) td:nth-child(2) a', this).html(), | |
artist : $('div .artist-list li', this).text().trim().replace(/(?:\r\n|\r|\n)/g, ' _ ') | |
} | |
//console.log(targetItem); | |
// 다운로드 받을 필요가 없는 경우 스킵 | |
if (!shoudDownload(targetItem)) { | |
galleriesElemDone++; | |
console.log("!!![" + galleriesElemDone + "/" + $galleriesElem.length + "] " + targetItem.name + " --- already downloaded"); | |
if (galleriesElemDone == $galleriesElem.length) { | |
callback(); | |
} | |
return; | |
} | |
// 해당 갤러리의 모든 아이템을 다운받는 함수. 디렉토리 변경까지 책임짐. | |
downloadGallery($, targetItem, function () { | |
galleriesElemDone++; | |
console.log("[" + galleriesElemDone + "/" + $galleriesElem.length + "] " + targetItem.name + " --- done "); | |
if (galleriesElemDone == $galleriesElem.length) { | |
callback(); | |
} | |
}); | |
}); | |
//console.log("checkbody working"); | |
//while ($galleriesElem.length() < galleriesElemDone) { process.nextTick(); console.log("checkbody waiting"); } | |
//console.log("checkbody end"); | |
} | |
/** itemInfo 스키마 | |
{ | |
id : number | |
name : string, | |
artist : string, | |
series : string, undefined | |
type : string | |
} | |
*/ | |
// 이미 다운받은 갤러리인지 확인. 중간에 안 끝난 갤러리인 경우 | |
function shoudDownload(itemInfo) { | |
var targetDir = makeItemPath(itemInfo); | |
var tempDir = makeItemTempPath(itemInfo); | |
var targetDirExist = isDirExist(targetDir); | |
var tempDirExist = isDirExist(tempDir); | |
// if temp file is exist, remove all and retry download. | |
if (tempDirExist) { | |
fs.removeSync(tempDir); | |
if (targetDirExist) | |
fs.removeSync(targetDir); | |
return true; | |
} | |
// if temp file is not exist and file exist, skip. | |
if (targetDirExist) { | |
return false; | |
} | |
return true; | |
} | |
// 갤러리 하나를 다운로드. | |
function downloadGallery($, itemInfo, callback) { | |
//console.log("downloadGallery start"); | |
var targetDir = makeItemPath(itemInfo); | |
var tempDir = makeItemTempPath(itemInfo); | |
// 타겟 임시 디렉터리 생성 | |
fs.mkdirsSync(tempDir); | |
var galleryDlDone = false; | |
// 항목 리스트 다운로드 | |
var listUrl = url.resolve(hitomi, '/galleries/' + itemInfo.id.toString() + '.js'); | |
reqQueue.push(listUrl, function (body) { // body 는 갤러리.js 파일로, 갤러리의 이미지 리스트 객체 배열 변수임. eval 대신 자르기를 사용. | |
//배열로 변환. 카와이하게 자름 | |
var imgItemArray = body.substring(body.search(/\[/), body.length); | |
var imageItems = JSON.parse(imgItemArray); | |
var imageItemsDownloaded = 0; | |
imageItems.forEach(function (elem, index, array) { | |
//console.log("imageItems.forEach start"); | |
var imgUrll = url.resolve(hitomi, '/galleries/' + itemInfo.id.toString() + "/" + elem.name ); | |
imgDlQueue.push({imgUrl:imgUrll, directory : tempDir, name: elem.name}, function () { | |
// console.log("image request " + elem.name + " of " + itemInfo.name + " done"); | |
// fs.writeFile(path.join(tempDir, elem.name), function () { | |
imageItemsDownloaded++; | |
// console.log("image " + elem.name + " of " + itemInfo.name + " downloaded"); | |
// }); | |
// | |
// imgDlQueue.push({imgUrl:}) | |
if (imageItems.length == imageItemsDownloaded) { | |
fs.move(tempDir, targetDir, function () { | |
//console.log("moved"); | |
fs.removeSync(tempDir); | |
callback(); | |
}); | |
} | |
}); | |
//console.log("imageItems.forEach end"); | |
}); | |
//while(imageItems.length > imageItemsDownloaded) process.nextTick(); | |
galleryDlDone = true; | |
}); | |
//while (!galleryDlDone) process.nextTick(); | |
//console.log("downloadGallery end"); | |
} | |
function isDirExist(dirPath) { | |
try { | |
// Query the entry | |
stats = fs.lstatSync(dirPath); | |
// Is it a directory? | |
if (stats.isDirectory()) { | |
return true; | |
} else { | |
return false; | |
} | |
} | |
catch (e) { // in error case? | |
//console.log("Error : isDirExist()", e); | |
return false; | |
} | |
} | |
function makeItemPath(itemInfo) { | |
if (itemInfo.series === null) itemInfo.series = "all"; | |
return path.join('./', targetDirName, itemInfo.type, itemInfo.series, (itemInfo.id.toString()) + "__" + itemInfo.name.substr(0,100) | |
+ ((itemInfo.artist != "") ? "__ by " + itemInfo.artist : "")); | |
} | |
function makeItemTempPath(itemInfo) { | |
if (itemInfo.series === null) itemInfo.series = "all"; | |
return path.join('./', targetDirName, "/temp", (itemInfo.id.toString())); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "hitomiapp" | |
, "version": "0.0.1" | |
, "private": true | |
, "dependencies": { | |
"cheerio": ">= 0.19.0" | |
, "request": ">= 2.59.0" | |
, "fs-extra": ">= 0.22.1" | |
, "async": ">= 1.4.0" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment