Last active
March 15, 2017 04:24
-
-
Save akirattii/9051bb396d7141147810dd41ea58305e to your computer and use it in GitHub Desktop.
How to scrape the member page that login required.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| $ node member-page-scraping.js <loginId> <password> | |
| */ | |
| var request = require("request"); | |
| request = request.defaults({ jar: true }); | |
| var encoding = require('encoding-japanese'); | |
| const cheerio = require("cheerio"); | |
| const Step = require("step"); | |
| var config = require("config"); | |
| var loginId = process.argv[2]; | |
| var password = process.argv[3]; | |
| var jar; // Cookie jar | |
| var uri; | |
| // TODO: Set page encoding | |
| const ENCODE = "EUC-JP"; | |
| /** | |
| * Login | |
| */ | |
| function login() { | |
| jar = request.jar(); | |
| let url = 'https://www.example'; | |
| let method = "POST"; | |
| let json = true; | |
| let form = { | |
| loginId, | |
| password | |
| }; | |
| let headers = { | |
| //'Content-Type':'application/json' | |
| }; | |
| let options = { | |
| url, | |
| method, | |
| headers, | |
| json, | |
| form, | |
| jar, | |
| }; | |
| request(options, function(error, response, body) { | |
| // | |
| Step( | |
| function(err) { | |
| if (err) throw err; | |
| // Login | |
| request(options, this) | |
| }, | |
| function(err, resp, body) { | |
| if (err) throw err; | |
| // Move member page | |
| memberPageRequest("https://mypage.example", this); | |
| }, | |
| function(err, $body) { | |
| if (err) throw err; | |
| console.log("html:", $body.html()); | |
| }); // Step | |
| }); | |
| } | |
| /** | |
| * moves internal page in which login required | |
| */ | |
| function memberPageRequest(url, cb) { | |
| console.log("url:", url); | |
| let method = "GET"; | |
| let json = true; | |
| let headers = { | |
| "Content-Type": "content=text/html; charset=" + ENCODE | |
| }; | |
| // console.log("authedHeaders",authedHeaders); | |
| let options = { | |
| url, | |
| method, | |
| headers, | |
| json, | |
| jar, | |
| encoding: null, // for converting EUC-JP to unicode | |
| }; | |
| request(options, function(err, response, body) { | |
| if (err) cb && cb(err); | |
| // converts EUC-JP to unicode | |
| let unicodeArr = encoding.convert(body, { | |
| from: ENCODE, | |
| to: 'UNICODE' | |
| }); | |
| let html = encoding.codeToString(unicodeArr); | |
| // console.log(html); | |
| let $ = cheerio.load(html); | |
| cb && cb(err, $("body")); | |
| }); | |
| } | |
| // Fire! | |
| login(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment