Last active
November 25, 2017 01:03
-
-
Save sykwer/33ea2492aaa8e1659d5559173e64ffb1 to your computer and use it in GitHub Desktop.
Airbnbのreviewデータをとってくる。使い方は下の方に。Airbnbのrooms#showページのhtml構造に深く依存しているコードなので2017/11/24現在以降正しく動く保証はありません。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const phantom = require("phantom") | |
const Papa = require("papaparse") | |
const fs = require("fs") | |
// input data | |
const dataRows = Papa.parse(fs.readFileSync("URLs.csv", "utf8")).data | |
dataRows.shift() // title row | |
const rowsCount = dataRows.length | |
// init model data | |
const modelData = {} | |
const years = Array(10).fill(0).map(function(v, i) { return 2008 + i }) | |
const months = Array(12).fill(0).map(function(v, i) { return 1 + i }) | |
years.forEach(function(year) { | |
months.forEach(function(month) { | |
modelData[year + "/" + month] = 0 | |
}) | |
}) | |
// init output file | |
const outputFileName = new Date().getTime() + "_fetch_airbnb.csv" | |
const content = "url,id,lat,lon," + Object.keys(modelData).join(",") + "\n" | |
fs.writeFile(outputFileName, content, function(err) { | |
if (err) throw err | |
console.log("Output file " + outputFileName + " created!") | |
}) | |
// entry point | |
let _ph | |
phantom.create().then(function(ph) { | |
_ph = ph | |
nextRow(ph) | |
}).catch(function(e) { | |
console.log(e) | |
_ph.exit() | |
}) | |
// process one row | |
async function nextRow(ph) { | |
if (dataRows.length < 1) { | |
console.log("Finished!") | |
ph.exit() | |
} | |
const row = dataRows.shift() | |
const roomId = row[2] | |
const lat = row[3] | |
const lon = row[4] | |
const url = "https://www.airbnb.jp/rooms/" + roomId | |
console.log(rowsCount - dataRows.length + "/" + rowsCount + ": Fetching data from " + url) | |
const page = await ph.createPage() | |
const status = await page.open(url) | |
console.log("Status: " + status) | |
const pagesCount = await page.evaluate(checkPagesCount) | |
const data = Object.assign({}, modelData) | |
for (var i = 1; i <= pagesCount; i++) { | |
console.log("Scanning page: " + i + "/" + pagesCount) | |
const dates = await page.evaluate(scanAndClick) | |
dates.forEach(function(date) { | |
data[date] += 1 | |
}) | |
await waitWhile(2000) | |
} | |
// record data | |
const content = [url, roomId, lat, lon].join(",") + "," + Object.values(data).join(",") + "\n" | |
fs.appendFile(outputFileName, content, function(err) { | |
if (err) throw err | |
console.log("Add data to" + outputFileName) | |
}) | |
nextRow(ph) | |
} | |
// functions | |
// | |
function waitWhile(millisecond) { | |
return new Promise(function(resolve) { | |
setTimeout(function() { | |
resolve() | |
}, millisecond) | |
}) | |
} | |
function checkPagesCount() { | |
if (!document.getElementsByClassName("list-unstyled").item(0)) { | |
return 1 | |
} else { | |
const paginateButtonWrappers = document.getElementsByClassName("list-unstyled").item(0).childNodes | |
return parseInt(paginateButtonWrappers.item(paginateButtonWrappers.length - 2).childNodes.item(0).innerText) | |
} | |
} | |
function scanAndClick() { | |
const scanDates = function() { | |
const nodes = document.getElementsByClassName("_150a3jym") | |
const strings = Array.prototype.map.call(nodes, function(node) { | |
return node.innerText | |
}) | |
const filtered = strings.filter(function(s) { | |
return s.match(/\d+年\d+月/) | |
}).map(function(s) { | |
return s.replace("年", "/").replace("月", "") | |
}) | |
return filtered | |
} | |
const resultDates = [] | |
scanDates().forEach(function(date) { | |
resultDates.push(date) | |
}) | |
if (document.getElementsByClassName("list-unstyled").item(0)) { | |
const paginateButtonWrappers = document.getElementsByClassName("list-unstyled").item(0).childNodes | |
const button = paginateButtonWrappers.item(paginateButtonWrappers.length - 1).childNodes.item(0) | |
button.click() | |
} | |
return resultDates | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "fetch_airbnb_reviews", | |
"version": "1.0.0", | |
"main": "index.js", | |
"license": "MIT", | |
"dependencies": { | |
"papaparse": "^4.3.6", | |
"phantom": "^4.0.9", | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nodeのversionは>=8を想定
Usage
URLs.csv
でprojectのrootディレクトリにおく。(A~E行のみでよい)node ./index.js
(上のfetch_airbnb_reviews.jsをファイル名index.jsとしておく)Output
review数を月ごとに集計する
