Skip to content

Instantly share code, notes, and snippets.

@dk00
Last active June 15, 2020 12:35
Show Gist options
  • Save dk00/b6b540c79ffb5d4e1586298d627d7fc5 to your computer and use it in GitHub Desktop.
Save dk00/b6b540c79ffb5d4e1586298d627d7fc5 to your computer and use it in GitHub Desktop.
Crawler for findchildcare.taipei
node_modules
sitters.dat

Usage

Crawl baby sitter list:

yarn
yarn start

Export CSV:

yarn export-csv
const fetch = require('node-fetch')
const delay = t => new Promise(resolve => setTimeout(resolve, t))
const queued = (fn, {concurrent, wait}) => {
const state = {
currentJob: Promise.resolve()
}
return (...args) => {
state.currentJob = state.currentJob
.then(() => delay(wait))
.then(() => fn(...args))
return state.currentJob
}
}
const queueFetch = queued(fetch, {
concurrent: 1,
wait: 500
})
const {parseList, parseDetail} = require('./parse')
const listUrl = 'https://findchildcare.taipei/baby_sitter/search?members_service_zip=&search_keyword=&location_distance=-1&search_location=&search_service_type=1&take_care_cat=-1&take_care_type=-1&photo_option=-1&is_cooperation=-1&only_have_childs_cnt=-1&only_can_reservation=-1&start_num=10'
const detailUrl = 'https://findchildcare.taipei/baby_sitter/info/14070'
const fetchList = ({ page = 1 }) => {
const url = new URL(
'https://findchildcare.taipei/baby_sitter/search' +
'?members_service_zip=&search_keyword=&location_distance=-1&' +
'search_location=&search_service_type=1&take_care_cat=-1&' +
'take_care_type=-1&photo_option=-1&is_cooperation=-1&' +
`only_have_childs_cnt=-1&only_can_reservation=-1&start_num=${(page - 1)*10}`
)
return queueFetch(url).then(it => it.text())
}
const fetchDetail = id => {
const url = `https://findchildcare.taipei/baby_sitter/info/${id.replace(/^0*/, '')}`
return queueFetch(url).then(it => it.text())
}
const syncDetail = (db, item) =>
fetchDetail(item.id).then(detail => {
console.log(item.id)
return new Promise(resolve =>
db.update(
{id: item.id},
{$set: {...item, ...parseDetail(detail)}},
{upsert: true},
resolve
)
)
})
const getDB = () => {
const Datastore = require('nedb')
const db = new Datastore({ filename: 'sitters.dat' })
return new Promise(resolve => db.loadDatabase(resolve)).then(() => db)
}
const fetchAll = ({start=1, length=390}={}) => {
getDB().then(db =>
Array.from({length: length - start}).reduce((last, _, i) =>
last.then(() => fetchList({ page: i + start }))
.then(content => {
console.log('page', i + start)
Promise.all(parseList(content).map(item => {
syncDetail(db, item)
}))
}),
Promise.resolve()
)
)
}
const main = () => {
fetchAll()
}
main()
const getDB = () => {
const Datastore = require('nedb')
const db = new Datastore({ filename: 'sitters.dat' })
return new Promise(resolve => db.loadDatabase(resolve)).then(() => db)
}
const exportData = query => getDB().then(db => {
return new Promise((resolve, reject) => {
db.find(query).limit(2000).exec((err, docs) => {
if (err) reject(err)
else resolve(docs)
})
})
})
module.exports = {getDB, exportData}
const {exportData} = require('./dataStore')
const main = () => {
const Papa = require('papaparse')
const query = {
location: {$regex: /南港/},
}
exportData(query).then(docs => {
console.log(docs)
console.log(Papa.unparse(docs))
})
.catch(e => {
console.error(e)
})
}
main()
{
"name": "data",
"version": "1.0.0",
"license": "MIT",
"dependencies": {
"cheerio": "^1.0.0-rc.3",
"nedb": "^1.8.0",
"node-fetch": "^2.6.0",
"papaparse": "^5.1.1"
},
"scripts": {
"start": "node crawl.js",
"export-csv": "node exportCSV.js"
}
}
const cheerio = require('cheerio')
const parseList = text => {
const $ = cheerio.load(text)
return $('.list-imf-box')
.get()
.map(element => $(element))
.map(row => {
const content = row
.find('.imf-content')
.text()
.replace(/(在|到)宅:/g, '\n')
.replace(/\/月/, '/月\n')
.replace(/\/請洽詢保母/, '/請洽詢保母\n')
.replace(/\t+/g, '\n')
.replace(/\n+/g, '\n')
.trim()
const items = Object.assign({}, ...content
.split('\n')
.map(line => {
const [label,, value] = line.split(/\s*(:|:)\s*/)
return {[label]: value}
}))
return {
id: items['系統編號'],
name: row.find('.babysitter-name').text().trim(),
location: items['托育地點'],
babys: parseInt(items['目前已收托數']) || 0,
beds: parseInt(items['預計總收托數']) || 0,
phone: items['保母電話'],
fare: items['日間托育'] || '?',
admin: items['中心名稱'],
certificate: `${items['登記資格']} ${items['保母証號']}`,
}
})
}
const parseDetail = text => {
const $ = cheerio.load(text)
const items = Object.assign({}, ...$('.personal-data-row > div > .row')
.get()
.map(element => $(element))
.map(row => {
const label = row.find('label').text().replace(':', '')
const text = row
.find('.col-sm-9')
.text()
.replace(/\t+/g, '\n')
.replace(/\n+/g, '\n')
.trim()
return {[label]: text}
}))
return {
gender: items['保母性別'],
age: items['保母年齡'],
religion: items['信仰宗教']
}
}
module.exports = {parseList, parseDetail}
# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
# yarn lockfile v1
"@types/node@*":
version "13.5.0"
resolved "https://registry.yarnpkg.com/@types/node/-/node-13.5.0.tgz#4e498dbf355795a611a87ae5ef811a8660d42662"
integrity sha512-Onhn+z72D2O2Pb2ql2xukJ55rglumsVo1H6Fmyi8mlU9SvKdBk/pUSUAiBY/d9bAOF7VVWajX3sths/+g6ZiAQ==
[email protected]:
version "0.2.10"
resolved "https://registry.yarnpkg.com/async/-/async-0.2.10.tgz#b6bbe0b0674b9d719708ca38de8c237cb526c3d1"
integrity sha1-trvgsGdLnXGXCMo43owjfLUmw9E=
[email protected]:
version "0.2.5"
resolved "https://registry.yarnpkg.com/binary-search-tree/-/binary-search-tree-0.2.5.tgz#7dbb3b210fdca082450dad2334c304af39bdc784"
integrity sha1-fbs7IQ/coIJFDa0jNMMErzm9x4Q=
dependencies:
underscore "~1.4.4"
boolbase@~1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e"
integrity sha1-aN/1++YMUes3cl6p4+0xDcwed24=
cheerio@^1.0.0-rc.3:
version "1.0.0-rc.3"
resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0-rc.3.tgz#094636d425b2e9c0f4eb91a46c05630c9a1a8bf6"
integrity sha512-0td5ijfUPuubwLUu0OBoe98gZj8C/AA+RW3v67GPlGOrvxWjZmBXiBCRU+I8VEiNyJzjth40POfHiz2RB3gImA==
dependencies:
css-select "~1.2.0"
dom-serializer "~0.1.1"
entities "~1.1.1"
htmlparser2 "^3.9.1"
lodash "^4.15.0"
parse5 "^3.0.1"
css-select@~1.2.0:
version "1.2.0"
resolved "https://registry.yarnpkg.com/css-select/-/css-select-1.2.0.tgz#2b3a110539c5355f1cd8d314623e870b121ec858"
integrity sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg=
dependencies:
boolbase "~1.0.0"
css-what "2.1"
domutils "1.5.1"
nth-check "~1.0.1"
[email protected]:
version "2.1.3"
resolved "https://registry.yarnpkg.com/css-what/-/css-what-2.1.3.tgz#a6d7604573365fe74686c3f311c56513d88285f2"
integrity sha512-a+EPoD+uZiNfh+5fxw2nO9QwFa6nJe2Or35fGY6Ipw1R3R4AGz1d1TEZrCegvw2YTmZ0jXirGYlzxxpYSHwpEg==
dom-serializer@0:
version "0.2.2"
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-0.2.2.tgz#1afb81f533717175d478655debc5e332d9f9bb51"
integrity sha512-2/xPb3ORsQ42nHYiSunXkDjPLBaEj/xTwUO4B7XCZQTRk7EBtTOPaygh10YAAh2OI1Qrp6NWfpAhzswj0ydt9g==
dependencies:
domelementtype "^2.0.1"
entities "^2.0.0"
dom-serializer@~0.1.1:
version "0.1.1"
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-0.1.1.tgz#1ec4059e284babed36eec2941d4a970a189ce7c0"
integrity sha512-l0IU0pPzLWSHBcieZbpOKgkIn3ts3vAh7ZuFyXNwJxJXk/c4Gwj9xaTJwIDVQCXawWD0qb3IzMGH5rglQaO0XA==
dependencies:
domelementtype "^1.3.0"
entities "^1.1.1"
domelementtype@1, domelementtype@^1.3.0, domelementtype@^1.3.1:
version "1.3.1"
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-1.3.1.tgz#d048c44b37b0d10a7f2a3d5fee3f4333d790481f"
integrity sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w==
domelementtype@^2.0.1:
version "2.0.1"
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.0.1.tgz#1f8bdfe91f5a78063274e803b4bdcedf6e94f94d"
integrity sha512-5HOHUDsYZWV8FGWN0Njbr/Rn7f/eWSQi1v7+HsUVwXgn8nWWlL64zKDkS0n8ZmQ3mlWOMuXOnR+7Nx/5tMO5AQ==
domhandler@^2.3.0:
version "2.4.2"
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-2.4.2.tgz#8805097e933d65e85546f726d60f5eb88b44f803"
integrity sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==
dependencies:
domelementtype "1"
[email protected]:
version "1.5.1"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.5.1.tgz#dcd8488a26f563d61079e48c9f7b7e32373682cf"
integrity sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=
dependencies:
dom-serializer "0"
domelementtype "1"
domutils@^1.5.1:
version "1.7.0"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.7.0.tgz#56ea341e834e06e6748af7a1cb25da67ea9f8c2a"
integrity sha512-Lgd2XcJ/NjEw+7tFvfKxOzCYKZsdct5lczQ2ZaQY8Djz7pfAD3Gbp8ySJWtreII/vDlMVmxwa6pHmdxIYgttDg==
dependencies:
dom-serializer "0"
domelementtype "1"
entities@^1.1.1, entities@~1.1.1:
version "1.1.2"
resolved "https://registry.yarnpkg.com/entities/-/entities-1.1.2.tgz#bdfa735299664dfafd34529ed4f8522a275fea56"
integrity sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==
entities@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/entities/-/entities-2.0.0.tgz#68d6084cab1b079767540d80e56a39b423e4abf4"
integrity sha512-D9f7V0JSRwIxlRI2mjMqufDrRDnx8p+eEOz7aUM9SuvF8gsBzra0/6tbjl1m8eQHrZlYj6PxqE00hZ1SAIKPLw==
htmlparser2@^3.9.1:
version "3.10.1"
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-3.10.1.tgz#bd679dc3f59897b6a34bb10749c855bb53a9392f"
integrity sha512-IgieNijUMbkDovyoKObU1DUhm1iwNYE/fuifEoEHfd1oZKZDaONBSkal7Y01shxsM49R4XaMdGez3WnF9UfiCQ==
dependencies:
domelementtype "^1.3.1"
domhandler "^2.3.0"
domutils "^1.5.1"
entities "^1.1.1"
inherits "^2.0.1"
readable-stream "^3.1.1"
immediate@~3.0.5:
version "3.0.6"
resolved "https://registry.yarnpkg.com/immediate/-/immediate-3.0.6.tgz#9db1dbd0faf8de6fbe0f5dd5e56bb606280de69b"
integrity sha1-nbHb0Pr43m++D13V5Wu2BigN5ps=
inherits@^2.0.1, inherits@^2.0.3:
version "2.0.4"
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
[email protected]:
version "3.1.1"
resolved "https://registry.yarnpkg.com/lie/-/lie-3.1.1.tgz#9a436b2cc7746ca59de7a41fa469b3efb76bd87e"
integrity sha1-mkNrLMd0bKWd56QfpGmz77dr2H4=
dependencies:
immediate "~3.0.5"
localforage@^1.3.0:
version "1.7.3"
resolved "https://registry.yarnpkg.com/localforage/-/localforage-1.7.3.tgz#0082b3ca9734679e1bd534995bdd3b24cf10f204"
integrity sha512-1TulyYfc4udS7ECSBT2vwJksWbkwwTX8BzeUIiq8Y07Riy7bDAAnxDaPU/tWyOVmQAcWJIEIFP9lPfBGqVoPgQ==
dependencies:
lie "3.1.1"
lodash@^4.15.0:
version "4.17.15"
resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.15.tgz#b447f6670a0455bbfeedd11392eff330ea097548"
integrity sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A==
[email protected]:
version "0.0.8"
resolved "https://registry.yarnpkg.com/minimist/-/minimist-0.0.8.tgz#857fcabfc3397d2625b8228262e86aa7a011b05d"
integrity sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=
mkdirp@~0.5.1:
version "0.5.1"
resolved "https://registry.yarnpkg.com/mkdirp/-/mkdirp-0.5.1.tgz#30057438eac6cf7f8c4767f38648d6697d75c903"
integrity sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=
dependencies:
minimist "0.0.8"
nedb@^1.8.0:
version "1.8.0"
resolved "https://registry.yarnpkg.com/nedb/-/nedb-1.8.0.tgz#0e3502cd82c004d5355a43c9e55577bd7bd91d88"
integrity sha1-DjUCzYLABNU1WkPJ5VV3vXvZHYg=
dependencies:
async "0.2.10"
binary-search-tree "0.2.5"
localforage "^1.3.0"
mkdirp "~0.5.1"
underscore "~1.4.4"
node-fetch@^2.6.0:
version "2.6.0"
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.0.tgz#e633456386d4aa55863f676a7ab0daa8fdecb0fd"
integrity sha512-8dG4H5ujfvFiqDmVu9fQ5bOHUC15JMjMY/Zumv26oOvvVJjM67KF8koCWIabKQ1GJIa9r2mMZscBq/TbdOcmNA==
nth-check@~1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-1.0.2.tgz#b2bd295c37e3dd58a3bf0700376663ba4d9cf05c"
integrity sha512-WeBOdju8SnzPN5vTUJYxYUxLeXpCaVP5i5e0LF8fg7WORF2Wd7wFX/pk0tYZk7s8T+J7VLy0Da6J1+wCT0AtHg==
dependencies:
boolbase "~1.0.0"
papaparse@^5.1.1:
version "5.1.1"
resolved "https://registry.yarnpkg.com/papaparse/-/papaparse-5.1.1.tgz#1da66a039f80e2db43a1226b0bf44106451e9a2d"
integrity sha512-KPkW4GNQxunmYTeJIjHFrvilcNuHBWrfgbyvmagEmfGOA4hnP1WIkPbv4yABhj1Nam3as4w+7MBiI27BntwqVg==
parse5@^3.0.1:
version "3.0.3"
resolved "https://registry.yarnpkg.com/parse5/-/parse5-3.0.3.tgz#042f792ffdd36851551cf4e9e066b3874ab45b5c"
integrity sha512-rgO9Zg5LLLkfJF9E6CCmXlSE4UVceloys8JrFqCcHloC3usd/kJCyPDwH2SOlzix2j3xaP9sUX3e8+kvkuleAA==
dependencies:
"@types/node" "*"
readable-stream@^3.1.1:
version "3.5.0"
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.5.0.tgz#465d70e6d1087f6162d079cd0b5db7fbebfd1606"
integrity sha512-gSz026xs2LfxBPudDuI41V1lka8cxg64E66SGe78zJlsUofOg/yqwezdIcdfwik6B4h8LFmWPA9ef9X3FiNFLA==
dependencies:
inherits "^2.0.3"
string_decoder "^1.1.1"
util-deprecate "^1.0.1"
safe-buffer@~5.2.0:
version "5.2.0"
resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.0.tgz#b74daec49b1148f88c64b68d49b1e815c1f2f519"
integrity sha512-fZEwUGbVl7kouZs1jCdMLdt95hdIv0ZeHg6L7qPeciMZhZ+/gdesW4wgTARkrFWEpspjEATAzUGPG8N2jJiwbg==
string_decoder@^1.1.1:
version "1.3.0"
resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e"
integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==
dependencies:
safe-buffer "~5.2.0"
underscore@~1.4.4:
version "1.4.4"
resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.4.4.tgz#61a6a32010622afa07963bf325203cf12239d604"
integrity sha1-YaajIBBiKvoHljvzJSA88SI51gQ=
util-deprecate@^1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"
integrity sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment