Last active
January 8, 2023 19:54
-
-
Save koru1130/615c4dd5649507dc6d2b893db8953ad0 to your computer and use it in GitHub Desktop.
NTU NOL crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import https from "https"; | |
import fs from "fs"; | |
import { readdir } from 'node:fs/promises'; | |
import pMap from 'p-map' | |
const url = sem => | |
pageNumber => `https://nol.ntu.edu.tw/nol/coursesearch/search_for_02_dpt.php?alltime=yes&allproced=yes&selcode=-1&dptname=0&coursename=&teachername=¤t_sem=${sem}&yearcode=0&op=&startrec=${pageNumber*150}&week1=&week2=&week3=&week4=&week5=&week6=&proced0=&proced1=&proced2=&proced3=&proced4=&procedE=&proced5=&proced6=&proced7=&proced8=&proced9=&procedA=&procedB=&procedC=&procedD=&allsel=yes&selCode1=&selCode2=&selCode3=&page_cnt=150` | |
const getPage = | |
n => new Promise((resolve, reject) => { | |
console.log(`Page ${n} started...`); | |
https.get( | |
url('111-1')(n), | |
{rejectUnauthorized: false}, | |
res => { | |
const path = `raw/${n}.html`; | |
const writeStream = fs.createWriteStream(path); | |
res.pipe(writeStream); | |
writeStream.on("finish", () => { | |
writeStream.close(); | |
console.log(`Page ${n} completed!`); | |
resolve() | |
}) | |
}).on("error", err => reject(err)); | |
}); | |
const toN = n => [...Array(n).keys()]; | |
(async () => { | |
try { | |
if(!fs.existsSync("raw")) { | |
fs.mkdirSync("raw"); | |
} | |
const downloaded = await readdir('raw'); | |
const toDownload = toN(100).filter(n => !downloaded.includes(`${n}.html`)); | |
const result = await pMap(toDownload, getPage, {concurrency: 2}); | |
} catch (error) { | |
console.log(error); | |
} | |
})(); | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { createRequire } from "module"; | |
const require = createRequire(import.meta.url); | |
const courseDatas = require('./courses.json'); | |
const objectMap = (obj, fn) => | |
Object.fromEntries( | |
Object.entries(obj).map( | |
([k, v], i) => [k, fn(v, k, i)] | |
) | |
) | |
const result = objectMap(courseDatas, course => ({ | |
"流水號": course.流水號, | |
"課程名稱": course.課程名稱, | |
"times": course.times, | |
"授課教師": course.授課教師, | |
})) | |
console.log(JSON.stringify(result)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { createRequire } from "module"; | |
const require = createRequire(import.meta.url); | |
const courses = require('./temp2.json') | |
const reTime = /([一二三四五六])([\dABCD]+(?:,[\dABCD]+)*)\((.+?)\)/g | |
const reWeek = /第(\d(?:,\d+)*)\s*週/ | |
const dayOfWeekToNumber = x => { | |
// 如下,一是 0 | |
switch (x) { | |
case '一': | |
return 0 | |
case '二': | |
return 1 | |
case '三': | |
return 2 | |
case '四': | |
return 3 | |
case '五': | |
return 4 | |
case '六': | |
return 5 | |
case '一': | |
return 1 | |
default: | |
console.error("WTF") | |
} | |
} | |
const parseTimeToInt = x => { | |
switch (x) { | |
case 'A': | |
return 11 | |
case 'B': | |
return 12 | |
case 'C': | |
return 13 | |
case 'D': | |
return 14 | |
default: | |
return parseInt(x) | |
} | |
} | |
//https://stackoverflow.com/a/47907583 | |
function groupArray(a) { | |
const ret = []; | |
if (!a.length) return ret; | |
let ixf = 0; | |
for (let ixc = 1; ixc < a.length; ixc += 1) { | |
if (a[ixc] !== a[ixc-1] + 1) { | |
ret.push(a.slice(ixf, ixc)); | |
ixf = ixc; | |
} | |
} | |
ret.push(a.slice(ixf, a.length)); | |
return ret; | |
} | |
const parse時間教室 = str => { | |
const reWeeksResult = str.match(reWeek) | |
const weeks = reWeeksResult ? reWeeksResult[1].split(',').map(x => parseInt(x)) : "ALL" | |
const reTimeResult = Array.from(str.matchAll(reTime)) | |
const times = | |
(reTimeResult.length != 0) ? | |
(reTimeResult.map( y => { | |
let timeArr = y[2].split(',') | |
const temp = ({ | |
dayOfWeek : y[1], | |
sections: timeArr, | |
location: y[3] | |
}) | |
// console.log(temp) | |
return temp | |
} | |
).flat() | |
) | |
: "N/A" | |
return { | |
weeks: weeks, | |
times: times | |
} | |
} | |
const result = courses.map(x => { | |
const {weeks, times} = parse時間教室(x.時間教室) | |
let temp = {} | |
for (const dayOfWeek of ['一','二','三','四','五','六']) { | |
for(const section of ['0','1','2','3','4','5','6','7','8','9','10','A','B','C','D']){ | |
temp[dayOfWeek+section] = "0" | |
} | |
} | |
if(times != "N/A"){ | |
for (const time of times) { | |
for(const section of time.sections){ | |
temp[time.dayOfWeek+section] = "1" | |
} | |
} | |
} | |
return { | |
...x, | |
...temp | |
} | |
}) | |
function convertToCSV(arr) { | |
const array = [Object.keys(arr[0])].concat(arr) | |
return array.map(it => { | |
return Object.values(it).join('=') | |
}).join('\n') | |
} | |
const csv = convertToCSV(result) | |
console.log(csv) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "nol-crawler", | |
"main": "index.js", | |
"author": "koru1130 <[email protected]>", | |
"type": "module", | |
"dependencies": { | |
"jsdom": "^20.0.0", | |
"p-map": "^5.5.0", | |
"rambda": "^7.2.1" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fs from 'fs'; | |
import { JSDOM } from 'jsdom' | |
import * as R from 'rambda' | |
let maybeMissing = []; | |
const readNthHTML = n => { | |
const path = `raw/${n}.html`; | |
return fs.readFileSync(path, 'utf8'); | |
} | |
const parseNthHTML = n => { | |
const html = readNthHTML(n) | |
const result = Array.from(getTable(html).children).slice(1).map(parseRow) | |
console.log(`Page ${n}: ${result.length} courses`) | |
if(result.length < 150){ | |
maybeMissing.push(n); | |
} | |
return result | |
} | |
const getTable = html => { | |
//const doc = parseHtml(html) | |
const doc = (new JSDOM(html)).window.document | |
const xpath = "//td[text()='流水號']" | |
// const ele流水號 = $( "td:contains('流水號')" ); | |
// const ele流水號 = xpath.fromPageSource(html).findElement("//td[text()='流水號']") | |
// const ele流水號 = doc.evaluate(xpath, doc, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue | |
//const ele流水號 = doc.querySelectorAll('td').find(x => x.textContent === '流水號') | |
//return ele流水號.parentElement.parentElement; | |
const result = doc.querySelector("body > table:nth-child(2) > tbody > tr > td > font > table:nth-child(4) > tbody") | |
//console.log(result) | |
return result | |
} | |
const parseRow = row => { | |
//console.log(row.children[0].innerText) | |
const getNthCol = n => row.children[n]; | |
const getNthColText = n => getNthCol(n).textContent.trim() | |
const {weeks, times} = parse時間教室(getNthColText(12)) | |
return { | |
流水號: getNthColText(0), | |
授課對象: getNthColText(1), | |
課號: getNthColText(2), | |
班次: getNthColText(3), | |
課程名稱: getNthColText(4), | |
領域專長: getNthColText(5), | |
學分: getNthColText(6), | |
課程識別碼: getNthColText(7), | |
全半年: getNthColText(8), | |
必選修: getNthColText(9), | |
授課教師: getNthColText(10), | |
加選方式: getNthColText(11), | |
時間教室: getNthColText(12), | |
總人數: getNthColText(13), | |
選課限制條件: getNthColText(14), | |
備註: getNthColText(15), | |
// weeks: weeks, | |
// times: times | |
} | |
} | |
const reTime = /([一二三四五六])([\dABCD]+(?:,[\dABCD]+)*)\((.+?)\)/g | |
const reWeek = /第(\d(?:,\d+)*)\s*週/ | |
const dayOfWeekToNumber = x => { | |
// 如下,一是 0 | |
switch (x) { | |
case '一': | |
return 0 | |
case '二': | |
return 1 | |
case '三': | |
return 2 | |
case '四': | |
return 3 | |
case '五': | |
return 4 | |
case '六': | |
return 5 | |
case '一': | |
return 1 | |
default: | |
console.error("WTF") | |
} | |
} | |
const parseTimeToInt = x => { | |
switch (x) { | |
case 'A': | |
return 11 | |
case 'B': | |
return 12 | |
case 'C': | |
return 13 | |
case 'D': | |
return 14 | |
default: | |
return parseInt(x) | |
} | |
} | |
//https://stackoverflow.com/a/47907583 | |
function groupArray(a) { | |
const ret = []; | |
if (!a.length) return ret; | |
let ixf = 0; | |
for (let ixc = 1; ixc < a.length; ixc += 1) { | |
if (a[ixc] !== a[ixc-1] + 1) { | |
ret.push(a.slice(ixf, ixc)); | |
ixf = ixc; | |
} | |
} | |
ret.push(a.slice(ixf, a.length)); | |
return ret; | |
} | |
const parse時間教室 = str => { | |
const reWeeksResult = str.match(reWeek) | |
const weeks = reWeeksResult ? reWeeksResult[1].split(',').map(x => parseInt(x)) : "ALL" | |
const reTimeResult = Array.from(str.matchAll(reTime)) | |
const times = | |
(reTimeResult.length != 0) ? | |
(reTimeResult.map( y => { | |
let timeArr = y[2].split(',').map(parseTimeToInt) | |
const temp = groupArray(timeArr).map(z =>({ | |
dayOfWeek : dayOfWeekToNumber(y[1]), | |
startTime: z[0], | |
endTime: z[z.length - 1], | |
duration: z[z.length-1] - z[0] + 1, | |
location: y[3] | |
})) | |
// console.log(temp) | |
return temp | |
} | |
).flat() | |
) | |
: "N/A" | |
return { | |
weeks: weeks, | |
times: times | |
} | |
} | |
// console.log(parseRow(getTable(readNthHTML(1)).children[1])) | |
//console.log((getTable(readNthHTML(1)).children[1].children[0].innerHTML)) | |
// console.log(JSON.stringify(parseNthHTML(10))) | |
const toN = n => [...Array(n).keys()]; | |
const unique = arr => [...(new Set(arr))] | |
const parseAll = n => { | |
const temp = toN(n).flatMap(x => parseNthHTML(x)) | |
const result = R.pipe( | |
// R.map(x=>x) | |
R.filter( x => x.流水號), | |
// R.groupBy( x => x.流水號), | |
// R.map( x => ({ | |
// ...x[0], | |
// 授課對象: unique(x.map( y => y.授課對象 ).filter( y => y != "")), | |
// })) | |
)(temp) | |
if(maybeMissing) { | |
console.log(`WARNING: Page ${maybeMissing} may miss some courses.`) | |
console.log(`Please check the pages and manually paste the full content into the page if missing.`) | |
} | |
return result | |
} | |
//console.log(JSON.stringify(parseAll(99))) | |
const result = parseAll(100); | |
fs.writeFileSync('./temp2.json', JSON.stringify(result)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment