Skip to content

Instantly share code, notes, and snippets.

@koru1130
Last active January 8, 2023 19:54
Show Gist options
  • Save koru1130/615c4dd5649507dc6d2b893db8953ad0 to your computer and use it in GitHub Desktop.
Save koru1130/615c4dd5649507dc6d2b893db8953ad0 to your computer and use it in GitHub Desktop.
NTU NOL crawler
import https from "https";
import fs from "fs";
import { readdir } from 'node:fs/promises';
import pMap from 'p-map'
const url = sem =>
pageNumber => `https://nol.ntu.edu.tw/nol/coursesearch/search_for_02_dpt.php?alltime=yes&allproced=yes&selcode=-1&dptname=0&coursename=&teachername=&current_sem=${sem}&yearcode=0&op=&startrec=${pageNumber*150}&week1=&week2=&week3=&week4=&week5=&week6=&proced0=&proced1=&proced2=&proced3=&proced4=&procedE=&proced5=&proced6=&proced7=&proced8=&proced9=&procedA=&procedB=&procedC=&procedD=&allsel=yes&selCode1=&selCode2=&selCode3=&page_cnt=150`
const getPage =
n => new Promise((resolve, reject) => {
console.log(`Page ${n} started...`);
https.get(
url('111-1')(n),
{rejectUnauthorized: false},
res => {
const path = `raw/${n}.html`;
const writeStream = fs.createWriteStream(path);
res.pipe(writeStream);
writeStream.on("finish", () => {
writeStream.close();
console.log(`Page ${n} completed!`);
resolve()
})
}).on("error", err => reject(err));
});
const toN = n => [...Array(n).keys()];
(async () => {
try {
if(!fs.existsSync("raw")) {
fs.mkdirSync("raw");
}
const downloaded = await readdir('raw');
const toDownload = toN(100).filter(n => !downloaded.includes(`${n}.html`));
const result = await pMap(toDownload, getPage, {concurrency: 2});
} catch (error) {
console.log(error);
}
})();
import { createRequire } from "module";
const require = createRequire(import.meta.url);
const courseDatas = require('./courses.json');
const objectMap = (obj, fn) =>
Object.fromEntries(
Object.entries(obj).map(
([k, v], i) => [k, fn(v, k, i)]
)
)
const result = objectMap(courseDatas, course => ({
"流水號": course.流水號,
"課程名稱": course.課程名稱,
"times": course.times,
"授課教師": course.授課教師,
}))
console.log(JSON.stringify(result))
import { createRequire } from "module";
const require = createRequire(import.meta.url);
const courses = require('./temp2.json')
const reTime = /([一二三四五六])([\dABCD]+(?:,[\dABCD]+)*)\((.+?)\)/g
const reWeek = /第(\d(?:,\d+)*)\s*週/
const dayOfWeekToNumber = x => {
// 如下,一是 0
switch (x) {
case '一':
return 0
case '二':
return 1
case '三':
return 2
case '四':
return 3
case '五':
return 4
case '六':
return 5
case '一':
return 1
default:
console.error("WTF")
}
}
const parseTimeToInt = x => {
switch (x) {
case 'A':
return 11
case 'B':
return 12
case 'C':
return 13
case 'D':
return 14
default:
return parseInt(x)
}
}
//https://stackoverflow.com/a/47907583
function groupArray(a) {
const ret = [];
if (!a.length) return ret;
let ixf = 0;
for (let ixc = 1; ixc < a.length; ixc += 1) {
if (a[ixc] !== a[ixc-1] + 1) {
ret.push(a.slice(ixf, ixc));
ixf = ixc;
}
}
ret.push(a.slice(ixf, a.length));
return ret;
}
const parse時間教室 = str => {
const reWeeksResult = str.match(reWeek)
const weeks = reWeeksResult ? reWeeksResult[1].split(',').map(x => parseInt(x)) : "ALL"
const reTimeResult = Array.from(str.matchAll(reTime))
const times =
(reTimeResult.length != 0) ?
(reTimeResult.map( y => {
let timeArr = y[2].split(',')
const temp = ({
dayOfWeek : y[1],
sections: timeArr,
location: y[3]
})
// console.log(temp)
return temp
}
).flat()
)
: "N/A"
return {
weeks: weeks,
times: times
}
}
const result = courses.map(x => {
const {weeks, times} = parse時間教室(x.時間教室)
let temp = {}
for (const dayOfWeek of ['一','二','三','四','五','六']) {
for(const section of ['0','1','2','3','4','5','6','7','8','9','10','A','B','C','D']){
temp[dayOfWeek+section] = "0"
}
}
if(times != "N/A"){
for (const time of times) {
for(const section of time.sections){
temp[time.dayOfWeek+section] = "1"
}
}
}
return {
...x,
...temp
}
})
function convertToCSV(arr) {
const array = [Object.keys(arr[0])].concat(arr)
return array.map(it => {
return Object.values(it).join('=')
}).join('\n')
}
const csv = convertToCSV(result)
console.log(csv)
{
"name": "nol-crawler",
"main": "index.js",
"author": "koru1130 <[email protected]>",
"type": "module",
"dependencies": {
"jsdom": "^20.0.0",
"p-map": "^5.5.0",
"rambda": "^7.2.1"
}
}
import fs from 'fs';
import { JSDOM } from 'jsdom'
import * as R from 'rambda'
let maybeMissing = [];
const readNthHTML = n => {
const path = `raw/${n}.html`;
return fs.readFileSync(path, 'utf8');
}
const parseNthHTML = n => {
const html = readNthHTML(n)
const result = Array.from(getTable(html).children).slice(1).map(parseRow)
console.log(`Page ${n}: ${result.length} courses`)
if(result.length < 150){
maybeMissing.push(n);
}
return result
}
const getTable = html => {
//const doc = parseHtml(html)
const doc = (new JSDOM(html)).window.document
const xpath = "//td[text()='流水號']"
// const ele流水號 = $( "td:contains('流水號')" );
// const ele流水號 = xpath.fromPageSource(html).findElement("//td[text()='流水號']")
// const ele流水號 = doc.evaluate(xpath, doc, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue
//const ele流水號 = doc.querySelectorAll('td').find(x => x.textContent === '流水號')
//return ele流水號.parentElement.parentElement;
const result = doc.querySelector("body > table:nth-child(2) > tbody > tr > td > font > table:nth-child(4) > tbody")
//console.log(result)
return result
}
const parseRow = row => {
//console.log(row.children[0].innerText)
const getNthCol = n => row.children[n];
const getNthColText = n => getNthCol(n).textContent.trim()
const {weeks, times} = parse時間教室(getNthColText(12))
return {
流水號: getNthColText(0),
授課對象: getNthColText(1),
課號: getNthColText(2),
班次: getNthColText(3),
課程名稱: getNthColText(4),
領域專長: getNthColText(5),
學分: getNthColText(6),
課程識別碼: getNthColText(7),
全半年: getNthColText(8),
必選修: getNthColText(9),
授課教師: getNthColText(10),
加選方式: getNthColText(11),
時間教室: getNthColText(12),
總人數: getNthColText(13),
選課限制條件: getNthColText(14),
備註: getNthColText(15),
// weeks: weeks,
// times: times
}
}
const reTime = /([一二三四五六])([\dABCD]+(?:,[\dABCD]+)*)\((.+?)\)/g
const reWeek = /第(\d(?:,\d+)*)\s*週/
const dayOfWeekToNumber = x => {
// 如下,一是 0
switch (x) {
case '一':
return 0
case '二':
return 1
case '三':
return 2
case '四':
return 3
case '五':
return 4
case '六':
return 5
case '一':
return 1
default:
console.error("WTF")
}
}
const parseTimeToInt = x => {
switch (x) {
case 'A':
return 11
case 'B':
return 12
case 'C':
return 13
case 'D':
return 14
default:
return parseInt(x)
}
}
//https://stackoverflow.com/a/47907583
function groupArray(a) {
const ret = [];
if (!a.length) return ret;
let ixf = 0;
for (let ixc = 1; ixc < a.length; ixc += 1) {
if (a[ixc] !== a[ixc-1] + 1) {
ret.push(a.slice(ixf, ixc));
ixf = ixc;
}
}
ret.push(a.slice(ixf, a.length));
return ret;
}
const parse時間教室 = str => {
const reWeeksResult = str.match(reWeek)
const weeks = reWeeksResult ? reWeeksResult[1].split(',').map(x => parseInt(x)) : "ALL"
const reTimeResult = Array.from(str.matchAll(reTime))
const times =
(reTimeResult.length != 0) ?
(reTimeResult.map( y => {
let timeArr = y[2].split(',').map(parseTimeToInt)
const temp = groupArray(timeArr).map(z =>({
dayOfWeek : dayOfWeekToNumber(y[1]),
startTime: z[0],
endTime: z[z.length - 1],
duration: z[z.length-1] - z[0] + 1,
location: y[3]
}))
// console.log(temp)
return temp
}
).flat()
)
: "N/A"
return {
weeks: weeks,
times: times
}
}
// console.log(parseRow(getTable(readNthHTML(1)).children[1]))
//console.log((getTable(readNthHTML(1)).children[1].children[0].innerHTML))
// console.log(JSON.stringify(parseNthHTML(10)))
const toN = n => [...Array(n).keys()];
const unique = arr => [...(new Set(arr))]
const parseAll = n => {
const temp = toN(n).flatMap(x => parseNthHTML(x))
const result = R.pipe(
// R.map(x=>x)
R.filter( x => x.流水號),
// R.groupBy( x => x.流水號),
// R.map( x => ({
// ...x[0],
// 授課對象: unique(x.map( y => y.授課對象 ).filter( y => y != "")),
// }))
)(temp)
if(maybeMissing) {
console.log(`WARNING: Page ${maybeMissing} may miss some courses.`)
console.log(`Please check the pages and manually paste the full content into the page if missing.`)
}
return result
}
//console.log(JSON.stringify(parseAll(99)))
const result = parseAll(100);
fs.writeFileSync('./temp2.json', JSON.stringify(result))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment