# Stell sicher dass du NodeJS installiert hast
# Siehe https://nodejs.org/en/learn/getting-started/how-to-install-nodejs
# Installier npm dependencies
npm install
# Beispiel script ausführung
LOGIN_USERNAME=foo LOGIN_PASSWORD=bar node index.mjs
# Die CSV datei wird im selben verzeichnis erstellt wo das script ist
###
###
# Weitere optionale Parameter
FROM="01.01.2024" TO="01.02.2024" # wenn nicht angegeben liest das script die letzten 30 Tage vom aktuellen Datum aus
FILENAME="foo.csv" # falls du einen anderen datei namen willst, oder du passt einfach Z.68 in `index.mjs` an
DEBUG=true # falls irgendwas nicht funktioniert, kannst du im debug mode sehen was der crawler macht und wo er vielleicht hängen bleibt
# Zum beispiel
DEBUG=true FROM="01.01.2024" TO="01.02.2024" LOGIN_USERNAME=foo LOGIN_PASSWORD=bar node index.mjs
Last active
May 7, 2024 11:36
-
-
Save julianburr/f253ea3536907f87af56c9252bc07269 to your computer and use it in GitHub Desktop.
IGNRW Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as fs from "node:fs"; | |
import * as url from "node:url"; | |
import * as path from "node:path"; | |
import puppeteer from "puppeteer"; | |
const __dirname = url.fileURLToPath(new URL(".", import.meta.url)); | |
(async () => { | |
// Parameters | |
const USERNAME = process.env.LOGIN_USERNAME; | |
const PASSWORD = process.env.LOGIN_PASSWORD; | |
const FROM = process.env.FROM; | |
const TO = process.env.TO; | |
const FILENAME = process.env.FILENAME; | |
const DEBUG = process.env.DEBUG; | |
// Date and file name | |
const toDate = TO ? new Date(TO) : new Date(); | |
const fromDate = FROM | |
? new Date(FROM) | |
: new Date(toDate.getTime() - 30 * 24 * 60 * 60 * 1000); | |
const [fromYear, fromMonth, fromDay] = fromDate.toISOString().split(/[-T]/); | |
const [toYear, toMonth, toDay] = toDate.toISOString().split(/[-T]/); | |
// Launch the browser and open a new blank page | |
const browser = await puppeteer.launch({ headless: !DEBUG }); | |
const page = await browser.newPage(); | |
// Login | |
await page.goto("https://www.ig.nrw.de/IGNRW-Internet/"); | |
await page.type('input[name="kennung"]', USERNAME); | |
await page.type('input[name="kennwort"]', PASSWORD); | |
await Promise.all([ | |
page.waitForNavigation(), | |
page.click('input[type="submit"]'), | |
]); | |
// Get data | |
await page.goto( | |
"https://www.ig.nrw.de/IGNRW-Internet/prepareAuswertungKHVerwalterMeldungen.action?ziel=h13m03u02" | |
); | |
await page.type("#txtfDatumStart", `${fromDay}.${fromMonth}.${fromYear}`); | |
await page.type("#txtfDatumEnde", `${toDay}.${toMonth}.${toYear}`); | |
// HACK: timeout needed because the page loads weirdly | |
await page.waitForNetworkIdle({ idleTime: 4000 }); | |
// Find table rows and loop through them | |
const csv = await page.evaluate(() => { | |
let csv = ""; | |
const rows = document.querySelectorAll("#apliste tbody tr"); | |
for (let i = 0; i < rows.length; i++) { | |
const cells = rows[i].querySelectorAll("td"); | |
for (let c = 0; c < cells.length; c++) { | |
csv += `${c > 0 ? ";" : ""}${cells[c].innerHTML.trim()}`; | |
} | |
csv += `\n`; | |
} | |
return csv; | |
}); | |
const fileName = | |
FILENAME || | |
`${fromYear}${fromMonth}${fromDay}_${toYear}${toMonth}${toDay}.csv`; | |
const filePath = path.resolve(__dirname, fileName); | |
fs.writeFileSync(filePath, csv); | |
if (!DEBUG) { | |
await browser.close(); | |
} | |
})(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "ignrw-crawler", | |
"version": "1.0.0", | |
"description": "", | |
"keywords": [], | |
"author": "", | |
"license": "ISC", | |
"dependencies": { | |
"puppeteer": "^22.7.1" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment