Skip to content

Instantly share code, notes, and snippets.

@yappo
Last active February 13, 2025 21:20
Show Gist options
  • Save yappo/3e2d77b3ddb9146eee7e49557151fa41 to your computer and use it in GitHub Desktop.
Save yappo/3e2d77b3ddb9146eee7e49557151fa41 to your computer and use it in GitHub Desktop.
KWebScraper
package jp.yappo.kwebscraper
import java.net.URI
import okhttp3.OkHttpClient
import okhttp3.Request
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
class KWebScraper(private val scraperCode: KWebScraper.() -> Unit) {
private val ua = OkHttpClient()
private var doc: Element? = null
fun scrape(uri: URI) {
val request = Request.Builder().url(uri.toURL()).build()
ua.newCall(request).execute().use { response ->
response.body?.let { body ->
doc = Jsoup.parse(body.string())
scraperCode()
}
}
}
// Delegate to KWebScraperProcess
fun process(selector: String, code: KWebScraperProcessor.(it: KWebScraperProcessor.KWebScraperElement) -> Unit) {
doc?.let {
val processor = KWebScraperProcessor(it)
processor.execute(selector, code)
} ?: error("Can't call directory.")
}
}
class KWebScraperProcessor(private val doc: Element) {
fun process(selector: String, code: KWebScraperProcessor.(it: KWebScraperElement) -> Unit) {
execute(selector, code)
}
internal fun execute(selector: String, code: KWebScraperProcessor.(it: KWebScraperElement) -> Unit) {
val elements = doc.select(selector)
elements.forEach { it ->
val processor = KWebScraperProcessor(it)
processor.code(KWebScraperElement(it))
}
}
class KWebScraperElement(private val element: Element) {
fun text(): String {
return element.text()
}
fun html(): String {
return element.html()
}
fun attr(name: String): String {
return element.attr(name)
}
}
}
fun scraper(code: KWebScraper.() -> Unit): KWebScraper {
val kWebScraper = KWebScraper(code)
return kWebScraper
}
// MIT License
// Reference project https://metacpan.org/pod/Web::Scraper
package org.example.app
import jp.yappo.kwebscraper.scraper
import java.net.URI
data class Entry (
val title: String,
val href: String,
)
fun main() {
val entries = mutableListOf<Entry>()
val myScraper = scraper {
process("main li") { // この要素は複数あるので forEach の様に要素の数だけ繰り返す
process("a") {
entries.add(Entry(it.text(), it.attr("href")))
}
}
}
myScraper.scrape(URI("https://blog.yappo.jp/"))
entries.forEach {
println("title = ${it.title}, path = ${it.href}")
}
// title = meta.toml を実装, path = /entry/2025/01/29/metafile/
// title = index.html と rss feed も生成できるようにした, path = /entry/2025/01/27/support-index/
// title = とりあえずmd2blogかいた, path = /entry/2025/01/27/pologenize/
// title = そろそろ再開します〜, path = /entry/2025/01/26/reborn/
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment