Last active
February 13, 2025 21:20
-
-
Save yappo/3e2d77b3ddb9146eee7e49557151fa41 to your computer and use it in GitHub Desktop.
KWebScraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package jp.yappo.kwebscraper | |
import java.net.URI | |
import okhttp3.OkHttpClient | |
import okhttp3.Request | |
import org.jsoup.Jsoup | |
import org.jsoup.nodes.Element | |
class KWebScraper(private val scraperCode: KWebScraper.() -> Unit) { | |
private val ua = OkHttpClient() | |
private var doc: Element? = null | |
fun scrape(uri: URI) { | |
val request = Request.Builder().url(uri.toURL()).build() | |
ua.newCall(request).execute().use { response -> | |
response.body?.let { body -> | |
doc = Jsoup.parse(body.string()) | |
scraperCode() | |
} | |
} | |
} | |
// Delegate to KWebScraperProcess | |
fun process(selector: String, code: KWebScraperProcessor.(it: KWebScraperProcessor.KWebScraperElement) -> Unit) { | |
doc?.let { | |
val processor = KWebScraperProcessor(it) | |
processor.execute(selector, code) | |
} ?: error("Can't call directory.") | |
} | |
} | |
class KWebScraperProcessor(private val doc: Element) { | |
fun process(selector: String, code: KWebScraperProcessor.(it: KWebScraperElement) -> Unit) { | |
execute(selector, code) | |
} | |
internal fun execute(selector: String, code: KWebScraperProcessor.(it: KWebScraperElement) -> Unit) { | |
val elements = doc.select(selector) | |
elements.forEach { it -> | |
val processor = KWebScraperProcessor(it) | |
processor.code(KWebScraperElement(it)) | |
} | |
} | |
class KWebScraperElement(private val element: Element) { | |
fun text(): String { | |
return element.text() | |
} | |
fun html(): String { | |
return element.html() | |
} | |
fun attr(name: String): String { | |
return element.attr(name) | |
} | |
} | |
} | |
fun scraper(code: KWebScraper.() -> Unit): KWebScraper { | |
val kWebScraper = KWebScraper(code) | |
return kWebScraper | |
} | |
// MIT License | |
// Reference project https://metacpan.org/pod/Web::Scraper |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.example.app | |
import jp.yappo.kwebscraper.scraper | |
import java.net.URI | |
data class Entry ( | |
val title: String, | |
val href: String, | |
) | |
fun main() { | |
val entries = mutableListOf<Entry>() | |
val myScraper = scraper { | |
process("main li") { // この要素は複数あるので forEach の様に要素の数だけ繰り返す | |
process("a") { | |
entries.add(Entry(it.text(), it.attr("href"))) | |
} | |
} | |
} | |
myScraper.scrape(URI("https://blog.yappo.jp/")) | |
entries.forEach { | |
println("title = ${it.title}, path = ${it.href}") | |
} | |
// title = meta.toml を実装, path = /entry/2025/01/29/metafile/ | |
// title = index.html と rss feed も生成できるようにした, path = /entry/2025/01/27/support-index/ | |
// title = とりあえずmd2blogかいた, path = /entry/2025/01/27/pologenize/ | |
// title = そろそろ再開します〜, path = /entry/2025/01/26/reborn/ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment