Skip to content

Instantly share code, notes, and snippets.

@GibsonRuitiari
Created December 8, 2023 20:26
Show Gist options
  • Save GibsonRuitiari/cb7d780616697573887e0fb85a6494e4 to your computer and use it in GitHub Desktop.
Save GibsonRuitiari/cb7d780616697573887e0fb85a6494e4 to your computer and use it in GitHub Desktop.
regex html parsing
import com.jakewharton.picnic.BorderStyle
import com.jakewharton.picnic.TextAlignment
import com.jakewharton.picnic.table
import java.net.URL
import kotlin.time.measureTime
fun main(args: Array<String>){
fun String.toHtml() = URL(this).openStream().bufferedReader().use { it.readText() }
val baseUrlHtml ="https://leagueofcomicgeeks.com/comics".toHtml()
// issues based on publisher
val publishersNewTitlesHtml="https://leagueofcomicgeeks.com/comics/new-comics/".toHtml()
val timeTaken=measureTime {
getFeaturedComicPublishers(baseUrlHtml)
println(" ")
getCommunityAndIndiePicks(baseUrlHtml)
println(" ")
latestIssues(publishersNewTitlesHtml)
}
// and that's it. One more run . We are parsing two htmls under wait or is it in? 500 ms.
// That's amazing!!!!!!!!!!!!!!!!!!!!!!
println("time taken to parse in ms ${timeTaken.inWholeMilliseconds}")
}
// based on publisher
fun latestIssues(html:String){
/**
* The regex is pretty simple. Get a div whose class ="cover" then skip over the breaks/lines/spaces
* then ignore everything that follows the space up until you meet an '<a' tag. Now inside the <a tag
* take the href provided the href link has the word 'comic' in it, then since the <a></a> contains an img tag
* take the data-src link inside the img tag and the alt (comic-name) inside the img tag. Then ensure the search
* ends at '>'
*
* this is the div being parsed, inside the larger html file
* <div class="cover">
* <a href="/comic/1711767/birds-of-prey-4">
* <img class="lazy" src="data:image/gif;base64"
* data-src="https://s3.amazonaws.com/comicgeeks/comics/covers/medium-1711767.jpg?1701803920"
* alt="birds of prey #4">
* </a>
* </div>
*
*/
val latestTitlesDetails="<div\\s+class=\"cover\">\\s+.*?<a.*?href=\"(/comic/.*?)\">\\s+.*?data-src=\"(.*?)\\s+alt=\"(.*?)\">"
.toRegex()
.findAll(html).asIterable()
/**
* this one too is simple
* look for a div whose class =publisher,
* now if the div is followed by a space as in <div class="publisher>
* // space
* ignore the space using s*? and ignore everything else coming after the space
* take the text that comes after wards (.*?) but ignore any quotes acompanying the texts [^\"] and [^\"]
* so in total this becomes ([^"].*?[^"])
* then skip the space following the text and everything that follows the text
* we include a '?' to make the regex lazy so it won't really consume a line/text unless you force to it to
* hence as an example
* <div class="publisher color-offset">
* DC Comics </div>
*
* the </div> won't be consumed or looked
*/
val latestTitlesPublisherIterable ="div\\s+class=\"publisher\\s*?.*?>\\s+([^\"].*?[^\"])\\s.*?<" // here we only have
// on group
.toRegex()
.findAll(html).asIterable()
val latestTitlesReleaseDateIterable ="<span\\s+class=\"date\"\\s*.*?>\\s*(.*?)<".toRegex() // similarly we only
// have one group
.findAll(html).asIterable()
val latestIssuesDetailsIterator=zipIterator(latestTitlesDetails,latestTitlesPublisherIterable,latestTitlesReleaseDateIterable)
val latestComicsTable=table {
style {
borderStyle=BorderStyle.Solid
}
cellStyle {
paddingLeft = 1
border=true
borderBottom=true
borderLeft=true
borderRight=true
borderTop=true
paddingRight = 1
}
row("name","publisher","release-year")
body {
latestIssuesDetailsIterator
.forEach {latestIssuesDetails->
val (basicComicDetails,publisherDetails,releaseDateDetails) = latestIssuesDetails
// since latestIssuesDetails is an IteratorContainer containing the match result of our regex,
// we retrieve them then use matchResult.groupValues[index] to get the exact value/text captured by
// our group. If you noticed in our regexes we had (). The '()' signifies a group
// so
val publisher = publisherDetails.groupValues.getOrNull(1)
val publishDate = releaseDateDetails.groupValues.getOrNull(1)
val basicComicDetailsGroupValues = basicComicDetails.groupValues
val comicLink = basicComicDetailsGroupValues.getOrNull(1)
val thumbnail = basicComicDetailsGroupValues.getOrNull(2)?.removeSuffix("\"")
val comicName = basicComicDetailsGroupValues.getOrNull(3)
row("$comicName","$publisher","$publishDate")
}
}
footer {
cellStyle {
border=true
}
row {
cell("total-issues"){
alignment=TextAlignment.BottomCenter
}
cell("89"){
alignment=TextAlignment.BottomCenter
}
}
}
}
println("Latest comics")
println(latestComicsTable)
}
data class IteratorContainer<A,B,C>(val param1: A,val param2: B,val param3: C)
data class BiIteratorContainer<A,B>(val param1: A, val param2: B)
/**
* This is just a cheeky way of combining two iterables/collections. Instead of using the
* kotlin's standard lib zip() we do this. Zip() internally creates a new array so when you do iterableA.zip(iterableB)
* you create a new array, then when you zip it with something else eg a list, another new array is created so
* the space complexity increases linearly.
* To combine two iterables/ lists or collections, zip() needs to loop through all the elements of the first and second Iterable
* to create the combined collection. This makes it eager because it will loop and work on the elements regardless of
* whether we will be using them at the moment or not.
* This method on the other hand does neither of the above. We don't create any additional array just playing around with
* iterators and the elements are accessed/processed whenever we call a terminal function like forEach
* or an intermediate function like map() or next() so in a way it makes this lazy.
*/
fun<A,B,C> zipIterator(firstIterable:Iterable<A>,
secondIterable:Iterable<B>,
thirdIterable:Iterable<C>):Iterator<IteratorContainer<A,B,C>>{
val firstIterator = firstIterable.iterator()
val secondIterator = secondIterable.iterator()
val thirdIterator = thirdIterable.iterator()
return object :Iterator<IteratorContainer<A,B,C>>{
override fun hasNext(): Boolean{
return firstIterator.hasNext() && secondIterator.hasNext() && thirdIterator.hasNext()
}
override fun next():IteratorContainer<A,B,C>{
return IteratorContainer(firstIterator.next(),secondIterator.next(),
thirdIterator.next())
}
}
}
fun<A,B> zipIterator(firstIterable:Iterable<A>,
secondIterable:Iterable<B>):Iterator<BiIteratorContainer<A,B>>{
val firstIterator = firstIterable.iterator()
val secondIterator = secondIterable.iterator()
return object :Iterator<BiIteratorContainer<A,B>>{
override fun hasNext(): Boolean{
return firstIterator.hasNext() && secondIterator.hasNext()
}
override fun next():BiIteratorContainer<A,B>{
return BiIteratorContainer(firstIterator.next(),secondIterator.next())
}
}
}
fun getFeaturedComicPublishers(html:String){
// here we have two groups in one regex
val publishersRegexSequence = "div\\s*.*?=\"col-xxxxl-2.*?\">\\s+.*?href=\"(/comics/.*?[^\"])\".*?data-title=\"(.*?)\""
.toRegex()
.findAll(html)
val publishersTable=table {
style {
borderStyle=BorderStyle.Solid
}
cellStyle {
paddingLeft = 1
border=true
borderBottom=true
borderLeft=true
borderRight=true
borderTop=true
paddingRight = 1
}
row("publisher name"," publisher link")
body {
publishersRegexSequence.asIterable().forEach {matchResult ->
val groupValues = matchResult.groupValues
// as you can see, we get group1 and group2
row("${groupValues.getOrNull(2)?.trim()}","${groupValues.getOrNull(1)?.trim()}")
}
}
}
println("featured comic publishers")
println()
println(publishersTable)
}
fun getCommunityAndIndiePicks(html:String){
val comicPublishersRegexSequence ="div.*=\"pl-3\".?\\s*.*?=\"copy-really-small.*\".*?\\s+(.+)<"
.toRegex()
.findAll(html).asIterable()
val featuredComicsDetailsRegex = "<div.+=\"card.*\".*\\s+.*data-src=\"(.*?)\"\\s+alt=\"([^\"].*?[^\"])\".*?>\\s+.*href=\"(/comic/\\d+/.*)\"\\s+.+\">(.+)<"
.toRegex()
.findAll(html).asIterable()
val communityAndIndiePicksTable=table {
style {
borderStyle=BorderStyle.Solid
}
cellStyle {
paddingLeft = 1
border=true
borderBottom=true
borderLeft=true
borderRight=true
borderTop=true
paddingRight = 1
}
row("name","publisher")
body {
zipIterator(featuredComicsDetailsRegex,comicPublishersRegexSequence)
.forEach {biIteratorContainer ->
val (details, publishers) = biIteratorContainer
val detailsGroupValues = details.groupValues
row("${detailsGroupValues.getOrNull(2)}",
"${publishers.groupValues.getOrNull(1)?.trim()}")
}
}
}
println("Community and Indie Picks")
println(communityAndIndiePicksTable)
// println(featuredComicsDetailsRegex.asIterable().joinToString(separator = ", ") {
// val groupValues = it.groupValues
// "thumbnailLink→ ${groupValues.getOrNull(1)} comicIssueName→ ${groupValues.getOrNull(2)} " +
// "comicLink→ https://leagueofcomicgeeks.com/${groupValues.getOrNull(3)}"
// })
//
// println(comicPublishersRegexSequence.asIterable().joinToString(", ") { matchResult ->
// val groupValues = matchResult.groupValues
// "publisher→ ${groupValues.getOrNull(1)?.trim()}"
// })
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment