Created
December 8, 2023 20:26
-
-
Save GibsonRuitiari/cb7d780616697573887e0fb85a6494e4 to your computer and use it in GitHub Desktop.
regex html parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.jakewharton.picnic.BorderStyle | |
import com.jakewharton.picnic.TextAlignment | |
import com.jakewharton.picnic.table | |
import java.net.URL | |
import kotlin.time.measureTime | |
fun main(args: Array<String>){ | |
fun String.toHtml() = URL(this).openStream().bufferedReader().use { it.readText() } | |
val baseUrlHtml ="https://leagueofcomicgeeks.com/comics".toHtml() | |
// issues based on publisher | |
val publishersNewTitlesHtml="https://leagueofcomicgeeks.com/comics/new-comics/".toHtml() | |
val timeTaken=measureTime { | |
getFeaturedComicPublishers(baseUrlHtml) | |
println(" ") | |
getCommunityAndIndiePicks(baseUrlHtml) | |
println(" ") | |
latestIssues(publishersNewTitlesHtml) | |
} | |
// and that's it. One more run . We are parsing two htmls under wait or is it in? 500 ms. | |
// That's amazing!!!!!!!!!!!!!!!!!!!!!! | |
println("time taken to parse in ms ${timeTaken.inWholeMilliseconds}") | |
} | |
// based on publisher | |
fun latestIssues(html:String){ | |
/** | |
* The regex is pretty simple. Get a div whose class ="cover" then skip over the breaks/lines/spaces | |
* then ignore everything that follows the space up until you meet an '<a' tag. Now inside the <a tag | |
* take the href provided the href link has the word 'comic' in it, then since the <a></a> contains an img tag | |
* take the data-src link inside the img tag and the alt (comic-name) inside the img tag. Then ensure the search | |
* ends at '>' | |
* | |
* this is the div being parsed, inside the larger html file | |
* <div class="cover"> | |
* <a href="/comic/1711767/birds-of-prey-4"> | |
* <img class="lazy" src="data:image/gif;base64" | |
* data-src="https://s3.amazonaws.com/comicgeeks/comics/covers/medium-1711767.jpg?1701803920" | |
* alt="birds of prey #4"> | |
* </a> | |
* </div> | |
* | |
*/ | |
val latestTitlesDetails="<div\\s+class=\"cover\">\\s+.*?<a.*?href=\"(/comic/.*?)\">\\s+.*?data-src=\"(.*?)\\s+alt=\"(.*?)\">" | |
.toRegex() | |
.findAll(html).asIterable() | |
/** | |
* this one too is simple | |
* look for a div whose class =publisher, | |
* now if the div is followed by a space as in <div class="publisher> | |
* // space | |
* ignore the space using s*? and ignore everything else coming after the space | |
* take the text that comes after wards (.*?) but ignore any quotes acompanying the texts [^\"] and [^\"] | |
* so in total this becomes ([^"].*?[^"]) | |
* then skip the space following the text and everything that follows the text | |
* we include a '?' to make the regex lazy so it won't really consume a line/text unless you force to it to | |
* hence as an example | |
* <div class="publisher color-offset"> | |
* DC Comics </div> | |
* | |
* the </div> won't be consumed or looked | |
*/ | |
val latestTitlesPublisherIterable ="div\\s+class=\"publisher\\s*?.*?>\\s+([^\"].*?[^\"])\\s.*?<" // here we only have | |
// on group | |
.toRegex() | |
.findAll(html).asIterable() | |
val latestTitlesReleaseDateIterable ="<span\\s+class=\"date\"\\s*.*?>\\s*(.*?)<".toRegex() // similarly we only | |
// have one group | |
.findAll(html).asIterable() | |
val latestIssuesDetailsIterator=zipIterator(latestTitlesDetails,latestTitlesPublisherIterable,latestTitlesReleaseDateIterable) | |
val latestComicsTable=table { | |
style { | |
borderStyle=BorderStyle.Solid | |
} | |
cellStyle { | |
paddingLeft = 1 | |
border=true | |
borderBottom=true | |
borderLeft=true | |
borderRight=true | |
borderTop=true | |
paddingRight = 1 | |
} | |
row("name","publisher","release-year") | |
body { | |
latestIssuesDetailsIterator | |
.forEach {latestIssuesDetails-> | |
val (basicComicDetails,publisherDetails,releaseDateDetails) = latestIssuesDetails | |
// since latestIssuesDetails is an IteratorContainer containing the match result of our regex, | |
// we retrieve them then use matchResult.groupValues[index] to get the exact value/text captured by | |
// our group. If you noticed in our regexes we had (). The '()' signifies a group | |
// so | |
val publisher = publisherDetails.groupValues.getOrNull(1) | |
val publishDate = releaseDateDetails.groupValues.getOrNull(1) | |
val basicComicDetailsGroupValues = basicComicDetails.groupValues | |
val comicLink = basicComicDetailsGroupValues.getOrNull(1) | |
val thumbnail = basicComicDetailsGroupValues.getOrNull(2)?.removeSuffix("\"") | |
val comicName = basicComicDetailsGroupValues.getOrNull(3) | |
row("$comicName","$publisher","$publishDate") | |
} | |
} | |
footer { | |
cellStyle { | |
border=true | |
} | |
row { | |
cell("total-issues"){ | |
alignment=TextAlignment.BottomCenter | |
} | |
cell("89"){ | |
alignment=TextAlignment.BottomCenter | |
} | |
} | |
} | |
} | |
println("Latest comics") | |
println(latestComicsTable) | |
} | |
data class IteratorContainer<A,B,C>(val param1: A,val param2: B,val param3: C) | |
data class BiIteratorContainer<A,B>(val param1: A, val param2: B) | |
/** | |
* This is just a cheeky way of combining two iterables/collections. Instead of using the | |
* kotlin's standard lib zip() we do this. Zip() internally creates a new array so when you do iterableA.zip(iterableB) | |
* you create a new array, then when you zip it with something else eg a list, another new array is created so | |
* the space complexity increases linearly. | |
* To combine two iterables/ lists or collections, zip() needs to loop through all the elements of the first and second Iterable | |
* to create the combined collection. This makes it eager because it will loop and work on the elements regardless of | |
* whether we will be using them at the moment or not. | |
* This method on the other hand does neither of the above. We don't create any additional array just playing around with | |
* iterators and the elements are accessed/processed whenever we call a terminal function like forEach | |
* or an intermediate function like map() or next() so in a way it makes this lazy. | |
*/ | |
fun<A,B,C> zipIterator(firstIterable:Iterable<A>, | |
secondIterable:Iterable<B>, | |
thirdIterable:Iterable<C>):Iterator<IteratorContainer<A,B,C>>{ | |
val firstIterator = firstIterable.iterator() | |
val secondIterator = secondIterable.iterator() | |
val thirdIterator = thirdIterable.iterator() | |
return object :Iterator<IteratorContainer<A,B,C>>{ | |
override fun hasNext(): Boolean{ | |
return firstIterator.hasNext() && secondIterator.hasNext() && thirdIterator.hasNext() | |
} | |
override fun next():IteratorContainer<A,B,C>{ | |
return IteratorContainer(firstIterator.next(),secondIterator.next(), | |
thirdIterator.next()) | |
} | |
} | |
} | |
fun<A,B> zipIterator(firstIterable:Iterable<A>, | |
secondIterable:Iterable<B>):Iterator<BiIteratorContainer<A,B>>{ | |
val firstIterator = firstIterable.iterator() | |
val secondIterator = secondIterable.iterator() | |
return object :Iterator<BiIteratorContainer<A,B>>{ | |
override fun hasNext(): Boolean{ | |
return firstIterator.hasNext() && secondIterator.hasNext() | |
} | |
override fun next():BiIteratorContainer<A,B>{ | |
return BiIteratorContainer(firstIterator.next(),secondIterator.next()) | |
} | |
} | |
} | |
fun getFeaturedComicPublishers(html:String){ | |
// here we have two groups in one regex | |
val publishersRegexSequence = "div\\s*.*?=\"col-xxxxl-2.*?\">\\s+.*?href=\"(/comics/.*?[^\"])\".*?data-title=\"(.*?)\"" | |
.toRegex() | |
.findAll(html) | |
val publishersTable=table { | |
style { | |
borderStyle=BorderStyle.Solid | |
} | |
cellStyle { | |
paddingLeft = 1 | |
border=true | |
borderBottom=true | |
borderLeft=true | |
borderRight=true | |
borderTop=true | |
paddingRight = 1 | |
} | |
row("publisher name"," publisher link") | |
body { | |
publishersRegexSequence.asIterable().forEach {matchResult -> | |
val groupValues = matchResult.groupValues | |
// as you can see, we get group1 and group2 | |
row("${groupValues.getOrNull(2)?.trim()}","${groupValues.getOrNull(1)?.trim()}") | |
} | |
} | |
} | |
println("featured comic publishers") | |
println() | |
println(publishersTable) | |
} | |
fun getCommunityAndIndiePicks(html:String){ | |
val comicPublishersRegexSequence ="div.*=\"pl-3\".?\\s*.*?=\"copy-really-small.*\".*?\\s+(.+)<" | |
.toRegex() | |
.findAll(html).asIterable() | |
val featuredComicsDetailsRegex = "<div.+=\"card.*\".*\\s+.*data-src=\"(.*?)\"\\s+alt=\"([^\"].*?[^\"])\".*?>\\s+.*href=\"(/comic/\\d+/.*)\"\\s+.+\">(.+)<" | |
.toRegex() | |
.findAll(html).asIterable() | |
val communityAndIndiePicksTable=table { | |
style { | |
borderStyle=BorderStyle.Solid | |
} | |
cellStyle { | |
paddingLeft = 1 | |
border=true | |
borderBottom=true | |
borderLeft=true | |
borderRight=true | |
borderTop=true | |
paddingRight = 1 | |
} | |
row("name","publisher") | |
body { | |
zipIterator(featuredComicsDetailsRegex,comicPublishersRegexSequence) | |
.forEach {biIteratorContainer -> | |
val (details, publishers) = biIteratorContainer | |
val detailsGroupValues = details.groupValues | |
row("${detailsGroupValues.getOrNull(2)}", | |
"${publishers.groupValues.getOrNull(1)?.trim()}") | |
} | |
} | |
} | |
println("Community and Indie Picks") | |
println(communityAndIndiePicksTable) | |
// println(featuredComicsDetailsRegex.asIterable().joinToString(separator = ", ") { | |
// val groupValues = it.groupValues | |
// "thumbnailLink→ ${groupValues.getOrNull(1)} comicIssueName→ ${groupValues.getOrNull(2)} " + | |
// "comicLink→ https://leagueofcomicgeeks.com/${groupValues.getOrNull(3)}" | |
// }) | |
// | |
// println(comicPublishersRegexSequence.asIterable().joinToString(", ") { matchResult -> | |
// val groupValues = matchResult.groupValues | |
// "publisher→ ${groupValues.getOrNull(1)?.trim()}" | |
// }) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment