GibsonRuitiari · December 8, 2023 20:26
diff --git a/Main.kt b/Main.kt
 import com.jakewharton.picnic.BorderStyle
 import com.jakewharton.picnic.TextAlignment
 import com.jakewharton.picnic.table
 import java.net.URL
 import kotlin.time.measureTime

 fun main(args: Array<String>){
    fun String.toHtml() = URL(this).openStream().bufferedReader().use { it.readText() }
    val baseUrlHtml ="https://leagueofcomicgeeks.com/comics".toHtml()
    // issues based on publisher
    val publishersNewTitlesHtml="https://leagueofcomicgeeks.com/comics/new-comics/".toHtml()


    val timeTaken=measureTime {
        getFeaturedComicPublishers(baseUrlHtml)
        println(" ")
        getCommunityAndIndiePicks(baseUrlHtml)
        println(" ")

        latestIssues(publishersNewTitlesHtml)
    }
 // and that's it. One more run . We are parsing two htmls under wait or is it in? 500 ms.
 // That's amazing!!!!!!!!!!!!!!!!!!!!!!
    println("time taken to parse in ms ${timeTaken.inWholeMilliseconds}")

 }
 // based on publisher
 fun latestIssues(html:String){
    /**
     * The regex is pretty simple. Get a div whose class ="cover" then skip over the breaks/lines/spaces
     * then ignore everything that follows the space up until you meet an '<a' tag. Now inside the <a tag
     * take the href provided the href link has the word 'comic' in it, then since the <a></a> contains an img tag
     * take the data-src link inside the img tag and the alt (comic-name) inside the img tag. Then ensure the search
     * ends at '>'
     *
     * this is the div being parsed, inside the larger html file
     * <div class="cover">
     *             <a href="/comic/1711767/birds-of-prey-4">
     *                 <img class="lazy" src="data:image/gif;base64"
     *                 data-src="https://s3.amazonaws.com/comicgeeks/comics/covers/medium-1711767.jpg?1701803920"
     *                 alt="birds of prey #4">
     *             </a>
     *         </div>
     *
     */
    val latestTitlesDetails="<div\\s+class=\"cover\">\\s+.*?<a.*?href=\"(/comic/.*?)\">\\s+.*?data-src=\"(.*?)\\s+alt=\"(.*?)\">"
        .toRegex()
        .findAll(html).asIterable()

    /**
     * this one too is simple
     * look for a div whose class =publisher,
     * now if the div is followed by a space as in <div class="publisher>
     *     // space
     *     ignore the space using s*? and ignore everything else coming after the space
     *     take the text that comes after wards (.*?) but ignore any quotes acompanying the texts [^\"] and [^\"]
     *     so in total this becomes ([^"].*?[^"])
     *     then skip the space following the text and everything that follows the text
     *     we include a '?' to make the regex lazy so it won't really consume a line/text unless you force to it to
     *     hence as an example
     *      <div class="publisher color-offset">
     *             DC Comics        </div>
     *
     *             the </div> won't be consumed or looked
     */

    val latestTitlesPublisherIterable ="div\\s+class=\"publisher\\s*?.*?>\\s+([^\"].*?[^\"])\\s.*?<" // here we only have
        // on group
        .toRegex()
        .findAll(html).asIterable()

    val latestTitlesReleaseDateIterable ="<span\\s+class=\"date\"\\s*.*?>\\s*(.*?)<".toRegex() // similarly we only
        // have one group
        .findAll(html).asIterable()

    val latestIssuesDetailsIterator=zipIterator(latestTitlesDetails,latestTitlesPublisherIterable,latestTitlesReleaseDateIterable)
    val latestComicsTable=table {
        style {
            borderStyle=BorderStyle.Solid
        }
        cellStyle {
            paddingLeft = 1
            border=true
            borderBottom=true
            borderLeft=true
            borderRight=true
            borderTop=true
            paddingRight = 1

        }
        row("name","publisher","release-year")
        body {
            latestIssuesDetailsIterator
                .forEach {latestIssuesDetails->
                val (basicComicDetails,publisherDetails,releaseDateDetails) = latestIssuesDetails
                    // since latestIssuesDetails is an IteratorContainer containing the match result of our regex,
                    // we retrieve them then use matchResult.groupValues[index] to get the exact value/text captured by
                    // our group. If you noticed in our regexes we had (). The '()' signifies a group
                    // so
                val publisher = publisherDetails.groupValues.getOrNull(1)
                val publishDate = releaseDateDetails.groupValues.getOrNull(1)
                val basicComicDetailsGroupValues = basicComicDetails.groupValues
                val comicLink = basicComicDetailsGroupValues.getOrNull(1)
                val thumbnail = basicComicDetailsGroupValues.getOrNull(2)?.removeSuffix("\"")
                val comicName = basicComicDetailsGroupValues.getOrNull(3)

                row("$comicName","$publisher","$publishDate")
            }
        }
        footer {
            cellStyle {
                border=true
            }
            row {
                cell("total-issues"){
                    alignment=TextAlignment.BottomCenter

                }
                cell("89"){
                    alignment=TextAlignment.BottomCenter

                }
            }

        }
    }
    println("Latest comics")
    println(latestComicsTable)

 }
 data class IteratorContainer<A,B,C>(val param1: A,val param2: B,val param3: C)
 data class BiIteratorContainer<A,B>(val param1: A, val param2: B)

 /**
 * This is just a cheeky way of combining two iterables/collections. Instead of using the
 * kotlin's standard lib zip() we do this. Zip() internally creates a new array so when you do iterableA.zip(iterableB)
 * you create a new array, then when you zip it with something else eg a list, another new array is created so
 * the space complexity increases linearly.
 * To combine two iterables/ lists or collections, zip() needs to loop through all the elements of the first and second Iterable
 * to create the combined collection. This makes it eager because it will loop and work on the elements regardless of
 * whether we will be using them at the moment or not.
 * This method on the other hand does neither of the above. We don't create any additional array just playing around with
 * iterators and the elements are accessed/processed whenever we call a terminal function like forEach
 * or an intermediate function like map() or next() so in a way it makes this lazy.
 */
 fun<A,B,C> zipIterator(firstIterable:Iterable<A>,
                           secondIterable:Iterable<B>,
                           thirdIterable:Iterable<C>):Iterator<IteratorContainer<A,B,C>>{
    val firstIterator = firstIterable.iterator()
    val secondIterator = secondIterable.iterator()
    val thirdIterator = thirdIterable.iterator()

    return object :Iterator<IteratorContainer<A,B,C>>{
        override fun hasNext(): Boolean{
            return firstIterator.hasNext() && secondIterator.hasNext() && thirdIterator.hasNext()
        }

        override fun next():IteratorContainer<A,B,C>{
            return IteratorContainer(firstIterator.next(),secondIterator.next(),
                thirdIterator.next())
        }
    }
 }
 fun<A,B> zipIterator(firstIterable:Iterable<A>,
                       secondIterable:Iterable<B>):Iterator<BiIteratorContainer<A,B>>{
    val firstIterator = firstIterable.iterator()
    val secondIterator = secondIterable.iterator()


    return object :Iterator<BiIteratorContainer<A,B>>{
        override fun hasNext(): Boolean{
            return firstIterator.hasNext() && secondIterator.hasNext()
        }

        override fun next():BiIteratorContainer<A,B>{
            return BiIteratorContainer(firstIterator.next(),secondIterator.next())
        }
    }
 }

 fun getFeaturedComicPublishers(html:String){
    // here we have two groups in one regex
    val publishersRegexSequence = "div\\s*.*?=\"col-xxxxl-2.*?\">\\s+.*?href=\"(/comics/.*?[^\"])\".*?data-title=\"(.*?)\""
        .toRegex()
        .findAll(html)
    val publishersTable=table {
        style {
            borderStyle=BorderStyle.Solid
        }
        cellStyle {
            paddingLeft = 1
            border=true
            borderBottom=true
            borderLeft=true
            borderRight=true
            borderTop=true
            paddingRight = 1

        }
        row("publisher name"," publisher link")
        body {
           publishersRegexSequence.asIterable().forEach {matchResult ->
               val groupValues = matchResult.groupValues
               // as you can see, we get group1 and group2
              row("${groupValues.getOrNull(2)?.trim()}","${groupValues.getOrNull(1)?.trim()}")
           }
        }
    }

    println("featured comic publishers")
    println()
    println(publishersTable)
 }
 fun getCommunityAndIndiePicks(html:String){
    val comicPublishersRegexSequence ="div.*=\"pl-3\".?\\s*.*?=\"copy-really-small.*\".*?\\s+(.+)<"
        .toRegex()
        .findAll(html).asIterable()

    val featuredComicsDetailsRegex = "<div.+=\"card.*\".*\\s+.*data-src=\"(.*?)\"\\s+alt=\"([^\"].*?[^\"])\".*?>\\s+.*href=\"(/comic/\\d+/.*)\"\\s+.+\">(.+)<"
        .toRegex()
        .findAll(html).asIterable()


    val communityAndIndiePicksTable=table {
        style {
            borderStyle=BorderStyle.Solid
        }
        cellStyle {
            paddingLeft = 1
            border=true
            borderBottom=true
            borderLeft=true
            borderRight=true
            borderTop=true
            paddingRight = 1

        }
        row("name","publisher")
        body {
            zipIterator(featuredComicsDetailsRegex,comicPublishersRegexSequence)
                .forEach {biIteratorContainer ->
                    val (details, publishers) = biIteratorContainer
                    val detailsGroupValues = details.groupValues
                    row("${detailsGroupValues.getOrNull(2)}",
                        "${publishers.groupValues.getOrNull(1)?.trim()}")
            }

        }
    }
    println("Community and Indie Picks")
    println(communityAndIndiePicksTable)
 //    println(featuredComicsDetailsRegex.asIterable().joinToString(separator = ", ") {
 //        val groupValues = it.groupValues
 //        "thumbnailLink→ ${groupValues.getOrNull(1)} comicIssueName→ ${groupValues.getOrNull(2)} " +
 //                "comicLink→ https://leagueofcomicgeeks.com/${groupValues.getOrNull(3)}"
 //    })
 //
 //    println(comicPublishersRegexSequence.asIterable().joinToString(", ") { matchResult ->
 //        val groupValues = matchResult.groupValues
 //        "publisher→ ${groupValues.getOrNull(1)?.trim()}"
 //    })
   
 }
	import com.jakewharton.picnic.BorderStyle
	import com.jakewharton.picnic.TextAlignment
	import com.jakewharton.picnic.table
	import java.net.URL
	import kotlin.time.measureTime

	fun main(args: Array<String>){
	fun String.toHtml() = URL(this).openStream().bufferedReader().use { it.readText() }
	val baseUrlHtml ="https://leagueofcomicgeeks.com/comics".toHtml()
	// issues based on publisher
	val publishersNewTitlesHtml="https://leagueofcomicgeeks.com/comics/new-comics/".toHtml()


	val timeTaken=measureTime {
	getFeaturedComicPublishers(baseUrlHtml)
	println(" ")
	getCommunityAndIndiePicks(baseUrlHtml)
	println(" ")

	latestIssues(publishersNewTitlesHtml)
	}
	// and that's it. One more run . We are parsing two htmls under wait or is it in? 500 ms.
	// That's amazing!!!!!!!!!!!!!!!!!!!!!!
	println("time taken to parse in ms ${timeTaken.inWholeMilliseconds}")

	}
	// based on publisher
	fun latestIssues(html:String){
	/**
	* The regex is pretty simple. Get a div whose class ="cover" then skip over the breaks/lines/spaces
	* then ignore everything that follows the space up until you meet an '<a' tag. Now inside the <a tag
	* take the href provided the href link has the word 'comic' in it, then since the <a></a> contains an img tag
	* take the data-src link inside the img tag and the alt (comic-name) inside the img tag. Then ensure the search
	* ends at '>'
	*
	* this is the div being parsed, inside the larger html file
	* <div class="cover">
	* <a href="/comic/1711767/birds-of-prey-4">
	* <img class="lazy" src="data:image/gif;base64"
	* data-src="https://s3.amazonaws.com/comicgeeks/comics/covers/medium-1711767.jpg?1701803920"
	* alt="birds of prey #4">
	* </a>
	* </div>
	*
	*/
	val latestTitlesDetails="<div\\s+class=\"cover\">\\s+.?<a.?href=\"(/comic/.?)\">\\s+.?data-src=\"(.?)\\s+alt=\"(.?)\">"
	.toRegex()
	.findAll(html).asIterable()

	/**
	* this one too is simple
	* look for a div whose class =publisher,
	* now if the div is followed by a space as in <div class="publisher>
	* // space
	* ignore the space using s*? and ignore everything else coming after the space
	* take the text that comes after wards (.*?) but ignore any quotes acompanying the texts [^\"] and [^\"]
	* so in total this becomes ([^"].*?[^"])
	* then skip the space following the text and everything that follows the text
	* we include a '?' to make the regex lazy so it won't really consume a line/text unless you force to it to
	* hence as an example
	* <div class="publisher color-offset">
	* DC Comics </div>
	*
	* the </div> won't be consumed or looked
	*/

	val latestTitlesPublisherIterable ="div\\s+class=\"publisher\\s?.?>\\s+([^\"].?[^\"])\\s.?<" // here we only have
	// on group
	.toRegex()
	.findAll(html).asIterable()

	val latestTitlesReleaseDateIterable ="<span\\s+class=\"date\"\\s.?>\\s(.?)<".toRegex() // similarly we only
	// have one group
	.findAll(html).asIterable()

	val latestIssuesDetailsIterator=zipIterator(latestTitlesDetails,latestTitlesPublisherIterable,latestTitlesReleaseDateIterable)
	val latestComicsTable=table {
	style {
	borderStyle=BorderStyle.Solid
	}
	cellStyle {
	paddingLeft = 1
	border=true
	borderBottom=true
	borderLeft=true
	borderRight=true
	borderTop=true
	paddingRight = 1

	}
	row("name","publisher","release-year")
	body {
	latestIssuesDetailsIterator
	.forEach {latestIssuesDetails->
	val (basicComicDetails,publisherDetails,releaseDateDetails) = latestIssuesDetails
	// since latestIssuesDetails is an IteratorContainer containing the match result of our regex,
	// we retrieve them then use matchResult.groupValues[index] to get the exact value/text captured by
	// our group. If you noticed in our regexes we had (). The '()' signifies a group
	// so
	val publisher = publisherDetails.groupValues.getOrNull(1)
	val publishDate = releaseDateDetails.groupValues.getOrNull(1)
	val basicComicDetailsGroupValues = basicComicDetails.groupValues
	val comicLink = basicComicDetailsGroupValues.getOrNull(1)
	val thumbnail = basicComicDetailsGroupValues.getOrNull(2)?.removeSuffix("\"")
	val comicName = basicComicDetailsGroupValues.getOrNull(3)

	row("$comicName","$publisher","$publishDate")
	}
	}
	footer {
	cellStyle {
	border=true
	}
	row {
	cell("total-issues"){
	alignment=TextAlignment.BottomCenter

	}
	cell("89"){
	alignment=TextAlignment.BottomCenter

	}
	}

	}
	}
	println("Latest comics")
	println(latestComicsTable)

	}
	data class IteratorContainer<A,B,C>(val param1: A,val param2: B,val param3: C)
	data class BiIteratorContainer<A,B>(val param1: A, val param2: B)

	/**
	* This is just a cheeky way of combining two iterables/collections. Instead of using the
	* kotlin's standard lib zip() we do this. Zip() internally creates a new array so when you do iterableA.zip(iterableB)
	* you create a new array, then when you zip it with something else eg a list, another new array is created so
	* the space complexity increases linearly.
	* To combine two iterables/ lists or collections, zip() needs to loop through all the elements of the first and second Iterable
	* to create the combined collection. This makes it eager because it will loop and work on the elements regardless of
	* whether we will be using them at the moment or not.
	* This method on the other hand does neither of the above. We don't create any additional array just playing around with
	* iterators and the elements are accessed/processed whenever we call a terminal function like forEach
	* or an intermediate function like map() or next() so in a way it makes this lazy.
	*/
	fun<A,B,C> zipIterator(firstIterable:Iterable<A>,
	secondIterable:Iterable<B>,
	thirdIterable:Iterable<C>):Iterator<IteratorContainer<A,B,C>>{
	val firstIterator = firstIterable.iterator()
	val secondIterator = secondIterable.iterator()
	val thirdIterator = thirdIterable.iterator()

	return object :Iterator<IteratorContainer<A,B,C>>{
	override fun hasNext(): Boolean{
	return firstIterator.hasNext() && secondIterator.hasNext() && thirdIterator.hasNext()
	}

	override fun next():IteratorContainer<A,B,C>{
	return IteratorContainer(firstIterator.next(),secondIterator.next(),
	thirdIterator.next())
	}
	}
	}
	fun<A,B> zipIterator(firstIterable:Iterable<A>,
	secondIterable:Iterable<B>):Iterator<BiIteratorContainer<A,B>>{
	val firstIterator = firstIterable.iterator()
	val secondIterator = secondIterable.iterator()


	return object :Iterator<BiIteratorContainer<A,B>>{
	override fun hasNext(): Boolean{
	return firstIterator.hasNext() && secondIterator.hasNext()
	}

	override fun next():BiIteratorContainer<A,B>{
	return BiIteratorContainer(firstIterator.next(),secondIterator.next())
	}
	}
	}

	fun getFeaturedComicPublishers(html:String){
	// here we have two groups in one regex
	val publishersRegexSequence = "div\\s.?=\"col-xxxxl-2.?\">\\s+.?href=\"(/comics/.?[^\"])\".?data-title=\"(.*?)\""
	.toRegex()
	.findAll(html)
	val publishersTable=table {
	style {
	borderStyle=BorderStyle.Solid
	}
	cellStyle {
	paddingLeft = 1
	border=true
	borderBottom=true
	borderLeft=true
	borderRight=true
	borderTop=true
	paddingRight = 1

	}
	row("publisher name"," publisher link")
	body {
	publishersRegexSequence.asIterable().forEach {matchResult ->
	val groupValues = matchResult.groupValues
	// as you can see, we get group1 and group2
	row("${groupValues.getOrNull(2)?.trim()}","${groupValues.getOrNull(1)?.trim()}")
	}
	}
	}

	println("featured comic publishers")
	println()
	println(publishersTable)
	}
	fun getCommunityAndIndiePicks(html:String){
	val comicPublishersRegexSequence ="div.=\"pl-3\".?\\s.?=\"copy-really-small.\".*?\\s+(.+)<"
	.toRegex()
	.findAll(html).asIterable()

	val featuredComicsDetailsRegex = "<div.+=\"card.\".\\s+.data-src=\"(.?)\"\\s+alt=\"([^\"].?[^\"])\".?>\\s+.href=\"(/comic/\\d+/.)\"\\s+.+\">(.+)<"
	.toRegex()
	.findAll(html).asIterable()


	val communityAndIndiePicksTable=table {
	style {
	borderStyle=BorderStyle.Solid
	}
	cellStyle {
	paddingLeft = 1
	border=true
	borderBottom=true
	borderLeft=true
	borderRight=true
	borderTop=true
	paddingRight = 1

	}
	row("name","publisher")
	body {
	zipIterator(featuredComicsDetailsRegex,comicPublishersRegexSequence)
	.forEach {biIteratorContainer ->
	val (details, publishers) = biIteratorContainer
	val detailsGroupValues = details.groupValues
	row("${detailsGroupValues.getOrNull(2)}",
	"${publishers.groupValues.getOrNull(1)?.trim()}")
	}

	}
	}
	println("Community and Indie Picks")
	println(communityAndIndiePicksTable)
	// println(featuredComicsDetailsRegex.asIterable().joinToString(separator = ", ") {
	// val groupValues = it.groupValues
	// "thumbnailLink→ ${groupValues.getOrNull(1)} comicIssueName→ ${groupValues.getOrNull(2)} " +
	// "comicLink→ https://leagueofcomicgeeks.com/${groupValues.getOrNull(3)}"
	// })
	//
	// println(comicPublishersRegexSequence.asIterable().joinToString(", ") { matchResult ->
	// val groupValues = matchResult.groupValues
	// "publisher→ ${groupValues.getOrNull(1)?.trim()}"
	// })

	}