Skip to content

Instantly share code, notes, and snippets.

@carlosrogue
Last active July 9, 2024 17:51
Show Gist options
  • Save carlosrogue/065ad066d474bd05d6c3a2123fbc42dc to your computer and use it in GitHub Desktop.
Save carlosrogue/065ad066d474bd05d6c3a2123fbc42dc to your computer and use it in GitHub Desktop.
Export RSS feed from Scala - www.noticias3d.com - with JSoup and Rome

RSS feed in Scala 3

I follow Noticias3d news since the early 2000s and I was always wanting to have a RSS feed, given there is no official support. Now finally in 2024 I got some inspiration from this old post https://foro.noticias3d.com/vbulletin/showthread.php?t=358685 and decided to just write my own using Scala 3. To execute this example you have to first install Scala CLI and you are good to go with Rss.sc:

scala-cli --watch Rss.sc

Then in a tab or from your smartphone using your machine IP (e.g. http://192.168.2.58:8080/feed):

curl http://localhost:8080/feed

Or slightly better IO program RssIO.sc:

scala-cli --watch RssIO.sc

Or even a direct-style using Context Functions one RssContext.sc:

scala-cli --watch RssContext.sc

You can try the JSoup or Rome APIs using the powerful Ammonite RPEL:

scala-cli --power repl --amm -S 3.3.3 --dep com.rometools:rome:2.1.0 --dep org.jsoup:jsoup:1.17.2
@ import org.jsoup._ 
@ val urlNoticia = "https://www.noticias3d.com/noticia/95206/samsung-galaxy-s25-ultra-cambio-importante.html"
@ val docNoticia = Jsoup.connect(urlNoticia).get()
@ val author = docNoticia.select("#content-data:has(i.fa-user) a[href~=^mailto: .+]").first

Resources

Here are some useful resources related to how to read/write RSS feeds in Scala:

Some links from the official documentation:

//> using dep "org.http4s::http4s-dsl:0.23.27"
//> using dep "org.jsoup:jsoup:1.17.2"
//> using dep "com.rometools:rome:2.1.0"
//> using dep "org.http4s::http4s-ember-server:0.23.27"
import cats.effect._
import cats.syntax.all._
import org.http4s._, org.http4s.dsl.io._, org.http4s.implicits._
import org.http4s.headers.*
import cats.data.NonEmptyList
import com.comcast.ip4s._
import org.http4s.ember.server._
import org.http4s.server.Router
import org.jsoup._
import com.rometools.rome.feed.synd._
import com.rometools.rome.io._
import scala.jdk.CollectionConverters._
import java.util.{Date, TimeZone, Locale}
import cats.effect.unsafe.IORuntime
// https://foro.noticias3d.com/vbulletin/showthread.php?t=358685
val domain = "https://www.noticias3d.com"
val url = s"$domain/ultimas-noticias"
val dateId = "wrap-title-news-dia"
val contentId = "content-list-noticias"
val locale = new Locale("es", "ES")
val doc = Jsoup.connect(url).followRedirects(true).get()
val news = doc.select("aside").select(s"[id~=$dateId|$contentId]")
val feed: SyndFeed = new SyndFeedImpl()
feed.setTitle("Noticias3d")
feed.setLink(s"$domain/noticias.asp")
feed.setDescription("Noticias3D")
feed.setLanguage(locale.toLanguageTag())
// rss_0.90, rss_0.91, rss_0.92, rss_0.93, rss_0.94, rss_1.0 rss_2.0 or atom_0.3
feed.setFeedType("rss_2.0")
val articles = news.asScala
//domingo, 30 de junio de 2024
val formatter = new java.text.SimpleDateFormat("EEEEE, dd 'de' MMM 'de' yyyy", locale)
formatter.setTimeZone(TimeZone.getTimeZone("UTC"))
val entries = articles.foldLeft((new Date(), List[SyndEntry]())) {
case ((published, all), article) => {
article.attr("id") match {
case `dateId` =>
(formatter.parse(article.select("h2").text), all)
case `contentId` =>
val a = article.select("a[href]")
val title = a.text
val link = s"$domain${a.attr("href")}"
val entry = new SyndEntryImpl
entry.setTitle(title)
entry.setLink(link)
entry.setPublishedDate(published)
val description = new SyndContentImpl
description.setType("text/html")
val docDesc = Jsoup.connect(link).get()
val author = docDesc.select("#content-data:has(i.fa-user) a[href~=^mailto: .+$]").first
val content = docDesc.select("#content-article p")
val path = "[/].+".r
content.first.select("a[href]").forEach(a => a.attr("href", a.attr("href") match {
case path() => s"$domain${a.attr("href")}"
case absolute => absolute
}))
val desc = content.first.html
val images = content.select("img[src]")
images.forEach(i => i.attr("src", s"$domain${i.attr("src")}"))
val img = Option(images.first).map(_.outerHtml).getOrElse("")
val foot = s"""<p>Seguir leyendo <a href="$link"><b>$title</b>→</a>.<p/>"""
description.setValue(s"<p>$desc</p>$img<br>$foot")
entry.setDescription(description)
entry.setAuthor(s"${author.attr("href").replace("mailto: ", "")} (${author.text})")
(published, entry :: all)
case _ => (published, all)
}
}
}
feed.setEntries(entries._2.reverse.asJava)
val output = new SyndFeedOutput()
val out = new java.io.ByteArrayOutputStream()
val writer = new java.io.OutputStreamWriter(out)
output.output(feed, writer)
val mimeType = "application/xml; charset=UTF-8"
val rssService = HttpRoutes.of[IO] {
case GET -> Root / "feed" => {
Ok(out.toByteArray, `Content-Type`(MediaType.application.`xml`, Charset.`UTF-8`))
}
}
implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global
// val getRss = Request[IO](Method.GET, uri"/feed")
// val responseRss = rssService.orNotFound.run(getRss).unsafeRunSync()
// println(responseRss)
val httpApp = Router("/" -> rssService).orNotFound
val server = EmberServerBuilder
.default[IO]
.withHost(ipv4"0.0.0.0")
.withPort(port"8080")
.withHttpApp(httpApp)
.build
val shutdown = server.allocated.unsafeRunSync()._2
scala.io.StdIn.readLine("Press any key to shutdown\n")
shutdown.unsafeRunSync()
//> using dep "org.http4s::http4s-dsl:0.23.27"
//> using dep "org.jsoup:jsoup:1.17.2"
//> using dep "com.rometools:rome:2.1.0"
//> using dep "org.http4s::http4s-ember-server:0.23.27"
import cats.effect._
import cats.effect.std.Console
import cats.syntax.all._
import org.http4s._, org.http4s.dsl.io._, org.http4s.implicits._
import org.http4s.headers.*
import cats.data.NonEmptyList
import com.comcast.ip4s._
import org.http4s.ember.server._
import org.http4s.server.Router
import org.jsoup._
import org.jsoup.select._
import com.rometools.rome.feed.synd._
import com.rometools.rome.io._
import scala.jdk.CollectionConverters._
import scala.concurrent.duration._
import java.util.{Date, TimeZone, Locale}
import java.io.*
import cats.effect.unsafe.IORuntime
// https://foro.noticias3d.com/vbulletin/showthread.php?t=358685
val domain = "https://www.noticias3d.com"
val url = s"$domain/ultimas-noticias"
val dateId = "wrap-title-news-dia"
val contentId = "content-list-noticias"
val locale = new Locale("es", "ES")
//domingo, 30 de junio de 2024
val formatter = new java.text.SimpleDateFormat("EEEEE, dd 'de' MMM 'de' yyyy", locale)
formatter.setTimeZone(TimeZone.getTimeZone("UTC"))
import Caches.*
import Caches.Feed.*
type Feed = SyndFeed
def emptyFeed(): IO[Feed] = IO.delay {
val feed: Feed = new SyndFeedImpl()
feed.setTitle("Noticias3d")
feed.setLink(s"$domain/noticias.asp")
feed.setDescription("Noticias3D")
feed.setLanguage(locale.toLanguageTag())
// rss_0.90, rss_0.91, rss_0.92, rss_0.93, rss_0.94, rss_1.0 rss_2.0 or atom_0.3
feed.setFeedType("rss_2.0")
feed
}
def getNewEntries(news: Elements)(using cache: Cache[Feed]) = IO.blocking {
val latestId = cache match {
case EmptyCache => 0
case present: Feed => maybeId(present).getOrElse(0)
}
val newEntries = news.asScala.takeWhile(a => latestId == 0 || (a.attr("id") match {
case `contentId` => id(a.select("a[href]").first.attr("href")) > latestId
case _ => true
})).foldLeft((new Date(), List[SyndEntry]())) {
case ((published, all), article) => {
article.attr("id") match {
case `dateId` =>
(formatter.parse(article.select("h2").text), all)
case `contentId` =>
val a = article.select("a[href]")
val title = a.text
val link = s"$domain${a.attr("href")}"
val entry = new SyndEntryImpl
entry.setTitle(title)
entry.setLink(link)
entry.setPublishedDate(published)
val description = new SyndContentImpl
description.setType("text/html")
val docDesc = Jsoup.connect(link).get()
val author = docDesc.select("#content-data:has(i.fa-user) a[href~=^mailto: .+$]").first
val content = docDesc.select("#content-article p")
val path = "[/].+".r
content.first.select("a[href]").forEach(a => a.attr("href", a.attr("href") match {
case path() => s"$domain${a.attr("href")}"
case absolute => absolute
}))
val desc = content.first.html
val images = content.select("img[src]")
images.forEach(i => i.attr("src", s"$domain${i.attr("src")}"))
val img = Option(images.first).map(_.outerHtml).getOrElse("")
val foot = s"""<p>Seguir leyendo <a href="$link"><b>$title</b>→</a>.<p/>"""
description.setValue(s"<p>$desc</p>$img<br>$foot")
entry.setDescription(description)
entry.setAuthor(s"${author.attr("href").replace("mailto: ", "")} (${author.text})")
(published, entry :: all)
case _ => (published, all)
}
}
}
newEntries._2.reverse
}
def getFeed(merge: Boolean = true)(using cache: Cache[Feed]): IO[Feed] =
for {
doc <- IO.blocking(Jsoup.connect(url).followRedirects(true).get())
latest = cache match {
case EmptyCache => None
case present => Some(present)
}
news = doc.select("aside").select(s"[id~=$dateId|$contentId]")
feed <- emptyFeed()
newEntries <- getNewEntries(news)
updated = cache match {
case EmptyCache => newEntries
case previous: Feed =>
if (merge) newEntries ++ previous.getEntries().asScala.dropRight(newEntries.size)
else newEntries
}
_ = feed.setEntries(updated.asJava)
} yield feed
val output = new SyndFeedOutput()
def rssService(expirable: ExpirableFeed)(using strategy: CacheStrategy, state: FeedCacheState) = HttpRoutes.of[IO] {
case GET -> Root / "feed" => {
for {
updated <- Feed.run(expirable)
(state, feed) = updated
out = new ByteArrayOutputStream()
_ = output.output(feed, OutputStreamWriter(out))
resp <- Ok(out.toByteArray, `Content-Type`(MediaType.application.`xml`, Charset.`UTF-8`))
} yield resp
}
case GET -> Root / "feed" / "hot" => {
given FeedCacheState = state.copy(deadline = invalidated)
for {
updated <- Feed.run(expirable)//(using strategy, state.copy(deadline = Deadline.now))
(_, feed) = updated
out = new ByteArrayOutputStream()
_ = output.output(feed, OutputStreamWriter(out))
resp <- Ok(out.toByteArray, `Content-Type`(MediaType.application.`xml`, Charset.`UTF-8`))
} yield resp
}
case GET -> Root / "feed" / "latest" => for {
id <- latestCachedId
resp <- Ok(id.toString)
} yield resp
}
/** def fromCache(ref: FeedRef, ttl: FiniteDuration) = for {
feedRef <- ref.get
(deadline, maybeFeed) = feedRef
feed <- maybeFeed match {
case Some(valid) if (deadline.hasTimeLeft()) => valid.pure[IO]
case _ => getFeed(maybeFeed).flatMap { updated =>
ref.set(ttl.fromNow, Some(updated)).as(updated)
}
}
} yield feed */
def getNextState(using strategy: CacheStrategy, state: FeedCacheState): IO[(FeedCacheState, Feed)] = for {
maybeCache <- state.get
deadline <- state.deadline
nextState <- maybeCache match {
case valid: Feed if (deadline.hasTimeLeft()) => (state, valid).pure[IO]
case invalid =>
getFeed()(using invalid).flatMap { latest =>
state.set(latest, maybeCache).as((state.copy(deadline=strategy.fromNow.pure[IO]), latest))
}
}
} yield nextState
object Caches {
case object EmptyCache
type EmptyCache = EmptyCache.type
case class TTL(duration: FiniteDuration)
type Cache[A] = EmptyCache | A
type CacheStrategy = TTL
// case class BaseCacheStrategy[F[A], A](ttl: TTL, eval: Cache[A] => F[Option[A]])
case class CacheState[F[A], A](deadline: F[Deadline], get: F[Cache[A]], set: (A, Cache[A]) => F[Cache[A]])
extension (d: FiniteDuration)
def ttl = TTL(d)
extension (s: CacheStrategy)
def fromNow = s.duration.fromNow
val invalidated = IO.delay(Deadline.now)
type Expirable[F[A], A] = CacheStrategy ?=> CacheState[F, A] ?=> F[(CacheState[F, A], A)]
def apply[F[_], A](body: CacheStrategy ?=> CacheState[F, A] ?=> F[(CacheState[F, A], A)]): Expirable[F,A] =
body
object Feed {
opaque type FeedCache = Feed
type FeedCacheState = CacheState[IO, Feed]
type ExpirableFeed = Expirable[IO, Feed]
inline def apply(inline body: CacheStrategy ?=> CacheState[IO, Feed] ?=> IO[(CacheState[IO, Feed], Feed)]): ExpirableFeed =
Caches(body)
def run(expirable: ExpirableFeed)(using CacheStrategy, CacheState[IO, Feed]): IO[(CacheState[IO, Feed], Feed)] = {
expirable
}
}
}
val newsId = "^.*/noticia/(.\\d+)/.+\\.html$".r
/** def latestId(ref: FeedRef): IO[Int] =
for(feedRef <- ref.get)
yield
feedRef._2.flatMap(maybeId).getOrElse(-1)*/
def latestCachedId(using state: FeedCacheState): IO[Int] =
state.get.map({
case EmptyCache => -1
case present: Feed => maybeId(present).getOrElse(-1)
})
def maybeId(feed: SyndFeed): Option[Int] =
feed.getEntries().size() match {
case 0 => None
case _ => Some(id(feed.getEntries().get(0).getLink())).filter(_ != -1)
}
def id(link: String): Int =
link match {
case newsId(id) => id.toInt
case _ => -1
}
implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global
//val getRss = Request[IO](Method.GET, uri"/feed")
//val responseRss = rssService.orNotFound.run(getRss).unsafeRunSync()
//println(responseRss)
//val getLatest = Request[IO](Method.GET, uri"/feed/latest")
//val responseLatest = rssService.orNotFound.run(getLatest).unsafeRunSync()
//println(responseLatest.as[String].unsafeRunSync())
given ttl: TTL = 1.minute.ttl
val app =
for {
ref: Ref[IO, Cache[Feed]] <- Ref[IO].of(EmptyCache)
given CacheState[IO,Feed] = CacheState[IO,Feed](invalidated, ref.get, (feed,s) => {
ref.modify(p => (feed, feed))
})
expirable = Feed(getNextState)
httpApp = Router("/" -> rssService(expirable)).orNotFound
server = EmberServerBuilder
.default[IO]
.withHost(ipv4"0.0.0.0")
.withPort(port"8080")
.withHttpApp(httpApp)
.build
started <- server.allocated
(_, shutdown) = started
_ <- Console[IO].println("Press any key to shutdown\n")
_ <- Console[IO].readLine
exitCode <- shutdown
} yield exitCode
app.unsafeRunSync()
//> using dep "org.http4s::http4s-dsl:0.23.27"
//> using dep "org.jsoup:jsoup:1.17.2"
//> using dep "com.rometools:rome:2.1.0"
//> using dep "org.http4s::http4s-ember-server:0.23.27"
import cats.effect._
import cats.effect.std.Console
import cats.syntax.all._
import org.http4s._, org.http4s.dsl.io._, org.http4s.implicits._
import org.http4s.headers.*
import cats.data.NonEmptyList
import com.comcast.ip4s._
import org.http4s.ember.server._
import org.http4s.server.Router
import org.jsoup._
import com.rometools.rome.feed.synd._
import com.rometools.rome.io._
import scala.jdk.CollectionConverters._
import scala.concurrent.duration._
import java.util.{Date, TimeZone, Locale}
import java.io.*
import cats.effect.unsafe.IORuntime
// https://foro.noticias3d.com/vbulletin/showthread.php?t=358685
val domain = "https://www.noticias3d.com"
val url = s"$domain/ultimas-noticias"
val dateId = "wrap-title-news-dia"
val contentId = "content-list-noticias"
val locale = new Locale("es", "ES")
//domingo, 30 de junio de 2024
val formatter = new java.text.SimpleDateFormat("EEEEE, dd 'de' MMM 'de' yyyy", locale)
formatter.setTimeZone(TimeZone.getTimeZone("UTC"))
def getFeed(latest: Option[SyndFeed] = None, merge: Boolean = true): IO[SyndFeed] =
for {
doc <- IO.blocking(Jsoup.connect(url).followRedirects(true).get())
} yield {
val news = doc.select("aside").select(s"[id~=$dateId|$contentId]")
val feed: SyndFeed = new SyndFeedImpl()
feed.setTitle("Noticias3d")
feed.setLink(s"$domain/noticias.asp")
feed.setDescription("Noticias3D")
feed.setLanguage(locale.toLanguageTag())
// rss_0.90, rss_0.91, rss_0.92, rss_0.93, rss_0.94, rss_1.0 rss_2.0 or atom_0.3
feed.setFeedType("rss_2.0")
val articles = news.asScala
val latestId = latest.flatMap(maybeId).getOrElse(0)
val entries = articles.takeWhile(a => latestId == 0 || (a.attr("id") match {
case `contentId` => id(a.select("a[href]").first.attr("href")) > latestId
case _ => true
})).foldLeft((new Date(), List[SyndEntry]())) {
case ((published, all), article) => {
article.attr("id") match {
case `dateId` =>
(formatter.parse(article.select("h2").text), all)
case `contentId` =>
val a = article.select("a[href]")
val title = a.text
val link = s"$domain${a.attr("href")}"
val entry = new SyndEntryImpl
entry.setTitle(title)
entry.setLink(link)
entry.setPublishedDate(published)
val description = new SyndContentImpl
description.setType("text/html")
val docDesc = Jsoup.connect(link).get()
val author = docDesc.select("#content-data:has(i.fa-user) a[href~=^mailto: .+$]").first
val content = docDesc.select("#content-article p")
val path = "[/].+".r
content.first.select("a[href]").forEach(a => a.attr("href", a.attr("href") match {
case path() => s"$domain${a.attr("href")}"
case absolute => absolute
}))
val desc = content.first.html
val images = content.select("img[src]")
images.forEach(i => i.attr("src", s"$domain${i.attr("src")}"))
val img = Option(images.first).map(_.outerHtml).getOrElse("")
val foot = s"""<p>Seguir leyendo <a href="$link"><b>$title</b>→</a>.<p/>"""
description.setValue(s"<p>$desc</p>$img<br>$foot")
entry.setDescription(description)
entry.setAuthor(s"${author.attr("href").replace("mailto: ", "")} (${author.text})")
(published, entry :: all)
case _ => (published, all)
}
}
}
val newEntries = entries._2.reverse
val updated = latest match {
case None => newEntries
case Some(previous) =>
if (merge) newEntries ++ previous.getEntries().asScala.dropRight(newEntries.size)
else newEntries
}
feed.setEntries(updated.asJava)
feed
}
val output = new SyndFeedOutput()
type FeedRef = Ref[IO, (Deadline, Option[SyndFeed])]
def rssService(ref: FeedRef, ttl: FiniteDuration) = HttpRoutes.of[IO] {
case GET -> Root / "feed" => {
for {
feed <- fromCache(ref, ttl)
out = new ByteArrayOutputStream()
_ = output.output(feed, OutputStreamWriter(out))
resp <- Ok(out.toByteArray, `Content-Type`(MediaType.application.`xml`, Charset.`UTF-8`))
} yield resp
}
case GET -> Root / "feed" / "hot" => for {
feedRef <- ref.get
_ <- ref.set((Deadline.now, feedRef._2))
feed <- fromCache(ref, ttl)
out = new ByteArrayOutputStream()
_ = output.output(feed, OutputStreamWriter(out))
resp <- Ok(out.toByteArray, `Content-Type`(MediaType.application.`xml`, Charset.`UTF-8`))
} yield resp
case GET -> Root / "feed" / "latest" => for {
id <- latestId(ref)
resp <- Ok(id.toString)
} yield resp
}
def fromCache(ref: FeedRef, ttl: FiniteDuration) = for {
feedRef <- ref.get
(deadline, maybeFeed) = feedRef
feed <- maybeFeed match {
case Some(valid) if (deadline.hasTimeLeft()) => valid.pure[IO]
case _ => getFeed(maybeFeed).flatMap { updated =>
ref.set(ttl.fromNow, Some(updated)).as(updated)
}
}
} yield feed
val newsId = "^.*/noticia/(.\\d+)/.+\\.html$".r
def latestId(ref: FeedRef): IO[Int] =
for(feedRef <- ref.get)
yield
feedRef._2.flatMap(maybeId).getOrElse(-1)
def maybeId(feed: SyndFeed): Option[Int] =
feed.getEntries().size() match {
case 0 => None
case _ => Some(id(feed.getEntries().get(0).getLink())).filter(_ != -1)
}
def id(link: String): Int =
link match {
case newsId(id) => id.toInt
case _ => -1
}
implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global
//val getRss = Request[IO](Method.GET, uri"/feed")
//val responseRss = rssService.orNotFound.run(getRss).unsafeRunSync()
//println(responseRss)
//val getLatest = Request[IO](Method.GET, uri"/feed/latest")
//val responseLatest = rssService.orNotFound.run(getLatest).unsafeRunSync()
//println(responseLatest.as[String].unsafeRunSync())
val app =
for {
ref: FeedRef <- Ref[IO].of((Deadline.now, None))
httpApp = Router("/" -> rssService(ref, 1.minute)).orNotFound
server = EmberServerBuilder
.default[IO]
.withHost(ipv4"0.0.0.0")
.withPort(port"8080")
.withHttpApp(httpApp)
.build
started <- server.allocated
(_, shutdown) = started
_ <- Console[IO].println("Press any key to shutdown\n")
_ <- Console[IO].readLine
exitCode <- shutdown
} yield exitCode
app.unsafeRunSync()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment