Created
February 21, 2012 20:37
-
-
Save devnoo/1878752 to your computer and use it in GitHub Desktop.
seven languages in seven weeks scala day 3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import collection.mutable.HashSet | |
| import dbc.result.Tuple | |
| import scala.io._ | |
| import scala.actors._ | |
| import Actor._ | |
| import scala.util.matching.Regex | |
| val linkPattern = new Regex("""<a +href=\"([^\"]+)\"[^>]*>""", "link") | |
| case class Page(val content: String) { | |
| def size = content.length | |
| def links(): Set[String] = { | |
| linkPattern.findAllIn(content).foldLeft(Set(): Set[String]) { | |
| (links: Set[String], link) => links + link | |
| } | |
| } | |
| def numberOfLinks() { | |
| //Should count non unique links probably | |
| linkPattern.findAllIn(content).size | |
| } | |
| } | |
| object PageLoader { | |
| def getPage(url: String) = { | |
| Page(Source.fromURL(url).mkString) | |
| } | |
| } | |
| //removed amazon.com cause of strange bug with encoding in scala 2.9.1 | |
| val urls = List("http://www.twitter.com/", | |
| "http://www.google.com/", | |
| "http://www.cnn.com/") | |
| def getPageSizeSequentially() = { | |
| for (url <- urls) { | |
| val page: Page = PageLoader.getPage(url) | |
| println("Size for " + url + ": " + page.size) | |
| println("Number of links for " + url + ": " + page.links().size) | |
| } | |
| } | |
| def getPageSizeConcurrently() = { | |
| val caller = self | |
| for (url <- urls) { | |
| actor { | |
| caller !(url, PageLoader.getPage(url)) | |
| } | |
| } | |
| for (i <- 1 to urls.size) { | |
| receive { | |
| case (url, page: Page) => | |
| println("Size for " + url + ": " + page.size) | |
| println("Number of links for " + url + ": " + page.links().size) | |
| } | |
| } | |
| } | |
| def timeMethod(method: () => Unit) = { | |
| val start = System.nanoTime | |
| method() | |
| val end = System.nanoTime | |
| println("Method took " + (end - start) / 1000000000.0 + " seconds.") | |
| } | |
| println("Sequential run:") | |
| timeMethod { | |
| getPageSizeSequentially | |
| } | |
| println("Concurrent run") | |
| timeMethod { | |
| getPageSizeConcurrently | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import collection.mutable.HashSet | |
| import dbc.result.Tuple | |
| import java.util.regex.Pattern | |
| import scala.io._ | |
| import scala.actors._ | |
| import Actor._ | |
| import scala.util.matching.Regex | |
| val linkPattern = new Regex("""<a +href=\"([^\"]+)\"[^>]*>""", "link") | |
| val protocolLessUrl = "//[.*]".r | |
| case class Page(val url: String, val content: String) { | |
| def size = content.length | |
| def links(): Set[String] = { | |
| linkPattern.findAllIn(content).matchData.foldLeft(Set(): Set[String]) { | |
| (links: Set[String], link) => links + link.group("link") | |
| } | |
| } | |
| def absoluteLink(): Set[String] = { | |
| links().map { | |
| link: String => link match { | |
| case link if link.startsWith("http") => link | |
| case link if link.startsWith("//") => "http:" + link | |
| case link if link.startsWith("/") => url + link.substring(1) | |
| case relativeLink: String => url + relativeLink | |
| } | |
| } | |
| } | |
| def numberOfLinks() { | |
| //Should count non unique links probably | |
| linkPattern.findAllIn(content).size | |
| } | |
| } | |
| object PageLoader { | |
| def getPage(url: String) = { | |
| println(url) | |
| try { | |
| Page(url, Source.fromURL(url, "UTF-8").mkString) | |
| } catch { | |
| case _ => println("error fetching url :" + url) ;Page(url, "") | |
| } | |
| } | |
| } | |
| //removed amazon.com cause of strange bug with encoding in scala 2.9.1 | |
| val urls = List("http://www.twitter.com/", | |
| "http://www.google.com/") | |
| // "http://www.cnn.com/") | |
| def getPageSizeSequentially() = { | |
| for (url <- urls) { | |
| val page: Page = PageLoader.getPage(url) | |
| val totalSize = page.absoluteLink().foldLeft(page.size) { | |
| (size: Int, link: String) => size + PageLoader.getPage(link).size | |
| } | |
| println("Size for " + url + ": " + totalSize) | |
| } | |
| } | |
| def getTotalPageSizeConcurrently(url: String) = { | |
| val page: Page = PageLoader.getPage(url) | |
| page.absoluteLink().par.foldLeft(page.size) { | |
| (size: Int, link: String) => size + PageLoader.getPage(link).size | |
| } | |
| } | |
| def getPageSizeConcurrently() = { | |
| urls.par.foreach(url => | |
| println("Size for " + url + ": " + getTotalPageSizeConcurrently(url)) | |
| ) | |
| } | |
| def timeMethod(method: () => Unit) = { | |
| val start = System.nanoTime | |
| method() | |
| val end = System.nanoTime | |
| println("Method took " + (end - start) / 1000000000.0 + " seconds.") | |
| } | |
| println("Sequential run:") | |
| timeMethod { | |
| getPageSizeSequentially | |
| } | |
| println("Concurrent run") | |
| timeMethod { | |
| getPageSizeConcurrently | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import dbc.result.Tuple | |
| import scala.io._ | |
| import scala.actors._ | |
| import Actor._ | |
| import scala.util.matching.Regex | |
| val linkPattern = new Regex("""<a +href=\"([^\"]+)\"[^>]*>""", "link") | |
| object PageLoader { | |
| def getPageSizeAndLinkCount(url : String) = { | |
| var numberOfLinks = 0 | |
| val text = Source.fromURL(url).mkString | |
| ((Set(): Set[String]) /: linkPattern.findAllIn(text).matchData) { | |
| (s, md) => s + md.group("link") } foreach { | |
| link => println(url + " => " + link); numberOfLinks+=1 } | |
| (text.length, numberOfLinks) | |
| } | |
| } | |
| //removed amazon.com cause of strange bug with encoding in scala 2.9.1 | |
| val urls = List("http://www.twitter.com/", | |
| "http://www.google.com/", | |
| "http://www.cnn.com/" ) | |
| def timeMethod(method: () => Unit) = { | |
| val start = System.nanoTime | |
| method() | |
| val end = System.nanoTime | |
| println("Method took " + (end - start)/1000000000.0 + " seconds.") | |
| } | |
| def getPageSizeSequentially() = { | |
| for(url <- urls) { | |
| println("Size for " + url + ": " + PageLoader.getPageSizeAndLinkCount(url)._1) | |
| println("Number of links for " + url + ": " + PageLoader.getPageSizeAndLinkCount(url)._2) | |
| } | |
| } | |
| def getPageSizeConcurrently() = { | |
| val caller = self | |
| for(url <- urls) { | |
| actor { caller ! (url, PageLoader.getPageSizeAndLinkCount(url)) } | |
| } | |
| for(i <- 1 to urls.size) { | |
| receive { | |
| case (url, size : Tuple2[Int, Int]) => | |
| println("Size for " + url + ": " + size._1) | |
| println("Number of links for " + url + ": " + size._2) | |
| } | |
| } | |
| } | |
| println("Sequential run:") | |
| timeMethod { getPageSizeSequentially } | |
| println("Concurrent run") | |
| timeMethod { getPageSizeConcurrently } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment