Created
March 23, 2015 14:52
-
-
Save camman3d/bc8e3ee9b3eefb0871d6 to your computer and use it in GitHub Desktop.
Extracts data from http://www.nuforc.org and writes it to a .CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.{File, PrintWriter} | |
import org.jsoup.Jsoup | |
import org.jsoup.nodes.Element | |
import collection.JavaConversions._ | |
/** | |
* Created by josh on 3/20/15. | |
* | |
* Extracts data from http://www.nuforc.org and writes it to a .CSV file | |
*/ | |
object Driver { | |
val indexURL = "http://www.nuforc.org/webreports/ndxevent.html" | |
val numPages = 50 | |
def getPageURLs = { | |
Jsoup.connect(indexURL).get() | |
.select("table a") | |
.map(_.attr("abs:href")) | |
.toVector | |
} | |
case class RowData(date: String, city: String, state: String, shape: String, duration: String, summary: String) | |
def getRowData(row: Element) = { | |
val data = row | |
.select("td") | |
.map(_.text()) | |
.toVector | |
RowData(data.head, data(1), data(2), data(3), data(4), data(5)) | |
} | |
def getData(url: String) = { | |
println(s"Extracting data from $url") | |
Jsoup.connect(url).get() | |
.select("tr:not(:first-child)") | |
.map(getRowData) | |
.toVector | |
} | |
def writeCsv(data: Vector[RowData]): Unit = { | |
println("Writing .CSV file") | |
val p = new PrintWriter(new File("./data.csv")) | |
data.foreach(row => p.println(row.productIterator.mkString("\t"))) | |
p.close() | |
} | |
def main(args: Array[String]) { | |
val urls = getPageURLs | |
val data = urls | |
.take(numPages) | |
.flatMap(getData) | |
println(s"${data.size} entries extracted") | |
writeCsv(data) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment