Created
June 25, 2016 14:43
-
-
Save josdirksen/98de8ae7ff5751b939ed16cab7010628 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.smartjava.dw | |
import java.io.{File, FileInputStream} | |
import com.github.mtailor.srtdissector.SrtDissector | |
import scala.annotation.tailrec | |
import scala.collection.immutable.ListMap | |
import scala.io.Source | |
import scala.util.{Failure, Success} | |
object DWCounter extends App { | |
val dws = Source.fromFile(getClass.getClassLoader.getResource("dwList.txt").toURI).getLines().toList | |
val subs = new File(getClass.getClassLoader.getResource("srt/wolfofwallstreet.srt").toURI) | |
val dissector = SrtDissector(new FileInputStream(subs)) | |
case class TimedWord(word: String, time: Double) | |
val n = dissector match { | |
case Success(srt) => | |
// add timing information to the individual words | |
// approximation, just calculating total time of block, divided by each word | |
srt.map( srtBlock => { | |
val totalTime = srtBlock.end - srtBlock.start; | |
val asWords = toWords(srtBlock.lines) | |
val totalWordSize = asWords.mkString("").size | |
val timePerCharacter = totalTime.toDouble / totalWordSize | |
@tailrec | |
def toTimedWords(words: List[String], currentTime: Double, totalWords: List[TimedWord]) : List[TimedWord] = { | |
words match { | |
case head :: tail => | |
val endTime = currentTime + head.size * timePerCharacter | |
toTimedWords(tail, endTime, TimedWord(head, currentTime) :: totalWords ) | |
case nil => totalWords | |
} | |
} | |
toTimedWords(asWords.toList, srtBlock.start.toDouble, List()) | |
}).flatten | |
case Failure(e) => println(s"Failed to parse srt: ${e.getMessage}.") ; throw(e) | |
} | |
val maxTime = n.foldLeft(0d)((z, el) => { | |
if (z < el.time) el.time else z | |
}) | |
println("Words with 'fuck' in them: " + n.filter(_.word.contains("fuck")).length) | |
println("Words with 'bitch' in them: " + n.filter(_.word.contains("bitch")).length) | |
val counted = n.foldLeft(Map[String, Int]()) { (z, word) => | |
if (dws.contains(word.word)) { | |
val count = z.getOrElse(word.word, 0) + 1 | |
z + (word.word -> count) | |
} else { | |
z | |
} | |
} | |
val sorted = ListMap(counted.toSeq.sortBy(_._2).reverse:_*) | |
println(sorted); | |
// output as a simple csv | |
val filtered = n.filter(timedWord => dws.contains(timedWord.word)).foreach(line => { | |
println(s"${line.word},${(line.time/1000).toLong}") | |
}) | |
val groupoPerSeconds = 60 | |
val groupedMinute = n.filter(timedWord => dws.contains(timedWord.word)).groupBy( w => | |
Math.floor((w.time / 1000) / groupoPerSeconds).toLong | |
) | |
println(groupedMinute) | |
val countedPerMinute = groupedMinute.map { | |
case (t, list) => (t -> list.length) | |
} | |
0l to (maxTime / (1000 * 60)).toLong + 2 foreach ((i) => { | |
val toPrint = countedPerMinute.getOrElse(i, 0) | |
println(s"$i,$toPrint") | |
}) | |
@tailrec | |
def countSwears(words: Seq[String], currentTime: Long, currentCount: Map[String, Int]) : Map[String, Int] = { | |
words match { | |
case head :: tail => if (dws.contains(head)) { | |
val newCount = currentCount.getOrElse(head,0) + 1 | |
return countSwears(tail, 0, currentCount + (head -> newCount)) | |
} else { | |
// println(head) | |
return countSwears(tail, 0, currentCount) | |
} | |
case Nil => return currentCount | |
} | |
} | |
def toWords(lines: Seq[String]) = lines flatMap { line => | |
"[a-zA-Z]+".r findAllIn line map (_.toLowerCase) | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<meta charset="utf-8"> | |
<head> | |
<style> | |
#chart { | |
margin-left: 50px; | |
} | |
rect.bordered { | |
stroke: #E6E6E6; | |
stroke-width:1px; | |
rx: 2px; | |
ry: 2px; | |
} | |
text.mono { | |
font-size: 9pt; | |
font-family: Consolas, courier; | |
fill: #aaa; | |
} | |
.title { | |
font-size: 20px; | |
font-family: sans-serif; | |
} | |
</style> | |
<script src="//d3js.org/d3.v3.min.js"></script> | |
<script src="js/colorbrewer.min.js"></script> | |
</head> | |
<body> | |
<div style="margin: 0px; column-count: 3"> | |
<div id="chart"></div> | |
</div> | |
<script type="text/javascript"> | |
var margin = { top: 60, right: 0, bottom: 100, left: 50 }, | |
width = 700 - margin.left - margin.right, | |
height = 330 - margin.top - margin.bottom, | |
gridSize = Math.floor(width / 34), | |
legendElementWidth = gridSize*2, | |
buckets = 9, | |
colors = colorbrewer.OrRd[9]; | |
datasets = ["../resources/results/swearsPerMinuteLoebowski.csv"]; | |
var addDiagram = function(tsvFile, title) { | |
var svg = d3.select("#chart").append("svg") | |
.attr("width", width + margin.left + margin.right) | |
.attr("height", height + margin.top + margin.bottom) | |
var chartGroup = svg.append("g") | |
.attr("transform", "translate(" + margin.left + "," + margin.top + ")"); | |
var textGroup = svg.append("g"); | |
d3.csv(tsvFile, | |
function(d) { | |
var minute = +d.time; | |
var row = Math.floor(minute / 24); | |
var column = minute % 24; | |
return { | |
day: row, | |
minute: column, | |
value: +d.swears | |
}; | |
}, | |
function(error, data) { | |
// we could use max to create a more specific range. | |
// var max = d3.max(data, function (d) { return d.value; }); | |
var totalHeight = (Math.ceil(data.length/30)) * (gridSize) + margin.top + margin.bottom; | |
svg.attr('height', totalHeight); | |
// a colorscale for grouping the colors | |
var colorScale = d3.scale.quantile().domain([0, buckets - 1, 30]).range(colors); | |
var spm = chartGroup.selectAll(".minute") | |
.data(data, function(d) {return d.day+':'+d.minute;}); | |
spm.enter().append("rect") | |
.attr("x", function(d) { return (d.minute - 1) * gridSize; }) | |
.attr("y", function(d) { return (d.day - 1) * gridSize; }) | |
.attr("rx", 4) | |
.attr("ry", 4) | |
.attr("class", "minute bordered") | |
.attr("width", gridSize) | |
.attr("height", gridSize) | |
.style("fill", colors[0]); | |
spm.transition().duration(1000) | |
.style("fill", function(d) { return colorScale(d.value); }); | |
spm.exit().remove(); | |
var legend = chartGroup.selectAll(".legend") | |
.data([0].concat(colorScale.quantiles()), function(d) { return d; }); | |
legend.enter().append("g") | |
.attr("class", "legend"); | |
var legendPos = ((Math.ceil(data.length/30) + 2) * (gridSize)); | |
legend.append("rect") | |
.attr("x", function(d, i) { return legendElementWidth * i; }) | |
.attr("y", legendPos) | |
.attr("width", legendElementWidth) | |
.attr("height", gridSize / 2) | |
.style("fill", function(d, i) { return colors[i]; }); | |
legend.append("text") | |
.attr("class", "mono") | |
.text(function(d) { return "≥ " + Math.round(d); }) | |
.attr("x", function(d, i) { return legendElementWidth * i; }) | |
.attr("y", legendPos + gridSize); | |
legend.exit().remove(); | |
textGroup.append("text") | |
.attr("x", 0) | |
.attr("y", 20) | |
.attr("class", "title") | |
.text(title); | |
}); | |
}; | |
addDiagram("../resources/results/swearsPerMinuteStraightOutOfCompton.csv", "Straight out of Compton"); | |
addDiagram("../resources/results/deadpool.csv", "Deadpool"); | |
addDiagram("../resources/results/swearsPerMinuteLoebowski.csv", "The Big Lebowski"); | |
addDiagram("../resources/results/swaersPerMinuteSwearnet.csv", "Swearnet: the Movie"); | |
addDiagram("../resources/results/swearsPerTwoMinutesWolf.csv", "Wolf of Wallstreet"); | |
addDiagram("../resources/results/swearsPerMinuteStarwarsForce.csv", "Star-Wars: The Force Awakens"); | |
addDiagram("../resources/results/swearsPerMinuteJurrasicWorld.csv", "Jurassic World"); | |
</script> | |
</body> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<meta charset="utf-8"> | |
<style> | |
.bar { | |
fill: gray; | |
stroke: black; | |
} | |
.bar:hover { | |
fill: brown; | |
} | |
.axis { | |
font: 10px sans-serif; | |
} | |
.axis path, | |
.axis line { | |
fill: none; | |
stroke: #000; | |
shape-rendering: crispEdges; | |
} | |
.x.axis path { | |
display: none; | |
} | |
</style> | |
<body> | |
<script src="//d3js.org/d3.v3.min.js"></script> | |
<script> | |
function addDiagram(source, title) { | |
var margin = {top: 20, right: 20, bottom: 30, left: 40}, | |
width = 960 - margin.left - margin.right, | |
height = 150 - margin.top - margin.bottom; | |
var x = d3.scale.ordinal().range([width, 0]) | |
.rangeBands([0, width], .1); | |
var y = d3.scale.linear() | |
.range([height, 0]); | |
var xAxis = d3.svg.axis() | |
.scale(x) | |
.tickFormat(function (d) { console.log(d) ; return (+d % 5 === 0) ? d : '' }) | |
.orient("bottom"); | |
var yAxis = d3.svg.axis() | |
.scale(y) | |
.orient("left") | |
var svg = d3.select("body").append("svg") | |
.attr("width", width + margin.left + margin.right) | |
.attr("height", height + margin.top + margin.bottom) | |
.append("g") | |
.attr("transform", "translate(" + margin.left + "," + margin.top + ")"); | |
d3.csv(source, type, function(error, data) { | |
if (error) throw error; | |
x.domain(data.map(function(d) { console.log(d) ;return +d.time; })); | |
y.domain([0, 30]); | |
svg.append("text") | |
.attr("x", (width - 150)) | |
.attr("y", 15) | |
.attr("class", "axis") | |
.style("font-size", "12px") | |
.text(title); | |
svg.append("g") | |
.attr("class", "x axis") | |
.attr("transform", "translate(0," + height + ")") | |
.call(xAxis); | |
svg.append("g") | |
.attr("class", "y axis") | |
.call(yAxis) | |
.append("text") | |
.attr("transform", "rotate(-90)") | |
.attr("y", 6) | |
.attr("dy", ".71em") | |
.style("text-anchor", "end") | |
.text("Swears / minute"); | |
svg.selectAll(".bar") | |
.data(data) | |
.enter().append("rect") | |
.attr("class", "bar") | |
.attr("x", function(d) { return x(d.time); }) | |
.attr("width", x.rangeBand()) | |
.attr("y", function(d) { return y(d.swears); }) | |
.attr("height", function(d) { return height - y(d.swears); }); | |
}); | |
} | |
function type(d) { | |
d.swears = +d.swears; | |
return d; | |
} | |
addDiagram("../resources/results/swearsPerTwoMinutesWolf.csv", "Wolf of Wallstreet"); | |
addDiagram("../resources/results/swearsPerMinuteStraightOutOfCompton.csv", "Straight out of Compton"); | |
addDiagram("../resources/results/swaersPerMinuteSwearnet.csv", "Swearnet: the Movie"); | |
addDiagram("../resources/results/swearsPerMinuteLoebowski.csv", "The Big Lebowski"); | |
addDiagram("../resources/results/swearsPerMinuteStarwarsForce.csv", "Star-Wars: The Force Awakens"); | |
addDiagram("../resources/results/swearsPerMinuteJurrasicWorld.csv", "Jurassic World"); | |
</script> | |
</body> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment