Skip to content

Instantly share code, notes, and snippets.

@felipehummel
Created February 6, 2012 20:40
Show Gist options
  • Save felipehummel/1754692 to your computer and use it in GitHub Desktop.
Save felipehummel/1754692 to your computer and use it in GitHub Desktop.
wiki cleaning
private def wikiTitles() = io.Source.fromFile(wikiTitlesDumpFile).getLines()
def retrieveAndIndex() {
val titles = wikiTitles().filter( titleIsUsable(_) ).map( cleanLine(_) ).toSet
for (title <- titles) {
val q = textPhraseQuery("_all", title)
val request = esClient.prepareSearch(buskIndex)
.setQuery(q).execute().actionGet()
val totalHits = request.hits().totalHits()
println(title + " = " + totalHits)
}
}
def cleanLine(line: String) = {
line.replaceAll("_", " ")
.replaceAll(""" \(.*\)""", "") // removes disambiguation parenthesis
}
def titleIsUsable(wikiTitle: String) : Boolean ={
!wikiTitle.matches("^[^\\p{L}0-9]+.*") && // starts with anything that is not a letter
!wikiTitle.matches("^List_.*") && // is a page List of something...
!wikiTitle.matches("^Category:.*") && // is a Category page
hasMoreDigitsThanLetters(wikiTitle)
true
}
def hasMoreDigitsThanLetters(wikiTitle: String) =
wikiTitle.filter( _.isDigit ).length > wikiTitle.filter( _.isLetter ).length
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment