Last active
February 12, 2017 12:28
-
-
Save gmenard/8424118 to your computer and use it in GitHub Desktop.
Given an arbitrary text document written in English, this solution will generate an alphabetical list of all word occurrences with their frequencies and the sentence numbers in which they appear.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.nio.charset.Charset; | |
import java.nio.charset.StandardCharsets; | |
import java.nio.file.Files; | |
import java.nio.file.Paths; | |
import java.text.BreakIterator; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.Locale; | |
import java.util.Map; | |
import java.util.TreeMap; | |
public class WordFrequencies { | |
private Locale locale; | |
private Map<String, List<Integer>> concordanceMap; // Key = Word, Value = list of sentence numbers in which word appears | |
public WordFrequencies(Locale locale) { | |
this.locale = locale; | |
this.concordanceMap = new TreeMap<String, List<Integer>>(); | |
} | |
public void build(String text) { | |
BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(this.locale); | |
BreakIterator wordIterator = BreakIterator.getWordInstance(this.locale); | |
int sentenceCounter = 0; | |
int sentenceIdx = 0; | |
sentenceIterator.setText(text); | |
// Go through sentences | |
while (BreakIterator.DONE != sentenceIterator.next()) { | |
String sentence = text.substring(sentenceIdx, sentenceIterator.current()); | |
sentenceCounter++; | |
int wordIdx = 0; | |
wordIterator.setText(sentence); | |
// Go through words | |
while (BreakIterator.DONE != wordIterator.next()) { | |
String word = sentence.substring(wordIdx, wordIterator.current()).toLowerCase(); | |
if (Character.isLetterOrDigit(word.charAt(0))) { | |
if (this.concordanceMap.containsKey(word)) { | |
this.concordanceMap.get(word).add(sentenceCounter); | |
} else { | |
List<Integer> sentenceNumbers = new ArrayList<Integer>(); | |
sentenceNumbers.add(sentenceCounter); | |
this.concordanceMap.put(word, sentenceNumbers); | |
} | |
} | |
wordIdx = wordIterator.current(); | |
} | |
sentenceIdx = sentenceIterator.current(); | |
} | |
} | |
public void print() { | |
for (String word : this.concordanceMap.keySet()) { | |
List<Integer> sentenceNumbers = this.concordanceMap.get(word); | |
System.out.println(word + " {" + sentenceNumbers.size() + ":" + sentenceNumbers.toString() + "}"); | |
} | |
} | |
public static void main(String[] args) { | |
String fileName = (args.length == 0 ? "input.txt" : args[0]); | |
Charset inputEncoding = StandardCharsets.UTF_8; | |
try { | |
String text = new String(Files.readAllBytes(Paths.get(fileName)), inputEncoding); | |
if (null != text && !text.trim().isEmpty()) { | |
WordFrequencies solution = new WordFrequencies(Locale.US); | |
// Build concordance Map | |
solution.build(text); | |
// Print Result | |
solution.print(); | |
} else { | |
throw new IllegalArgumentException(); | |
} | |
} catch (IOException e) { | |
System.out.println("Error: Could not load file " + fileName); | |
} catch (IllegalArgumentException e) { | |
System.out.println("Text file '" + fileName + "' is empty!"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment