Created
September 29, 2015 00:31
-
-
Save dhagan/091fbdadb597f20b7571 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
import java.nio.file.*; | |
import java.util.*; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class concordance { | |
/** | |
* Usage is: java concordance <<filename>>" | |
* | |
* @param args | |
*/ | |
public static void main(String[] args) { | |
if(args.length == 0) | |
{ | |
System.out.println("Proper Usage is: java concordance <<filename>>"); | |
System.exit(0); | |
} | |
_main(args); | |
} | |
/** | |
* wrapper | |
* | |
* @param args | |
*/ | |
static void _main(String[] args) { | |
Map<String, WordInfo> concordanceMap = new HashMap<>(); | |
String myText; | |
try { | |
// deficiency - would need a streaming strategy if the file text sizes | |
myText = new String(Files.readAllBytes(Paths.get(args[0]))); | |
} catch (IOException e) { | |
System.out.print(args[0] + " File not found! Please check that the file exists."); | |
return; | |
} | |
String[] sentences = splitSentences(myText); | |
int sentenceNumber = 1; | |
for (String sentence : sentences) { | |
List<String> words = splitWords(sentence); | |
for (String word : words) { | |
String _word = word.toLowerCase(); | |
if (!concordanceMap.containsKey(_word)) { | |
concordanceMap.put(_word, new WordInfo(_word, sentenceNumber)); | |
} else { | |
concordanceMap.get(_word).WordCount++; | |
concordanceMap.get(_word).SentenceNumbers.add(sentenceNumber); | |
} | |
} | |
sentenceNumber++; | |
} | |
Map<String, WordInfo> treeMap = new TreeMap<>(concordanceMap); | |
printMap(treeMap); | |
} | |
/** | |
* format output | |
* @param map | |
*/ | |
public static void printMap(Map<String, WordInfo> map) { | |
for (Map.Entry<String, WordInfo> entry : map.entrySet()) { | |
WordInfo wordInfo = entry.getValue(); | |
StringJoiner sentenceNumbers = new StringJoiner(","); | |
for (Integer number : wordInfo.SentenceNumbers) { | |
sentenceNumbers.add(number.toString()); | |
} | |
String value = "{" + wordInfo.WordCount + ":" + sentenceNumbers.toString() + "}"; | |
System.out.println(String.format("%-20s %s", entry.getKey(), value)); | |
} | |
} | |
/** | |
* split sentences into words, handle special case like i.e. | |
* | |
* please note I have used stack overflow suggestions for the regex | |
* @param sentence | |
* @return | |
*/ | |
static List<String> splitWords(String sentence) { | |
List<String> allMatches = new ArrayList<String>(); | |
Matcher matcher = Pattern.compile("((\\b[^\\s]+\\b)((?<=\\.\\w).)?)").matcher(sentence); | |
while (matcher.find()) { | |
allMatches.add(matcher.group()); | |
} | |
return allMatches; | |
} | |
/** | |
* | |
* split text blob into sentences | |
* | |
* @param text | |
* @return | |
*/ | |
static String[] splitSentences(String text) { | |
String pattern = "(?<=[.!?])\\s+(?=[A-Z])"; | |
return text.split(pattern); | |
} | |
/** | |
* hold class for word count info, | |
* deficiency - next iteration abstract public members | |
*/ | |
public static class WordInfo { | |
public WordInfo(String word, int sentenceNumber) { | |
Word = word; | |
WordCount = 1; | |
SentenceNumbers.add(sentenceNumber); | |
} | |
public String Word; | |
public int WordCount; | |
public List<Integer> SentenceNumbers = new ArrayList<>(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thanks for sharing this.