Created
July 8, 2009 17:15
-
-
Save arosien/142978 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package asr.nestedsets; | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.Collection; | |
import java.util.HashSet; | |
import java.util.Set; | |
import java.util.SortedSet; | |
import java.util.TreeSet; | |
/** | |
* Run with -Xmx512m for the standard wordlist.txt, i.e., "java -Xmx512m asr.nestedsets.LargestNestedSet wordlist.txt". | |
*/ | |
public class LargestNestedSet | |
{ | |
// Hold the words in a trie (prefix tree), using O(unique prefixes) space. | |
private final TrieNode root = new TrieNode(null); | |
// wordDepth of the TrieNode(s) with the greatest wordDepth, potentially updated on each insert(). | |
private int largestWordDepth = -1; | |
// There may be more than one largest nested set, potentially updated on each insert(). | |
private final Set<SortedSet<String>> largestNestedSets = new HashSet<SortedSet<String>>(); | |
/** | |
* Read a wordlist and print the largest nested set(s). | |
* | |
* @param args args[0] is the filename to read to compute the largest nested set, other args ignored | |
* @throws IOException | |
*/ | |
public static void main(String[] args) throws IOException | |
{ | |
LargestNestedSet lns = new LargestNestedSet(); | |
BufferedReader reader = new BufferedReader(new FileReader(args[0])); | |
String word; | |
int count = 0; | |
// Insert words, showing progress as we go. Incremental memory use should decrease as more trie nodes are shared. | |
long prevUsedMemory = 0; | |
System.err.println(String.format("One '.' per 1000 words processed, used memory is %s", usedMemory())); | |
while ((word = reader.readLine()) != null) { | |
lns.insert(word); | |
if (count++ % 1000 == 0) { | |
System.err.print("."); | |
} | |
if (count % 100000 == 0) { | |
long usedMemory = usedMemory(); | |
System.err.println(String.format(" %dmb (+%dmb)", usedMemory, usedMemory - prevUsedMemory)); | |
prevUsedMemory = usedMemory; | |
} | |
} | |
long usedMemory = usedMemory(); | |
System.err.println(String.format("%dmb (+%dmb)", usedMemory, usedMemory - prevUsedMemory)); | |
System.err.println(String.format("%d words processed", count)); | |
// System.err.println(lns); | |
System.out.println(String.format("Largest nested set(s) are: %s", lns.getLargestNestedSets())); | |
} | |
private static long usedMemory() | |
{ | |
Runtime runtime = Runtime.getRuntime(); | |
long mb = 1024 * 1024; | |
return (runtime.totalMemory() - runtime.freeMemory()) / mb; | |
} | |
/** | |
* Insert a word, updating the largest nested set(s) as a side-effect. | |
* | |
* @param word | |
*/ | |
public void insert(String word) | |
{ | |
// Assume word is normalized, i.e., lowercased, etc. | |
if (word.isEmpty()) { | |
return; | |
} | |
// Iterative addition of word, letter by letter, to trie, O(word length) time. | |
TrieNode node = root; | |
int wordDepth = 0; | |
for (int i = 0; i < word.length(); i++) { | |
node = node.getChild(word.charAt(i)); | |
if (node.isWord()) { | |
wordDepth++; // Keep track of how many words are above the word we're inserting. | |
} | |
} | |
// Mark the final node as a word in the trie. | |
node.setWord(word, wordDepth); | |
} | |
public Set<SortedSet<String>> getLargestNestedSets() | |
{ | |
return largestNestedSets; | |
} | |
private class TrieNode | |
{ | |
private TrieNode[] children = new TrieNode[26]; // One for each English letter; could possibly compress since most children will be null. | |
private TrieNode parent = null; | |
private String word = null; // The word this node represents, null if an intermediate non-word node. | |
private int wordDepth = 0; // If a word, how many words (exclusive) are above this in the trie. | |
public TrieNode(TrieNode parent) | |
{ | |
this.parent = parent; | |
} | |
public TrieNode getChild(char letter) | |
{ | |
int offset = letter - 'a'; // 'a' is at 0, etc. | |
TrieNode child = children[offset]; | |
// Lazily create child. | |
if (child == null) { | |
child = new TrieNode(this); | |
children[offset] = child; | |
} | |
return child; | |
} | |
public void setWord(String word, int wordDepth) | |
{ | |
this.word = word; | |
this.wordDepth = wordDepth; | |
// This node may be the deepest wordDepth-wise and thus have the largest nested set. | |
updateLargestNestedSets(); | |
// We may be adding a word that is already a substring of another, so adjust wordDepth (and largest nested sets) for sub-nodes. | |
incrementWordDepth(); | |
} | |
private Collection<TrieNode> nonNullChildren() | |
{ | |
ArrayList<TrieNode> c = new ArrayList<TrieNode>(); | |
for (int i = 0; i < children.length; i++) { | |
if (children[i] != null) { | |
c.add(children[i]); | |
} | |
} | |
return c; | |
} | |
/* | |
* O(number of sub-nodes) ~= O(number of words that are substring of current word) | |
*/ | |
private void incrementWordDepth() | |
{ | |
Collection<TrieNode> nodes = nonNullChildren(); | |
// Iteratively | |
while (!nodes.isEmpty()) { | |
ArrayList<TrieNode> newNodes = new ArrayList<TrieNode>(); | |
for (TrieNode node : nodes) { | |
if (node.isWord()) { | |
node.wordDepth++; | |
node.updateLargestNestedSets(); // This word may now be a largest nested set. | |
} | |
newNodes.addAll(node.nonNullChildren()); | |
} | |
nodes = newNodes; | |
} | |
} | |
/* | |
* Potentially set or add this word set as the largest. | |
*/ | |
private void updateLargestNestedSets() | |
{ | |
if (wordDepth >= largestWordDepth) { | |
if (wordDepth > largestWordDepth) { | |
// System.err.println(String.format("Clearing old sets of length %d: %s", largestWordDepth, largestNestedSets)); | |
largestNestedSets.clear(); | |
largestWordDepth = wordDepth; | |
} | |
largestNestedSets.add(getNestedSet()); | |
} | |
} | |
/* | |
* O(depth of node) ~= O(wordDepth) | |
*/ | |
public SortedSet<String> getNestedSet() | |
{ | |
SortedSet<String> set = new TreeSet<String>(); | |
TrieNode node = this; | |
while (node.parent != null) { | |
if (node.isWord()) { | |
set.add(node.getWord()); | |
} | |
node = node.parent; | |
} | |
return set; | |
} | |
public String getWord() | |
{ | |
return word; | |
} | |
public boolean isWord() | |
{ | |
return word != null; | |
} | |
@Override | |
public String toString() | |
{ | |
StringBuilder c = new StringBuilder(); | |
c.append('['); | |
for (int i = 0; i < children.length; i++) { | |
if (children[i] != null) { | |
c.append((char) (i + 'a')); | |
c.append("="); | |
c.append(children[i]); | |
} | |
} | |
c.append(']'); | |
return String.format("[%schildren=%s]", word == null ? "" : String.format("word=%s,wordDepth=%d,", word, wordDepth), c); | |
} | |
} | |
@Override | |
public String toString() | |
{ | |
return root.toString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment