Skip to content

Instantly share code, notes, and snippets.

@mormegil-cz
Created August 6, 2019 15:56
Show Gist options
  • Save mormegil-cz/fbfa7a50ae50b3ada3488840d34efc50 to your computer and use it in GitHub Desktop.
Save mormegil-cz/fbfa7a50ae50b3ada3488840d34efc50 to your computer and use it in GitHub Desktop.
Random word n-gram generator
import java.nio.file.Files;
import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
public class NGrams {
public static void main(String[] args) throws Exception {
String data = new String(Files.readAllBytes(FileSystems.getDefault().getPath(args[0]))).replace('\n', ' ').replace("\r", "").replace(" ", "");
NGrams impl = new NGrams(data, Integer.parseInt(args[1]));
System.out.println(impl.getNGramsText());
}
private String words[]; // List of words in Supplied String
private HashMap<String, List<String>> nGramsMap; // Mapping of the Ngrams
private int value; // Value for NGram
NGrams(String text, int n) {
if (text == null) throw new NullPointerException("Check the input parameters: String should not be null");
if (n < 1 || text.length() == 0)
throw new IllegalArgumentException("Check the input parameters : String should not empty and Ngram value should be > 1");
words = text.split(" ");
nGramsMap = new HashMap<String, List<String>>();
this.value = n;
}
public Map<String, List<String>> generateNgrams() {
// Filling the HashMap based on N value and using WOrds Array
/*
* Look at each set of N adjacent words in a document. Use the first N-1 words of the set as a key, and remember the fact that the nth word
* followed that key. Once you’ve finished, you know the list of individual words that can follow each N-1 word sequence in the document.
*/
for (int i = 0; i <= words.length - value; i++) {
StringBuilder sb = new StringBuilder();
int j = 0;
while (j < value - 1) {
sb.append(words[i + j].trim());
j++;
if (j < value - 1) sb.append(" ");
}
String key = sb.toString();
if (!nGramsMap.containsKey(key)) {
ArrayList<String> list = new ArrayList<>();
list.add(words[i + j]);
nGramsMap.put(key, list);
} else {
List<String> list = nGramsMap.get(key);
list.add(words[i + j]);
}
// System.out.println("Key: "+key+" Value : "+words[i+j]);
}
return nGramsMap;
}
public String getNGramsText() {
/*
* To generate new text from this analysis, choose an arbitrary word pair as a starting point. Use these to look up a random next word (using
* the table above) and append this new word to the text so far. This now gives you a new word pair at the end of the text, so look up a
* potential next word based on these. Add this to the list, and so on.
*/
if (nGramsMap.size() == 0) generateNgrams();
StringBuilder result = new StringBuilder();
Random gen = new Random();
int st = gen.nextInt(words.length - value); // arbitrary word pair
StringBuilder startSb = new StringBuilder();
for (int i = 0; i < value - 1; i++) {
startSb.append(words[st + i]);
if (i + 1 < value - 1) startSb.append(" ");
}
String start = startSb.toString();
result.append(start);
while (nGramsMap.containsKey(start)) {
result.append(' ');
int size = nGramsMap.get(start).size();
String next;
if (size > 1) {
st = gen.nextInt(size - 1);
next = nGramsMap.get(start).remove(st);
} else {
st = 0;
next = nGramsMap.get(start).get(st);
}
String start_split[] = start.split(" ");
String nextKey;
if (start_split.length > 1) {
nextKey = start.substring(start_split[0].length() + 1);
start = nextKey + " " + next;
} else start = next;
result.append(next); // append this new word to the text so far
}
return result.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment