Last active
August 29, 2015 14:18
-
-
Save john-h-kastner/27e1777d50cb8e28cd7e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package unpackingsentence; | |
import java.util.Comparator; | |
import java.util.Arrays; | |
import java.util.List; | |
import java.lang.StringBuilder; | |
public class MarkovChain { | |
private final int[][] characterFrequencies; | |
private int dataPoints; | |
public final static int WORD_DELIMITER = -1; | |
public MarkovChain(){ | |
characterFrequencies = new int[26][27]; | |
dataPoints = 0; | |
} | |
private int inRange(int c){ | |
if(c==WORD_DELIMITER){ | |
return 26; | |
} | |
return Character.toLowerCase(c)-'a'; | |
} | |
public void train(List<String> words){ | |
words.stream().forEach(this :: addWord); | |
} | |
public void addWord(String word){ | |
char[] chars = word.toCharArray(); | |
for(int i = 0; i< chars.length-1;i++){ | |
characterFrequencies[inRange(chars[i])][inRange(chars[i+1])]++; | |
} | |
characterFrequencies[inRange(chars[chars.length-1])][inRange(WORD_DELIMITER)]++; | |
dataPoints+=word.length()+1; | |
} | |
public double chanceOf(int from, int to){ | |
return (double)characterFrequencies[inRange(from)][inRange(to)]/dataPoints; | |
} | |
public Comparator<Character> comparingFrequency(char from){ | |
return Comparator.comparingDouble(c->(double)characterFrequencies[inRange(from)][inRange(c)]/dataPoints); | |
} | |
@Override | |
public String toString(){ | |
StringBuilder builder = new StringBuilder(); | |
builder.append(dataPoints).append("\n"); | |
for(int[] ns : characterFrequencies){ | |
for(int n:ns){ | |
builder.append(n).append(" "); | |
} | |
builder.append("\n"); | |
} | |
return builder.toString(); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package unpackingsentence; | |
import java.awt.Point; | |
public class PackedFile { | |
public final char[][] packedSentence; | |
public final Point startingPosition; | |
public PackedFile(char[][] sentence,int r, int c){ | |
packedSentence = sentence; | |
startingPosition = new Point(r,c); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package unpackingsentence; | |
import java.awt.Point; | |
import java.util.List; | |
import java.util.ArrayList; | |
import java.util.stream.Stream; | |
import java.util.Comparator; | |
import java.lang.StringBuilder; | |
import java.util.Optional; | |
import java.util.Collections; | |
import java.io.IOException; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.nio.file.Paths; | |
import java.nio.file.Files; | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
public class Unpacker { | |
private final MarkovChain characterProbabilities; | |
public Unpacker(MarkovChain chain){ | |
characterProbabilities = chain; | |
} | |
public List<String> unpack(PackedFile packed){ | |
return unpack(packed.packedSentence,packed.startingPosition); | |
} | |
public List<String> unpack(char[][] packed, Point start){ | |
return unpack(packed,start, new ArrayList<>(packed.length*packed[0].length)); | |
} | |
public List<String> unpack(char[][] packed, Point start, List<Point> usedPoints){ | |
usedPoints.add(start); | |
char startChar = packed[start.x][start.y]; | |
Point[] moves = {new Point(start.x,start.y-1), | |
new Point(start.x,start.y+1), | |
new Point(start.x-1,start.y), | |
new Point(start.x+1,start.y)}; | |
Comparator<Character> frequencyComparator = characterProbabilities.comparingFrequency(startChar); | |
Point[] vistOrder = Stream | |
.of(moves) | |
.filter(e -> !usedPoints.contains(e)) | |
.filter(p -> p.x >= 0 && p.x < packed.length && p.y >= 0 && p.y < packed[0].length) | |
.sorted((p0,p1) -> frequencyComparator.compare(packed[p1.x][p1.y],packed[p0.x][p0.y])) | |
.toArray(Point[] :: new); | |
if(vistOrder.length == 0) { | |
if(usedPoints.size() != packed.length*packed[0].length){ | |
return Collections.emptyList(); | |
} else { | |
List<String> unpacked = new ArrayList<>(); | |
unpacked.add(buildString(packed,usedPoints)); | |
return unpacked; | |
} | |
} | |
List<String> allUnpacked = new ArrayList<>(); | |
for(Point toVisit : vistOrder){ | |
List<String> unpacked = unpack(packed,toVisit,new ArrayList<Point>(usedPoints)); | |
if(!unpacked.isEmpty()){ | |
allUnpacked.addAll(unpacked); | |
} | |
} | |
return allUnpacked; | |
} | |
public String buildString(char[][] packed,List<Point> usedPoints){ | |
return usedPoints | |
.stream() | |
.map(p -> packed[p.x][p.y]) | |
.reduce("",((str,c) -> str + c),String :: concat); | |
} | |
public static PackedFile parsePacked(String path){ | |
try( | |
BufferedReader read = new BufferedReader(new FileReader(new File(path))); | |
){ | |
String[] header = read.readLine().split(" "); | |
int rows = Integer.parseInt(header[0]); | |
char[][] packedSentence = new char[rows][0]; | |
int startRow = Integer.parseInt(header[1]); | |
int startColumn = Integer.parseInt(header[2]); | |
for(int i=0;i<rows;i++){ | |
int[] chars = read.readLine() | |
.chars() | |
.filter(Character :: isLetter) | |
.toArray(); | |
packedSentence[i] = new char[chars.length]; | |
for(int j =0;j<chars.length;j++){ | |
packedSentence[i][j] = (char) chars[j]; | |
} | |
} | |
return new PackedFile(packedSentence,startRow-1,startColumn-1); | |
} catch (IOException ioe) { | |
System.err.println("Exception Occurred while reading packed sentence!"); | |
return null; | |
} | |
} | |
public static void main(String[] args){ | |
String packedFile = args[0]; | |
String trainingFile = args[1]; | |
PackedFile packed = parsePacked(packedFile); | |
MarkovChain chain = new MarkovChain(); | |
List<String> trainingWords; | |
try { | |
trainingWords = Files.readAllLines(Paths.get(trainingFile)); | |
} catch (IOException ioe) { | |
trainingWords = new ArrayList<>(); | |
} | |
chain.train(trainingWords); | |
Unpacker unpacker = new Unpacker(chain); | |
unpacker.unpack(packed).stream().forEach(System.out :: println); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment