Created
November 27, 2017 20:17
-
-
Save anonymous/7dd63c5e5127f89b4b55a2f36a68bdc6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.FileInputStream; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.nio.charset.Charset; | |
import java.nio.file.Files; | |
import java.nio.file.Paths; | |
import java.util.*; | |
public class WordStatLineIndex { | |
public static final char[] buf = new char[256]; | |
public static void main(String[] args) { | |
if (args.length != 2) { | |
System.err.println("Wrong input"); | |
return; | |
} | |
StringBuilder bd = new StringBuilder(); | |
try(InputStreamReader in = new InputStreamReader(new FileInputStream(args[0]), "UTF-8")) { | |
Map<String, ArrayList<String > > stat = new TreeMap<>(); | |
int read = -1; | |
int wordId = 1; | |
int lineId = 1; | |
while ((read = in.read(buf)) > 0) { | |
for (int i = 0; i < read; i++) { | |
char c = buf[i]; | |
if (Character.isLetter(c) || c == '\'' || Character.getType(c) == Character.DASH_PUNCTUATION) { | |
bd.append(c); | |
} else { | |
if (bd.length() > 0) { | |
stat.merge(bd.toString().toLowerCase(), new ArrayList<>(Arrays.asList(lineId + ":" + wordId)), (x, y) -> { | |
x.addAll(y); | |
return x; | |
}); | |
wordId++; | |
} | |
if (c == '\n') { | |
lineId++; | |
wordId = 1; | |
} | |
bd.setLength(0); | |
} | |
} | |
} | |
if (bd.length() > 0) { | |
stat.merge(bd.toString().toLowerCase(), new ArrayList<>(Arrays.asList(lineId + ":" + wordId)), (x, y) -> { | |
x.addAll(y); | |
return x; | |
}); | |
} | |
Files.write(Paths.get(args[1]), | |
stat.entrySet().stream() | |
.reduce(new ArrayList<String>(), | |
(list, entry) -> { | |
bd.setLength(0); | |
bd.append(entry.getKey() + " " | |
+ entry.getValue().size() + " "); | |
entry.getValue().forEach((x -> bd.append(x + " "))); | |
bd.setLength(bd.length() - 1); | |
list.add(bd.toString()); | |
return list; | |
}, (l1, l2) -> { | |
l1.addAll(l2); | |
return l1; | |
}), | |
Charset.forName("UTF-8")); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment