Forked from stuart-marks/ReadFileJavaApplicationBufferedReader7.java
Created
January 12, 2019 18:13
-
-
Save commonquail/e680ac0cb78a09c5caa1c8483a6c0d6b to your computer and use it in GitHub Desktop.
Processing Large Files in Java, Variation 7
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.class | |
indiv18.zip | |
itcont.txt | |
sample.txt |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CLASS_NAME = ReadFileJavaApplicationBufferedReader7 | |
%.class: %.java | |
javac $< | |
.PHONY: test | |
test: $(CLASS_NAME).class sample.txt | |
java -Xmx4G $(CLASS_NAME) sample.txt | |
.PHONY: run | |
run: $(CLASS_NAME).class | |
java -Xmx4G $(CLASS_NAME) itcont.txt | |
.PHONY: download-large-file | |
download-large-file: itcont.txt | |
indiv18.zip: | |
wget https://www.fec.gov/files/bulk-downloads/2018/indiv18.zip | |
itcont.txt: indiv18.zip | |
unzip indiv18.zip itcont.txt | |
sample.txt: itcont.txt | |
head -n 44000 $< > $@ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.nio.file.Files; | |
import java.nio.file.Path; | |
import java.time.Duration; | |
import java.time.Instant; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Map.Entry; | |
import java.util.regex.Pattern; | |
import static java.util.stream.Collectors.counting; | |
import static java.util.stream.Collectors.groupingBy; | |
public class ReadFileJavaApplicationBufferedReader7 { | |
// returns the time between startTime and now in milliseconds | |
static long between(Instant startTime) { | |
return Duration.between(startTime, Instant.now()).toMillis(); | |
} | |
public static void main(String[] args) throws IOException { | |
Instant startTime = Instant.now(); | |
try (BufferedReader b = Files.newBufferedReader(Path.of(args[0]))) { | |
int[] indexes = {0, 432, 43243}; | |
List<String> names = new ArrayList<>(); | |
List<String> dates = new ArrayList<>(); | |
List<String> firstNames = new ArrayList<>(); | |
var namePat = Pattern.compile(", \\s*([^, ]+)"); | |
StringBuilder sb = new StringBuilder(7); | |
System.out.println("Reading file using " + Caller.getName()); | |
/* | |
* Line format: | |
* | |
* 0 | 1 | 2 | 3 | DATE | 5 | 6 | NAME | 8 | ... | |
* ^ ^ ^ ^ ^ ^ ^ ^ | |
* 1 2 3 4 5 6 7 8 | |
*/ | |
String readLine; | |
while ((readLine = b.readLine()) != null) { | |
// There are at least 3 separators before the first separator | |
// we're interested in so we don't need to check the first 3 | |
// characters. | |
int startFieldIdx = 1 + nthIndexOf(readLine, '|', 4, 3); | |
int endFieldIdx = readLine.indexOf('|', startFieldIdx); | |
// extract dates | |
String rawDate = readLine.substring(startFieldIdx, endFieldIdx).strip(); | |
sb.setLength(0); | |
sb.append(rawDate, 0, 4) | |
.append('-') | |
.append(rawDate, 4, 6); | |
dates.add(sb.toString()); | |
// get all the names | |
startFieldIdx = 1 + nthIndexOf(readLine, '|', 2, endFieldIdx); | |
endFieldIdx = readLine.indexOf('|', startFieldIdx); | |
String name = readLine.substring(startFieldIdx, endFieldIdx).strip(); | |
names.add(name); | |
// extract first names | |
var matcher = namePat.matcher(name); | |
if (matcher.find()) { | |
firstNames.add(matcher.group(1)); | |
} | |
} | |
for (int i : indexes) { | |
System.out.println("Name: " + names.get(i) + " at index: " + i); | |
} | |
System.out.println("Name time: " + between(startTime) + "ms"); | |
System.out.println("Total file line count: " + names.size()); | |
System.out.println("Line count time: " + between(startTime) + "ms"); | |
Map<String, Long> dateMap = dates.stream() | |
.collect(groupingBy(date -> date, counting())); | |
dateMap.forEach((date, count) | |
-> System.out.println("Donations per month and year: " + date + " and donation count: " + count)); | |
System.out.println("Donations time: " + between(startTime) + "ms"); | |
Map<String, Long> nameMap = firstNames.stream() | |
.collect(groupingBy(name -> name, counting())); | |
Entry<String, Long> common = Collections.max(nameMap.entrySet(), Entry.comparingByValue()); | |
System.out.println("The most common first name is: " + common.getKey() + " and it occurs: " + common.getValue() + " times."); | |
System.out.println("Most common name time: " + between(startTime) + "ms"); | |
} | |
} | |
/** | |
* Finds the nth occurrence of {@code c} in {@code s} <em>after</em> index | |
* {@code start}. To find a character at index 0, call with {@code start} | |
* equal to -1. | |
*/ | |
static int nthIndexOf(String s, char c, int n, int start) { | |
int x = start; | |
while (n-- > 0) { | |
x = s.indexOf(c, x + 1); | |
} | |
return x; | |
} | |
} | |
class Caller { | |
// gets the simple name of the caller's class | |
public static String getName() { | |
return StackWalker.getInstance() | |
.walk(s -> s.skip(1) | |
.findFirst() | |
.map(StackWalker.StackFrame::getClassName) | |
.map(name -> name.replaceFirst("^.*\\.", "")) | |
.orElse("")); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment