Skip to content

Instantly share code, notes, and snippets.

@birjj
Created April 4, 2018 09:25
Show Gist options
  • Save birjj/9c2d2fe3cdcd5c7a77507ba5ffe377a7 to your computer and use it in GitHub Desktop.
Save birjj/9c2d2fe3cdcd5c7a77507ba5ffe377a7 to your computer and use it in GitHub Desktop.
Data generator
package data_generator;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
/**
* Generates data
*
* @author me
*/
public class DataGenerator {
public interface Config {
/** Get how many subdirectories should be generated in the dir
* (which is at the specified depth) */
public int getNumSubdirs(Path dir, int depth);
/** Get the number of .dat files that should be generated in the dir */
public int getNumDatFiles(Path dir, int depth);
/** Get the number of .txt files that should be generated in the dir */
public int getNumTxtFiles(Path dir, int depth);
/** Get the size a file should have in bytes */
public long getFileSize(Path file);
}
public class Stats {
public Map<Path, Integer> minNumbers = new HashMap<>();
public int numTxts = 0;
public int numDats = 0;
@Override
public String toString() {
return "Stats("+numTxts+":"+numDats+")";
}
}
private final Config config;
public DataGenerator(Config config) {
this.config = config;
}
/**
* Generate data in a specific directory
* @param root The directory to generate data in
*/
public Stats generate(Path root) {
try {
if (Files.list(root).findAny().isPresent()) {
throw new IllegalArgumentException("Cannot generate in "+root+" - directory must be empty");
}
} catch (IOException ex) {}
Stats stats = new Stats();
_generate(root, 1, stats);
return stats;
}
private void _generate(Path dir, int depth, Stats stats) {
if (depth > 128) {
throw new IndexOutOfBoundsException("Don't you think you've had enough?");
}
int numSubdirs = this.config.getNumSubdirs(dir, depth);
int numDats = this.config.getNumDatFiles(dir, depth);
int numTxts = this.config.getNumTxtFiles(dir, depth);
for (int i = 0; i < numSubdirs; ++i) {
Path newDir = dir.resolve("dir"+i);
if (!newDir.toFile().mkdir()) {
System.err.println("Failed to create dir "+newDir.toAbsolutePath());
} else {
_generate(newDir, depth + 1, stats);
}
}
for (int i = 0; i < numDats; ++i) {
Path newDat = dir.resolve("dat"+i+".dat");
stats.minNumbers.put(newDat, _generateDat(newDat));
++stats.numDats;
}
for (int i = 0; i < numTxts; ++i) {
Path newTxt = dir.resolve("txt"+i+".txt");
stats.minNumbers.put(newTxt, _generateTxt(newTxt));
++stats.numTxts;
}
System.out.println("Finished generation of "+dir);
}
/** @return The smallest number in the dat */
private int _generateDat(Path dat) {
Random rand = new Random();
long maxSize = this.config.getFileSize(dat);
long size = 0;
boolean isFirst = true;
int min = Integer.MAX_VALUE;
try (BufferedWriter writer = Files.newBufferedWriter(dat)) {
while (true) {
int num = rand.nextInt(Integer.MAX_VALUE);
if (num < min) { min = num; }
String nextVal = Integer.toString(num);
int sizeChange = 1 + nextVal.length();
if (size + sizeChange < maxSize) {
if (!isFirst) { writer.write(","); }
isFirst = false;
writer.write(nextVal);
size += sizeChange;
} else {
break;
}
// 1/100 change that we'll add a newline
// this is done after adding a number, so that lines are never empty
if (rand.nextInt(100) == 0) {
writer.write("\n");
size += 1;
isFirst = true;
}
}
} catch (IOException ex) {
System.err.println("Failed to write to file "+dat);
ex.printStackTrace();
}
return min;
}
/** @return The smallest number in the txt */
private int _generateTxt(Path txt) {
Random rand = new Random();
long maxSize = this.config.getFileSize(txt);
long size = 0;
boolean isFirst = true;
int min = Integer.MAX_VALUE;
try (BufferedWriter writer = Files.newBufferedWriter(txt)) {
while (true) {
int num = rand.nextInt(Integer.MAX_VALUE);
if (num < min) { min = num; }
String nextVal = Integer.toString(num);
int sizeChange = 1 + nextVal.length();
if (size + sizeChange < maxSize) {
if (!isFirst) { writer.write(","); }
isFirst = false;
writer.write(nextVal);
size += sizeChange;
} else {
break;
}
}
} catch (IOException ex) {
System.err.println("Failed to write to file "+txt);
ex.printStackTrace();
}
return min;
}
}
package data_generator;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map.Entry;
import java.util.Random;
/**
* Generates a ~300MB data directory, consisting of:
* 2^8 = 256 directories
* 10 .dat files in each directory
* 10 .txt files in each directory
* ~65536 bytes per file
* (total of 5120 files, ~335.5MB)
* @author me
*/
public class Main {
public static void main(String[] args) {
Random rand = new Random();
DataGenerator.Config config = new DataGenerator.Config() {
@Override
public int getNumSubdirs(Path dir, int depth) {
if (depth < 8) {
return 2;
}
return 0;
}
@Override
public int getNumDatFiles(Path dir, int depth) {
return 10;
}
@Override
public int getNumTxtFiles(Path dir, int depth) {
return 10;
}
@Override
public long getFileSize(Path file) {
return 65536;
}
};
DataGenerator generator = new DataGenerator(config);
Path root = Paths.get("../test_data");
root.toFile().mkdir();
DataGenerator.Stats stats = generator.generate(root);
Path meta = root.resolve("meta.generated");
System.out.println("Writing meta to "+meta);
try (BufferedWriter writer = Files.newBufferedWriter(meta)) {
writer.write(stats.numTxts+"\n");
writer.write(stats.numDats+"\n");
for (Entry<Path, Integer> entry : stats.minNumbers.entrySet()) {
writer.write(entry.getKey().toAbsolutePath()+":"+entry.getValue()+"\n");
}
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment