Created
April 4, 2018 09:25
-
-
Save birjj/9c2d2fe3cdcd5c7a77507ba5ffe377a7 to your computer and use it in GitHub Desktop.
Data generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package data_generator; | |
import java.io.BufferedWriter; | |
import java.io.IOException; | |
import java.nio.file.Files; | |
import java.nio.file.Path; | |
import java.util.HashMap; | |
import java.util.Map; | |
import java.util.Random; | |
/** | |
* Generates data | |
* | |
* @author me | |
*/ | |
public class DataGenerator { | |
public interface Config { | |
/** Get how many subdirectories should be generated in the dir | |
* (which is at the specified depth) */ | |
public int getNumSubdirs(Path dir, int depth); | |
/** Get the number of .dat files that should be generated in the dir */ | |
public int getNumDatFiles(Path dir, int depth); | |
/** Get the number of .txt files that should be generated in the dir */ | |
public int getNumTxtFiles(Path dir, int depth); | |
/** Get the size a file should have in bytes */ | |
public long getFileSize(Path file); | |
} | |
public class Stats { | |
public Map<Path, Integer> minNumbers = new HashMap<>(); | |
public int numTxts = 0; | |
public int numDats = 0; | |
@Override | |
public String toString() { | |
return "Stats("+numTxts+":"+numDats+")"; | |
} | |
} | |
private final Config config; | |
public DataGenerator(Config config) { | |
this.config = config; | |
} | |
/** | |
* Generate data in a specific directory | |
* @param root The directory to generate data in | |
*/ | |
public Stats generate(Path root) { | |
try { | |
if (Files.list(root).findAny().isPresent()) { | |
throw new IllegalArgumentException("Cannot generate in "+root+" - directory must be empty"); | |
} | |
} catch (IOException ex) {} | |
Stats stats = new Stats(); | |
_generate(root, 1, stats); | |
return stats; | |
} | |
private void _generate(Path dir, int depth, Stats stats) { | |
if (depth > 128) { | |
throw new IndexOutOfBoundsException("Don't you think you've had enough?"); | |
} | |
int numSubdirs = this.config.getNumSubdirs(dir, depth); | |
int numDats = this.config.getNumDatFiles(dir, depth); | |
int numTxts = this.config.getNumTxtFiles(dir, depth); | |
for (int i = 0; i < numSubdirs; ++i) { | |
Path newDir = dir.resolve("dir"+i); | |
if (!newDir.toFile().mkdir()) { | |
System.err.println("Failed to create dir "+newDir.toAbsolutePath()); | |
} else { | |
_generate(newDir, depth + 1, stats); | |
} | |
} | |
for (int i = 0; i < numDats; ++i) { | |
Path newDat = dir.resolve("dat"+i+".dat"); | |
stats.minNumbers.put(newDat, _generateDat(newDat)); | |
++stats.numDats; | |
} | |
for (int i = 0; i < numTxts; ++i) { | |
Path newTxt = dir.resolve("txt"+i+".txt"); | |
stats.minNumbers.put(newTxt, _generateTxt(newTxt)); | |
++stats.numTxts; | |
} | |
System.out.println("Finished generation of "+dir); | |
} | |
/** @return The smallest number in the dat */ | |
private int _generateDat(Path dat) { | |
Random rand = new Random(); | |
long maxSize = this.config.getFileSize(dat); | |
long size = 0; | |
boolean isFirst = true; | |
int min = Integer.MAX_VALUE; | |
try (BufferedWriter writer = Files.newBufferedWriter(dat)) { | |
while (true) { | |
int num = rand.nextInt(Integer.MAX_VALUE); | |
if (num < min) { min = num; } | |
String nextVal = Integer.toString(num); | |
int sizeChange = 1 + nextVal.length(); | |
if (size + sizeChange < maxSize) { | |
if (!isFirst) { writer.write(","); } | |
isFirst = false; | |
writer.write(nextVal); | |
size += sizeChange; | |
} else { | |
break; | |
} | |
// 1/100 change that we'll add a newline | |
// this is done after adding a number, so that lines are never empty | |
if (rand.nextInt(100) == 0) { | |
writer.write("\n"); | |
size += 1; | |
isFirst = true; | |
} | |
} | |
} catch (IOException ex) { | |
System.err.println("Failed to write to file "+dat); | |
ex.printStackTrace(); | |
} | |
return min; | |
} | |
/** @return The smallest number in the txt */ | |
private int _generateTxt(Path txt) { | |
Random rand = new Random(); | |
long maxSize = this.config.getFileSize(txt); | |
long size = 0; | |
boolean isFirst = true; | |
int min = Integer.MAX_VALUE; | |
try (BufferedWriter writer = Files.newBufferedWriter(txt)) { | |
while (true) { | |
int num = rand.nextInt(Integer.MAX_VALUE); | |
if (num < min) { min = num; } | |
String nextVal = Integer.toString(num); | |
int sizeChange = 1 + nextVal.length(); | |
if (size + sizeChange < maxSize) { | |
if (!isFirst) { writer.write(","); } | |
isFirst = false; | |
writer.write(nextVal); | |
size += sizeChange; | |
} else { | |
break; | |
} | |
} | |
} catch (IOException ex) { | |
System.err.println("Failed to write to file "+txt); | |
ex.printStackTrace(); | |
} | |
return min; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package data_generator; | |
import java.io.BufferedWriter; | |
import java.io.IOException; | |
import java.nio.file.Files; | |
import java.nio.file.Path; | |
import java.nio.file.Paths; | |
import java.util.Map.Entry; | |
import java.util.Random; | |
/** | |
* Generates a ~300MB data directory, consisting of: | |
* 2^8 = 256 directories | |
* 10 .dat files in each directory | |
* 10 .txt files in each directory | |
* ~65536 bytes per file | |
* (total of 5120 files, ~335.5MB) | |
* @author me | |
*/ | |
public class Main { | |
public static void main(String[] args) { | |
Random rand = new Random(); | |
DataGenerator.Config config = new DataGenerator.Config() { | |
@Override | |
public int getNumSubdirs(Path dir, int depth) { | |
if (depth < 8) { | |
return 2; | |
} | |
return 0; | |
} | |
@Override | |
public int getNumDatFiles(Path dir, int depth) { | |
return 10; | |
} | |
@Override | |
public int getNumTxtFiles(Path dir, int depth) { | |
return 10; | |
} | |
@Override | |
public long getFileSize(Path file) { | |
return 65536; | |
} | |
}; | |
DataGenerator generator = new DataGenerator(config); | |
Path root = Paths.get("../test_data"); | |
root.toFile().mkdir(); | |
DataGenerator.Stats stats = generator.generate(root); | |
Path meta = root.resolve("meta.generated"); | |
System.out.println("Writing meta to "+meta); | |
try (BufferedWriter writer = Files.newBufferedWriter(meta)) { | |
writer.write(stats.numTxts+"\n"); | |
writer.write(stats.numDats+"\n"); | |
for (Entry<Path, Integer> entry : stats.minNumbers.entrySet()) { | |
writer.write(entry.getKey().toAbsolutePath()+":"+entry.getValue()+"\n"); | |
} | |
} catch (IOException ex) { | |
ex.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment