Skip to content

Instantly share code, notes, and snippets.

@RealNeGate
Last active November 27, 2023 11:41

Revisions

  1. RealNeGate revised this gist Nov 27, 2023. 1 changed file with 1 addition and 2 deletions.
    3 changes: 1 addition & 2 deletions csv.java
    Original file line number Diff line number Diff line change
    @@ -60,8 +60,7 @@ void parse(byte data[]) {
    } else if (c == ',') {
    parsed.put(i);
    } else if (c == '"') {
    while (data[i] != '"') { i++; }
    i += 1;
    while (data[i++] != '"') {}
    }
    } while (i < data.length);
    }
  2. RealNeGate created this gist Nov 25, 2023.
    115 changes: 115 additions & 0 deletions csv.java
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,115 @@
    import java.nio.file.*;
    import java.io.*;
    import java.util.*;
    import java.nio.charset.StandardCharsets;

    class csv {

    // i don't want all my fucking int array elements boxed.
    static class IntArray {
    int cnt;
    int[] data;

    IntArray(int init_cap) { data = new int[init_cap]; cnt = 0; }
    void reset() { cnt = 0; }
    int get(int i) { return data[i]; }
    void put(int v) {
    // grow array
    if (cnt == data.length) {
    int[] new_data = new int[cnt * 2];
    System.arraycopy(data, 0, new_data, 0, cnt);
    data = new_data;
    }
    data[cnt++] = v;
    }
    }

    static class CSV {
    // points into data to show where rows start
    IntArray rows;
    // represent slices:
    // data[parsed[i*2 + 0] .. parsed[i*2 + 1])
    IntArray parsed;
    // Actual file bytes
    byte[] data;

    CSV() {
    rows = new IntArray(2*1024*1024);
    parsed = new IntArray(2*1024*1024);
    }

    void parse(byte data[]) {
    this.data = data;
    rows.reset();
    parsed.reset();

    // for each line
    int i = 0;
    while (i < data.length) {
    rows.put(parsed.cnt);
    parsed.put(i);

    do {
    byte c = data[i++];
    if (c == '\r' || c == '\n') {
    // line break (CRLF included)
    if (i < data.length && c + data[i] == '\r' + '\n') {
    i += 1;
    }
    break;
    } else if (c == ',') {
    parsed.put(i);
    } else if (c == '"') {
    while (data[i] != '"') { i++; }
    i += 1;
    }
    } while (i < data.length);
    }

    // write out EOF
    rows.put(parsed.cnt);
    parsed.put(data.length);
    }

    int row_len(int i) { return rows.get(i + 1) - rows.get(i); }
    String entry(int i, int j) {
    int k = rows.get(i);
    int start = parsed.get(k+j);
    int end = parsed.get(k+j+1) - 1;
    return new String(data, start, end - start, StandardCharsets.UTF_8);
    }
    };

    public static void main(String[] args) throws IOException {
    if (args[0].equals("-single")) {
    // single test
    byte[] data = Files.readAllBytes(Paths.get(args[1]));
    var csv = new CSV();
    csv.parse(data);
    System.out.println(csv.entry(csv.rows.cnt - 2, 2));
    } else {
    byte[] data = Files.readAllBytes(Paths.get(args[0]));
    var csv = new CSV();

    long avg = 0;
    for (int i = 0; i < 10; i++) {
    long start = System.nanoTime();
    csv.parse(data);
    long elapsed = System.nanoTime() - start;
    avg += elapsed / 10;

    double sec = elapsed / 1000000000.0;
    double bw = (double) data.length / sec;
    bw /= 1048576.0f;

    System.out.printf("Run %d: %f MB/s (avg %f seconds)\n", 1+i, bw, sec, csv.entry(csv.rows.cnt - 2, 2));
    }

    double sec = avg / 1000000000.0;
    double bw = ((double) data.length / sec);
    bw /= 1048576.0f;

    System.out.printf("Average nanos: %f MB/s (avg %f seconds)", bw, sec);
    }
    }
    }