Created
September 17, 2023 20:20
-
-
Save maxsei/f97216d03b0e665674fdd40c1402ee5c to your computer and use it in GitHub Desktop.
two csv parser implemenations in zig
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const std = @import("std"); | |
| const mem = std.mem; | |
| const Allocator = mem.Allocator; | |
| const log = std.log; | |
| const Csv = struct { | |
| fields: std.ArrayList([][]const u8), | |
| backing: ?[]const u8, | |
| allocator: Allocator, | |
| arena: ?std.heap.ArenaAllocator, | |
| const Self = @This(); | |
| pub fn deinit(self: *Self) void { | |
| if (self.backing != null) self.allocator.free(self.backing.?); | |
| if (self.arena != null) self.arena.?.deinit(); | |
| self.fields.deinit(); | |
| } | |
| }; | |
| pub fn GetCsv1(path: []const u8, allocator: Allocator) !Csv { | |
| // Open the file | |
| const file = try std.fs.openFileAbsolute(path, .{}); | |
| defer file.close(); | |
| var buf = try file.readToEndAllocOptions(allocator, 0xefffffff, 0x4000, @alignOf(u8), null); | |
| // defer allocator.free(buf); | |
| var csv = std.ArrayList([][]const u8).init(allocator); | |
| // defer csv.deinit(); | |
| var fields = std.ArrayList([]const u8).init(allocator); | |
| defer fields.deinit(); | |
| var cur: []u8 = buf[0..]; | |
| while (cur.len > 0) { | |
| var i: usize = 0; | |
| while (cur.len > 0) { | |
| if (cur[i] == '\n') { | |
| try fields.append(cur[0..i]); | |
| try csv.append(try fields.toOwnedSlice()); | |
| cur = cur[i + 1 ..]; | |
| if (csv.items.len % 10000 == 0) { | |
| log.info("lines: {d} cur.len: {d} i: {d}", .{ csv.items.len, cur.len, i }); | |
| } | |
| break; | |
| } else if (cur[i] == ',') { | |
| try fields.append(cur[0..i]); | |
| cur = cur[i + 1 ..]; | |
| i = 0; | |
| continue; | |
| } | |
| i += 1; | |
| } | |
| } | |
| return Csv{ | |
| .backing = buf, | |
| .fields = csv, | |
| .allocator = allocator, | |
| .arena = null, | |
| }; | |
| } | |
| pub fn GetCsv2(path: []const u8, allocator: Allocator) !Csv { | |
| const file = try std.fs.openFileAbsolute(path, .{}); | |
| defer file.close(); | |
| var buf: [4096]u8 = undefined; | |
| _ = try file.read(buf[0..]); | |
| var fields = std.ArrayList([][]const u8).init(allocator); | |
| var arena = std.heap.ArenaAllocator.init(allocator); | |
| var line = std.ArrayList([]const u8).init(allocator); | |
| defer line.deinit(); | |
| const delimeters = ",\n"; | |
| var cur: []u8 = buf[0..]; | |
| var maybe_extra: ?[]u8 = null; | |
| while (true) { | |
| const tok_end = std.mem.indexOfAny(u8, cur, delimeters) orelse { | |
| if (cur.len == buf.len) { | |
| return error.NotEnoughBufferSpaceToFindAnyTokens; | |
| } | |
| if (cur.len > 0) { | |
| maybe_extra = try arena.allocator().alloc(u8, cur.len); | |
| @memcpy(maybe_extra.?, cur); | |
| } | |
| const n = try file.read(buf[0..]); | |
| if (n == 0) return Csv{ | |
| .fields = fields, | |
| .allocator = allocator, | |
| .arena = arena, | |
| .backing = null, | |
| }; | |
| cur = buf[0..n]; | |
| continue; | |
| }; | |
| const tok = cur[0..tok_end]; | |
| { | |
| if (maybe_extra != null) { | |
| const extra = maybe_extra.?; | |
| var token_copy = try arena.allocator().realloc(extra, extra.len + tok.len); | |
| @memcpy(token_copy[extra.len..], tok); | |
| try line.append(token_copy); | |
| maybe_extra = null; | |
| } else { | |
| const token_copy = try arena.allocator().alloc(u8, tok.len); | |
| @memcpy(token_copy, tok); | |
| try line.append(token_copy); | |
| } | |
| } | |
| if (cur[tok_end] == '\n') { | |
| try fields.append(try line.toOwnedSlice()); | |
| } | |
| cur = cur[tok_end + 1 ..]; | |
| } | |
| unreachable; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment