Created
July 23, 2021 08:56
-
-
Save madsbuch/9af44d172444ba8d1f0eae0b8effac98 to your computer and use it in GitHub Desktop.
Performance test on 2 different parsing strategies for JSON lines files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* This is a small informal performance test to see how it best makes sense to | |
* read JSON-lines files. The JSONL format is a file with a JSON object on each | |
* line. This is opposed to a JSON file with an array on the top level | |
* containing each object. | |
* | |
* I test two evaluation strategies: | |
* | |
* 1. Split the string on new lines and parse each string individually | |
* 2. Massage the string into a stringifies JSON array and parse it | |
* | |
* Note: I use the c1.split("\n").join(",") rather than | |
* c1.replace(new RegExp('\n','g'), ",") as it has shown to be faster. | |
*/ | |
const fs = require("fs") | |
// Write two test files with the same content | |
const content = [...new Array(5000000)].map((b, i) => ({ | |
index: i, | |
text: `some text goes here ${i}`, | |
ending: "yep" | |
})) | |
.map(o => JSON.stringify(o)) | |
.join("\n") | |
fs.writeFileSync("test1.jsonl", content) | |
fs.writeFileSync("test2.jsonl", content) | |
// test parsing lines individually | |
const t0 = performance.now() | |
for (let i = 0; i < 10; i++) { | |
const c = fs.readFileSync("test1.jsonl").toString() | |
c.split("\n").map(str => JSON.parse(str)) | |
} | |
const t1 = performance.now() | |
console.log("Parsing individually took " + (t1 - t0) / 10 + " milliseconds.") | |
// Rewrite to JSON array and parse once | |
const r = new RegExp('\n','g') | |
const t10 = performance.now() | |
for (let i = 0; i < 10; i++) { | |
const c1 = fs.readFileSync("test2.jsonl").toString() | |
JSON.parse("[" + c1.split("\n").join(",") + "]") | |
} | |
const t11 = performance.now() | |
console.log("Rewrite and parse took " + (t11 - t10) / 10 + " milliseconds.") | |
/** | |
* With result: | |
* | |
* Parsing individually took 5124.440068499744 milliseconds. | |
* Rewrite and parse took 5645.711843699962 milliseconds. | |
* | |
* It seems like parse individually wins with around 10% which is significant. | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment