Created
November 10, 2021 17:50
-
-
Save jamesseanwright/33b51a5bba2280cb9b74f72e9febe3cd to your computer and use it in GitHub Desktop.
Serialising a HTML table to CSV format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
const { parseHTML } = require('linkedom'); | |
const createSerialiser = serialiser => | |
element => { | |
let result = ''; | |
const traverse = el => { | |
const serialisedEl = serialiser(el, result.length); | |
if (serialisedEl) { | |
result += serialisedEl; | |
} | |
for (const child of Array.from(el.children)) { | |
traverse(child); | |
} | |
return `${result}\n`; | |
}; | |
return traverse(element); | |
}; | |
const createLineSerialiser = () => createSerialiser( | |
(el, len) => ['th', 'td'].includes(el.tagName.toLowerCase()) | |
&& `${len ? ',' : ''}${el.textContent}` | |
); | |
const serialiseTree = createSerialiser( | |
el => el.tagName.toLowerCase() === 'tr' && createLineSerialiser()(el) | |
); | |
const table = ` | |
<table> | |
<colgroup span="4"></colgroup> | |
<tr> | |
<th>Countries</th> | |
<th>Capitals</th> | |
<th>Population</th> | |
<th>Language</th> | |
</tr> | |
<tr> | |
<td>USA</td> | |
<td>Washington, D.C.</td> | |
<td>309 million</td> | |
<td>English</td> | |
</tr> | |
<tr> | |
<td>Sweden</td> | |
<td>Stockholm</td> | |
<td>9 million</td> | |
<td>Swedish</td> | |
</tr> | |
</table> | |
`; | |
const { document } = parseHTML(table); | |
const tableElement = document.querySelector('table'); | |
console.log(serialiseTree(tableElement)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment