-
-
Save designeng/f34970ef16f6d2ebcb58e0da90e1c3b9 to your computer and use it in GitHub Desktop.
Recursion through a Cheerio.js object and writing to .CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*An object created to parse through a large number of HTML | |
blocks quickly. Used with cheerio.js. Begin via: | |
parse.run($('some-div')[0]) | |
*/ | |
var fs = require('fs'), | |
cheerio = require('cheerio'); | |
var Parse = function(block) { | |
this.block = {}; //top level node | |
this.depth = 0; //number of 1st gen | |
this.stats = []; //array to push successful parses | |
}; | |
Parse.prototype = { | |
run: function(block) { | |
this.block = block; //top level node | |
this.depth = block.children.length; //number of 1st gen | |
this.stats = []; //array to push successful parses | |
//begin | |
this.parseRecursion(this.block); | |
}, | |
parseRecursion: function(node) { | |
var nodes; | |
nodes = node.children; | |
this.parseLoopAllChildren(nodes); | |
if (this.depth === 0) { | |
this.outputParse(); | |
} | |
}, | |
parseLoopAllChildren: function(nodes) { | |
if(nodes) { | |
var node; | |
for (var obj in nodes) { | |
node = nodes[obj]; | |
//If || Switch blocks here depending on what you | |
//are parsing for. | |
if (node.type == 'text') { | |
this.cleanLine(node.data); | |
} | |
//Continue recursion on node if more children nodes | |
if (node.children) { | |
this.parseRecursion(node); | |
} | |
//If the nodes parent is top level, node complete | |
if(node.parent == this.block) { | |
this.depth--; | |
} | |
} | |
} | |
}, | |
//Output and cleaning functions | |
outputParse: function() { | |
//Output function will vary depending on parsing goals | |
var result = []; | |
for (var i = 0; i < this.stats.length; i++) { | |
switch(this.stats[i]) { | |
case 'First Name': | |
result[0] = (this.cleanComma(this.stats[i + 2])); | |
break; | |
case 'Last Name:': | |
result[1] = (this.cleanComma(this.stats[i + 1])); | |
break; | |
case 'Email:': | |
result[6] = (this.cleanComma(this.stats[i + 1])); | |
break; | |
case 'Website:': | |
result[7] = (this.cleanComma(this.stats[i + 1])); | |
break; | |
} | |
} | |
fs.appendFile('test.csv', this.num + ', ' + result.join(', ') + '\n', function(err) { | |
if (err) throw err; | |
}); | |
}, | |
cleanComma: function(str) { | |
return (str) ? str.replace(',', ' ') : str; | |
}, | |
cleanLine: function(line) { | |
if (line[0] != '\r' && line[0] != ' ') { | |
this.stats.push(line); | |
} else { | |
var newLine = ''; | |
for (var i = 0; i < line.length; i++) { | |
if (line[i] != '\r' && line[i] != '\n' && line[i] != '\t' && line[i] != ' ' && line[i] != String.fromCharCode(0xC2)) { | |
newLine += line[i]; | |
} | |
} | |
if (newLine !== '') { | |
this.stats.push(newLine); | |
} | |
} | |
} | |
}; | |
module.exports = Parse; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment