Last active
January 10, 2017 04:09
-
-
Save daformat/dd07d781286b812500cb77519d9a9bf1 to your computer and use it in GitHub Desktop.
[data.gouv.fr] - Extract JSON formatted table of content for any given french legal code (Légifrance's Codes en vigueur)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Extract JSON table of content from any given french Legal code | |
// ============================================================== | |
// author: @daformat <mat.jouhet[at][google's mail service].com> | |
// lastmod: 2017/01/10 | |
// | |
// Usage: | |
// ------ | |
// Execute in javascript console while browsing the toc you're interested in | |
// see comments if you need to tweak anything. | |
// | |
// Notes: | |
// ------ | |
// • Légifrance gives you access to their entire dataset via XML files. | |
// So, **if you need to fequently grab data**, you shoud definitely take | |
// a look at the french government open-data sets. The one we're intersted in: | |
// http://www.data.gouv.fr/fr/datasets/legi-codes-lois-et-reglements-consolides/ | |
// Should tell you to go to ftp://echanges.dila.gouv.fr/LEGI/ | |
// | |
// • Obviously, this script will break if Légifrance changes their layout | |
// It was done as a quick way to get data from Légifrance without the pain | |
// of dealing with the XMLs. Given the very short delay for the the first release | |
// of the project we're working on, we chose the quick and easy way. | |
// | |
// • The JSON data is formatted so you can use it ootb with react-treebeard | |
// https://github.com/alexcurtis/react-treebeard/ | |
// | |
// Buy me a coffee? | |
// ---------------- | |
// Want to buy me a coffee? http://ko-fi.com/A581HY1 | |
// | |
// Feel free to ping me on your favorite social network | |
// Change this to false if you just want the TOC | |
// without the articles as children | |
var fetchArticleList = true; | |
// format to JSON helper | |
format2JSON = function(what) { | |
// Format To JSON | |
// Change params to (legiToc, null, '\t') if you prefer tabs for indent | |
// Change params to (legiToc) if you want it minified | |
return JSON.stringify(what, null, ' '); | |
} | |
// Copy result helper | |
copyResult = function(what) { | |
try { | |
copy(what); | |
console.info('JSON data was copied to clipboard'); | |
} catch(e) { | |
// If the copy command failed, we'll display the object that was to be | |
// copied, so the user can always manually copy it. | |
// We also suggest trying to run manually the copy command. | |
var lines = [ | |
'Sorry about that, but the above data couldn’t be copied to clipboard', | |
'Maybe your browser doesn’t support the copy command line API?', | |
'You should try manually running `copy(jsonResult)`' | |
]; | |
console.log(what); | |
console.error(lines[0] + '\n' + lines[1]); | |
console.info(lines[2]); | |
// See next comment | |
if (copyCommandWasCopiedToClipboard === true) { | |
console.log('(ironically, this command was copied to your clipboard)'); | |
} | |
} | |
} | |
// It seems the copy command line API doesn't like when we're using it | |
// against big datasets in the script, so, as a fallback we first copy | |
// the copy command to clipboard (how ironic) | |
try { | |
copy('try {copy(jsonResult); console.info(\'Data was sucessfully copied to your clipboard\')} catch (e) {console.error(\'No, really, we tried, but you have to manually copy the data\')}'); | |
var copyCommandWasCopiedToClipboard = true; | |
} catch(e) { | |
var copyCommandWasCopiedToClipboard = false; | |
} | |
// Just so we don't have to do this all the time | |
var base = window.location.protocol+'/'+'/'+window.location.hostname+'/'; | |
// ======================================================= | |
// STEP 1 - Get the table of content from the current page | |
// ======================================================= | |
// | |
// Recursive function to walk through the tree | |
traverse = function(node, destinationArray) { | |
// console.log('Traversing', node, destinationArray); | |
var main = $(node); | |
var currentNodeData = {}; | |
var children = []; | |
// Extract data | |
var nodeName = $.trim($(node).children('span').eq(0).text()); | |
var legiLink = $(node).children('span.codeLienArt'); | |
try { | |
var legiLinkCleanUrl = legiLink.eq(0).children('a').attr('href').replace(/;jsessionid=[^\?]+\?/,'?'); | |
} catch(e) { | |
var legiLinkCleanUrl = ''; | |
} | |
// Traverse inner sections | |
$('>ul>li', main).each(function() { | |
var section = []; | |
traverse(this, section); | |
children.push(section[0]); | |
}); | |
// Store extracted node data | |
if(children.length === 0) { | |
currentNodeData = {name: nodeName, section: true}; | |
} else { | |
currentNodeData = {name: nodeName, section: true, 'children': children, toggled: true}; | |
} | |
// Add source url and article range if present | |
if (legiLink.length !== 0) { | |
currentNodeData.name += ' ' + $.trim(legiLink.eq(0).text().replace(/\s+\)/, ')')); | |
currentNodeData.source = base + legiLinkCleanUrl; | |
} | |
destinationArray.push(currentNodeData); | |
} | |
// Initialize our main variable | |
var legiToc = []; | |
// Start traversing the current page TOC | |
$('#content_left>.data>ul.noType>li').each(function(){ | |
traverse(this, legiToc); | |
}); | |
// Format To JSON | |
var jsonResult = format2JSON(legiToc); | |
// If fetchArticleList is set to false, we'll try to copy the data to clipboard | |
// and STOP right after doing so. | |
// ------------------------------ | |
if ( fetchArticleList === false ) { | |
// Copy data to clipboard | |
copyResult(jsonResult); | |
// If not, we keep going | |
} else { | |
// =========================== | |
// STEP 2 - fetch article list | |
// =========================== | |
// If the fetchArticleList variable is set to true we'll go fetch articles titles | |
// from sections when their source prop is defined and not empty | |
// | |
// We'll use this to get to know when jquery finishes fetching everything. | |
var pendingRequests = []; | |
var succeededRequests = 0; | |
var errors = []; | |
// This function loads the section page found at `url` via a get request | |
// It extracts article names + shortnames from the page and store the | |
// data as an object in the children array property of the destination object | |
fetchSectionArticleList = function(url, destObj){ | |
// Our succeed handler | |
var onSucceed = (function(destO, href) { | |
return function(d){ | |
// This was tricky, had to use this weird concatenation to get something | |
// I could get jQuery to work with. | |
var $d = $('\ | |
' + d + '\ | |
'); | |
// We'll append sectionChildren's values before any other children in the section | |
var sectionChildren = []; | |
// Grab every article title | |
var $titreArt = $d.find('.titreArt'); | |
var articlesNames = $titreArt.text().replace(/\s\(abrogé(.*)/g, '').replace(/\s\En savoir plus(.*)/g, '').split('\n'); | |
// Loop through each article | |
for ( var i = 0; i < articlesNames.length; i++) { | |
if (articlesNames[i] != '') { | |
// Extract name / shortname | |
var articleLink = base + $titreArt.eq(i).children('a').attr('href').replace(/;jsessionid=[^\?]+\?/,'?'); | |
var articleName = $.trim(articlesNames[i]); | |
var shortname = articleName.replace(/Article /g,''); | |
// Push result | |
sectionChildren.push({name: articleName, 'shortname': shortname, source: articleLink}); | |
} else { | |
articlesNames.splice(i, 1); | |
} | |
} | |
if (sectionChildren.length > 0) { | |
if(destO.children) { | |
// Concat articles with other child sections | |
destO.children = sectionChildren.concat(destO.children); | |
} else { | |
// Store articles | |
destO.children = sectionChildren; | |
} | |
} | |
// Let's increment our succeeded counter | |
succeededRequests++; | |
// We're done, calculate stats and display a recap in the console | |
var stats = { | |
total: succeededRequests + pendingRequests.length, | |
succeeded: succeededRequests, | |
'errors': errors.length | |
}; | |
var percentage = parseInt(stats.succeeded / stats.total * 100); | |
var errorMsg = (stats.errors > 0 ? (stats.errors + ' error' + (stats.errors.length > 1 ? 's':'') + ' ') : ''); | |
var msg = '[ ' + percentage + '% ] ' + errorMsg + stats.succeeded + '/' + stats.total + ' - fetched'; | |
console.info(msg, href); | |
console.log(destObj.name); | |
try { | |
console.table(destObj.children); | |
console.log(''); | |
} catch(e) { | |
console.log(destObj.children); | |
} | |
// Trigger the next request | |
nextRequest(); | |
}; | |
})(destObj, url); | |
// Error handler | |
var onFail = function(d){ | |
console.error('An error occured during the get request', d); | |
errors.push({href: url, response: d}); | |
nextRequest(); | |
}; | |
// jQuery promise | |
var jqHxr = $.get(url, null, onSucceed).fail(onFail); | |
} | |
// Our second recursive function will push pending requests | |
// We'll then use another function to sequentially trigger the requests | |
traverse2 = function (obj) { | |
var currentNode = obj; | |
// If we have sources fetch article list | |
if (currentNode.source && currentNode.source !== '') { | |
pendingRequests.push({url: currentNode.source, node: currentNode}); | |
console.log("Will fetch articles in", currentNode.name); | |
} | |
// Recursively walk through each of the current node children | |
try { | |
for (var i = 0; i < currentNode.children.length; i++) { | |
traverse2(currentNode.children[i]); | |
} | |
} catch(e) { | |
if(!currentNode.source) { | |
console.info("Empty section detected!", currentNode.name, currentNode); | |
} | |
} | |
} | |
// We now have to call the recursive function on each legiToc prop | |
for (var key in legiToc) { | |
// Skip loop if the property is from prototype | |
if (!legiToc.hasOwnProperty(key)) continue; | |
var node = legiToc[key]; | |
traverse2(node); | |
} | |
// Setup the nextRequest function | |
nextRequest = function() { | |
// Are there any pending request? | |
if (pendingRequests.length > 0) { | |
var request = pendingRequests.shift(); | |
fetchSectionArticleList(request.url, request.node); | |
// If not, let's display the final output | |
} else { | |
// Format To JSON | |
jsonResult = format2JSON(legiToc); | |
// Copy data to clipboard | |
copyResult(jsonResult); | |
console.log(''); | |
// Display errors, if any | |
if(errors.length > 0) { | |
console.info('\n' + (succeededRequests === 0 ? 'No' : succeededRequests) + ' request' + (succeededRequests > 1 ? 's' : '') + ' succeeded') | |
console.error(errors.length + ' failed request' + (errors.length > 1 ? 's' : ''), errors); | |
} else { | |
console.info('Done, no error occured.'); | |
} | |
} | |
} | |
// Finally, trigger the first request | |
nextRequest(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment