Skip to content

Instantly share code, notes, and snippets.

@daformat
Last active January 10, 2017 04:09
Show Gist options
  • Save daformat/dd07d781286b812500cb77519d9a9bf1 to your computer and use it in GitHub Desktop.
Save daformat/dd07d781286b812500cb77519d9a9bf1 to your computer and use it in GitHub Desktop.
[data.gouv.fr] - Extract JSON formatted table of content for any given french legal code (Légifrance's Codes en vigueur)
// Extract JSON table of content from any given french Legal code
// ==============================================================
// author: @daformat <mat.jouhet[at][google's mail service].com>
// lastmod: 2017/01/10
//
// Usage:
// ------
// Execute in javascript console while browsing the toc you're interested in
// see comments if you need to tweak anything.
//
// Notes:
// ------
// • Légifrance gives you access to their entire dataset via XML files.
// So, **if you need to fequently grab data**, you shoud definitely take
// a look at the french government open-data sets. The one we're intersted in:
// http://www.data.gouv.fr/fr/datasets/legi-codes-lois-et-reglements-consolides/
// Should tell you to go to ftp://echanges.dila.gouv.fr/LEGI/
//
// • Obviously, this script will break if Légifrance changes their layout
// It was done as a quick way to get data from Légifrance without the pain
// of dealing with the XMLs. Given the very short delay for the the first release
// of the project we're working on, we chose the quick and easy way.
//
// • The JSON data is formatted so you can use it ootb with react-treebeard
// https://github.com/alexcurtis/react-treebeard/
//
// Buy me a coffee?
// ----------------
// Want to buy me a coffee? http://ko-fi.com/A581HY1
//
// Feel free to ping me on your favorite social network
// Change this to false if you just want the TOC
// without the articles as children
var fetchArticleList = true;
// format to JSON helper
format2JSON = function(what) {
// Format To JSON
// Change params to (legiToc, null, '\t') if you prefer tabs for indent
// Change params to (legiToc) if you want it minified
return JSON.stringify(what, null, ' ');
}
// Copy result helper
copyResult = function(what) {
try {
copy(what);
console.info('JSON data was copied to clipboard');
} catch(e) {
// If the copy command failed, we'll display the object that was to be
// copied, so the user can always manually copy it.
// We also suggest trying to run manually the copy command.
var lines = [
'Sorry about that, but the above data couldn’t be copied to clipboard',
'Maybe your browser doesn’t support the copy command line API?',
'You should try manually running `copy(jsonResult)`'
];
console.log(what);
console.error(lines[0] + '\n' + lines[1]);
console.info(lines[2]);
// See next comment
if (copyCommandWasCopiedToClipboard === true) {
console.log('(ironically, this command was copied to your clipboard)');
}
}
}
// It seems the copy command line API doesn't like when we're using it
// against big datasets in the script, so, as a fallback we first copy
// the copy command to clipboard (how ironic)
try {
copy('try {copy(jsonResult); console.info(\'Data was sucessfully copied to your clipboard\')} catch (e) {console.error(\'No, really, we tried, but you have to manually copy the data\')}');
var copyCommandWasCopiedToClipboard = true;
} catch(e) {
var copyCommandWasCopiedToClipboard = false;
}
// Just so we don't have to do this all the time
var base = window.location.protocol+'/'+'/'+window.location.hostname+'/';
// =======================================================
// STEP 1 - Get the table of content from the current page
// =======================================================
//
// Recursive function to walk through the tree
traverse = function(node, destinationArray) {
// console.log('Traversing', node, destinationArray);
var main = $(node);
var currentNodeData = {};
var children = [];
// Extract data
var nodeName = $.trim($(node).children('span').eq(0).text());
var legiLink = $(node).children('span.codeLienArt');
try {
var legiLinkCleanUrl = legiLink.eq(0).children('a').attr('href').replace(/;jsessionid=[^\?]+\?/,'?');
} catch(e) {
var legiLinkCleanUrl = '';
}
// Traverse inner sections
$('>ul>li', main).each(function() {
var section = [];
traverse(this, section);
children.push(section[0]);
});
// Store extracted node data
if(children.length === 0) {
currentNodeData = {name: nodeName, section: true};
} else {
currentNodeData = {name: nodeName, section: true, 'children': children, toggled: true};
}
// Add source url and article range if present
if (legiLink.length !== 0) {
currentNodeData.name += ' ' + $.trim(legiLink.eq(0).text().replace(/\s+\)/, ')'));
currentNodeData.source = base + legiLinkCleanUrl;
}
destinationArray.push(currentNodeData);
}
// Initialize our main variable
var legiToc = [];
// Start traversing the current page TOC
$('#content_left>.data>ul.noType>li').each(function(){
traverse(this, legiToc);
});
// Format To JSON
var jsonResult = format2JSON(legiToc);
// If fetchArticleList is set to false, we'll try to copy the data to clipboard
// and STOP right after doing so.
// ------------------------------
if ( fetchArticleList === false ) {
// Copy data to clipboard
copyResult(jsonResult);
// If not, we keep going
} else {
// ===========================
// STEP 2 - fetch article list
// ===========================
// If the fetchArticleList variable is set to true we'll go fetch articles titles
// from sections when their source prop is defined and not empty
//
// We'll use this to get to know when jquery finishes fetching everything.
var pendingRequests = [];
var succeededRequests = 0;
var errors = [];
// This function loads the section page found at `url` via a get request
// It extracts article names + shortnames from the page and store the
// data as an object in the children array property of the destination object
fetchSectionArticleList = function(url, destObj){
// Our succeed handler
var onSucceed = (function(destO, href) {
return function(d){
// This was tricky, had to use this weird concatenation to get something
// I could get jQuery to work with.
var $d = $('\
' + d + '\
');
// We'll append sectionChildren's values before any other children in the section
var sectionChildren = [];
// Grab every article title
var $titreArt = $d.find('.titreArt');
var articlesNames = $titreArt.text().replace(/\s\(abrogé(.*)/g, '').replace(/\s\En savoir plus(.*)/g, '').split('\n');
// Loop through each article
for ( var i = 0; i < articlesNames.length; i++) {
if (articlesNames[i] != '') {
// Extract name / shortname
var articleLink = base + $titreArt.eq(i).children('a').attr('href').replace(/;jsessionid=[^\?]+\?/,'?');
var articleName = $.trim(articlesNames[i]);
var shortname = articleName.replace(/Article /g,'');
// Push result
sectionChildren.push({name: articleName, 'shortname': shortname, source: articleLink});
} else {
articlesNames.splice(i, 1);
}
}
if (sectionChildren.length > 0) {
if(destO.children) {
// Concat articles with other child sections
destO.children = sectionChildren.concat(destO.children);
} else {
// Store articles
destO.children = sectionChildren;
}
}
// Let's increment our succeeded counter
succeededRequests++;
// We're done, calculate stats and display a recap in the console
var stats = {
total: succeededRequests + pendingRequests.length,
succeeded: succeededRequests,
'errors': errors.length
};
var percentage = parseInt(stats.succeeded / stats.total * 100);
var errorMsg = (stats.errors > 0 ? (stats.errors + ' error' + (stats.errors.length > 1 ? 's':'') + ' ') : '');
var msg = '[ ' + percentage + '% ] ' + errorMsg + stats.succeeded + '/' + stats.total + ' - fetched';
console.info(msg, href);
console.log(destObj.name);
try {
console.table(destObj.children);
console.log('');
} catch(e) {
console.log(destObj.children);
}
// Trigger the next request
nextRequest();
};
})(destObj, url);
// Error handler
var onFail = function(d){
console.error('An error occured during the get request', d);
errors.push({href: url, response: d});
nextRequest();
};
// jQuery promise
var jqHxr = $.get(url, null, onSucceed).fail(onFail);
}
// Our second recursive function will push pending requests
// We'll then use another function to sequentially trigger the requests
traverse2 = function (obj) {
var currentNode = obj;
// If we have sources fetch article list
if (currentNode.source && currentNode.source !== '') {
pendingRequests.push({url: currentNode.source, node: currentNode});
console.log("Will fetch articles in", currentNode.name);
}
// Recursively walk through each of the current node children
try {
for (var i = 0; i < currentNode.children.length; i++) {
traverse2(currentNode.children[i]);
}
} catch(e) {
if(!currentNode.source) {
console.info("Empty section detected!", currentNode.name, currentNode);
}
}
}
// We now have to call the recursive function on each legiToc prop
for (var key in legiToc) {
// Skip loop if the property is from prototype
if (!legiToc.hasOwnProperty(key)) continue;
var node = legiToc[key];
traverse2(node);
}
// Setup the nextRequest function
nextRequest = function() {
// Are there any pending request?
if (pendingRequests.length > 0) {
var request = pendingRequests.shift();
fetchSectionArticleList(request.url, request.node);
// If not, let's display the final output
} else {
// Format To JSON
jsonResult = format2JSON(legiToc);
// Copy data to clipboard
copyResult(jsonResult);
console.log('');
// Display errors, if any
if(errors.length > 0) {
console.info('\n' + (succeededRequests === 0 ? 'No' : succeededRequests) + ' request' + (succeededRequests > 1 ? 's' : '') + ' succeeded')
console.error(errors.length + ' failed request' + (errors.length > 1 ? 's' : ''), errors);
} else {
console.info('Done, no error occured.');
}
}
}
// Finally, trigger the first request
nextRequest();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment