daformat · January 10, 2017 04:09
diff --git a/extract-LegifranceToc-Json.js b/extract-LegifranceToc-Json.js
 // Extract JSON table of content from any given french Legal code
 // ==============================================================
 // author: @daformat <mat.jouhet[at][google's mail service].com>
 // lastmod: 2017/01/10
 //
 // Usage:
 // ------
 // Execute in javascript console while browsing the toc you're interested in
 // see comments if you need to tweak anything.
 //
 // Notes:
 // ------
 // • Légifrance gives you access to their entire dataset via XML files.
 //   So, **if you need to fequently grab data**, you shoud definitely take
 //   a look at the french government open-data sets. The one we're intersted in:
 //   http://www.data.gouv.fr/fr/datasets/legi-codes-lois-et-reglements-consolides/
 //   Should tell you to go to ftp://echanges.dila.gouv.fr/LEGI/
 //
 // • Obviously, this script will break if Légifrance changes their layout
 //   It was done as a quick way to get data from Légifrance without the pain
 //   of dealing with the XMLs. Given the very short delay for the the first release
 //   of the project we're working on, we chose the quick and easy way.
 //
 // • The JSON data is formatted so you can use it ootb with react-treebeard
 //   https://github.com/alexcurtis/react-treebeard/
 //
 // Buy me a coffee?
 // ----------------
 // Want to buy me a coffee? http://ko-fi.com/A581HY1
 //
 // Feel free to ping me on your favorite social network

 // Change this to false if you just want the TOC
 // without the articles as children
 var fetchArticleList = true;

 // format to JSON helper
 format2JSON = function(what) {
  // Format To JSON
  // Change params to (legiToc, null, '\t') if you prefer tabs for indent
  // Change params to (legiToc) if you want it minified
  return JSON.stringify(what, null, ' ');
 }

 // Copy result helper
 copyResult = function(what) {
  try {
    copy(what);
    console.info('JSON data was copied to clipboard');
  } catch(e) {
    // If the copy command failed, we'll display the object that was to be
    // copied, so the user can always manually copy it.
    // We also suggest trying to run manually the copy command.
    var lines = [
      'Sorry about that, but the above data couldn’t be copied to clipboard',
      'Maybe your browser doesn’t support the copy command line API?',
      'You should try manually running `copy(jsonResult)`'
    ];

    console.log(what);
    console.error(lines[0] + '\n' + lines[1]);
    console.info(lines[2]);

    // See next comment
    if (copyCommandWasCopiedToClipboard === true) {
      console.log('(ironically, this command was copied to your clipboard)');
    }
  }
 }

 // It seems the copy command line API doesn't like when we're using it
 // against big datasets in the script, so, as a fallback we first copy
 // the copy command to clipboard (how ironic)
 try {
  copy('try {copy(jsonResult); console.info(\'Data was sucessfully copied to your clipboard\')} catch (e) {console.error(\'No, really, we tried, but you have to manually copy the data\')}');
  var copyCommandWasCopiedToClipboard = true;
 } catch(e) {
  var copyCommandWasCopiedToClipboard = false;
 }

 // Just so we don't have to do this all the time
 var base = window.location.protocol+'/'+'/'+window.location.hostname+'/';

 // =======================================================
 // STEP 1 - Get the table of content from the current page
 // =======================================================
 //
 // Recursive function to walk through the tree
 traverse = function(node, destinationArray) {

  // console.log('Traversing', node, destinationArray);
  var main = $(node);
  var currentNodeData = {};
  var children = [];

  // Extract data
  var nodeName = $.trim($(node).children('span').eq(0).text());
  var legiLink = $(node).children('span.codeLienArt');
  try {
    var legiLinkCleanUrl = legiLink.eq(0).children('a').attr('href').replace(/;jsessionid=[^\?]+\?/,'?');
  } catch(e) {
    var legiLinkCleanUrl = '';
  }

  // Traverse inner sections
  $('>ul>li', main).each(function() {
      var section = [];
      traverse(this, section);
      children.push(section[0]);
  });

  // Store extracted node data
  if(children.length === 0) {
    currentNodeData = {name: nodeName, section: true};
  } else {
    currentNodeData = {name: nodeName, section: true, 'children': children, toggled: true};
  }

  // Add source url and article range if present
  if (legiLink.length !== 0) {
      currentNodeData.name += ' ' + $.trim(legiLink.eq(0).text().replace(/\s+\)/, ')'));
      currentNodeData.source = base + legiLinkCleanUrl;
  }

  destinationArray.push(currentNodeData);
 }

 // Initialize our main variable
 var legiToc = [];

 // Start traversing the current page TOC
 $('#content_left>.data>ul.noType>li').each(function(){
  traverse(this, legiToc);
 });

 // Format To JSON
 var jsonResult = format2JSON(legiToc);

 // If fetchArticleList is set to false, we'll try to copy the data to clipboard
 // and STOP right after doing so.
 // ------------------------------
 if ( fetchArticleList === false ) {
  // Copy data to clipboard
  copyResult(jsonResult);

 // If not, we keep going
 } else {

  // ===========================
  // STEP 2 - fetch article list
  // ===========================
  // If the fetchArticleList variable is set to true we'll go fetch articles titles
  // from sections when their source prop is defined and not empty
  //
  // We'll use this to get to know when jquery finishes fetching everything.
  var pendingRequests = [];
  var succeededRequests = 0;
  var errors = [];

  // This function loads the section page found at `url` via a get request
  // It extracts article names + shortnames from the page and store the
  // data as an object in the children array property of the destination object
  fetchSectionArticleList = function(url, destObj){

    // Our succeed handler
    var onSucceed = (function(destO, href) {
      return function(d){
        // This was tricky, had to use this weird concatenation to get something
        // I could get jQuery to work with.
        var $d = $('\
        ' + d + '\
        ');

        // We'll append sectionChildren's values before any other children in the section
        var sectionChildren = [];

        // Grab every article title
        var $titreArt = $d.find('.titreArt');
        var articlesNames = $titreArt.text().replace(/\s\(abrogé(.*)/g, '').replace(/\s\En savoir plus(.*)/g, '').split('\n');

        // Loop through each article
        for ( var i = 0; i < articlesNames.length; i++) {
          if (articlesNames[i] != '') {

            // Extract name / shortname
            var articleLink = base + $titreArt.eq(i).children('a').attr('href').replace(/;jsessionid=[^\?]+\?/,'?');
            var articleName = $.trim(articlesNames[i]);
            var shortname = articleName.replace(/Article /g,'');

            // Push result
            sectionChildren.push({name: articleName, 'shortname': shortname, source: articleLink});
          } else {
            articlesNames.splice(i, 1);
          }
        }

        if (sectionChildren.length > 0) {
          if(destO.children) {
            // Concat articles with other child sections
            destO.children = sectionChildren.concat(destO.children);
          } else {
            // Store articles
            destO.children = sectionChildren;
          }
        }

        // Let's increment our succeeded counter
        succeededRequests++;

        // We're done, calculate stats and display a recap in the console
        var stats = {
          total: succeededRequests + pendingRequests.length,
          succeeded: succeededRequests,
          'errors': errors.length
        };

        var percentage = parseInt(stats.succeeded / stats.total * 100);
        var errorMsg = (stats.errors > 0 ? (stats.errors + ' error' + (stats.errors.length > 1 ? 's':'') + ' ') : '');
        var msg = '[ ' + percentage +  '% ] ' + errorMsg + stats.succeeded + '/' + stats.total + ' - fetched';

        console.info(msg, href);
        console.log(destObj.name);
        try {
          console.table(destObj.children);
          console.log('');
        } catch(e) {
          console.log(destObj.children);
        }

        // Trigger the next request
        nextRequest();
      };
    })(destObj, url);

    // Error handler
    var onFail = function(d){
      console.error('An error occured during the get request', d);
      errors.push({href: url, response: d});
      nextRequest();
    };

    // jQuery promise
    var jqHxr = $.get(url, null, onSucceed).fail(onFail);
  }

  // Our second recursive function will push pending requests
  // We'll then use another function to sequentially trigger the requests
  traverse2 = function (obj) {
    var currentNode = obj;

    // If we have sources fetch article list
    if (currentNode.source && currentNode.source !== '') {
      pendingRequests.push({url: currentNode.source, node: currentNode});
      console.log("Will fetch articles in", currentNode.name);
    }

    // Recursively walk through each of the current node children
    try {
      for (var i = 0; i < currentNode.children.length; i++) {
        traverse2(currentNode.children[i]);
      }
    } catch(e) {
      if(!currentNode.source) {
        console.info("Empty section detected!", currentNode.name, currentNode);
      }
    }
  }

  // We now have to call the recursive function on each legiToc prop
  for (var key in legiToc) {
      // Skip loop if the property is from prototype
      if (!legiToc.hasOwnProperty(key)) continue;

      var node = legiToc[key];
      traverse2(node);
  }

  // Setup the nextRequest function
  nextRequest = function() {
    // Are there any pending request?
    if (pendingRequests.length > 0) {
      var request = pendingRequests.shift();
      fetchSectionArticleList(request.url, request.node);

    // If not, let's display the final output
    } else {
      // Format To JSON
      jsonResult = format2JSON(legiToc);

      // Copy data to clipboard
      copyResult(jsonResult);

      console.log('');

      // Display errors, if any
      if(errors.length > 0) {
        console.info('\n' + (succeededRequests === 0 ? 'No' : succeededRequests) + ' request' + (succeededRequests > 1 ? 's' : '') + ' succeeded')
        console.error(errors.length + ' failed request' + (errors.length > 1 ? 's' : ''), errors);
      } else {
        console.info('Done, no error occured.');
      }
    }
  }

  // Finally, trigger the first request
  nextRequest();

 }
	// Extract JSON table of content from any given french Legal code
	// ==============================================================
	// author: @daformat <mat.jouhet[at][google's mail service].com>
	// lastmod: 2017/01/10
	//
	// Usage:
	// ------
	// Execute in javascript console while browsing the toc you're interested in
	// see comments if you need to tweak anything.
	//
	// Notes:
	// ------
	// • Légifrance gives you access to their entire dataset via XML files.
	// So, if you need to fequently grab data, you shoud definitely take
	// a look at the french government open-data sets. The one we're intersted in:
	// http://www.data.gouv.fr/fr/datasets/legi-codes-lois-et-reglements-consolides/
	// Should tell you to go to ftp://echanges.dila.gouv.fr/LEGI/
	//
	// • Obviously, this script will break if Légifrance changes their layout
	// It was done as a quick way to get data from Légifrance without the pain
	// of dealing with the XMLs. Given the very short delay for the the first release
	// of the project we're working on, we chose the quick and easy way.
	//
	// • The JSON data is formatted so you can use it ootb with react-treebeard
	// https://github.com/alexcurtis/react-treebeard/
	//
	// Buy me a coffee?
	// ----------------
	// Want to buy me a coffee? http://ko-fi.com/A581HY1
	//
	// Feel free to ping me on your favorite social network

	// Change this to false if you just want the TOC
	// without the articles as children
	var fetchArticleList = true;

	// format to JSON helper
	format2JSON = function(what) {
	// Format To JSON
	// Change params to (legiToc, null, '\t') if you prefer tabs for indent
	// Change params to (legiToc) if you want it minified
	return JSON.stringify(what, null, ' ');
	}

	// Copy result helper
	copyResult = function(what) {
	try {
	copy(what);
	console.info('JSON data was copied to clipboard');
	} catch(e) {
	// If the copy command failed, we'll display the object that was to be
	// copied, so the user can always manually copy it.
	// We also suggest trying to run manually the copy command.
	var lines = [
	'Sorry about that, but the above data couldn’t be copied to clipboard',
	'Maybe your browser doesn’t support the copy command line API?',
	'You should try manually running `copy(jsonResult)`'
	];

	console.log(what);
	console.error(lines[0] + '\n' + lines[1]);
	console.info(lines[2]);

	// See next comment
	if (copyCommandWasCopiedToClipboard === true) {
	console.log('(ironically, this command was copied to your clipboard)');
	}
	}
	}

	// It seems the copy command line API doesn't like when we're using it
	// against big datasets in the script, so, as a fallback we first copy
	// the copy command to clipboard (how ironic)
	try {
	copy('try {copy(jsonResult); console.info(\'Data was sucessfully copied to your clipboard\')} catch (e) {console.error(\'No, really, we tried, but you have to manually copy the data\')}');
	var copyCommandWasCopiedToClipboard = true;
	} catch(e) {
	var copyCommandWasCopiedToClipboard = false;
	}

	// Just so we don't have to do this all the time
	var base = window.location.protocol+'/'+'/'+window.location.hostname+'/';

	// =======================================================
	// STEP 1 - Get the table of content from the current page
	// =======================================================
	//
	// Recursive function to walk through the tree
	traverse = function(node, destinationArray) {

	// console.log('Traversing', node, destinationArray);
	var main = $(node);
	var currentNodeData = {};
	var children = [];

	// Extract data
	var nodeName = $.trim($(node).children('span').eq(0).text());
	var legiLink = $(node).children('span.codeLienArt');
	try {
	var legiLinkCleanUrl = legiLink.eq(0).children('a').attr('href').replace(/;jsessionid=[^\?]+\?/,'?');
	} catch(e) {
	var legiLinkCleanUrl = '';
	}

	// Traverse inner sections
	$('>ul>li', main).each(function() {
	var section = [];
	traverse(this, section);
	children.push(section[0]);
	});

	// Store extracted node data
	if(children.length === 0) {
	currentNodeData = {name: nodeName, section: true};
	} else {
	currentNodeData = {name: nodeName, section: true, 'children': children, toggled: true};
	}

	// Add source url and article range if present
	if (legiLink.length !== 0) {
	currentNodeData.name += ' ' + $.trim(legiLink.eq(0).text().replace(/\s+\)/, ')'));
	currentNodeData.source = base + legiLinkCleanUrl;
	}

	destinationArray.push(currentNodeData);
	}

	// Initialize our main variable
	var legiToc = [];

	// Start traversing the current page TOC
	$('#content_left>.data>ul.noType>li').each(function(){
	traverse(this, legiToc);
	});

	// Format To JSON
	var jsonResult = format2JSON(legiToc);

	// If fetchArticleList is set to false, we'll try to copy the data to clipboard
	// and STOP right after doing so.
	// ------------------------------
	if ( fetchArticleList === false ) {
	// Copy data to clipboard
	copyResult(jsonResult);

	// If not, we keep going
	} else {

	// ===========================
	// STEP 2 - fetch article list
	// ===========================
	// If the fetchArticleList variable is set to true we'll go fetch articles titles
	// from sections when their source prop is defined and not empty
	//
	// We'll use this to get to know when jquery finishes fetching everything.
	var pendingRequests = [];
	var succeededRequests = 0;
	var errors = [];

	// This function loads the section page found at `url` via a get request
	// It extracts article names + shortnames from the page and store the
	// data as an object in the children array property of the destination object
	fetchSectionArticleList = function(url, destObj){

	// Our succeed handler
	var onSucceed = (function(destO, href) {
	return function(d){
	// This was tricky, had to use this weird concatenation to get something
	// I could get jQuery to work with.
	var $d = $('\
	' + d + '\
	');

	// We'll append sectionChildren's values before any other children in the section
	var sectionChildren = [];

	// Grab every article title
	var $titreArt = $d.find('.titreArt');
	var articlesNames = $titreArt.text().replace(/\s\(abrogé(.)/g, '').replace(/\s\En savoir plus(.)/g, '').split('\n');

	// Loop through each article
	for ( var i = 0; i < articlesNames.length; i++) {
	if (articlesNames[i] != '') {

	// Extract name / shortname
	var articleLink = base + $titreArt.eq(i).children('a').attr('href').replace(/;jsessionid=[^\?]+\?/,'?');
	var articleName = $.trim(articlesNames[i]);
	var shortname = articleName.replace(/Article /g,'');

	// Push result
	sectionChildren.push({name: articleName, 'shortname': shortname, source: articleLink});
	} else {
	articlesNames.splice(i, 1);
	}
	}

	if (sectionChildren.length > 0) {
	if(destO.children) {
	// Concat articles with other child sections
	destO.children = sectionChildren.concat(destO.children);
	} else {
	// Store articles
	destO.children = sectionChildren;
	}
	}

	// Let's increment our succeeded counter
	succeededRequests++;

	// We're done, calculate stats and display a recap in the console
	var stats = {
	total: succeededRequests + pendingRequests.length,
	succeeded: succeededRequests,
	'errors': errors.length
	};

	var percentage = parseInt(stats.succeeded / stats.total * 100);
	var errorMsg = (stats.errors > 0 ? (stats.errors + ' error' + (stats.errors.length > 1 ? 's':'') + ' ') : '');
	var msg = '[ ' + percentage + '% ] ' + errorMsg + stats.succeeded + '/' + stats.total + ' - fetched';

	console.info(msg, href);
	console.log(destObj.name);
	try {
	console.table(destObj.children);
	console.log('');
	} catch(e) {
	console.log(destObj.children);
	}

	// Trigger the next request
	nextRequest();
	};
	})(destObj, url);

	// Error handler
	var onFail = function(d){
	console.error('An error occured during the get request', d);
	errors.push({href: url, response: d});
	nextRequest();
	};

	// jQuery promise
	var jqHxr = $.get(url, null, onSucceed).fail(onFail);
	}

	// Our second recursive function will push pending requests
	// We'll then use another function to sequentially trigger the requests
	traverse2 = function (obj) {
	var currentNode = obj;

	// If we have sources fetch article list
	if (currentNode.source && currentNode.source !== '') {
	pendingRequests.push({url: currentNode.source, node: currentNode});
	console.log("Will fetch articles in", currentNode.name);
	}

	// Recursively walk through each of the current node children
	try {
	for (var i = 0; i < currentNode.children.length; i++) {
	traverse2(currentNode.children[i]);
	}
	} catch(e) {
	if(!currentNode.source) {
	console.info("Empty section detected!", currentNode.name, currentNode);
	}
	}
	}

	// We now have to call the recursive function on each legiToc prop
	for (var key in legiToc) {
	// Skip loop if the property is from prototype
	if (!legiToc.hasOwnProperty(key)) continue;

	var node = legiToc[key];
	traverse2(node);
	}

	// Setup the nextRequest function
	nextRequest = function() {
	// Are there any pending request?
	if (pendingRequests.length > 0) {
	var request = pendingRequests.shift();
	fetchSectionArticleList(request.url, request.node);

	// If not, let's display the final output
	} else {
	// Format To JSON
	jsonResult = format2JSON(legiToc);

	// Copy data to clipboard
	copyResult(jsonResult);

	console.log('');

	// Display errors, if any
	if(errors.length > 0) {
	console.info('\n' + (succeededRequests === 0 ? 'No' : succeededRequests) + ' request' + (succeededRequests > 1 ? 's' : '') + ' succeeded')
	console.error(errors.length + ' failed request' + (errors.length > 1 ? 's' : ''), errors);
	} else {
	console.info('Done, no error occured.');
	}
	}
	}

	// Finally, trigger the first request
	nextRequest();

	}