IanSimpson · April 25, 2019 23:08
diff --git a/converter.php b/converter.php
 <?php
 /**
 * index.php
 *
 * LightTag to AWS Comprehend annotation converter
 *
 * @author      Ian Simpson <[email protected]>
 * @copyright   2019 The Logic Studio Limited
 * @license     GPL 3
 * @license     https://opensource.org/licenses/GPL-3.0 GNU General Public License version 3
 *
 * To use, drop this file in a folder, along with a folder called "input". Put the JSON files exported from LightTag into
 * the "input" folder, then run this script. As output you will get a file "out.csv", and a folder "output" containing all
 * source texts. Put this on S3, point AWS Comprehend at it, and it should magically work.
 *
 */

 // Open our output file
 $fh = fopen('out.csv', 'w');

 // Add a header line
 fputcsv($fh, ['File', 'Line', 'Begin Offset', 'End Offset', 'Type']);

 // Make sure the input and output folders are ready
 if(!file_exists('output')) mkdir('output');
 if(!file_exists('input')) mkdir('input');

 // We'll keep track of completed texts to ensure no duplicates
 $done = [];

 // Find all the files in the input folder, and loop through them
 $files = scandir('input');
 foreach($files as $f) if(substr($f,0,1)!='.') {
 	// Get and parse the input file
 	$in = json_decode(file_get_contents('input/'.$f));

 	// Loop through each text
 	foreach($in->annotations_and_examples as $a) {

 		// Get a unique hash of the text to serve as a unique ID
 		$hash = md5($a->content);

 		//Make sure we only process each text once
 		if(!in_array($hash, $done)) {
 			$done[] = $hash;

 			// Output the text to a file ready for upload to S3
 			file_put_contents('output/'.$hash, $a->content);

 			// Loop through each annotation
 			foreach($a->annotations as $an) {

 				// Track the line number and character offset
 				$line = 0;
 				$offset = -1;

 				// Iterate through each character as the string. When we hit a newline, increment the line number
 				for($i=0; $i<=$an->start; $i++) {
 					$offset++;
 					$char = mb_substr($a->content, $i, 1);
 					if($char == "\n") {
 						$line++;
 						$offset = -1;
 					}
 				}

 				$end = $offset-1;
 				$length = $an->end - $an->start;

 				// Continue iterating through - multi-line annotations need to be logged as two separate annotations for Comprehend
 				for($i = $an->start; $i <= $an->end; $i++) {
 					$char = mb_substr($a->content, $i, 1);
 					$end++;

 					//Output the CSV line either at the end of the line, or the end of the annotation
 					if($char == "\n" || $i == $an->end) {
 						if($end>$offset) fputcsv($fh, [
 							$hash,											// File
 							$line,											// Line
 							$offset,										// Begin Offset
 							$end,											// End Offset
 							str_replace(' ', '_', strtoupper($an->tag)),	// Type
 						]);
 						$line++;
 						$offset = 0;
 						$end = -1;
 					}
 				}
 			}
 		}
 	}
 }

 //Clean up behind ourselves
 fclose($fh);
	<?php
	/**
	* index.php
	*
	* LightTag to AWS Comprehend annotation converter
	*
	* @author Ian Simpson <[email protected]>
	* @copyright 2019 The Logic Studio Limited
	* @license GPL 3
	* @license https://opensource.org/licenses/GPL-3.0 GNU General Public License version 3
	*
	* To use, drop this file in a folder, along with a folder called "input". Put the JSON files exported from LightTag into
	* the "input" folder, then run this script. As output you will get a file "out.csv", and a folder "output" containing all
	* source texts. Put this on S3, point AWS Comprehend at it, and it should magically work.
	*
	*/

	// Open our output file
	$fh = fopen('out.csv', 'w');

	// Add a header line
	fputcsv($fh, ['File', 'Line', 'Begin Offset', 'End Offset', 'Type']);

	// Make sure the input and output folders are ready
	if(!file_exists('output')) mkdir('output');
	if(!file_exists('input')) mkdir('input');

	// We'll keep track of completed texts to ensure no duplicates
	$done = [];

	// Find all the files in the input folder, and loop through them
	$files = scandir('input');
	foreach($files as $f) if(substr($f,0,1)!='.') {
	// Get and parse the input file
	$in = json_decode(file_get_contents('input/'.$f));

	// Loop through each text
	foreach($in->annotations_and_examples as $a) {

	// Get a unique hash of the text to serve as a unique ID
	$hash = md5($a->content);

	//Make sure we only process each text once
	if(!in_array($hash, $done)) {
	$done[] = $hash;

	// Output the text to a file ready for upload to S3
	file_put_contents('output/'.$hash, $a->content);

	// Loop through each annotation
	foreach($a->annotations as $an) {

	// Track the line number and character offset
	$line = 0;
	$offset = -1;

	// Iterate through each character as the string. When we hit a newline, increment the line number
	for($i=0; $i<=$an->start; $i++) {
	$offset++;
	$char = mb_substr($a->content, $i, 1);
	if($char == "\n") {
	$line++;
	$offset = -1;
	}
	}

	$end = $offset-1;
	$length = $an->end - $an->start;

	// Continue iterating through - multi-line annotations need to be logged as two separate annotations for Comprehend
	for($i = $an->start; $i <= $an->end; $i++) {
	$char = mb_substr($a->content, $i, 1);
	$end++;

	//Output the CSV line either at the end of the line, or the end of the annotation
	if($char == "\n" \|\| $i == $an->end) {
	if($end>$offset) fputcsv($fh, [
	$hash, // File
	$line, // Line
	$offset, // Begin Offset
	$end, // End Offset
	str_replace(' ', '_', strtoupper($an->tag)), // Type
	]);
	$line++;
	$offset = 0;
	$end = -1;
	}
	}
	}
	}
	}
	}

	//Clean up behind ourselves
	fclose($fh);