Last active
April 25, 2019 23:08
-
-
Save IanSimpson/bc9505a650d65251f5ec5faf0eed37ce to your computer and use it in GitHub Desktop.
LightTag to AWS Comprehend annotation conversion tool
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* index.php | |
* | |
* LightTag to AWS Comprehend annotation converter | |
* | |
* @author Ian Simpson <[email protected]> | |
* @copyright 2019 The Logic Studio Limited | |
* @license GPL 3 | |
* @license https://opensource.org/licenses/GPL-3.0 GNU General Public License version 3 | |
* | |
* To use, drop this file in a folder, along with a folder called "input". Put the JSON files exported from LightTag into | |
* the "input" folder, then run this script. As output you will get a file "out.csv", and a folder "output" containing all | |
* source texts. Put this on S3, point AWS Comprehend at it, and it should magically work. | |
* | |
*/ | |
// Open our output file | |
$fh = fopen('out.csv', 'w'); | |
// Add a header line | |
fputcsv($fh, ['File', 'Line', 'Begin Offset', 'End Offset', 'Type']); | |
// Make sure the input and output folders are ready | |
if(!file_exists('output')) mkdir('output'); | |
if(!file_exists('input')) mkdir('input'); | |
// We'll keep track of completed texts to ensure no duplicates | |
$done = []; | |
// Find all the files in the input folder, and loop through them | |
$files = scandir('input'); | |
foreach($files as $f) if(substr($f,0,1)!='.') { | |
// Get and parse the input file | |
$in = json_decode(file_get_contents('input/'.$f)); | |
// Loop through each text | |
foreach($in->annotations_and_examples as $a) { | |
// Get a unique hash of the text to serve as a unique ID | |
$hash = md5($a->content); | |
//Make sure we only process each text once | |
if(!in_array($hash, $done)) { | |
$done[] = $hash; | |
// Output the text to a file ready for upload to S3 | |
file_put_contents('output/'.$hash, $a->content); | |
// Loop through each annotation | |
foreach($a->annotations as $an) { | |
// Track the line number and character offset | |
$line = 0; | |
$offset = -1; | |
// Iterate through each character as the string. When we hit a newline, increment the line number | |
for($i=0; $i<=$an->start; $i++) { | |
$offset++; | |
$char = mb_substr($a->content, $i, 1); | |
if($char == "\n") { | |
$line++; | |
$offset = -1; | |
} | |
} | |
$end = $offset-1; | |
$length = $an->end - $an->start; | |
// Continue iterating through - multi-line annotations need to be logged as two separate annotations for Comprehend | |
for($i = $an->start; $i <= $an->end; $i++) { | |
$char = mb_substr($a->content, $i, 1); | |
$end++; | |
//Output the CSV line either at the end of the line, or the end of the annotation | |
if($char == "\n" || $i == $an->end) { | |
if($end>$offset) fputcsv($fh, [ | |
$hash, // File | |
$line, // Line | |
$offset, // Begin Offset | |
$end, // End Offset | |
str_replace(' ', '_', strtoupper($an->tag)), // Type | |
]); | |
$line++; | |
$offset = 0; | |
$end = -1; | |
} | |
} | |
} | |
} | |
} | |
} | |
//Clean up behind ourselves | |
fclose($fh); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment