Skip to content

Instantly share code, notes, and snippets.

@Planeshifter
Last active July 20, 2020 03:33
Show Gist options
  • Save Planeshifter/53a3357eb312ba25ad536ca2b6006c6a to your computer and use it in GitHub Desktop.
Save Planeshifter/53a3357eb312ba25ad536ca2b6006c6a to your computer and use it in GitHub Desktop.
Accuracy Evaluation for deidentify outputs on i2b2 corpus
// MODULES //
const fastXmlParser = require( 'fast-xml-parser' );
const readDir = require( '@stdlib/fs/read-dir' );
const endsWith = require( '@stdlib/string/ends-with' );
const isUppercase = require( '@stdlib/assert/is-uppercase' );
const contains = require( '@stdlib/assert/contains' );
const fs = require( 'fs' );
const join = require( 'path' ).join;
// FUNCTIONS //
function occurrences(str, value) {
var regExp = new RegExp( value, 'gi' );
return ( str.match( regExp ) || [] ).length;
}
// VARIABLES //
const files = readDir.sync( join( __dirname, 'testing-PHI-Gold-fixed' ) );
const TALLY_IDENTIFIED = {
DATE: {
missed: 0,
hits: 0
},
DOCTOR: {
missed: 0,
hits: 0
},
PATIENT: {
missed: 0,
hits: 0
},
USERNAME: {
missed: 0,
hits: 0
},
HOSPITAL: {
missed: 0,
hits: 0
},
STREET: {
missed: 0,
hits: 0
},
CITY: {
missed: 0,
hits: 0
},
STATE: {
missed: 0,
hits: 0
},
COUNTRY: {
missed: 0,
hits: 0
},
ZIP: {
missed: 0,
hits: 0
},
ORGANIZATION: {
missed: 0,
hits: 0
},
'LOCATION-OTHER': {
missed: 0,
hits: 0
},
AGE: {
missed: 0,
hits: 0
},
NAME: {
missed: 0,
hits: 0
},
LOCATION: {
missed: 0,
hits: 0
},
OVERALL: {
missed: 0,
hits: 0
}
};
const TALLY_FLAGGED = {
NAME: 0,
LOCATION: 0,
DATE: 0,
PHONE: 0,
ORGANIZATION: 0,
FAX: 0,
OVERALL: 0
};
for ( let i = 0; i < files.length; i++ ) {
const file = files[ i ];
if ( endsWith( file, 'xml' ) ) {
const contents = fs.readFileSync( join( __dirname, 'testing-PHI-Gold-fixed', file ) ).toString();
const processedFile = '$'+file.substring( 0, file.length - 3 )+'txt';
const processed = fs.readFileSync( join( __dirname, 'deidentified', processedFile ) ).toString();
TALLY_FLAGGED.DATE += occurrences( processed, '<dates>' );
TALLY_FLAGGED.NAME += occurrences( processed, '<names>' );
TALLY_FLAGGED.LOCATION += occurrences( processed, '<locations>' );
TALLY_FLAGGED.PHONE += occurrences( processed, '<phone>' );
TALLY_FLAGGED.ORGANIZATION += occurrences( processed, '<organizations>' );
TALLY_FLAGGED.FAX += occurrences( processed, '<fax>' );
const options = {
attributeNamePrefix: '',
ignoreAttributes: false,
cdataTagName: false,
cdataPositionChar: '',
};
let tObj = fastXmlParser.getTraversalObj( contents, options );
let jsonObj = fastXmlParser.convertToJson( tObj, options );
let tags = jsonObj.deIdi2b2.TAGS;
if ( tags.DATE ) {
for ( let j = 0; j < tags.DATE.length; j++ ) {
const tag = tags.DATE[ j ];
tag.text = tag.text || '';
if ( !contains( processed, tag.text ) ) {
TALLY_IDENTIFIED.OVERALL.hits += 1;
TALLY_IDENTIFIED.DATE.hits += 1;
} else {
TALLY_IDENTIFIED.DATE.missed += 1;
TALLY_IDENTIFIED.OVERALL.missed += 1;
}
}
}
if ( tags.NAME ) {
for ( let k = 0; k < tags.NAME.length; k++ ) {
let tag = tags.NAME[ k ];
if ( tag.TYPE !== 'USERNAME' && contains( processed, tag.text ) && !isUppercase( tag.text ) ) {
TALLY_IDENTIFIED.NAME.missed += 1;
TALLY_IDENTIFIED[ tag.TYPE ].missed += 1;
TALLY_IDENTIFIED.OVERALL.missed += 1;
} else {
TALLY_IDENTIFIED.NAME.hits += 1;
TALLY_IDENTIFIED[ tag.TYPE ].hits += 1;
TALLY_IDENTIFIED.OVERALL.hits += 1;
}
}
}
if ( tags.LOCATION ) {
for ( let k = 0; k < tags.LOCATION.length; k++ ) {
let tag = tags.LOCATION[ k ];
if ( contains( processed, tag.text ) && !isUppercase( tag.text ) ) {
TALLY_IDENTIFIED[ tag.TYPE ].missed += 1;
TALLY_IDENTIFIED.LOCATION.missed += 1;
TALLY_IDENTIFIED.OVERALL.missed += 1;
} else {
TALLY_IDENTIFIED.LOCATION.hits += 1;
TALLY_IDENTIFIED[ tag.TYPE ].hits += 1;
TALLY_IDENTIFIED.OVERALL.hits += 1;
}
}
}
if ( tags.AGE ) {
for ( let k = 0; k < tags.AGE.length; k++ ) {
let tag = tags.AGE[ k ];
if ( contains( processed, tag.text ) && !isUppercase( tag.text ) ) {
TALLY_IDENTIFIED[ tag.TYPE ].missed += 1;
} else {
TALLY_IDENTIFIED[ tag.TYPE ].hits += 1;
}
}
}
}
}
for ( let key in TALLY_IDENTIFIED ) {
TALLY_IDENTIFIED[ key ].recall = TALLY_IDENTIFIED[ key ].hits / ( TALLY_IDENTIFIED[ key ].hits + TALLY_IDENTIFIED[ key ].missed );
}
let overall = 0;
for ( let key in TALLY_FLAGGED ) {
overall += TALLY_FLAGGED[ key ];
}
TALLY_FLAGGED.OVERALL = overall;
TALLY_IDENTIFIED[ 'NAME / LOCATION / ORGANIZATION' ] = {
hits: TALLY_IDENTIFIED.NAME.hits + TALLY_IDENTIFIED.LOCATION.hits + TALLY_IDENTIFIED.ORGANIZATION.hits,
missed: TALLY_IDENTIFIED.NAME.missed + TALLY_IDENTIFIED.LOCATION.missed + TALLY_IDENTIFIED.ORGANIZATION.missed
};
TALLY_IDENTIFIED[ 'NAME / LOCATION / ORGANIZATION' ].recall = TALLY_IDENTIFIED[ 'NAME / LOCATION / ORGANIZATION' ].hits / ( TALLY_IDENTIFIED[ 'NAME / LOCATION / ORGANIZATION' ].hits + TALLY_IDENTIFIED[ 'NAME / LOCATION / ORGANIZATION' ].missed );
TALLY_FLAGGED[ 'NAME / LOCATION / ORGANIZATION' ] = TALLY_FLAGGED.NAME + TALLY_FLAGGED.LOCATION + TALLY_FLAGGED.ORGANIZATION;
console.log( "IDENTIFIED" )
console.log( TALLY_IDENTIFIED )
console.log( "FLAGGED" )
console.log( TALLY_FLAGGED );
const PRECISIONS = {};
PRECISIONS[ 'NAME / LOCATION / ORGANIZATION' ] = TALLY_IDENTIFIED[ 'NAME / LOCATION / ORGANIZATION' ].hits / TALLY_FLAGGED[ 'NAME / LOCATION / ORGANIZATION' ];
PRECISIONS[ 'DATE' ] = TALLY_IDENTIFIED[ 'DATE' ].hits / TALLY_FLAGGED[ 'DATE' ];
PRECISIONS[ 'OVERALL' ] = TALLY_IDENTIFIED[ 'OVERALL' ].hits / TALLY_FLAGGED[ 'OVERALL' ];
console.log( 'PRECISIONS' );
console.log( PRECISIONS );
const F1_SCORE = 2.0 * ( PRECISIONS.OVERALL * TALLY_IDENTIFIED.OVERALL.recall ) / ( PRECISIONS.OVERALL + TALLY_IDENTIFIED.OVERALL.recall );
console.log( "F1 Score" );
console.log( F1_SCORE );
{
"name": "i2b2-deidentify-evaluation",
"version": "1.0.0",
"description": "",
"main": "accuracy.js",
"author": "",
"license": "MIT",
"dependencies": {
"@stdlib/stdlib": "0.0.92",
"fast-xml-parser": "^3.12.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment