Last active
April 18, 2016 17:37
-
-
Save blahah/6ef1c7b79bbcd8a585e987a70c371910 to your computer and use it in GitHub Desktop.
europe pubmed central bibJSON -> search index optimised term array
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
// this script optimises the eupmc metadata for inclusion in a full-text | |
// search index. | |
var Transform = require("stream").Transform | |
var util = require("util") | |
var natural = require('natural') | |
var tokenize = (new natural.TreebankWordTokenizer()).tokenize | |
var stem = natural.PorterStemmer.stem | |
var inflector = new natural.NounInflector() | |
var _ = require('lodash') | |
var fs = require('fs') | |
var JSONStream = require('JSONStream') | |
var read = fs.createReadStream(process.argv[2]) | |
var write = fs.createWriteStream(process.argv[3], 'utf8') | |
var json = JSONStream.parse() | |
util.inherits(optimiser, Transform) // inherit Transform | |
function optimiser () { | |
Transform.call(this, { "objectMode": true }) | |
} | |
var rules = "./tr_from_posjs.txt" | |
var lexicon = "./lexicon_from_posjs.json" | |
var tagger = new natural.BrillPOSTagger(lexicon, rules, 'N', (err) => { | |
if (err) throw(err) | |
read.pipe(json).pipe(new optimiser()).pipe(write) | |
}) | |
optimiser.prototype._transform = function (entry, encoding, done) { | |
var pmcid = Number(getid(entry)) | |
var optimised = [entry] | |
.map(preprocessTitle) | |
.map(flattenRecord) | |
.map(lowercase) | |
.map(_.uniq) | |
.map(stripPunctuation) | |
.map((s) => { return JSON.stringify({ id: pmcid , terms: s }) }) | |
this.push(optimised[0] + '\n') | |
done() | |
} | |
function getid (e) { | |
return _.find(e.identifier, { type: 'pmcid' }).id | |
} | |
function flattenRecord (r) { | |
var author = r.author.map((a) => { | |
return a.surname | |
}) | |
return r.title.concat(author, [r.year]) | |
} | |
function preprocessTitle (r) { | |
r.title = r.title.replace('/', ' ') | |
r.title = tokenize(r.title) | |
r.title = tagger.tag(r.title) | |
.filter(filterPOS) | |
.map(singularize) | |
.map(stripTags) | |
.map(stem) | |
return r | |
} | |
function lowercase (a) { | |
return a.map((s) => { return s.toLowerCase() }) | |
} | |
function tokenize (s) { | |
return tokenize(s) | |
} | |
function stem (a) { | |
return a.map(stem) | |
} | |
function inflect (a) { | |
return a | |
} | |
function filterPOS (p) { | |
// see https://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used | |
var tag = p[1] | |
// keep | |
var first = tag[0] | |
if (first === 'N') return true // nouns | |
if (first === 'V') return true // verbs | |
if (first === 'J') return true // adjective | |
if (first === 'R') return true // adverb | |
// discard | |
if (first === 'A') return false // pre-qualifiers and pre-quantifiers | |
if (first === 'B') return false // verb 'be' | |
if (first === 'D') return false // verb 'do' + determiners | |
if (first === 'E') return false // existential #suchsartre | |
if (first === 'H') return false // verb 'have' | |
if (first === 'I') return false // preposition | |
// ignore punctuation | |
if (tag.length === 1) { | |
if (!(tag === '*')) return false | |
} | |
return true | |
} | |
function singularize (p) { | |
var tag = p[1] | |
if (tag === 'NNS' || tag === 'NNS$' || tag === 'NPS' || tag === 'NPS$') { | |
p[0] = inflector.singularize(p[0]) | |
} | |
return p | |
} | |
function stripTags (p) { | |
return p[0] | |
} | |
function stripPunctuation (a) { | |
return a.map((s) => { return s.replace(/\W/, '') }) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment