Skip to content

Instantly share code, notes, and snippets.

@blahah
Last active April 18, 2016 17:37
Show Gist options
  • Save blahah/6ef1c7b79bbcd8a585e987a70c371910 to your computer and use it in GitHub Desktop.
Save blahah/6ef1c7b79bbcd8a585e987a70c371910 to your computer and use it in GitHub Desktop.
europe pubmed central bibJSON -> search index optimised term array
#!/usr/bin/env node
// this script optimises the eupmc metadata for inclusion in a full-text
// search index.
var Transform = require("stream").Transform
var util = require("util")
var natural = require('natural')
var tokenize = (new natural.TreebankWordTokenizer()).tokenize
var stem = natural.PorterStemmer.stem
var inflector = new natural.NounInflector()
var _ = require('lodash')
var fs = require('fs')
var JSONStream = require('JSONStream')
var read = fs.createReadStream(process.argv[2])
var write = fs.createWriteStream(process.argv[3], 'utf8')
var json = JSONStream.parse()
util.inherits(optimiser, Transform) // inherit Transform
function optimiser () {
Transform.call(this, { "objectMode": true })
}
var rules = "./tr_from_posjs.txt"
var lexicon = "./lexicon_from_posjs.json"
var tagger = new natural.BrillPOSTagger(lexicon, rules, 'N', (err) => {
if (err) throw(err)
read.pipe(json).pipe(new optimiser()).pipe(write)
})
optimiser.prototype._transform = function (entry, encoding, done) {
var pmcid = Number(getid(entry))
var optimised = [entry]
.map(preprocessTitle)
.map(flattenRecord)
.map(lowercase)
.map(_.uniq)
.map(stripPunctuation)
.map((s) => { return JSON.stringify({ id: pmcid , terms: s }) })
this.push(optimised[0] + '\n')
done()
}
function getid (e) {
return _.find(e.identifier, { type: 'pmcid' }).id
}
function flattenRecord (r) {
var author = r.author.map((a) => {
return a.surname
})
return r.title.concat(author, [r.year])
}
function preprocessTitle (r) {
r.title = r.title.replace('/', ' ')
r.title = tokenize(r.title)
r.title = tagger.tag(r.title)
.filter(filterPOS)
.map(singularize)
.map(stripTags)
.map(stem)
return r
}
function lowercase (a) {
return a.map((s) => { return s.toLowerCase() })
}
function tokenize (s) {
return tokenize(s)
}
function stem (a) {
return a.map(stem)
}
function inflect (a) {
return a
}
function filterPOS (p) {
// see https://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used
var tag = p[1]
// keep
var first = tag[0]
if (first === 'N') return true // nouns
if (first === 'V') return true // verbs
if (first === 'J') return true // adjective
if (first === 'R') return true // adverb
// discard
if (first === 'A') return false // pre-qualifiers and pre-quantifiers
if (first === 'B') return false // verb 'be'
if (first === 'D') return false // verb 'do' + determiners
if (first === 'E') return false // existential #suchsartre
if (first === 'H') return false // verb 'have'
if (first === 'I') return false // preposition
// ignore punctuation
if (tag.length === 1) {
if (!(tag === '*')) return false
}
return true
}
function singularize (p) {
var tag = p[1]
if (tag === 'NNS' || tag === 'NNS$' || tag === 'NPS' || tag === 'NPS$') {
p[0] = inflector.singularize(p[0])
}
return p
}
function stripTags (p) {
return p[0]
}
function stripPunctuation (a) {
return a.map((s) => { return s.replace(/\W/, '') })
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment