blahah · April 18, 2016 17:37
diff --git a/optimise_eupmc.js b/optimise_eupmc.js
 #!/usr/bin/env node

 // this script optimises the eupmc metadata for inclusion in a full-text
 // search index.

 var Transform = require("stream").Transform
 var util = require("util")

 var natural = require('natural')
 var tokenize = (new natural.TreebankWordTokenizer()).tokenize
 var stem = natural.PorterStemmer.stem
 var inflector = new natural.NounInflector()
 var _ = require('lodash')

 var fs = require('fs')

 var JSONStream = require('JSONStream')
 var read = fs.createReadStream(process.argv[2])
 var write = fs.createWriteStream(process.argv[3], 'utf8')

 var json = JSONStream.parse()

 util.inherits(optimiser, Transform) // inherit Transform

 function optimiser () {
  Transform.call(this, { "objectMode": true })
 }

 var rules = "./tr_from_posjs.txt"
 var lexicon = "./lexicon_from_posjs.json"

 var tagger = new natural.BrillPOSTagger(lexicon, rules, 'N', (err) => {
  if (err) throw(err)

  read.pipe(json).pipe(new optimiser()).pipe(write)
 })

 optimiser.prototype._transform = function (entry, encoding, done) {
  var pmcid = Number(getid(entry))
  var optimised = [entry]
    .map(preprocessTitle)
    .map(flattenRecord)
    .map(lowercase)
    .map(_.uniq)
    .map(stripPunctuation)
    .map((s) => { return JSON.stringify({ id: pmcid , terms: s }) })
  this.push(optimised[0] + '\n')
  done()
 }

 function getid (e) {
  return _.find(e.identifier, { type: 'pmcid' }).id
 }

 function flattenRecord (r) {
  var author = r.author.map((a) => {
    return a.surname
  })
  return r.title.concat(author, [r.year])
 }


 function preprocessTitle (r) {
  r.title = r.title.replace('/', ' ')
  r.title = tokenize(r.title)
  r.title = tagger.tag(r.title)
    .filter(filterPOS)
    .map(singularize)
    .map(stripTags)
    .map(stem)
  return r
 }

 function lowercase (a) {
  return a.map((s) => { return s.toLowerCase() })
 }

 function tokenize (s) {
  return tokenize(s)
 }

 function stem (a) {
  return a.map(stem)
 }

 function inflect (a) {
 return a
 }

 function filterPOS (p) {
  // see https://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used
  var tag = p[1]

  // keep
  var first = tag[0]
  if (first === 'N') return true // nouns
  if (first === 'V') return true // verbs
  if (first === 'J') return true // adjective
  if (first === 'R') return true // adverb

  // discard
  if (first === 'A') return false // pre-qualifiers and pre-quantifiers
  if (first === 'B') return false // verb 'be'
  if (first === 'D') return false // verb 'do' + determiners
  if (first === 'E') return false // existential #suchsartre
  if (first === 'H') return false // verb 'have'
  if (first === 'I') return false // preposition

  // ignore punctuation
  if (tag.length === 1) {
    if (!(tag === '*')) return false
  }

  return true
 }

 function singularize (p) {
  var tag = p[1]
  if (tag === 'NNS' || tag === 'NNS$' ||  tag === 'NPS' || tag === 'NPS$') {
    p[0] = inflector.singularize(p[0])
  }
  return p
 }

 function stripTags (p) {
  return p[0]
 }

 function stripPunctuation (a) {
  return a.map((s) => { return s.replace(/\W/, '') })
 }
	#!/usr/bin/env node

	// this script optimises the eupmc metadata for inclusion in a full-text
	// search index.

	var Transform = require("stream").Transform
	var util = require("util")

	var natural = require('natural')
	var tokenize = (new natural.TreebankWordTokenizer()).tokenize
	var stem = natural.PorterStemmer.stem
	var inflector = new natural.NounInflector()
	var _ = require('lodash')

	var fs = require('fs')

	var JSONStream = require('JSONStream')
	var read = fs.createReadStream(process.argv[2])
	var write = fs.createWriteStream(process.argv[3], 'utf8')

	var json = JSONStream.parse()

	util.inherits(optimiser, Transform) // inherit Transform

	function optimiser () {
	Transform.call(this, { "objectMode": true })
	}

	var rules = "./tr_from_posjs.txt"
	var lexicon = "./lexicon_from_posjs.json"

	var tagger = new natural.BrillPOSTagger(lexicon, rules, 'N', (err) => {
	if (err) throw(err)

	read.pipe(json).pipe(new optimiser()).pipe(write)
	})

	optimiser.prototype._transform = function (entry, encoding, done) {
	var pmcid = Number(getid(entry))
	var optimised = [entry]
	.map(preprocessTitle)
	.map(flattenRecord)
	.map(lowercase)
	.map(_.uniq)
	.map(stripPunctuation)
	.map((s) => { return JSON.stringify({ id: pmcid , terms: s }) })
	this.push(optimised[0] + '\n')
	done()
	}

	function getid (e) {
	return _.find(e.identifier, { type: 'pmcid' }).id
	}

	function flattenRecord (r) {
	var author = r.author.map((a) => {
	return a.surname
	})
	return r.title.concat(author, [r.year])
	}


	function preprocessTitle (r) {
	r.title = r.title.replace('/', ' ')
	r.title = tokenize(r.title)
	r.title = tagger.tag(r.title)
	.filter(filterPOS)
	.map(singularize)
	.map(stripTags)
	.map(stem)
	return r
	}

	function lowercase (a) {
	return a.map((s) => { return s.toLowerCase() })
	}

	function tokenize (s) {
	return tokenize(s)
	}

	function stem (a) {
	return a.map(stem)
	}

	function inflect (a) {
	return a
	}

	function filterPOS (p) {
	// see https://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used
	var tag = p[1]

	// keep
	var first = tag[0]
	if (first === 'N') return true // nouns
	if (first === 'V') return true // verbs
	if (first === 'J') return true // adjective
	if (first === 'R') return true // adverb

	// discard
	if (first === 'A') return false // pre-qualifiers and pre-quantifiers
	if (first === 'B') return false // verb 'be'
	if (first === 'D') return false // verb 'do' + determiners
	if (first === 'E') return false // existential #suchsartre
	if (first === 'H') return false // verb 'have'
	if (first === 'I') return false // preposition

	// ignore punctuation
	if (tag.length === 1) {
	if (!(tag === '*')) return false
	}

	return true
	}

	function singularize (p) {
	var tag = p[1]
	if (tag === 'NNS' \|\| tag === 'NNS$' \|\| tag === 'NPS' \|\| tag === 'NPS$') {
	p[0] = inflector.singularize(p[0])
	}
	return p
	}

	function stripTags (p) {
	return p[0]
	}

	function stripPunctuation (a) {
	return a.map((s) => { return s.replace(/\W/, '') })
	}