Last active
September 9, 2017 04:21
-
-
Save luizamboni/6f8a1dfe3221934398892d067c453d4c to your computer and use it in GitHub Desktop.
transform phrase in lucene query with postuguese semantics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const dictionary = { | |
substantives: [ | |
"berço", | |
"notebook", | |
"smartphone", | |
"tela" | |
], | |
attributes: [ | |
"preto", | |
"preta", | |
"amarelo", | |
"redondo", | |
"redonda" | |
], | |
brands: [ | |
"hp", | |
"samsung", | |
"lenovo", | |
"lg", | |
"quantum" | |
], | |
prepositions: [ | |
"de", "da", "do", "para", "com" | |
], | |
metrics: [ | |
'"', | |
"cm", | |
"\d\s?m", | |
"metro", | |
"metros", | |
"polegadas" | |
] | |
} | |
const prepReg = new RegExp(dictionary.prepositions.join("|")) | |
const metricsReg = new RegExp(dictionary.metrics.join("|")) | |
const brandsReg = new RegExp(dictionary.brands.join("|")) | |
const cache = {} | |
function isPreposition(subterm) { | |
const { prepositions } = dictionary | |
return (prepositions.indexOf(subterm) !== -1 ) ? 1 : 0 | |
} | |
function isPredicate(subterm){ | |
const { prepositions, predicades } = dictionary | |
const prepReg = new RegExp(prepositions.map(t => `${t}\\s\\w`).join("|")) | |
if(prepReg.test(subterm)) | |
return 1 | |
if(isMetric(subterm)) | |
return 1 | |
if(isBrand(subterm)) | |
return 1 | |
if(isAttribute(subterm)) | |
return 1 | |
return 0 | |
} | |
function isBrand(subterm){ | |
const { brands } = dictionary | |
return (brands.indexOf(subterm) !== -1 ) ? 1 : 0 | |
} | |
function isSubstantive(subterm){ | |
const { substantives } = dictionary | |
return (substantives.indexOf(subterm) !== -1)? 1 : 0 | |
} | |
function isMetric(subterm) { | |
const { metrics } = dictionary | |
return (metricsReg.test(subterm)) ? 1 : 0 | |
} | |
function isAttribute(subterm) { | |
const { attributes } = dictionary | |
return (attributes.indexOf(subterm) !== -1)? 1 : 0 | |
} | |
function termsSplit(tokens) { | |
const { prepositions, metrics } = dictionary | |
return tokens.map((token,i) => { | |
const { t } = token | |
// add metadata | |
token.predicate = isPredicate(t) | |
token.substantive = isSubstantive(t) | |
token.brand = isBrand(t) | |
token.metric = isMetric(t) | |
token.attribute = isAttribute(t) | |
return token | |
}) | |
.map((token,i) => { | |
const prevToken = tokens[i-1] | |
const nextToken = tokens[i+1] | |
const { t: term } = token | |
if(isMetric(term)){ | |
return | |
} | |
if(isAttribute(term)){ | |
return | |
} | |
if(isPreposition(term)){ | |
return | |
} | |
if(prevToken) { | |
if(isPreposition(prevToken.t)) { | |
token.t = [ prevToken.t, term ].join(" ") | |
token.i = i - 1 | |
} | |
} | |
if(nextToken){ | |
if(isMetric(nextToken.t)) { | |
token.t += " " + nextToken.t , | |
token.i-- | |
} | |
if(isAttribute(nextToken.t)) { | |
token.t += " " + nextToken.t, | |
token.i-- | |
} | |
} | |
return token | |
}) | |
.filter(token => token) | |
} | |
function lucenize(terms, opts = { name: ""}) { | |
if(cache[terms]) | |
return cache[terms] | |
let tokens = terms.toLowerCase().split(/\s+/).map((t,i) => ({ t, i })) | |
tokens = termsSplit(tokens) | |
const tokensWeight = tokens.map((token,i) => { | |
const { substantive, predicate, brand, metric } = token | |
token.w = (tokens.length - i) + substantive*2 + predicate + brand + metric | |
return token | |
}) | |
const lucene = `(${tokensWeight.map(token => `"${token.t}"^${token.w}` ).join(" +")})` | |
cache[terms] = lucene | |
return opts.name ?`${opts.name}:${lucene}` : lucene | |
} | |
const terms = [ | |
"notebook da hp preto", | |
"kit berço", | |
"kit para berço", | |
"tela 15\"", | |
"mesa redonda 50x50 cm", | |
"coisa de 25 cm", | |
"unknow preto", | |
"smartphone samsung preto tela de x polegadas" | |
].forEach(t => { | |
console.log(lucenize(t, { name: "raw_name"})) | |
}) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment