-
-
Save matt-west/6500993 to your computer and use it in GitHub Desktop.
/** | |
* @fileoverview Pearson correlation score algorithm. | |
* @author [email protected] (Matt West) | |
* @license Copyright 2013 Matt West. | |
* Licensed under MIT (http://opensource.org/licenses/MIT). | |
*/ | |
/** | |
* Calculate the person correlation score between two items in a dataset. | |
* | |
* @param {object} prefs The dataset containing data about both items that | |
* are being compared. | |
* @param {string} p1 Item one for comparison. | |
* @param {string} p2 Item two for comparison. | |
* @return {float} The pearson correlation score. | |
*/ | |
function pearsonCorrelation(prefs, p1, p2) { | |
var si = []; | |
for (var key in prefs[p1]) { | |
if (prefs[p2][key]) si.push(key); | |
} | |
var n = si.length; | |
if (n == 0) return 0; | |
var sum1 = 0; | |
for (var i = 0; i < si.length; i++) sum1 += prefs[p1][si[i]]; | |
var sum2 = 0; | |
for (var i = 0; i < si.length; i++) sum2 += prefs[p2][si[i]]; | |
var sum1Sq = 0; | |
for (var i = 0; i < si.length; i++) { | |
sum1Sq += Math.pow(prefs[p1][si[i]], 2); | |
} | |
var sum2Sq = 0; | |
for (var i = 0; i < si.length; i++) { | |
sum2Sq += Math.pow(prefs[p2][si[i]], 2); | |
} | |
var pSum = 0; | |
for (var i = 0; i < si.length; i++) { | |
pSum += prefs[p1][si[i]] * prefs[p2][si[i]]; | |
} | |
var num = pSum - (sum1 * sum2 / n); | |
var den = Math.sqrt((sum1Sq - Math.pow(sum1, 2) / n) * | |
(sum2Sq - Math.pow(sum2, 2) / n)); | |
if (den == 0) return 0; | |
return num / den; | |
} |
function pearson(x, y){
let promedio = (lista) => { return lista.reduce((s, a) => s + a, 0) / lista.length };
let n = x.length, prom_x = promedio(x) , prom_y = promedio(y);
return (x.map( (e, i, r) => (r = {x:e, y:y[i]}) ).reduce( (s, a) => s + a.x * a.y, 0) - n * prom_x * prom_y) /
((Math.sqrt(x.reduce( (s, a) => (s + a * a) , 0) - n * prom_x * prom_x)) *
(Math.sqrt(y.reduce( (s, a) => (s + a * a) , 0) - n * prom_y * prom_y)));
}
This is my version!. I just did it as a practice for an exam that I have soon
@joracha your version is perfect! It's fastest version at this time!
Unfortunately I found that it doesn't respect to "empty" values, but it have to.
I've improved it a bit:
function pearson (x, y) {
const promedio = l => l.reduce((s, a) => s + a, 0) / l.length
const calc = (v, prom) => Math.sqrt(v.reduce((s, a) => (s + a * a), 0) - n * prom * prom)
let n = x.length
let nn = 0
for (let i = 0; i < n; i++, nn++) {
if ((!x[i] && x[i] !== 0) || (!y[i] && y[i] !== 0)) {
nn--
continue
}
x[nn] = x[i]
y[nn] = y[i]
}
if (n !== nn) {
x = x.splice(0, nn)
y = y.splice(0, nn)
n = nn
}
const prom_x = promedio(x), prom_y = promedio(y)
return (x
.map((e, i) => ({ x: e, y: y[i] }))
.reduce((v, a) => v + a.x * a.y, 0) - n * prom_x * prom_y
) / (calc(x, prom_x) * calc(y, prom_y))
}
Test showed that the speed degradation is insignificant (6.92%):
// test.js
const initialPearson = require('../initialPearson')
const improvedPearson = require('../improvedPearson')
const series1 = [80.026413,80.330908,76.68564,72.095302,75.473899,82.647118]
const series2 = [81.662683,85.802179,78.148427,81.126326,77.853491,85.974228]
console.log('initialPearson', initialPearson(series1, series2))
console.log('improvedPearson', improvedPearson(series1, series2))
benchmarkjs.options({ testTime: 4000 })
benchmarkjs('initialPearson', () => initialPearson(series1, series2))
benchmarkjs('improvedPearson', () => improvedPearson(series1, series2))
console.log(benchmarkjs.results)
Output
$ node test.js
initialPearson 0.6917288370450843
improvedPearson 0.6917288370450843
[
{
name: 'initialPearson',
elapsed: 4397,
checks: 3,
totalIterations: 48518134,
perSecondIterations: 11034372,
isOptimized: null,
diff: '0%'
},
{
name: 'improvedPearson',
elapsed: 4395,
checks: 3,
totalIterations: 45139154,
perSecondIterations: 10270569,
isOptimized: null,
diff: '6.92%'
}
]
In case of having a null it works as expected (and even faster actually):
// test.js
const initialPearson = require('../initialPearson')
const improvedPearson = require('../improvedPearson')
const series1 = [80.026413,80.330908,76.68564,72.095302,null,82.647118]
const series2 = [81.662683,85.802179,78.148427,81.126326,77.853491,85.974228]
benchmarkjs.options({ testTime: 4000 })
benchmarkjs('initialPearson', () => initialPearson(series1, series2))
benchmarkjs('improvedPearson', () => improvedPearson(series1, series2))
console.log(benchmarkjs.results)
$ node test.js
initialPearson 0.5995223578720159
improvedPearson 0.6570987279478532
[
{
name: 'initialPearson',
elapsed: 4136,
checks: 2,
totalIterations: 26454098,
perSecondIterations: 6396058,
isOptimized: null,
diff: '0%'
},
{
name: 'improvedPearson',
elapsed: 4139,
checks: 2,
totalIterations: 26027407,
perSecondIterations: 6288332,
isOptimized: null,
diff: '1.68%'
}
]
Thank you!
The above code has a bug, n is used before being defined.
let n = x.length
needs to be before the const calc ...
line.
The above code has a bug, n is used before being defined.
let n = x.length
needs to be before the
const calc ...
line.
No it hasn't. The third line defines function calc
without calling it.
The above code has a bug, n is used before being defined.
let n = x.length
needs to be before theconst calc ...
line.No it hasn't. The third line defines function
calc
without calling it.
Well, it's still bad form. You're referencing a global that's defined later in scope and generally that's bad form.
The current version of ESLint doesn't like this, but as you're only giving an example here I guess it might be fine.
I've rewrote this function in es6.
The code is more concise, but it does not work with associative arrays that were possible in the original code.