-
-
Save matt-west/6500993 to your computer and use it in GitHub Desktop.
/** | |
* @fileoverview Pearson correlation score algorithm. | |
* @author [email protected] (Matt West) | |
* @license Copyright 2013 Matt West. | |
* Licensed under MIT (http://opensource.org/licenses/MIT). | |
*/ | |
/** | |
* Calculate the person correlation score between two items in a dataset. | |
* | |
* @param {object} prefs The dataset containing data about both items that | |
* are being compared. | |
* @param {string} p1 Item one for comparison. | |
* @param {string} p2 Item two for comparison. | |
* @return {float} The pearson correlation score. | |
*/ | |
function pearsonCorrelation(prefs, p1, p2) { | |
var si = []; | |
for (var key in prefs[p1]) { | |
if (prefs[p2][key]) si.push(key); | |
} | |
var n = si.length; | |
if (n == 0) return 0; | |
var sum1 = 0; | |
for (var i = 0; i < si.length; i++) sum1 += prefs[p1][si[i]]; | |
var sum2 = 0; | |
for (var i = 0; i < si.length; i++) sum2 += prefs[p2][si[i]]; | |
var sum1Sq = 0; | |
for (var i = 0; i < si.length; i++) { | |
sum1Sq += Math.pow(prefs[p1][si[i]], 2); | |
} | |
var sum2Sq = 0; | |
for (var i = 0; i < si.length; i++) { | |
sum2Sq += Math.pow(prefs[p2][si[i]], 2); | |
} | |
var pSum = 0; | |
for (var i = 0; i < si.length; i++) { | |
pSum += prefs[p1][si[i]] * prefs[p2][si[i]]; | |
} | |
var num = pSum - (sum1 * sum2 / n); | |
var den = Math.sqrt((sum1Sq - Math.pow(sum1, 2) / n) * | |
(sum2Sq - Math.pow(sum2, 2) / n)); | |
if (den == 0) return 0; | |
return num / den; | |
} |
If you still need it :)
var data = new Array(
new Array(21,54,60,78,82),
new Array(20,54,54,65,45)
);
console.log(pearsonCorrelation(data,0,1))
please note the function works with qualitative distributions too (associative array)
Hi there, i used ur library just lazy to write one...i got a pearson relation as NaN for some huge dataset arrays. i tried writing a new code ..then got the actual value.
i have one array with values like 0.4554 of more than 20,000 rows and another with values like 20. as it is big file, i cannot link it here but just want to bring to ur attention.
function pearson(values) {
const n = values.length;
if (n == 0) return 0;
let meanX = 0;
let meanY = 0;
for (var i = 0; i < n; i++) {
meanX += values[i].x / n
meanY += values[i].y / n
}
let num = 0;
let den1 = 0;
let den2 = 0;
for (var i = 0; i < n; i++) {
let dx = (values[i].x - meanX);
let dy = (values[i].y - meanY);
num += dx * dy
den1 += dx * dx
den2 += dy * dy
}
const den = Math.sqrt(den1) * Math.sqrt(den2);
if (den == 0) return 0;
return num / den;
}
pearson([{x: 0, y: 0}, {x:1, y:1}])
thank u ,i just need this
I've rewrote this function in es6.
The code is more concise, but it does not work with associative arrays that were possible in the original code.
/**
* calculates pearson correlation
* @param {number[]} d1
* @param {number[]} d2
*/
export function corr(d1, d2) {
let { min, pow, sqrt } = Math
let add = (a, b) => a + b
let n = min(d1.length, d2.length)
if (n === 0) {
return 0
}
[d1, d2] = [d1.slice(0, n), d2.slice(0, n)]
let [sum1, sum2] = [d1, d2].map(l => l.reduce(add))
let [pow1, pow2] = [d1, d2].map(l => l.reduce((a, b) => a + pow(b, 2), 0))
let mulSum = d1.map((n, i) => n * d2[i]).reduce(add)
let dense = sqrt((pow1 - pow(sum1, 2) / n) * (pow2 - pow(sum2, 2) / n))
if (dense === 0) {
return 0
}
return (mulSum - (sum1 * sum2 / n)) / dense
}
function pearson(x, y){
let promedio = (lista) => { return lista.reduce((s, a) => s + a, 0) / lista.length };
let n = x.length, prom_x = promedio(x) , prom_y = promedio(y);
return (x.map( (e, i, r) => (r = {x:e, y:y[i]}) ).reduce( (s, a) => s + a.x * a.y, 0) - n * prom_x * prom_y) /
((Math.sqrt(x.reduce( (s, a) => (s + a * a) , 0) - n * prom_x * prom_x)) *
(Math.sqrt(y.reduce( (s, a) => (s + a * a) , 0) - n * prom_y * prom_y)));
}
This is my version!. I just did it as a practice for an exam that I have soon
@joracha your version is perfect! It's fastest version at this time!
Unfortunately I found that it doesn't respect to "empty" values, but it have to.
I've improved it a bit:
function pearson (x, y) {
const promedio = l => l.reduce((s, a) => s + a, 0) / l.length
const calc = (v, prom) => Math.sqrt(v.reduce((s, a) => (s + a * a), 0) - n * prom * prom)
let n = x.length
let nn = 0
for (let i = 0; i < n; i++, nn++) {
if ((!x[i] && x[i] !== 0) || (!y[i] && y[i] !== 0)) {
nn--
continue
}
x[nn] = x[i]
y[nn] = y[i]
}
if (n !== nn) {
x = x.splice(0, nn)
y = y.splice(0, nn)
n = nn
}
const prom_x = promedio(x), prom_y = promedio(y)
return (x
.map((e, i) => ({ x: e, y: y[i] }))
.reduce((v, a) => v + a.x * a.y, 0) - n * prom_x * prom_y
) / (calc(x, prom_x) * calc(y, prom_y))
}
Test showed that the speed degradation is insignificant (6.92%):
// test.js
const initialPearson = require('../initialPearson')
const improvedPearson = require('../improvedPearson')
const series1 = [80.026413,80.330908,76.68564,72.095302,75.473899,82.647118]
const series2 = [81.662683,85.802179,78.148427,81.126326,77.853491,85.974228]
console.log('initialPearson', initialPearson(series1, series2))
console.log('improvedPearson', improvedPearson(series1, series2))
benchmarkjs.options({ testTime: 4000 })
benchmarkjs('initialPearson', () => initialPearson(series1, series2))
benchmarkjs('improvedPearson', () => improvedPearson(series1, series2))
console.log(benchmarkjs.results)
Output
$ node test.js
initialPearson 0.6917288370450843
improvedPearson 0.6917288370450843
[
{
name: 'initialPearson',
elapsed: 4397,
checks: 3,
totalIterations: 48518134,
perSecondIterations: 11034372,
isOptimized: null,
diff: '0%'
},
{
name: 'improvedPearson',
elapsed: 4395,
checks: 3,
totalIterations: 45139154,
perSecondIterations: 10270569,
isOptimized: null,
diff: '6.92%'
}
]
In case of having a null it works as expected (and even faster actually):
// test.js
const initialPearson = require('../initialPearson')
const improvedPearson = require('../improvedPearson')
const series1 = [80.026413,80.330908,76.68564,72.095302,null,82.647118]
const series2 = [81.662683,85.802179,78.148427,81.126326,77.853491,85.974228]
benchmarkjs.options({ testTime: 4000 })
benchmarkjs('initialPearson', () => initialPearson(series1, series2))
benchmarkjs('improvedPearson', () => improvedPearson(series1, series2))
console.log(benchmarkjs.results)
$ node test.js
initialPearson 0.5995223578720159
improvedPearson 0.6570987279478532
[
{
name: 'initialPearson',
elapsed: 4136,
checks: 2,
totalIterations: 26454098,
perSecondIterations: 6396058,
isOptimized: null,
diff: '0%'
},
{
name: 'improvedPearson',
elapsed: 4139,
checks: 2,
totalIterations: 26027407,
perSecondIterations: 6288332,
isOptimized: null,
diff: '1.68%'
}
]
Thank you!
The above code has a bug, n is used before being defined.
let n = x.length
needs to be before the const calc ...
line.
The above code has a bug, n is used before being defined.
let n = x.length
needs to be before the
const calc ...
line.
No it hasn't. The third line defines function calc
without calling it.
The above code has a bug, n is used before being defined.
let n = x.length
needs to be before theconst calc ...
line.No it hasn't. The third line defines function
calc
without calling it.
Well, it's still bad form. You're referencing a global that's defined later in scope and generally that's bad form.
The current version of ESLint doesn't like this, but as you're only giving an example here I guess it might be fine.
could you give one example please for it with example dataset?