Created
April 22, 2024 04:28
-
-
Save inhumantsar/7b58d3e475cc8af1f3fba75bb37a85f8 to your computer and use it in GitHub Desktop.
Cosine Similarity implemented in Typescript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// slammed this together after reaading an interesting from-scratch series of posts on TF-IDF + Cosine Similarity. | |
// | |
// it will be slow, if it even works. totally untested and full of misguided practices. | |
// might come back to this some day and make it at least functional. | |
// | |
// https://blog.christianperone.com/2011/09/machine-learning-text-feature-extraction-tf-idf-part-i/ | |
// | |
class Vector extends Map<string, number> { | |
order: Set<string> | null = null; | |
static withOrder(v: Vocabulary) { | |
const vector = new Vector(); | |
vector.order = v; | |
return vector; | |
} | |
orderedEntries() { | |
return Array.from(this.entries()).sort((a, b) => a[0] === b[0] | |
? 0 | |
: a[0] < b[0] | |
? -1 | |
: 1); | |
} | |
orderedValues() { | |
return this.orderedEntries().map((entry) => entry[1]); | |
} | |
l2norm() { | |
const denom = Math.sqrt((this.orderedValues().map((val) => val ^ 2).reduce((prev, cur) => prev + cur))); | |
this.forEach((val, key) => this.set(key, val / denom)); | |
} | |
} | |
type Vocabulary = Set<string>; | |
class InverseDocumentFrequency { | |
private vocab: Vocabulary; | |
private _weights: Vector; | |
private documentFrequency: Vector; | |
private docs: CosineSimilarityDocument<any>[]; | |
get weights() { return this._weights.orderedValues(); } | |
constructor(docs: CosineSimilarityDocument<any>[], vocab: Vocabulary) { | |
this.docs = docs; | |
this.vocab = vocab; | |
this._weights = Vector.withOrder(this.vocab); | |
this.documentFrequency = Vector.withOrder(this.vocab); | |
} | |
updateFrequency(term: string) { | |
this.documentFrequency.set(term, (this.documentFrequency.get(term) || 0) + 1); | |
} | |
updateWeights() { | |
for (const [term, count] of this.documentFrequency) { | |
this._weights.set(term, Math.log(this.docs.length / (1 + count))); | |
} | |
} | |
} | |
class CosineSimilarityDocument<T> { | |
private doc: T; | |
private tf: Vector; | |
private _tfidf: Vector; | |
private idf: InverseDocumentFrequency; | |
private vocab: Vocabulary; | |
private toStringArray: (doc: T) => string[]; | |
get vector() { | |
if (this.idf.weights.length === 0) this.idf.updateWeights(); | |
if (this._tfidf.size === 0) { | |
this.tf.orderedEntries().forEach(([term, value], idx) => | |
this._tfidf.set(term, value * this.idf.weights[idx])); | |
this._tfidf.l2norm(); | |
} | |
return this._tfidf; | |
} | |
constructor(doc: T, vocab: Vocabulary, idf: InverseDocumentFrequency, toStringArray: (doc: T) => string[]) { | |
this.doc = doc; | |
this.toStringArray = toStringArray; | |
this.idf = idf; | |
this.vocab = vocab; | |
this._tfidf = Vector.withOrder(this.vocab); | |
this.tf = this.buildTf(); | |
} | |
private buildTf() { | |
const terms = this.toStringArray(this.doc); | |
const v = Vector.withOrder(this.vocab); | |
for (const term of terms) { | |
const prev = v.get(term) || 0; | |
v.set(term, prev + 1); | |
this.vocab.add(term); | |
if (prev === 0) this.idf.updateFrequency(term); | |
} | |
return v; | |
} | |
} | |
interface CosineSimilaritySearchResult<T> { | |
readonly doc: CosineSimilarityDocument<T>; | |
readonly result: number; | |
} | |
interface CosineSimilaritySearchResults<T> { | |
readonly tf: Vector; | |
readonly weights: number[]; | |
readonly results: CosineSimilaritySearchResult<T>[]; | |
} | |
class CosineSimilarityController<T> { | |
readonly vocab: Vocabulary; | |
readonly idf: InverseDocumentFrequency; | |
readonly docs: CosineSimilarityDocument<T>[]; | |
readonly toStringArray: (doc: T) => string[]; | |
private _searchCache: Map<T, CosineSimilaritySearchResults<T>> | |
constructor(docs: T[], toStringArray: (doc: T) => string[]) { | |
this.toStringArray = toStringArray; | |
this.vocab = new Set(); | |
this.docs = new Array(docs.length); | |
this.idf = new InverseDocumentFrequency(this.docs, this.vocab); | |
this._searchCache = new Map(); | |
docs.forEach((doc, idx) => this.docs[idx] = new CosineSimilarityDocument<T>(doc, this.vocab, this.idf, toStringArray)); | |
} | |
private getSearchTF(query: T): Vector { | |
const tf = Vector.withOrder(this.vocab); | |
const terms = this.toStringArray(query); | |
for (const term of terms) { | |
const prev = tf.get(term) || 0; | |
tf.set(term, prev + 1); | |
} | |
return tf; | |
} | |
private getCosine(a: Vector, b: Vector) { | |
let dotProduct = 0; | |
const aVals = a.orderedValues(); | |
const bVals = b.orderedValues(); | |
for (let idx = 0; idx < a.size; idx++) { | |
dotProduct += aVals[idx] * bVals[idx]; | |
} | |
return dotProduct / ( | |
Math.sqrt(aVals.reduce((prev, curr) => prev + curr ^ 2)) * | |
Math.sqrt(bVals.reduce((prev, curr) => prev + curr ^ 2)) | |
); | |
} | |
search(query: T): CosineSimilaritySearchResults<T> { | |
const tf = this.getSearchTF(query); | |
const tfidf = Vector.withOrder(this.vocab); | |
tf.orderedEntries().forEach(([term, value], idx) => | |
tfidf.set(term, value * this.idf.weights[idx])); | |
tfidf.l2norm(); | |
const results = this.docs.map((doc): CosineSimilaritySearchResult<T> => Object({ | |
doc: doc, | |
result: this.getCosine(tfidf, doc.vector) | |
})).sort((a, b) => a.result === b.result ? 0 | |
: a.result < b.result ? -1 : 1); | |
const result = { tf, weights: Array.from(this.idf.weights), results }; | |
this._searchCache.set(query, result); | |
return result; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment