inhumantsar · April 22, 2024 04:28
diff --git a/cosine-similarity.ts b/cosine-similarity.ts
 //
 // slammed this together after reaading an interesting from-scratch series of posts on TF-IDF + Cosine Similarity.
 //
 // it will be slow, if it even works. totally untested and full of misguided practices. 
 // might come back to this some day and make it at least functional.
 // 
 // https://blog.christianperone.com/2011/09/machine-learning-text-feature-extraction-tf-idf-part-i/
 //

 class Vector extends Map<string, number> {
    order: Set<string> | null = null;

    static withOrder(v: Vocabulary) {
        const vector = new Vector();
        vector.order = v;
        return vector;
    }

    orderedEntries() {
        return Array.from(this.entries()).sort((a, b) => a[0] === b[0]
            ? 0
            : a[0] < b[0]
                ? -1
                : 1);
    }

    orderedValues() {
        return this.orderedEntries().map((entry) => entry[1]);
    }

    l2norm() {
        const denom = Math.sqrt((this.orderedValues().map((val) => val ^ 2).reduce((prev, cur) => prev + cur)));
        this.forEach((val, key) => this.set(key, val / denom));
    }
 }

 type Vocabulary = Set<string>;

 class InverseDocumentFrequency {
    private vocab: Vocabulary;
    private _weights: Vector;
    private documentFrequency: Vector;
    private docs: CosineSimilarityDocument<any>[];

    get weights() { return this._weights.orderedValues(); }

    constructor(docs: CosineSimilarityDocument<any>[], vocab: Vocabulary) {
        this.docs = docs;
        this.vocab = vocab;
        this._weights = Vector.withOrder(this.vocab);
        this.documentFrequency = Vector.withOrder(this.vocab);
    }

    updateFrequency(term: string) {
        this.documentFrequency.set(term, (this.documentFrequency.get(term) || 0) + 1);
    }

    updateWeights() {
        for (const [term, count] of this.documentFrequency) {
            this._weights.set(term, Math.log(this.docs.length / (1 + count)));
        }
    }
 }

 class CosineSimilarityDocument<T> {
    private doc: T;
    private tf: Vector;
    private _tfidf: Vector;
    private idf: InverseDocumentFrequency;
    private vocab: Vocabulary;
    private toStringArray: (doc: T) => string[];

    get vector() {
        if (this.idf.weights.length === 0) this.idf.updateWeights();
        if (this._tfidf.size === 0) {
            this.tf.orderedEntries().forEach(([term, value], idx) =>
                this._tfidf.set(term, value * this.idf.weights[idx]));
            this._tfidf.l2norm();
        }
        return this._tfidf;
    }

    constructor(doc: T, vocab: Vocabulary, idf: InverseDocumentFrequency, toStringArray: (doc: T) => string[]) {
        this.doc = doc;
        this.toStringArray = toStringArray;
        this.idf = idf;
        this.vocab = vocab;
        this._tfidf = Vector.withOrder(this.vocab);
        this.tf = this.buildTf();
    }

    private buildTf() {
        const terms = this.toStringArray(this.doc);
        const v = Vector.withOrder(this.vocab);

        for (const term of terms) {
            const prev = v.get(term) || 0;
            v.set(term, prev + 1);
            this.vocab.add(term);
            if (prev === 0) this.idf.updateFrequency(term);
        }

        return v;
    }
 }

 interface CosineSimilaritySearchResult<T> {
    readonly doc: CosineSimilarityDocument<T>;
    readonly result: number;
 }

 interface CosineSimilaritySearchResults<T> {
    readonly tf: Vector;
    readonly weights: number[];
    readonly results: CosineSimilaritySearchResult<T>[];
 }

 class CosineSimilarityController<T> {
    readonly vocab: Vocabulary;
    readonly idf: InverseDocumentFrequency;
    readonly docs: CosineSimilarityDocument<T>[];
    readonly toStringArray: (doc: T) => string[];
    private _searchCache: Map<T, CosineSimilaritySearchResults<T>>

    constructor(docs: T[], toStringArray: (doc: T) => string[]) {
        this.toStringArray = toStringArray;
        this.vocab = new Set();
        this.docs = new Array(docs.length);
        this.idf = new InverseDocumentFrequency(this.docs, this.vocab);
        this._searchCache = new Map();
        docs.forEach((doc, idx) => this.docs[idx] = new CosineSimilarityDocument<T>(doc, this.vocab, this.idf, toStringArray));
    }

    private getSearchTF(query: T): Vector {
        const tf = Vector.withOrder(this.vocab);
        const terms = this.toStringArray(query);

        for (const term of terms) {
            const prev = tf.get(term) || 0;
            tf.set(term, prev + 1);
        }

        return tf;
    }

    private getCosine(a: Vector, b: Vector) {
        let dotProduct = 0;
        const aVals = a.orderedValues();
        const bVals = b.orderedValues();
        for (let idx = 0; idx < a.size; idx++) {
            dotProduct += aVals[idx] * bVals[idx];
        }
        return dotProduct / (
            Math.sqrt(aVals.reduce((prev, curr) => prev + curr ^ 2)) *
            Math.sqrt(bVals.reduce((prev, curr) => prev + curr ^ 2))
        );
    }

    search(query: T): CosineSimilaritySearchResults<T> {
        const tf = this.getSearchTF(query);
        const tfidf = Vector.withOrder(this.vocab);
        tf.orderedEntries().forEach(([term, value], idx) =>
            tfidf.set(term, value * this.idf.weights[idx]));
        tfidf.l2norm();

        const results = this.docs.map((doc): CosineSimilaritySearchResult<T> => Object({
            doc: doc,
            result: this.getCosine(tfidf, doc.vector)
        })).sort((a, b) => a.result === b.result ? 0
            : a.result < b.result ? -1 : 1);

        const result = { tf, weights: Array.from(this.idf.weights), results };
        this._searchCache.set(query, result);

        return result;
    }
 }
	//
	// slammed this together after reaading an interesting from-scratch series of posts on TF-IDF + Cosine Similarity.
	//
	// it will be slow, if it even works. totally untested and full of misguided practices.
	// might come back to this some day and make it at least functional.
	//
	// https://blog.christianperone.com/2011/09/machine-learning-text-feature-extraction-tf-idf-part-i/
	//

	class Vector extends Map<string, number> {
	order: Set<string> \| null = null;

	static withOrder(v: Vocabulary) {
	const vector = new Vector();
	vector.order = v;
	return vector;
	}

	orderedEntries() {
	return Array.from(this.entries()).sort((a, b) => a[0] === b[0]
	? 0
	: a[0] < b[0]
	? -1
	: 1);
	}

	orderedValues() {
	return this.orderedEntries().map((entry) => entry[1]);
	}

	l2norm() {
	const denom = Math.sqrt((this.orderedValues().map((val) => val ^ 2).reduce((prev, cur) => prev + cur)));
	this.forEach((val, key) => this.set(key, val / denom));
	}
	}

	type Vocabulary = Set<string>;

	class InverseDocumentFrequency {
	private vocab: Vocabulary;
	private _weights: Vector;
	private documentFrequency: Vector;
	private docs: CosineSimilarityDocument<any>[];

	get weights() { return this._weights.orderedValues(); }

	constructor(docs: CosineSimilarityDocument<any>[], vocab: Vocabulary) {
	this.docs = docs;
	this.vocab = vocab;
	this._weights = Vector.withOrder(this.vocab);
	this.documentFrequency = Vector.withOrder(this.vocab);
	}

	updateFrequency(term: string) {
	this.documentFrequency.set(term, (this.documentFrequency.get(term) \|\| 0) + 1);
	}

	updateWeights() {
	for (const [term, count] of this.documentFrequency) {
	this._weights.set(term, Math.log(this.docs.length / (1 + count)));
	}
	}
	}

	class CosineSimilarityDocument<T> {
	private doc: T;
	private tf: Vector;
	private _tfidf: Vector;
	private idf: InverseDocumentFrequency;
	private vocab: Vocabulary;
	private toStringArray: (doc: T) => string[];

	get vector() {
	if (this.idf.weights.length === 0) this.idf.updateWeights();
	if (this._tfidf.size === 0) {
	this.tf.orderedEntries().forEach(([term, value], idx) =>
	this._tfidf.set(term, value * this.idf.weights[idx]));
	this._tfidf.l2norm();
	}
	return this._tfidf;
	}

	constructor(doc: T, vocab: Vocabulary, idf: InverseDocumentFrequency, toStringArray: (doc: T) => string[]) {
	this.doc = doc;
	this.toStringArray = toStringArray;
	this.idf = idf;
	this.vocab = vocab;
	this._tfidf = Vector.withOrder(this.vocab);
	this.tf = this.buildTf();
	}

	private buildTf() {
	const terms = this.toStringArray(this.doc);
	const v = Vector.withOrder(this.vocab);

	for (const term of terms) {
	const prev = v.get(term) \|\| 0;
	v.set(term, prev + 1);
	this.vocab.add(term);
	if (prev === 0) this.idf.updateFrequency(term);
	}

	return v;
	}
	}

	interface CosineSimilaritySearchResult<T> {
	readonly doc: CosineSimilarityDocument<T>;
	readonly result: number;
	}

	interface CosineSimilaritySearchResults<T> {
	readonly tf: Vector;
	readonly weights: number[];
	readonly results: CosineSimilaritySearchResult<T>[];
	}

	class CosineSimilarityController<T> {
	readonly vocab: Vocabulary;
	readonly idf: InverseDocumentFrequency;
	readonly docs: CosineSimilarityDocument<T>[];
	readonly toStringArray: (doc: T) => string[];
	private _searchCache: Map<T, CosineSimilaritySearchResults<T>>

	constructor(docs: T[], toStringArray: (doc: T) => string[]) {
	this.toStringArray = toStringArray;
	this.vocab = new Set();
	this.docs = new Array(docs.length);
	this.idf = new InverseDocumentFrequency(this.docs, this.vocab);
	this._searchCache = new Map();
	docs.forEach((doc, idx) => this.docs[idx] = new CosineSimilarityDocument<T>(doc, this.vocab, this.idf, toStringArray));
	}

	private getSearchTF(query: T): Vector {
	const tf = Vector.withOrder(this.vocab);
	const terms = this.toStringArray(query);

	for (const term of terms) {
	const prev = tf.get(term) \|\| 0;
	tf.set(term, prev + 1);
	}

	return tf;
	}

	private getCosine(a: Vector, b: Vector) {
	let dotProduct = 0;
	const aVals = a.orderedValues();
	const bVals = b.orderedValues();
	for (let idx = 0; idx < a.size; idx++) {
	dotProduct += aVals[idx] * bVals[idx];
	}
	return dotProduct / (
	Math.sqrt(aVals.reduce((prev, curr) => prev + curr ^ 2)) *
	Math.sqrt(bVals.reduce((prev, curr) => prev + curr ^ 2))
	);
	}

	search(query: T): CosineSimilaritySearchResults<T> {
	const tf = this.getSearchTF(query);
	const tfidf = Vector.withOrder(this.vocab);
	tf.orderedEntries().forEach(([term, value], idx) =>
	tfidf.set(term, value * this.idf.weights[idx]));
	tfidf.l2norm();

	const results = this.docs.map((doc): CosineSimilaritySearchResult<T> => Object({
	doc: doc,
	result: this.getCosine(tfidf, doc.vector)
	})).sort((a, b) => a.result === b.result ? 0
	: a.result < b.result ? -1 : 1);

	const result = { tf, weights: Array.from(this.idf.weights), results };
	this._searchCache.set(query, result);

	return result;
	}
	}