calebhearth · July 15, 2016 15:36
diff --git a/pg-lsi b/pg-lsi
 LSI - Latent Semantic Indexing
 Look at occurrences of words in a corpus of documents, index them by repetition of words, then generate a matrix of "physical closeness" between words which
 approximates semantic closeness remarkably well. Want to remove stop words from documents to avoid false positives.
 Would stemming help LSI?
 LSI is slow, because it analyzes entire bodies of text.

 Full Text Search
 Removes stop words
 Stems words
 Can be indexed in Postgres, which is fast.

 Build LSI using FTS indices in Postgres?

 - Bookmark links (like Instapaper) that you read. Perform FTS indexing and LSI
  indexing on each document
 - LSI as a service for blogs as embedded service, as cross-link suggestion, etc.
 - Search blogs for concepts, return both results that match term and results
  that match concept (Google does this well for the Internet)
 - Search bookmarks for concepts. Could "bookmark" all sites you visit with a
  plugin if wanted.
 - Blog service could index links, posts, and even backlinks with a google
  integration to find internal and external related posts.
 - Browser button "Recall Similar" that searches your history for similar pages
  to the current one, and suggests pages from service's index that you've not
  visited as well.

 https://en.wikipedia.org/wiki/Latent_semantic_analysis
 https://web.archive.org/web/20051210031540/http://research.nitle.org/lsi/lsa_explanation.htm
 http://www.c2.com/cgi/wiki?LatentSemanticIndexing
 https://www.postgresql.org/docs/current/static/textsearch-controls.html#TEXTSEARCH-PARSING-DOCUMENTS
 https://github.com/jekyll/classifier-reborn/
diff --git a/views.sql b/views.sql
 CREATE MATERIALIZED VIEW article_lexemes AS
 SELECT
  article_id,
  jsonb_object(array_agg(lexeme), array_agg(occurances::text)) lexemes
 FROM (
  SELECT
    id article_id,
    trim(both '''' from strip(lexeme::tsvector)::text) lexeme,
    array_upper(string_to_array(lexeme, ','), 1) occurances
  FROM (
    SELECT
      id,
      to_tsvector('english', body) vector
    FROM articles
  ) x,
  unnest(string_to_array(vector::text, ' ')) lexeme
  GROUP BY lexeme, article_id
 ) y
 GROUP BY article_id
 ;

 CREATE MATERIALIZED VIEW unique_lexemes AS
 SELECT DISTINCT
  jsonb_object_keys(lexemes) as lexeme
 FROM article_lexemes
 ORDER BY lexeme;
	LSI - Latent Semantic Indexing
	Look at occurrences of words in a corpus of documents, index them by repetition of words, then generate a matrix of "physical closeness" between words which
	approximates semantic closeness remarkably well. Want to remove stop words from documents to avoid false positives.
	Would stemming help LSI?
	LSI is slow, because it analyzes entire bodies of text.

	Full Text Search
	Removes stop words
	Stems words
	Can be indexed in Postgres, which is fast.

	Build LSI using FTS indices in Postgres?

	- Bookmark links (like Instapaper) that you read. Perform FTS indexing and LSI
	indexing on each document
	- LSI as a service for blogs as embedded service, as cross-link suggestion, etc.
	- Search blogs for concepts, return both results that match term and results
	that match concept (Google does this well for the Internet)
	- Search bookmarks for concepts. Could "bookmark" all sites you visit with a
	plugin if wanted.
	- Blog service could index links, posts, and even backlinks with a google
	integration to find internal and external related posts.
	- Browser button "Recall Similar" that searches your history for similar pages
	to the current one, and suggests pages from service's index that you've not
	visited as well.

	https://en.wikipedia.org/wiki/Latent_semantic_analysis
	https://web.archive.org/web/20051210031540/http://research.nitle.org/lsi/lsa_explanation.htm
	http://www.c2.com/cgi/wiki?LatentSemanticIndexing
	https://www.postgresql.org/docs/current/static/textsearch-controls.html#TEXTSEARCH-PARSING-DOCUMENTS
	https://github.com/jekyll/classifier-reborn/
	CREATE MATERIALIZED VIEW article_lexemes AS
	SELECT
	article_id,
	jsonb_object(array_agg(lexeme), array_agg(occurances::text)) lexemes
	FROM (
	SELECT
	id article_id,
	trim(both '''' from strip(lexeme::tsvector)::text) lexeme,
	array_upper(string_to_array(lexeme, ','), 1) occurances
	FROM (
	SELECT
	id,
	to_tsvector('english', body) vector
	FROM articles
	) x,
	unnest(string_to_array(vector::text, ' ')) lexeme
	GROUP BY lexeme, article_id
	) y
	GROUP BY article_id
	;

	CREATE MATERIALIZED VIEW unique_lexemes AS
	SELECT DISTINCT
	jsonb_object_keys(lexemes) as lexeme
	FROM article_lexemes
	ORDER BY lexeme;