Skip to content

Instantly share code, notes, and snippets.

@sumanthprabhu
Last active March 29, 2023 14:03
Show Gist options
  • Save sumanthprabhu/8067221 to your computer and use it in GitHub Desktop.
Save sumanthprabhu/8067221 to your computer and use it in GitHub Desktop.
A rough SQL implementation of tf-idf <http://en.wikipedia.org/wiki/Tf%E2%80%93idf> Assuming you have run the snippet <https://gist.github.com/sumanthprabhu/8066438> to generate a count matrix and stored it in the file 'tfidf.csv', the output will be a table named "results" containing normalized scores for each term per tag.
load local data infile 'tfidf.csv' into table tfidf fields terminated by "|" lines terminated by '\n'(term, tag, count);
DELIMITER //
CREATE PROCEDURE tfidf_applier()
begin
declare res1 INT;
set res1 = (select count(distinct tag) from tfidf);
drop table if exists log_term_table;
create table log_term_table(term varchar(200), logval decimal(20,5));
insert into log_term_table (
select term, log(res1/count(tag)) from tfidf group by term
);
drop index term_log on log_term_table;
create index term_log on log_term_table(term);
drop index term_tfidf on tfidf;
create index term_tfidf on tfidf(term);
drop table if exists tmp_results;
create table tmp_results like tfidf;
insert into tmp_results(
select t1.term, t1.tag, t1.count*t2.logval as count from tfidf as t1
FORCE INDEX(term_tfidf) INNER JOIN log_term_table as t2
FORCE INDEX(term_log) on t1.term = t2.term
);
drop index term_tmp on tmp_results;
create index term_tmp on tmp_results(term);
drop table if exists results;
create table results (
term varchar(200), tag varchar(200), count decimal(20, 5)
);
insert into results(
select t3.term, t3.tag, t3.count/t4.maxcount from tmp_results as t3
FORCE INDEX(term_tmp) INNER JOIN (
select term, max(count) as maxcount from tmp_results group by term
)
as t4 on t3.term=t4.term
);
end
//
DELIMITER ;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment