Last active
March 29, 2023 14:03
-
-
Save sumanthprabhu/8067221 to your computer and use it in GitHub Desktop.
A rough SQL implementation of tf-idf <http://en.wikipedia.org/wiki/Tf%E2%80%93idf> Assuming you have run the snippet <https://gist.github.com/sumanthprabhu/8066438> to generate a count matrix and stored it in the file 'tfidf.csv', the output will be a table named "results" containing normalized scores for each term per tag.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
load local data infile 'tfidf.csv' into table tfidf fields terminated by "|" lines terminated by '\n'(term, tag, count); | |
DELIMITER // | |
CREATE PROCEDURE tfidf_applier() | |
begin | |
declare res1 INT; | |
set res1 = (select count(distinct tag) from tfidf); | |
drop table if exists log_term_table; | |
create table log_term_table(term varchar(200), logval decimal(20,5)); | |
insert into log_term_table ( | |
select term, log(res1/count(tag)) from tfidf group by term | |
); | |
drop index term_log on log_term_table; | |
create index term_log on log_term_table(term); | |
drop index term_tfidf on tfidf; | |
create index term_tfidf on tfidf(term); | |
drop table if exists tmp_results; | |
create table tmp_results like tfidf; | |
insert into tmp_results( | |
select t1.term, t1.tag, t1.count*t2.logval as count from tfidf as t1 | |
FORCE INDEX(term_tfidf) INNER JOIN log_term_table as t2 | |
FORCE INDEX(term_log) on t1.term = t2.term | |
); | |
drop index term_tmp on tmp_results; | |
create index term_tmp on tmp_results(term); | |
drop table if exists results; | |
create table results ( | |
term varchar(200), tag varchar(200), count decimal(20, 5) | |
); | |
insert into results( | |
select t3.term, t3.tag, t3.count/t4.maxcount from tmp_results as t3 | |
FORCE INDEX(term_tmp) INNER JOIN ( | |
select term, max(count) as maxcount from tmp_results group by term | |
) | |
as t4 on t3.term=t4.term | |
); | |
end | |
// | |
DELIMITER ; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment