Forked from cwerner13/01 create_paper_keyword.sql
Last active
March 29, 2018 07:25
-
-
Save alpenzoo/d07578e37ac92453078668db5ef92be9 to your computer and use it in GitHub Desktop.
RegExp_Split_To_Table: PostgreSQL command to split and trim a text field and create one row for each keyword in the expression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
drop table ba_paper_keyword; | |
create table ba_paper_keyword as | |
( | |
select p.id, p.keyword, count(p.keyword) count | |
from | |
(select id | |
, trim (both ';,.:?()"' from regexp_split_to_table(lower(paper.keyword), E'\\\s+')) AS keyword | |
from paper)p | |
group by p.id, p.keyword | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Drop TABLE ba_stop_words; | |
CREATE TABLE ba_stop_words | |
( | |
id bigint, | |
stop_word text | |
) | |
WITH ( | |
OIDS=FALSE | |
); | |
Insert into ba_stop_words (id, stop_word) values | |
(1, 'keyword') | |
,(2, 'keywords') | |
,(23, 'key') | |
,(24, 'words') | |
,(40, 'key-words') | |
,(41, 'keywords—') | |
,(42, 'keywords-') | |
,(43, '/') | |
,(44, '•') | |
,(10, '') | |
,(11, '-') | |
,(25, '') | |
,(26, '·') | |
,(27, '�') | |
,(28, '--') | |
,(29, '—') | |
,(30, 'i') | |
,(31, 'n') | |
,(32, 'b') | |
,(33, 'c') | |
,(34, 'd') | |
,(35, 'e') | |
,(3, 'or') | |
,(4, 'and') | |
,(5, 'the') | |
,(6, 'in') | |
,(7, 'for') | |
,(8, 'a') | |
,(9, 'on') | |
,(12, 'are') | |
,(13, 'with') | |
,(14, 'of') | |
,(15, 'from') | |
,(16, 'to') | |
,(17, 'by') | |
,(18, 'an') | |
,(19, 'is') | |
,(20, 'as') | |
,(21, 'at') | |
,(22, 'under') | |
,(21, 'it') | |
,(22, 'its') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- keyword frequency (minus stopword) | |
drop table ba_keywords; | |
create table ba_keywords as( | |
select x.keyword, sum(x.count) | |
from( | |
select id, keyword, count from ba_paper_keyword | |
except | |
select p.id, p.keyword, count | |
from | |
(select id, keyword, count from ba_paper_keyword) p | |
, (select * from ba_stop_words) s | |
where p.keyword = s.stop_word | |
)x | |
group by keyword | |
--order by 2 desc | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment