Skip to content

Instantly share code, notes, and snippets.

@alpenzoo
Forked from cwerner13/01 create_paper_keyword.sql
Last active March 29, 2018 07:25
Show Gist options
  • Save alpenzoo/d07578e37ac92453078668db5ef92be9 to your computer and use it in GitHub Desktop.
Save alpenzoo/d07578e37ac92453078668db5ef92be9 to your computer and use it in GitHub Desktop.
RegExp_Split_To_Table: PostgreSQL command to split and trim a text field and create one row for each keyword in the expression
drop table ba_paper_keyword;
create table ba_paper_keyword as
(
select p.id, p.keyword, count(p.keyword) count
from
(select id
, trim (both ';,.:?()"' from regexp_split_to_table(lower(paper.keyword), E'\\\s+')) AS keyword
from paper)p
group by p.id, p.keyword
)
Drop TABLE ba_stop_words;
CREATE TABLE ba_stop_words
(
id bigint,
stop_word text
)
WITH (
OIDS=FALSE
);
Insert into ba_stop_words (id, stop_word) values
(1, 'keyword')
,(2, 'keywords')
,(23, 'key')
,(24, 'words')
,(40, 'key-words')
,(41, 'keywords—')
,(42, 'keywords-')
,(43, '/')
,(44, '•')
,(10, '')
,(11, '-')
,(25, '')
,(26, '·')
,(27, '�')
,(28, '--')
,(29, '—')
,(30, 'i')
,(31, 'n')
,(32, 'b')
,(33, 'c')
,(34, 'd')
,(35, 'e')
,(3, 'or')
,(4, 'and')
,(5, 'the')
,(6, 'in')
,(7, 'for')
,(8, 'a')
,(9, 'on')
,(12, 'are')
,(13, 'with')
,(14, 'of')
,(15, 'from')
,(16, 'to')
,(17, 'by')
,(18, 'an')
,(19, 'is')
,(20, 'as')
,(21, 'at')
,(22, 'under')
,(21, 'it')
,(22, 'its')
--- keyword frequency (minus stopword)
drop table ba_keywords;
create table ba_keywords as(
select x.keyword, sum(x.count)
from(
select id, keyword, count from ba_paper_keyword
except
select p.id, p.keyword, count
from
(select id, keyword, count from ba_paper_keyword) p
, (select * from ba_stop_words) s
where p.keyword = s.stop_word
)x
group by keyword
--order by 2 desc
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment