Last active
March 4, 2025 15:20
-
-
Save mdchaney/948b88d2ca31b91b9c5d80919693cfbc to your computer and use it in GitHub Desktop.
English dictionary for PostgreSQL search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
gonna : going to | |
wanna : want to | |
gotta : got to | |
ain't : am not : are not : is not : has not : have not | |
lemme : let me | |
gimme : give me | |
dunno : do not know | |
kinda : kind of | |
lotta : lot of | |
outta : out of | |
betcha : bet you | |
finna : fixing to | |
hafta : have to | |
sorta : sort of | |
til : until | |
bout : about | |
goin' : going | |
havin' : having | |
makin' : making | |
doin' : doing | |
seein' : seeing | |
comin' : coming | |
gettin' : getting | |
givin' : giving | |
knowin' : knowing | |
lookin' : looking | |
playin' : playing | |
runnin' : running | |
sayin' : saying | |
thinkin' : thinking | |
tryin' : trying | |
feelin' : feeling | |
talkin' : talking | |
walkin' : walking | |
eatin' : eating | |
livin' : living | |
callin' : calling | |
leavin' : leaving | |
sleepin' : sleeping | |
startin' : starting | |
listenin' : listening | |
workin' : working | |
lovin' : loving |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
a | |
an | |
the | |
and | |
or |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public; | |
COMMENT ON EXTENSION unaccent IS 'text search dictionary that removes accents'; | |
CREATE TEXT SEARCH DICTIONARY public.english_lyrics_stem ( | |
TEMPLATE = pg_catalog.snowball, | |
language = 'english', stopwords = 'english_lyrics' ); | |
CREATE TEXT SEARCH DICTIONARY public.english_casual ( | |
TEMPLATE = pg_catalog.thesaurus, | |
dictfile = 'english_casual', dictionary = 'public.english_lyrics_stem' ); | |
CREATE TEXT SEARCH CONFIGURATION public.lyrics_search ( | |
PARSER = pg_catalog."default" ); | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR asciiword WITH public.english_casual, public.english_lyrics_stem; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR word WITH public.unaccent, public.english_casual, public.english_lyrics_stem; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR numword WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR email WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR url WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR host WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR sfloat WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR version WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR hword_numpart WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR hword_part WITH public.unaccent, public.english_casual, public.english_lyrics_stem; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR hword_asciipart WITH english_stem; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR numhword WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR asciihword WITH english_stem; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR hword WITH public.unaccent, public.english_casual, public.english_lyrics_stem; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR url_path WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR file WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR "float" WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR "int" WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.lyrics_search | |
ADD MAPPING FOR uint WITH simple; | |
COMMENT ON TEXT SEARCH CONFIGURATION public.lyrics_search IS 'english lyrics search with stemming, accent removal, casual english thesaurus, and small stopword dictionary'; | |
CREATE TEXT SEARCH CONFIGURATION public.name_search ( | |
PARSER = pg_catalog."default" ); | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR asciiword WITH public.unaccent, english_stem; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR word WITH public.unaccent, english_stem; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR numword WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR email WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR url WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR host WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR sfloat WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR version WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR hword_numpart WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR hword_part WITH public.unaccent, english_stem; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR hword_asciipart WITH english_stem; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR numhword WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR asciihword WITH english_stem; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR url_path WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR file WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR "float" WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR "int" WITH simple; | |
ALTER TEXT SEARCH CONFIGURATION public.name_search | |
ADD MAPPING FOR uint WITH simple; | |
COMMENT ON TEXT SEARCH CONFIGURATION public.name_search IS 'english search with stemming and accent removal'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Full text search in pgsql requires some files to be placed in the | |
file system for access by the database. The following files are | |
required: | |
english_lyrics.stop | |
This goes in a directory called "tsearch_data" in the postgres | |
data directory. You can find it using pg_config: | |
% pg_config --sharedir | |
If that doesn't work, you can use "find" to look for "english.stop" | |
% find /usr -name english.stop | |
The contents of english_lyrics.stop are: | |
a | |
an | |
the | |
and | |
or | |
It is purposely tiny since lyric matching often involves | |
searching for a phrase, not a word. | |
We also have a thesaurus file called "english_casual.ths". | |
This thesaurus file is used to make sure that words like | |
"gonna" will match "going to" (as well as "gonna"). This | |
will help normalize lyrics for matching. | |
gonna : going to | |
wanna : want to | |
gotta : got to | |
ain't : am not : are not : is not : has not : have not | |
lemme : let me | |
gimme : give me | |
dunno : do not know | |
kinda : kind of | |
lotta : lot of | |
outta : out of | |
betcha : bet you | |
finna : fixing to | |
hafta : have to | |
sorta : sort of | |
til : until | |
bout : about | |
goin' : going | |
havin' : having | |
makin' : making | |
doin' : doing | |
seein' : seeing | |
comin' : coming | |
gettin' : getting | |
givin' : giving | |
knowin' : knowing | |
lookin' : looking | |
playin' : playing | |
runnin' : running | |
sayin' : saying | |
thinkin' : thinking | |
tryin' : trying | |
feelin' : feeling | |
talkin' : talking | |
walkin' : walking | |
eatin' : eating | |
livin' : living | |
callin' : calling | |
leavin' : leaving | |
sleepin' : sleeping | |
startin' : starting | |
listenin' : listening | |
workin' : working |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment