Last active
January 25, 2017 04:04
-
-
Save LinZap/3ebaa379560c1bb806bfce58d5221985 to your computer and use it in GitHub Desktop.
Tokenize SQL Server Ver. (中英分開,只斷中文,保留英數)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- 測試用字串 | |
declare @str nvarchar(500)= | |
'Hello~Zap123提醒您!若(尚未)提交審查文件,務必 | |
參考官方部落格的說明準備tel:09123456789^^ | |
~thx!!你怎麼那麼厲害' | |
declare @enPattern nvarchar(15) = '%[-a-Z0-9_'']%' | |
declare @len int = len(@str) | |
declare @delimiter table(c char(2)) | |
declare @segment table(s nvarchar(200)) | |
declare @Token table(words nvarchar(200)) | |
declare @i int = 1 | |
declare @gram int = 5 | |
insert into @delimiter values(' '),(' '),(CHAR(13)),(CHAR(10)),(CHAR(9)), | |
(' '),(','),('.'),('-'),('*'),('!'),(';'),('?'),('"'),('〝'), | |
('〞'),('“'),('”'),('‘'),('’'),('。'),(','),(';'),('?'),('!'), | |
('、'),(':'),('」'),('「'),('『'),('』'),('《'),('》'),('〈'),('〉'), | |
('('),(')'),('('),(')'),('['),(']'),('〔'),('〕'),('﹝'),('﹞'), | |
('【'),('】'),('['),(']'),('{'),('}'),('…'),('﹏'),('~'),('—'),('╱'), | |
('/'),('/'),('%'),('%'),('^') | |
declare @tmpStr nvarchar(100)='' | |
declare @tmpChar nvarchar(2) | |
-- 去除 delimiter 並將英數與其他 pattern 分開 | |
while(@i<=@len) | |
begin | |
select @tmpChar=substring(@str,@i,1) | |
-- 遇到特殊符號 | |
if exists(select * from @delimiter where c=@tmpChar) | |
begin | |
if len(@tmpStr)>0 | |
begin | |
insert into @segment values(@tmpStr) | |
set @tmpStr='' | |
end | |
end | |
-- 遇到英數 | |
else if @tmpChar like @enPattern | |
begin | |
if @tmpStr like @enPattern | |
set @tmpStr = @tmpStr+@tmpChar | |
else if len(@tmpStr)<1 | |
set @tmpStr = @tmpChar | |
else | |
begin | |
insert into @segment values(@tmpStr) | |
set @tmpStr=@tmpChar | |
end | |
end | |
-- 遇到其他(中日韓文等) | |
else | |
begin | |
if @tmpStr like @enPattern | |
begin | |
insert into @segment values(@tmpStr) | |
set @tmpStr= @tmpChar | |
end | |
else if len(@tmpStr)<1 | |
begin | |
set @tmpStr = @tmpChar | |
end | |
else | |
set @tmpStr = @tmpStr+@tmpChar | |
end | |
set @i=@i+1 | |
end | |
if len(@tmpStr)>0 | |
insert into @segment values(@tmpStr) | |
-- 切 Token (Tokenize) 5-Gram 示範 | |
declare @seg nvarchar(200) | |
declare seg_cursor cursor for select s from @segment | |
open seg_cursor fetch next from seg_cursor into @seg | |
WHILE @@FETCH_STATUS = 0 | |
begin | |
set @gram = 5 | |
set @len = len(@seg) | |
-- 英數跳過不斷 | |
if @seg like @enPattern | |
begin | |
insert into @Token values(@seg) | |
fetch next from seg_cursor into @seg | |
continue | |
end | |
-- 真正的 Tokenize | |
while(@gram>0) | |
begin | |
set @i = 1 | |
while((@i+@gram-1)<=@len) | |
begin | |
insert into @Token values(SUBSTRING(@seg,@i,@gram)) | |
set @i=@i+1 | |
end | |
set @gram = @gram-1 | |
end | |
-- 下一個 segment | |
fetch next from seg_cursor into @seg | |
end | |
-- 打完收工 | |
select * from @Token |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Search Engine
搜尋引擎基底,透過 Tokenize 進行文字處理 (Text Processing),以建立索引並提供後續搜尋服務。
範例程式碼可以直接執行 (使用 SQL Server)
原理
要處理的字串會經過以下的步驟,逐步執行
範例
ngram=5
five-gram
four-gram
trigram
bigram
unigram
Usage
建立索引表(PK or cluster on token),將 Tokenize 後的 Token 與欲搜尋之資料關聯,就可以完成簡易版的搜尋功能
License MPL v1.1
MPL v1.1