Created
June 4, 2014 10:05
-
-
Save tromika/cbffaa666010dec651eb to your computer and use it in GitHub Desktop.
MS SQL Server Corpus Tokenizer Function - Get the specific token from a corpus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--The function is based on http://ole.michelsen.dk/blog/split-string-to-table-using-transact-sql/ | |
--Thx for the core BTW | |
IF OBJECT_ID('Tokenize') IS NOT NULL | |
DROP FUNCTION [dbo].[Tokenize] | |
GO | |
CREATE FUNCTION [dbo].[Tokenize] | |
( | |
@String NVARCHAR(4000), | |
@Delimiter NCHAR(1), | |
@token NCHAR(3) | |
) | |
RETURNS TABLE | |
AS | |
RETURN | |
( | |
WITH Split(stpos,endpos) | |
AS( | |
SELECT 0 AS stpos, CHARINDEX(@Delimiter,@String) AS endpos | |
UNION ALL | |
SELECT endpos+1, CHARINDEX(@Delimiter,@String,endpos+1) | |
FROM Split | |
WHERE endpos > 0 | |
) | |
SELECT * FROM ( | |
SELECT 'myRowId' = ROW_NUMBER() OVER (ORDER BY (SELECT 1)), | |
'Data' = SUBSTRING(@String,stpos,COALESCE(NULLIF(endpos,0),LEN(@String)+1)-stpos) | |
FROM Split) as sub | |
where sub.myRowId = @token | |
) | |
GO |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment