Skip to content

Instantly share code, notes, and snippets.

@tromika
Created June 4, 2014 10:05
Show Gist options
  • Save tromika/cbffaa666010dec651eb to your computer and use it in GitHub Desktop.
Save tromika/cbffaa666010dec651eb to your computer and use it in GitHub Desktop.
MS SQL Server Corpus Tokenizer Function - Get the specific token from a corpus
--The function is based on http://ole.michelsen.dk/blog/split-string-to-table-using-transact-sql/
--Thx for the core BTW
IF OBJECT_ID('Tokenize') IS NOT NULL
DROP FUNCTION [dbo].[Tokenize]
GO
CREATE FUNCTION [dbo].[Tokenize]
(
@String NVARCHAR(4000),
@Delimiter NCHAR(1),
@token NCHAR(3)
)
RETURNS TABLE
AS
RETURN
(
WITH Split(stpos,endpos)
AS(
SELECT 0 AS stpos, CHARINDEX(@Delimiter,@String) AS endpos
UNION ALL
SELECT endpos+1, CHARINDEX(@Delimiter,@String,endpos+1)
FROM Split
WHERE endpos > 0
)
SELECT * FROM (
SELECT 'myRowId' = ROW_NUMBER() OVER (ORDER BY (SELECT 1)),
'Data' = SUBSTRING(@String,stpos,COALESCE(NULLIF(endpos,0),LEN(@String)+1)-stpos)
FROM Split) as sub
where sub.myRowId = @token
)
GO
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment