Created
August 2, 2024 12:37
-
-
Save Taytay/5b516f7caf22483d12c6617f621d7e87 to your computer and use it in GitHub Desktop.
Snowflake enhanced regex_substr. This allows you to use python style regexes in Snowflake.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Originally found here: https://github.com/sfc-gh-gpavlik/SnowflakeUDFs/blob/main/RegularExpressions/regexp2.sql | |
-- Edited to use python and group_num by taytay: https://github.com/taytay/ | |
/******************************************************************************************************** | |
* * | |
* Snowflake Regular Expression Extensions * | |
* * | |
* Copyright (c) 2020 Snowflake Computing Inc. All rights reserved. * | |
* * | |
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in * | |
* compliance with the License. You may obtain a copy of the License at * | |
* * | |
* http://www.apache.org/licenses/LICENSE-2.0 * | |
* * | |
* Unless required by applicable law or agreed to in writing, software distributed under the License * | |
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * | |
* implied. See the License for the specific language governing permissions and limitations under the * | |
* License. * | |
* * | |
* Copyright (c) 2020, 2021, 2022, 2023 Snowflake Computing Inc. All rights reserved. * | |
* * | |
********************************************************************************************************/ | |
CREATE OR REPLACE FUNCTION REGEXP_SUBSTR2( | |
SUBJECT STRING, | |
PATTERN STRING, | |
POSITION FLOAT, | |
OCCURRENCE FLOAT, | |
PARAMETERS STRING, | |
GROUP_NUM FLOAT | |
) | |
RETURNS STRING | |
LANGUAGE PYTHON | |
RUNTIME_VERSION = '3.10' | |
PACKAGES = ('pandas', 'nltk') | |
HANDLER = 'regexp_substr2' | |
AS $$ | |
import re | |
from re import Match, Pattern | |
from typing import Optional | |
import pandas as pd | |
from _snowflake import vectorized | |
def compile_pattern(pattern: str, parameters: str) -> Pattern[str]: | |
regex_flags = 0 | |
if parameters is not None and parameters != '': | |
if 'i' in parameters: | |
regex_flags |= re.IGNORECASE | |
if 'm' in parameters: | |
regex_flags |= re.MULTILINE | |
if 's' in parameters: | |
regex_flags |= re.DOTALL | |
if 'u' in parameters: | |
regex_flags |= re.UNICODE | |
if 'x' in parameters: | |
regex_flags |= re.VERBOSE | |
return re.compile(pattern, regex_flags) | |
def process_row(subject: str, pattern_compiled: Pattern[str], position: float, occurrence: float, parameters: str, group_num: float) -> Optional[str]: | |
# Slice the subject string from the given position | |
str_part = subject[position:] | |
match : Match[str] | None = None | |
# Find the nth occurrence | |
for _ in range(occurrence): | |
match = pattern_compiled.search(str_part) | |
if not match: | |
return "" | |
str_part = str_part[match.start() + 1:] | |
if match is not None: | |
if 'e' in parameters: | |
group_count = len(match.groups()) | |
if group_count > 0: | |
group_index = int(group_num) if group_num is not None else 1 | |
if 1 <= group_index <= group_count: | |
return match.group(group_index) | |
return match.group(0) | |
else: | |
return "" | |
@vectorized(input=pd.DataFrame) | |
def regexp_substr2(params: pd.DataFrame) -> pd.Series: | |
subject_series = params[0] | |
pattern = params[1][0] | |
position = params[2][0] | |
occurrence = params[3][0] | |
parameters = params[4][0] | |
group_num = params[5][0] | |
# Adjust parameters if necessary | |
occurrence = int(occurrence) | |
if occurrence < 1: | |
occurrence = 1 | |
position = int(position) - 1 | |
if position < 0: | |
position = 0 | |
if group_num is None or group_num < 1: | |
group_num = 1 | |
else: | |
group_num = int(group_num) | |
if group_num < 1: | |
group_num = 1 | |
pattern_compiled = compile_pattern(pattern, parameters) | |
return subject_series.apply( | |
lambda row: process_row( | |
row, pattern_compiled, position, occurrence, parameters, group_num | |
) | |
) | |
$$; | |
create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float, OCCURRENCE float, PARAMETERS string) | |
returns string | |
language SQL | |
as | |
$$ | |
regexp_substr2(SUBJECT, PATTERN, POSITION, OCCURRENCE, PARAMETERS, 0) | |
$$; | |
create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float, OCCURRENCE float) | |
returns string | |
language SQL | |
as | |
$$ | |
regexp_substr2(SUBJECT, PATTERN, POSITION, OCCURRENCE, '', 0) | |
$$; | |
create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float) | |
returns string | |
language SQL | |
as | |
$$ | |
regexp_substr2(SUBJECT, PATTERN, POSITION, 1, '', 0) | |
$$; | |
create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string) | |
returns string | |
language SQL | |
as | |
$$ | |
regexp_substr2(SUBJECT, PATTERN, 1, 1, '', 0) | |
$$; | |
-- Returns the second group, which is ABCDEF | |
SELECT REGEXP_SUBSTR2('1234567890ABCDEF', $$ | |
(\d+) | |
(\D+).* # Look, I've got comments in my regex! | |
$$, 1, 1, 'eix', 2); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment