Taytay · August 2, 2024 12:37
diff --git a/regexp_substr2.sql b/regexp_substr2.sql
 -- Originally found here: https://github.com/sfc-gh-gpavlik/SnowflakeUDFs/blob/main/RegularExpressions/regexp2.sql
 -- Edited to use python and group_num by taytay: https://github.com/taytay/

 /********************************************************************************************************
 *                                                                                                       *
 *                             Snowflake Regular Expression Extensions                                   *
 *                                                                                                       *
 *  Copyright (c) 2020 Snowflake Computing Inc. All rights reserved.                                     *
 *                                                                                                       *
 *  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in  *
 *  compliance with the License. You may obtain a copy of the License at                                 *
 *                                                                                                       *
 *                             http://www.apache.org/licenses/LICENSE-2.0                                *
 *                                                                                                       *
 *  Unless required by applicable law or agreed to in writing, software distributed under the License    *
 *  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  *
 *  implied. See the License for the specific language governing permissions and limitations under the   *
 *  License.                                                                                             *
 *                                                                                                       *
 *  Copyright (c) 2020, 2021, 2022, 2023 Snowflake Computing Inc. All rights reserved.                   *
 *                                                                                                       *
 ********************************************************************************************************/

 CREATE OR REPLACE FUNCTION REGEXP_SUBSTR2(
  SUBJECT STRING, 
  PATTERN STRING, 
  POSITION FLOAT, 
  OCCURRENCE FLOAT, 
  PARAMETERS STRING, 
  GROUP_NUM FLOAT
 )
 RETURNS STRING
 LANGUAGE PYTHON
 RUNTIME_VERSION = '3.10'
 PACKAGES = ('pandas', 'nltk')
 HANDLER = 'regexp_substr2'
 AS $$
 import re
 from re import Match, Pattern
 from typing import Optional
 import pandas as pd
 from _snowflake import vectorized


 def compile_pattern(pattern: str, parameters: str) -> Pattern[str]:
    regex_flags = 0
    if parameters is not None and parameters != '':
        if 'i' in parameters:
            regex_flags |= re.IGNORECASE
        if 'm' in parameters:
            regex_flags |= re.MULTILINE
        if 's' in parameters:
            regex_flags |= re.DOTALL
        if 'u' in parameters:
            regex_flags |= re.UNICODE
        if 'x' in parameters:
            regex_flags |= re.VERBOSE
    return re.compile(pattern, regex_flags)

 def process_row(subject: str, pattern_compiled: Pattern[str], position: float, occurrence: float, parameters: str, group_num: float) -> Optional[str]:
    # Slice the subject string from the given position
    str_part = subject[position:]

    match : Match[str] | None = None
    # Find the nth occurrence
    for _ in range(occurrence):
        match = pattern_compiled.search(str_part)
        if not match:
            return ""
        str_part = str_part[match.start() + 1:]

    if match is not None:
        if 'e' in parameters:
            group_count = len(match.groups())
            if group_count > 0:
                group_index = int(group_num) if group_num is not None else 1
                if 1 <= group_index <= group_count:
                    return match.group(group_index)
        return match.group(0)
    else:
        return ""


 @vectorized(input=pd.DataFrame)
 def regexp_substr2(params: pd.DataFrame) -> pd.Series:
    subject_series = params[0]
    pattern = params[1][0]
    position = params[2][0]
    occurrence = params[3][0]
    parameters = params[4][0]
    group_num = params[5][0]

    # Adjust parameters if necessary
    occurrence = int(occurrence)
    
    if occurrence < 1:
        occurrence = 1

    position = int(position) - 1
    if position < 0:
        position = 0

    if group_num is None or group_num < 1:
      group_num = 1
    else:
      group_num = int(group_num)
      if group_num < 1:
          group_num = 1

    pattern_compiled = compile_pattern(pattern, parameters)

    return subject_series.apply(
        lambda row: process_row(
            row, pattern_compiled, position, occurrence, parameters, group_num
        )
    )
 $$;


 create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float, OCCURRENCE float, PARAMETERS string)
 returns string
 language SQL
 as
 $$
    regexp_substr2(SUBJECT, PATTERN, POSITION, OCCURRENCE, PARAMETERS, 0)
 $$;

 create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float, OCCURRENCE float)
 returns string
 language SQL
 as
 $$
    regexp_substr2(SUBJECT, PATTERN, POSITION, OCCURRENCE, '', 0)
 $$;

 create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float)
 returns string
 language SQL
 as
 $$
    regexp_substr2(SUBJECT, PATTERN, POSITION, 1, '', 0)
 $$;

 create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string)
 returns string
 language SQL
 as
 $$
    regexp_substr2(SUBJECT, PATTERN, 1, 1, '', 0)
 $$;

 -- Returns the second group, which is ABCDEF
 SELECT REGEXP_SUBSTR2('1234567890ABCDEF', $$
        (\d+)
        (\D+).*   # Look, I've got comments in my regex!
        $$, 1, 1, 'eix', 2);
	-- Originally found here: https://github.com/sfc-gh-gpavlik/SnowflakeUDFs/blob/main/RegularExpressions/regexp2.sql
	-- Edited to use python and group_num by taytay: https://github.com/taytay/

	/********************************************************************************************************
	* *
	* Snowflake Regular Expression Extensions *
	* *
	* Copyright (c) 2020 Snowflake Computing Inc. All rights reserved. *
	* *
	* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in *
	* compliance with the License. You may obtain a copy of the License at *
	* *
	* http://www.apache.org/licenses/LICENSE-2.0 *
	* *
	* Unless required by applicable law or agreed to in writing, software distributed under the License *
	* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or *
	* implied. See the License for the specific language governing permissions and limitations under the *
	* License. *
	* *
	* Copyright (c) 2020, 2021, 2022, 2023 Snowflake Computing Inc. All rights reserved. *
	* *
	********************************************************************************************************/

	CREATE OR REPLACE FUNCTION REGEXP_SUBSTR2(
	SUBJECT STRING,
	PATTERN STRING,
	POSITION FLOAT,
	OCCURRENCE FLOAT,
	PARAMETERS STRING,
	GROUP_NUM FLOAT
	)
	RETURNS STRING
	LANGUAGE PYTHON
	RUNTIME_VERSION = '3.10'
	PACKAGES = ('pandas', 'nltk')
	HANDLER = 'regexp_substr2'
	AS $$
	import re
	from re import Match, Pattern
	from typing import Optional
	import pandas as pd
	from _snowflake import vectorized


	def compile_pattern(pattern: str, parameters: str) -> Pattern[str]:
	regex_flags = 0
	if parameters is not None and parameters != '':
	if 'i' in parameters:
	regex_flags \|= re.IGNORECASE
	if 'm' in parameters:
	regex_flags \|= re.MULTILINE
	if 's' in parameters:
	regex_flags \|= re.DOTALL
	if 'u' in parameters:
	regex_flags \|= re.UNICODE
	if 'x' in parameters:
	regex_flags \|= re.VERBOSE
	return re.compile(pattern, regex_flags)

	def process_row(subject: str, pattern_compiled: Pattern[str], position: float, occurrence: float, parameters: str, group_num: float) -> Optional[str]:
	# Slice the subject string from the given position
	str_part = subject[position:]

	match : Match[str] \| None = None
	# Find the nth occurrence
	for _ in range(occurrence):
	match = pattern_compiled.search(str_part)
	if not match:
	return ""
	str_part = str_part[match.start() + 1:]

	if match is not None:
	if 'e' in parameters:
	group_count = len(match.groups())
	if group_count > 0:
	group_index = int(group_num) if group_num is not None else 1
	if 1 <= group_index <= group_count:
	return match.group(group_index)
	return match.group(0)
	else:
	return ""


	@vectorized(input=pd.DataFrame)
	def regexp_substr2(params: pd.DataFrame) -> pd.Series:
	subject_series = params[0]
	pattern = params[1][0]
	position = params[2][0]
	occurrence = params[3][0]
	parameters = params[4][0]
	group_num = params[5][0]

	# Adjust parameters if necessary
	occurrence = int(occurrence)

	if occurrence < 1:
	occurrence = 1

	position = int(position) - 1
	if position < 0:
	position = 0

	if group_num is None or group_num < 1:
	group_num = 1
	else:
	group_num = int(group_num)
	if group_num < 1:
	group_num = 1

	pattern_compiled = compile_pattern(pattern, parameters)

	return subject_series.apply(
	lambda row: process_row(
	row, pattern_compiled, position, occurrence, parameters, group_num
	)
	)
	$$;


	create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float, OCCURRENCE float, PARAMETERS string)
	returns string
	language SQL
	as
	$$
	regexp_substr2(SUBJECT, PATTERN, POSITION, OCCURRENCE, PARAMETERS, 0)
	$$;

	create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float, OCCURRENCE float)
	returns string
	language SQL
	as
	$$
	regexp_substr2(SUBJECT, PATTERN, POSITION, OCCURRENCE, '', 0)
	$$;

	create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string, POSITION float)
	returns string
	language SQL
	as
	$$
	regexp_substr2(SUBJECT, PATTERN, POSITION, 1, '', 0)
	$$;

	create or replace function REGEXP_SUBSTR2(SUBJECT string, PATTERN string)
	returns string
	language SQL
	as
	$$
	regexp_substr2(SUBJECT, PATTERN, 1, 1, '', 0)
	$$;

	-- Returns the second group, which is ABCDEF
	SELECT REGEXP_SUBSTR2('1234567890ABCDEF', $$
	(\d+)
	(\D+).* # Look, I've got comments in my regex!
	$$, 1, 1, 'eix', 2);