Created
June 21, 2022 16:26
-
-
Save sbutterfield/952dd3f702fcb54daebd2405e8eb4cc9 to your computer and use it in GitHub Desktop.
apex uri utils for matching and parsing URI & URL groups
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @author: Shawn Butterfield | |
* Utility class for URI (URL/URN/Email) data validation, parsing and matching | |
* Methods for pattern extraction defined by URI standard syntax, here: http://en.wikipedia.org/wiki/URI_scheme#Generic_syntax | |
*/ | |
public class URIUtils { | |
private static final String REGEX_URL_SCHEME = '^([a-zA-Z][a-zA-Z0-9\\+\\.\\-]+):'; | |
private static final String REGEX_URI_EMAIL_PATTERN = getUrlPatternString(); | |
private static final String[] UNSAFE_CHARS = new String[] { ',','.','&','_','0','Inc','Corp','Co','Ltd' }; // Strings or fragments that can screw up an api request due to encoding | |
public static Boolean isValidEmail(String input) { | |
return null; | |
} | |
public static String safeEncode(String input) { | |
String result; | |
if (input == null) { | |
return input; | |
} | |
for (Integer i=0; i<UNSAFE_CHARS.size(); i++) { | |
// The first four instances of unsafe chars could exist anywhere in the string | |
// and for any occurrence, we would want to replace them | |
if (i<=4) { | |
input = input.replace(UNSAFE_CHARS[i], ' '); | |
} | |
// We only want to replace the unsafe chars from the array if they exist at the | |
// end of the string | |
if (i>4) { | |
if (input.endsWith(UNSAFE_CHARS[i])) { | |
input = input.replace(UNSAFE_CHARS[i], ' '); | |
} | |
} | |
i++; | |
} | |
result = input.trim(); | |
return result; | |
} | |
public static Boolean isValidUri(String input) { | |
return null; | |
} | |
public static Boolean hasSubDomain(String input) { | |
return null; | |
} | |
public static String parseSchemeFromUri(String input) { | |
return null; | |
} | |
public static String ensureStartsWithHttp(String url) { | |
if(url == null) { | |
return url; | |
} | |
String urlString; | |
try { | |
Pattern p = Pattern.compile(REGEX_URL_SCHEME); | |
Matcher m = p.matcher(url); | |
if (m.find()) { | |
urlString = url; | |
} | |
else { | |
urlString = 'http://' + url; | |
return urlString; | |
} | |
} | |
catch(Exception e) { | |
return url; | |
} | |
return url; | |
} | |
public static String parseDomain(String input) { | |
if(String.isBlank(input)) { | |
return input; | |
} | |
String domain; | |
try { | |
System.debug( 'PD - INPUT: ' + input ); | |
Pattern p = Pattern.compile(REGEX_URI_EMAIL_PATTERN); | |
Matcher m = p.matcher(input); | |
while(m.matches()) { | |
/* Fix inputs with "@" character in a parameter that can screw up parsing, remove anything after the last occurence of "/" char | |
* Group0 = Input. | |
* Group4 should be null | |
* Group2 should always have more than one dot character if this is still a URI | |
* Finish by re-submitting the input, a given input may hit this logic more than once before being declared valid or invalid | |
*/ | |
if( | |
m.group(0).contains('@') && | |
(m.group(4) == null && m.group(2).countMatches('.') >= 1) | |
&& m.group(0).contains('/') | |
) | |
{ | |
input = m.group(0).substringBeforeLast('/'); | |
System.debug( 'PD - INPUT INSIDE: ' + input ); | |
m = m.reset(input); | |
} | |
// This would catch an input that has a valid email address for domain parsing in it but may contain other characteristics as well. | |
else if( | |
m.group(0).contains('@') | |
&& | |
( | |
m.group(2) != null | |
&& | |
m.group(5) != null | |
&& | |
m.group(6) != null | |
) | |
) | |
{ | |
// Handle ccSLD's and sub-domains in email addresses | |
if( | |
m.group(0).countMatches('.') >= 2 | |
|| | |
(m.group(5).length() <= 2 || m.group(6).length() <= 2) | |
) | |
{ | |
System.debug( 'PD - Group 0: ' + m.group(0) ); | |
domain = parseOutCcSld(m, input); | |
return domain; | |
} | |
else { | |
domain = m.group(5) + '.' + m.group(6); | |
domain.toLowerCase(); | |
return domain; | |
} | |
} | |
// This branch will check for sub-domains and ccSLD's in a URI and parse accordingly. | |
else if(m.group(0).countMatches('.') > 2 || (m.group(5).length() <= 2 || m.group(6).length() <= 2)) { | |
domain = parseOutCcSld(m, input); | |
return domain; | |
} | |
// Final branch, this just assumes a clean URI and extracts the domain | |
else { | |
domain = m.group(5) + '.' + m.group(6); | |
domain = domain.toLowerCase(); | |
return domain; | |
} | |
} | |
} | |
catch (Exception e) { | |
System.debug(LoggingLevel.ERROR,'An error occurred while parsing domain, returning to input stream ' +e.getStackTraceString()); | |
return input; | |
} | |
return domain; | |
} | |
public static String parseDomainWithSubdomain(String input) { | |
return null; | |
} | |
public static String parseParameters(String input) { | |
return null; | |
} | |
public static String parseParameter(String input, Integer n) { | |
return null; | |
} | |
private static String parseOutCcSld(Matcher m, String input) { | |
String domain; | |
// Group 4,5,6 must not be null and always return a domain combination including Group 4 unless Group 5 is shorter than 3 chars (ie: "salesforce.co.uk" versus "www.salesforce.com") | |
if( | |
m.group(4) != null | |
&& | |
m.group(5) != null | |
&& | |
m.group(6) != null | |
&& | |
m.group(5).length() <= 3 | |
) | |
{ | |
if(m.group(4).countMatches('.') > 0) { | |
domain = m.group(4).substringAfterLast('.') + '.'; | |
} | |
else domain = m.group(4) + '.'; | |
} | |
// System.debug( 'PD - Group 4: ' + m.group(4) ); | |
// System.debug( 'PD - Group 5: ' + m.group(5) ); | |
// System.debug( 'PD - Group 5: ' + m.group(6) ); | |
// System.debug( 'PD - Domain: ' + domain ); | |
if(domain != null) { | |
// Now add on the proper ccTLD and ccSLD | |
domain += m.group(5) + '.' + m.group(6); | |
} | |
else { | |
// Otherwise domain is clean with or without country code | |
domain = m.group(5) + '.' + m.group(6); | |
} | |
domain.toLowerCase(); | |
return domain; | |
} | |
/** | |
* Current URL Regex Pattern builds a match using 11 different capture groups which cover: | |
* Scheme/Protocol: http, https, ftp(s), uri, afp, mailto, service, email address ([email protected]) etc. | |
* Fully Qualified Scheme: http://, https://, afp:// | |
* Authority/Domain: www.salesforce.com, salesforce.com | |
* TLD: .com, .sk | |
* ccSLDs: .co.uk, .com.br etc | |
* IP Address (v4) | |
* Port Number (ICANN) | |
* Path, Sub Path and File Name: /pub/Main.aspx | |
* Query: ?adp=1 | |
* Fragment: .aspx, .asp, %$query | |
* https://www.usb.regexlib.com:6553/Search.aspx?query=1#%Metadata >> yeilds: | |
* Group0: [INPUT] https://www.usb.regexlib.com:6553/Search.aspx?query=1#%Metadata | |
* Group1: https | |
* Group2: [No Match] | |
* Group3: [No Match] | |
* Group4: www.usb | |
* Group5: regexlib | |
* Group6: com | |
* Group7: 6553 | |
* Group8: [No Match] | |
* Group9: /Search.aspx | |
* Group10: query=1 | |
* Group11: %Metadata | |
*/ | |
private static String getURLPatternString() { | |
String result; | |
try { | |
Blob content = [Select Body from StaticResource where Name = 'REGEX_URI_EMAIL_PATTERN'].Body; | |
if(content.size() > 0) { | |
result = content.toString(); | |
} | |
} | |
catch(Exception e) { | |
System.debug(LoggingLevel.ERROR,'Unable to retrieve REGEX_URI_EMAIL_PATTERN as a string from static resources.' +e); | |
} | |
return result; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment