winzig · July 30, 2024 22:09 · HenkPoley · Feb 9, 2014 · stuntbox · Feb 9, 2014
diff --git a/Liberal Regex Pattern for URLs b/Liberal Regex Pattern for URLs
 # Single-line version:

 (?i)\b(https?:\/{1,3})?((?:(?:[\w.\-]+\.(?:[a-z]{2,13})|(?<=http:\/\/|https:\/\/)[\w.\-]+)\/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)(?:\w+(?:[.\-]+\w+)*\.(?:[a-z]{2,13})|(?:(?:[0-9](?!\d)|[1-9][0-9](?!\d)|1[0-9]{2}(?!\d)|2[0-4][0-9](?!\d)|25[0-5](?!\d))[.]?){4})\b\/?(?!@)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))*(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])?))

 # Commented multi-line version:

 (?xi)
 \b
 (https?:\/{1,3})?                         # Capture $1: (optional) URL scheme, colon, and slashes
 (                                         # Capture $2: Entire matched URL (other than optional protocol://)
  (?:                                     
    (?:
      [\w.\-]+\.                          # looks like domain name
      (?:[a-z]{2,13})                     # ending in common popular gTLDs
      |                                   #
      (?<=http:\/\/|https:\/\/)[\w.\-]+   # hostname preceded by http:// or https://
    )
    \/                                    # followed by a slash
  )
  (?:                                     # One or more:
    [^\s()<>{}\[\]]+                      # Run of non-space, non-()<>{}[]
    |                                     #   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\)    # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\)                           # balanced parens, non-recursive: (…)
  )+
  (?:                                     # End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\)    # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\)                           # balanced parens, non-recursive: (…)
    |                                     #   or
    [^\s`!()\[\]{};:'\".,<>?«»“”‘’]       # not a space or one of these punct chars
  )
  |                                       # OR, the following to match naked domains:
  (?:
    (?<!@)                                # not preceded by a @, avoid matching foo@_gmail.com_(?<![@.])
    (?:
      \w+
      (?:[.\-]+\w+)*               
      \.                                  # avoid matching the last two parts of an email domain like co.uk in [email protected]
      (?:[a-z]{2,13})                     # ending in common popular gTLDs
      |                                   #   or 
      (?:(?:[0-9](?!\d)|[1-9][0-9](?!\d)|1[0-9]{2}(?!\d)|2[0-4][0-9](?!\d)|25[0-5](?!\d))[.]?){4}  # IPv4 address, as seen in https://stackoverflow.com/a/13166657/650558
    )
    \b
    \/?
    (?!@)                                 # not succeeded by a @, avoid matching "foo.na" in "[email protected]"
    (?:                                   # One or more:
      [^\s()<>{}\[\]]+                    # Run of non-space, non-()<>{}[]
      |                                   #   or
      \([^\s()]*?\([^\s()]+\)[^\s()]*?\)  # balanced parens, one level deep: (…(…)…)
      |
      \([^\s]+?\)                         # balanced parens, non-recursive: (…)
    )*
    (?:                                   # End with:
      \([^\s()]*?\([^\s()]+\)[^\s()]*?\)  # balanced parens, one level deep: (…(…)…)
      |
      \([^\s]+?\)                         # balanced parens, non-recursive: (…)
      |                                   #   or
      [^\s`!()\[\]{};:'\".,<>?«»“”‘’]     # not a space or one of these punct chars
    )?
  )
 )
	# Single-line version:

	(?i)\b(https?:\/{1,3})?((?:(?:[\w.\-]+\.(?:[a-z]{2,13})\|(?<=http:\/\/\|https:\/\/)[\w.\-]+)\/)(?:[^\s()<>{}\[\]]+\|\([^\s()]?\([^\s()]+\)[^\s()]?\)\|\([^\s]+?\))+(?:\([^\s()]?\([^\s()]+\)[^\s()]?\)\|\([^\s]+?\)\|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])\|(?:(?<!@)(?:\w+(?:[.\-]+\w+)\.(?:[a-z]{2,13})\|(?:(?:[0-9](?!\d)\|[1-9][0-9](?!\d)\|1[0-9]{2}(?!\d)\|2[0-4][0-9](?!\d)\|25[0-5](?!\d))[.]?){4})\b\/?(?!@)(?:[^\s()<>{}\[\]]+\|\([^\s()]?\([^\s()]+\)[^\s()]?\)\|\([^\s]+?\))(?:\([^\s()]?\([^\s()]+\)[^\s()]?\)\|\([^\s]+?\)\|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])?))

	# Commented multi-line version:

	(?xi)
	\b
	(https?:\/{1,3})? # Capture $1: (optional) URL scheme, colon, and slashes
	( # Capture $2: Entire matched URL (other than optional protocol://)
	(?:
	(?:
	[\w.\-]+\. # looks like domain name
	(?:[a-z]{2,13}) # ending in common popular gTLDs
	\| #
	(?<=http:\/\/\|https:\/\/)[\w.\-]+ # hostname preceded by http:// or https://
	)
	\/ # followed by a slash
	)
	(?: # One or more:
	[^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
	\| # or
	\([^\s()]?\([^\s()]+\)[^\s()]?\) # balanced parens, one level deep: (…(…)…)
	\|
	\([^\s]+?\) # balanced parens, non-recursive: (…)
	)+
	(?: # End with:
	\([^\s()]?\([^\s()]+\)[^\s()]?\) # balanced parens, one level deep: (…(…)…)
	\|
	\([^\s]+?\) # balanced parens, non-recursive: (…)
	\| # or
	[^\s`!()\[\]{};:'\".,<>?«»“”‘’] # not a space or one of these punct chars
	)
	\| # OR, the following to match naked domains:
	(?:
	(?<!@) # not preceded by a @, avoid matching foo@_gmail.com_(?<![@.])
	(?:
	\w+
	(?:[.\-]+\w+)*
	\. # avoid matching the last two parts of an email domain like co.uk in [email protected]
	(?:[a-z]{2,13}) # ending in common popular gTLDs
	\| # or
	(?:(?:[0-9](?!\d)\|[1-9][0-9](?!\d)\|1[0-9]{2}(?!\d)\|2[0-4][0-9](?!\d)\|25[0-5](?!\d))[.]?){4} # IPv4 address, as seen in https://stackoverflow.com/a/13166657/650558
	)
	\b
	\/?
	(?!@) # not succeeded by a @, avoid matching "foo.na" in "[email protected]"
	(?: # One or more:
	[^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
	\| # or
	\([^\s()]?\([^\s()]+\)[^\s()]?\) # balanced parens, one level deep: (…(…)…)
	\|
	\([^\s]+?\) # balanced parens, non-recursive: (…)
	)*
	(?: # End with:
	\([^\s()]?\([^\s()]+\)[^\s()]?\) # balanced parens, one level deep: (…(…)…)
	\|
	\([^\s]+?\) # balanced parens, non-recursive: (…)
	\| # or
	[^\s`!()\[\]{};:'\".,<>?«»“”‘’] # not a space or one of these punct chars
	)?
	)
	)