gruber · November 4, 2024 20:04 · DanieleQ97 · Jan 6, 2021 · jonpincus · Feb 18, 2021
diff --git a/Liberal Regex Pattern for All URLs b/Liberal Regex Pattern for All URLs
 The regex patterns in this gist are intended to match any URLs,
 including "mailto:[email protected]", "x-whatever://foo", etc. For a
 pattern that attempts only to match web URLs (http, https), see:
 https://gist.github.com/gruber/8891611


 # Single-line version of pattern:

 (?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))


 # Multi-line commented version of same pattern:

 (?xi)
 \b
 (							# Capture 1: entire matched URL
  (?:
    [a-z][\w-]+:				# URL protocol and colon
    (?:
      /{1,3}						# 1-3 slashes
      |								#   or
      [a-z0-9%]						# Single letter or digit or '%'
      								# (Trying not to match e.g. "URI::Escape")
    )
    |							#   or
    www\d{0,3}[.]				# "www.", "www1.", "www2." … "www999."
    |							#   or
    [a-z0-9.\-]+[.][a-z]{2,4}/	# looks like domain name followed by a slash
  )
  (?:							# One or more:
    [^\s()<>]+						# Run of non-space, non-()<>
    |								#   or
    \(([^\s()<>]+|(\([^\s()<>]+\)))*\)	# balanced parens, up to 2 levels
  )+
  (?:							# End with:
    \(([^\s()<>]+|(\([^\s()<>]+\)))*\)	# balanced parens, up to 2 levels
    |									#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]		# not a space or one of these punct char
  )
 )
	The regex patterns in this gist are intended to match any URLs,
	including "mailto:[email protected]", "x-whatever://foo", etc. For a
	pattern that attempts only to match web URLs (http, https), see:
	https://gist.github.com/gruber/8891611


	# Single-line version of pattern:

	(?i)\b((?:[a-z][\w-]+:(?:/{1,3}\|[a-z0-9%])\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))


	# Multi-line commented version of same pattern:

	(?xi)
	\b
	( # Capture 1: entire matched URL
	(?:
	[a-z][\w-]+: # URL protocol and colon
	(?:
	/{1,3} # 1-3 slashes
	\| # or
	[a-z0-9%] # Single letter or digit or '%'
	# (Trying not to match e.g. "URI::Escape")
	)
	\| # or
	www\d{0,3}[.] # "www.", "www1.", "www2." … "www999."
	\| # or
	[a-z0-9.\-]+[.][a-z]{2,4}/ # looks like domain name followed by a slash
	)
	(?: # One or more:
	[^\s()<>]+ # Run of non-space, non-()<>
	\| # or
	\(([^\s()<>]+\|(\([^\s()<>]+\)))*\) # balanced parens, up to 2 levels
	)+
	(?: # End with:
	\(([^\s()<>]+\|(\([^\s()<>]+\)))*\) # balanced parens, up to 2 levels
	\| # or
	[^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct char
	)
	)