Skip to content

Instantly share code, notes, and snippets.

@jasonlvhit
Last active August 29, 2015 13:57
Show Gist options
  • Save jasonlvhit/9568924 to your computer and use it in GitHub Desktop.
Save jasonlvhit/9568924 to your computer and use it in GitHub Desktop.
URL regex representation
import re
cctlds = ['AC','AD','AE','AF','AG','AI','AL','AM','AN','AO','AQ','AR','AS','AT','AU','AW','AX','AZ','BA','BB','BD',\
'BE','BF','BG','BH','BI','BJ','BL','BM','BN','BO','BQ','BR','BS','BT','BV','BW','BY','BZ','CA','CC','CD',\
'CF','CG','CH','CI','CK','CL','CM','CN','CO','CO','CR','CU','CV','CW','CX','CY','CZ','DE','DJ','DK','DM',\
'DO','DZ','EC','EE','EG','EH','ER','ES','ET','EU','FI','FJ','FK','FM','FO','FR','GA','GB','GD','GE','GF',\
'GG','GH','GI','GL','GM','GN','GP','GQ','GR','GS','GT','GU','GW','GY','HK','HM','HN','HR','HT','HU','ID',\
'IE','IL','IM','IN','IO','IQ','IR','IS','IT','JE','JM','JO','JP','KE','KG','KH','KI','KM','KN','KP','KR',\
'KW','KY','KZ','LA','LB','LC','LI','LK','LR','LS','LT','LU','LV','LY','MA','MC','MD','ME','MF','MG','MH',\
'MK','ML','MM','MN','MO','MP','MQ','MR','MS','MT','MU','MV','MW','MX','MY','MZ','NA','NC','NE','NF','NG',\
'NI','NL','NO','NP','NR','NU','NZ','OM','PA','PE','PF','PG','PH','PK','PL','PM','PN','PR','PS','PT','PW',\
'PY','QA','RE','RO','RS','RU','RW','SA','SB','SC','SD','SE','SG','SH','SI','SJ','SK','SL','SM','SN','SO',\
'SR','ST','SU','SV','SX','SY','SZ','TC','TD','TF','TG','TH','TJ','TK','TL','TM','TN','TO','TP','TR','TT',\
'TV','TW','TZ','UA','UG','UK','UM','US','UY','UZ','VA','VC','VE','VG','VI','VN','VU','WF','WS','YE','YT',\
'ZA','ZM','ZW',]
gtlds = ['AERO','ARPA','ASIA','BIZ','CAT','COM','COOP','EDU','GOV','INFO','INT','JOBS','MIL','MOBI','MUSEUM','NAME',\
'NET','ORG','PRO','TEL','TRAVEL',]
tlds_re = '|'.join(cctlds + gtlds)
p_re = re.compile(r'(?P<url>'
r'(http://|https://|ftp://)?'
r'( (?:[a-z\d][a-z\d-]*\.)+ )'#domain
r'(%s)'
r'(:\d{1,5})?' #port
r'(/\S*)?)'#path
% tlds_re, re.IGNORECASE|re.VERBOSE)
ip_re = re.compile(r'(?P<url>'
r'(http://|https://|ftp://)' #protocal
r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
r'(:\d{1,5})?' #port
r'(/\S*)? )', re.IGNORECASE|re.VERBOSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment