Skip to content

Instantly share code, notes, and snippets.

@drinks
Created November 21, 2011 16:32
Show Gist options
  • Save drinks/1383155 to your computer and use it in GitHub Desktop.
Save drinks/1383155 to your computer and use it in GitHub Desktop.
Python Regex (?x) fail
from nltk import regexp_tokenize
regex1 = r'''(?x)
(?:H|S)\. ?(?:(?:J|R)\. )?(?:Con\. )?(?:Res\. )?\d+ # Bills
| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.)
| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.)
| (Mr\.|Dr\.|Mrs\.|Ms\.) # Mr., Mrs., etc.
| \d*\.\d+ # Numbers with decimal points.
| \d\d?:\d\d # Times.
| \$?[,0-9]+ # Numbers with thousands separators.
| (((a|A)|(p|P))\.(m|M)\.) # a.m., p.m., A.M., P.M.
| \w+((-|')\w+)* # Words with optional internal hyphens.
| \$?\d+(\.\d+)?%? # Currency and percentages.
| \.\.\. # Ellipsis
| [][.,;"'?():-_`]
'''
regex2 = r'''(?x)(?:H|S)\. ?(?:(?:J|R)\. )?(?:Con\. )?(?:Res\. )?\d+|([A-Z]\.)+|([A-Z]+\&[A-Z]+)|(Mr\.|Dr\.|Mrs\.|Ms\.)|\d*\.\d+|\d\d?:\d\d|\$?[,0-9]+|(((a|A)|(p|P))\.(m|M)\.)|\w+((-|')\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_`]'''
regex3 = r'''(?:H|S)\. ?(?:(?:J|R)\. )?(?:Con\. )?(?:Res\. )?\d+|([A-Z]\.)+|([A-Z]+\&[A-Z]+)|(Mr\.|Dr\.|Mrs\.|Ms\.)|\d*\.\d+|\d\d?:\d\d|\$?[,0-9]+|(((a|A)|(p|P))\.(m|M)\.)|\w+((-|')\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_`]'''
print regexp_tokenize('blah H.R. 2354 blah', regex1)
# prints ['blah', 'H.R.', '2354', 'blah']
print regexp_tokenize('blah H.R. 2354 blah', regex2)
# prints ['blah', 'H.R.', '2354', 'blah']
print regexp_tokenize('blah H.R. 2354 blah', regex3)
# prints ['blah', 'H.R. 2354', 'blah']
'''
Turns out you have to escape your whitespace in verbose mode...
'''
regex = r'''(?x)
(?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.)
| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.)
| (Mr\.|Dr\.|Mrs\.|Ms\.) # Mr., Mrs., etc.
| \d*\.\d+ # Numbers with decimal points.
| \d\d?:\d\d # Times.
| \$?[,0-9]+ # Numbers with thousands separators.
| (((a|A)|(p|P))\.(m|M)\.) # a.m., p.m., A.M., P.M.
| \w+((-|')\w+)* # Words with optional internal hyphens.
| \$?\d+(\.\d+)?%? # Currency and percentages.
| \.\.\. # Ellipsis
| [][.,;"'?():-_`]
'''
print regexp_tokenize('blah H.R. 2354 blah', regex)
# prints ['blah', 'H.R. 2354', 'blah']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment