Created
November 21, 2011 16:32
-
-
Save drinks/1383155 to your computer and use it in GitHub Desktop.
Python Regex (?x) fail
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk import regexp_tokenize | |
regex1 = r'''(?x) | |
(?:H|S)\. ?(?:(?:J|R)\. )?(?:Con\. )?(?:Res\. )?\d+ # Bills | |
| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.) | |
| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.) | |
| (Mr\.|Dr\.|Mrs\.|Ms\.) # Mr., Mrs., etc. | |
| \d*\.\d+ # Numbers with decimal points. | |
| \d\d?:\d\d # Times. | |
| \$?[,0-9]+ # Numbers with thousands separators. | |
| (((a|A)|(p|P))\.(m|M)\.) # a.m., p.m., A.M., P.M. | |
| \w+((-|')\w+)* # Words with optional internal hyphens. | |
| \$?\d+(\.\d+)?%? # Currency and percentages. | |
| \.\.\. # Ellipsis | |
| [][.,;"'?():-_`] | |
''' | |
regex2 = r'''(?x)(?:H|S)\. ?(?:(?:J|R)\. )?(?:Con\. )?(?:Res\. )?\d+|([A-Z]\.)+|([A-Z]+\&[A-Z]+)|(Mr\.|Dr\.|Mrs\.|Ms\.)|\d*\.\d+|\d\d?:\d\d|\$?[,0-9]+|(((a|A)|(p|P))\.(m|M)\.)|\w+((-|')\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_`]''' | |
regex3 = r'''(?:H|S)\. ?(?:(?:J|R)\. )?(?:Con\. )?(?:Res\. )?\d+|([A-Z]\.)+|([A-Z]+\&[A-Z]+)|(Mr\.|Dr\.|Mrs\.|Ms\.)|\d*\.\d+|\d\d?:\d\d|\$?[,0-9]+|(((a|A)|(p|P))\.(m|M)\.)|\w+((-|')\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_`]''' | |
print regexp_tokenize('blah H.R. 2354 blah', regex1) | |
# prints ['blah', 'H.R.', '2354', 'blah'] | |
print regexp_tokenize('blah H.R. 2354 blah', regex2) | |
# prints ['blah', 'H.R.', '2354', 'blah'] | |
print regexp_tokenize('blah H.R. 2354 blah', regex3) | |
# prints ['blah', 'H.R. 2354', 'blah'] | |
''' | |
Turns out you have to escape your whitespace in verbose mode... | |
''' | |
regex = r'''(?x) | |
(?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills | |
| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.) | |
| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.) | |
| (Mr\.|Dr\.|Mrs\.|Ms\.) # Mr., Mrs., etc. | |
| \d*\.\d+ # Numbers with decimal points. | |
| \d\d?:\d\d # Times. | |
| \$?[,0-9]+ # Numbers with thousands separators. | |
| (((a|A)|(p|P))\.(m|M)\.) # a.m., p.m., A.M., P.M. | |
| \w+((-|')\w+)* # Words with optional internal hyphens. | |
| \$?\d+(\.\d+)?%? # Currency and percentages. | |
| \.\.\. # Ellipsis | |
| [][.,;"'?():-_`] | |
''' | |
print regexp_tokenize('blah H.R. 2354 blah', regex) | |
# prints ['blah', 'H.R. 2354', 'blah'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment