drinks · November 21, 2011 16:32
diff --git a/gistfile1.py b/gistfile1.py
 from nltk import regexp_tokenize

 regex1 = r'''(?x)
  (?:H|S)\. ?(?:(?:J|R)\. )?(?:Con\. )?(?:Res\. )?\d+     # Bills
 | ([A-Z]\.)+                                              # Abbreviations (U.S.A., etc.)
 | ([A-Z]+\&[A-Z]+)                                        # Internal ampersands (AT&T, etc.)
 | (Mr\.|Dr\.|Mrs\.|Ms\.)                                  # Mr., Mrs., etc.
 | \d*\.\d+                                                # Numbers with decimal points.
 | \d\d?:\d\d                                              # Times.
 | \$?[,0-9]+                                              # Numbers with thousands separators.
 | (((a|A)|(p|P))\.(m|M)\.)                                # a.m., p.m., A.M., P.M.
 | \w+((-|')\w+)*                                          # Words with optional internal hyphens.
 | \$?\d+(\.\d+)?%?                                        # Currency and percentages.
 | \.\.\.                                                  # Ellipsis
 | [][.,;"'?():-_`]
 '''

 regex2 = r'''(?x)(?:H|S)\. ?(?:(?:J|R)\. )?(?:Con\. )?(?:Res\. )?\d+|([A-Z]\.)+|([A-Z]+\&[A-Z]+)|(Mr\.|Dr\.|Mrs\.|Ms\.)|\d*\.\d+|\d\d?:\d\d|\$?[,0-9]+|(((a|A)|(p|P))\.(m|M)\.)|\w+((-|')\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_`]'''

 regex3 = r'''(?:H|S)\. ?(?:(?:J|R)\. )?(?:Con\. )?(?:Res\. )?\d+|([A-Z]\.)+|([A-Z]+\&[A-Z]+)|(Mr\.|Dr\.|Mrs\.|Ms\.)|\d*\.\d+|\d\d?:\d\d|\$?[,0-9]+|(((a|A)|(p|P))\.(m|M)\.)|\w+((-|')\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_`]'''

 print regexp_tokenize('blah H.R. 2354 blah', regex1)
 # prints ['blah', 'H.R.', '2354', 'blah']

 print regexp_tokenize('blah H.R. 2354 blah', regex2)
 # prints ['blah', 'H.R.', '2354', 'blah']

 print regexp_tokenize('blah H.R. 2354 blah', regex3)
 # prints ['blah', 'H.R. 2354', 'blah']

 '''
 Turns out you have to escape your whitespace in verbose mode...
 '''

 regex = r'''(?x)
  (?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
 | ([A-Z]\.)+                                              # Abbreviations (U.S.A., etc.)
 | ([A-Z]+\&[A-Z]+)                                        # Internal ampersands (AT&T, etc.)
 | (Mr\.|Dr\.|Mrs\.|Ms\.)                                  # Mr., Mrs., etc.
 | \d*\.\d+                                                # Numbers with decimal points.
 | \d\d?:\d\d                                              # Times.
 | \$?[,0-9]+                                              # Numbers with thousands separators.
 | (((a|A)|(p|P))\.(m|M)\.)                                # a.m., p.m., A.M., P.M.
 | \w+((-|')\w+)*                                          # Words with optional internal hyphens.
 | \$?\d+(\.\d+)?%?                                        # Currency and percentages.
 | \.\.\.                                                  # Ellipsis
 | [][.,;"'?():-_`]
  '''

 print regexp_tokenize('blah H.R. 2354 blah', regex)
 # prints ['blah', 'H.R. 2354', 'blah']
	from nltk import regexp_tokenize

	regex1 = r'''(?x)
	(?:H\|S)\. ?(?:(?:J\|R)\. )?(?:Con\. )?(?:Res\. )?\d+ # Bills
	\| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.)
	\| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.)
	\| (Mr\.\|Dr\.\|Mrs\.\|Ms\.) # Mr., Mrs., etc.
	\| \d*\.\d+ # Numbers with decimal points.
	\| \d\d?:\d\d # Times.
	\| \$?[,0-9]+ # Numbers with thousands separators.
	\| (((a\|A)\|(p\|P))\.(m\|M)\.) # a.m., p.m., A.M., P.M.
	\| \w+((-\|')\w+)* # Words with optional internal hyphens.
	\| \$?\d+(\.\d+)?%? # Currency and percentages.
	\| \.\.\. # Ellipsis
	\| [][.,;"'?():-_`]
	'''

	regex2 = r'''(?x)(?:H\|S)\. ?(?:(?:J\|R)\. )?(?:Con\. )?(?:Res\. )?\d+\|([A-Z]\.)+\|([A-Z]+\&[A-Z]+)\|(Mr\.\|Dr\.\|Mrs\.\|Ms\.)\|\d\.\d+\|\d\d?:\d\d\|\$?[,0-9]+\|(((a\|A)\|(p\|P))\.(m\|M)\.)\|\w+((-\|')\w+)\|\$?\d+(\.\d+)?%?\|\.\.\.\|[][.,;"'?():-_`]'''

	regex3 = r'''(?:H\|S)\. ?(?:(?:J\|R)\. )?(?:Con\. )?(?:Res\. )?\d+\|([A-Z]\.)+\|([A-Z]+\&[A-Z]+)\|(Mr\.\|Dr\.\|Mrs\.\|Ms\.)\|\d\.\d+\|\d\d?:\d\d\|\$?[,0-9]+\|(((a\|A)\|(p\|P))\.(m\|M)\.)\|\w+((-\|')\w+)\|\$?\d+(\.\d+)?%?\|\.\.\.\|[][.,;"'?():-_`]'''

	print regexp_tokenize('blah H.R. 2354 blah', regex1)
	# prints ['blah', 'H.R.', '2354', 'blah']

	print regexp_tokenize('blah H.R. 2354 blah', regex2)
	# prints ['blah', 'H.R.', '2354', 'blah']

	print regexp_tokenize('blah H.R. 2354 blah', regex3)
	# prints ['blah', 'H.R. 2354', 'blah']

	'''
	Turns out you have to escape your whitespace in verbose mode...
	'''

	regex = r'''(?x)
	(?:H\|S)\.\ ?(?:(?:J\|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
	\| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.)
	\| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.)
	\| (Mr\.\|Dr\.\|Mrs\.\|Ms\.) # Mr., Mrs., etc.
	\| \d*\.\d+ # Numbers with decimal points.
	\| \d\d?:\d\d # Times.
	\| \$?[,0-9]+ # Numbers with thousands separators.
	\| (((a\|A)\|(p\|P))\.(m\|M)\.) # a.m., p.m., A.M., P.M.
	\| \w+((-\|')\w+)* # Words with optional internal hyphens.
	\| \$?\d+(\.\d+)?%? # Currency and percentages.
	\| \.\.\. # Ellipsis
	\| [][.,;"'?():-_`]
	'''

	print regexp_tokenize('blah H.R. 2354 blah', regex)
	# prints ['blah', 'H.R. 2354', 'blah']