Created
August 20, 2009 06:09
-
-
Save fitoria/170867 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ADDRESSES_RE = re.compile(r"""(?x) | |
| (?<!-|/|:|,|\.|\$) # These various characters are not allowed before an address/intersection. | |
| \b | |
| # Ignore things that look like dates -- e.g., "21 May 2009". | |
| # This is a problem e.g. in cases where there's a May Street. | |
| (?! | |
| \d+\s+ | |
| (?:January|February|March|April|May|June|July|August|September|October|November|December) | |
| ,?\s+ | |
| \d\d\d\d | |
| ) | |
| # Ignore intersections that are prefixed by "University of", like | |
| # "University of Texas at Austin". This is a common false positive. | |
| (?<! | |
| [Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\s[Oo][Ff]\s | |
| ) | |
| (?: | |
| # SEGMENT ("FOO BETWEEN BAR AND BAZ") | |
| (?: | |
| %(STREET_NAME_CAPTURE)s (,?\ + between \ +) %(STREET_NAME_CAPTURE)s (,?\ + and \ +) %(STREET_NAME_CAPTURE)s | |
| | | |
| %(STREET_NAME_CAPTURE)s (,?\ + from \ +) %(STREET_NAME_CAPTURE)s (,?\ + to \ +) %(STREET_NAME_CAPTURE)s | |
| ) | |
| | | |
| # BLOCK/ADDRESS | |
| (?: | |
| ( | |
| (?: | |
| (?:\d+|[Ff][Ii][Rr][Ss][Tt])[-\ ] | |
| (?:(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ )? | |
| [Bb][Ll][Oo][Cc][Kk]\ [Oo][Ff] | |
| | | |
| \d+\ *-\ *\d+ | |
| | | |
| \d+ | |
| ) | |
| \ + | |
| ) | |
| %(STREET_NAME_CAPTURE)s | |
| # ignore the intersection in parenthesis so that it's not picked | |
| # up as a separate location. We do this by consuming the string | |
| # but *not* capturing it. | |
| (?: | |
| \ + | |
| \(? | |
| between | |
| \ + | |
| %(STREET_NAME_NOCAPTURE)s | |
| \ + | |
| and | |
| \ + | |
| %(STREET_NAME_NOCAPTURE)s | |
| \)? | |
| )? | |
| ) | |
| | | |
| # INTERSECTION | |
| (?: | |
| # Common intersection prefixes. They're included here so that the | |
| # regex doesn't include them as part of the street name. | |
| (?: | |
| (?: | |
| [Nn]ear | | |
| [Aa]t | | |
| [Oo]n | | |
| [Tt]o | | |
| [Aa]round | | |
| [Ii]ntersection\ of | | |
| [Cc]orner\ of | | |
| [Aa]rea\ of | | |
| [Aa]reas?\ surrounding | | |
| vicinity\ of | | |
| ran\ down | | |
| running\ down | | |
| crossed | |
| ) | |
| \ + | |
| )? | |
| \b | |
| (?:%(STREET_NAME_CAPTURE)s) | |
| (\ +) | |
| ( | |
| (?: | |
| [Aa][Nn][Dd] | | |
| [Aa][Tt] | | |
| [Nn][Ee][Aa][Rr] | | |
| & | | |
| [Aa][Rr][Oo][Uu][Nn][Dd] | | |
| [Tt][Oo][Ww][Aa][Rr][Dd][Ss]? | | |
| [Oo][Ff][Ff] | | |
| (?:[Jj][Uu][Ss][Tt]\ )?(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ [Oo][Ff] | | |
| (?:[Jj][Uu][Ss][Tt]\ )?[Pp][Aa][Ss][Tt] | |
| ) | |
| \ + | |
| ) | |
| (?:%(STREET_NAME_CAPTURE)s) | |
| ) | |
| ) | |
| # OPTIONAL CITY SUFFIX | |
| (?: | |
| (?: | |
| ,?\s+in | | |
| , | |
| ) | |
| \s+ | |
| # CITY NAME | |
| ( | |
| [A-Z][a-z][A-Za-z]* # One initial-capped word | |
| (?: | |
| ,?\ Jr\.?,? | |
| | | |
| \ [A-Z][a-z][A-Za-z]* | |
| | | |
| -[A-Za-z]+ # Hyphenated words (e.g. "Croton-on-Hudson" in NY) | |
| ){0,4} # Initial-capped words | |
| ) | |
| )? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment