Created
January 21, 2012 04:07
-
-
Save mpurbo/1651234 to your computer and use it in GitHub Desktop.
Japanese Address Regular Expression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static final String SPACE = "[\\s\u3000]"; | |
public static final String NUMBER = "[0-9\uff10-\uff19]"; // 0-90-9 | |
public static final String HYPHEN = "[-\uff0d\u2212]"; | |
public static final String NOT_NUMBER_OR_HYPHEN = "[^" + NUMBER.substring(1, NUMBER.length()-1) + HYPHEN.substring(1); | |
public static final String NUMBERS = "([0-9\uff10-\uff19\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d]+)"; // 0-90-9一二三四五六七八九 | |
public static final String CHOUME = "\u4e01\u76ee"; // 丁目 | |
public static final String BANCHI = "\u756a\u5730?"; // 番地 | |
public static final String GOU = "\u53f7"; // 号 | |
// 丁目番地号 combined | |
public static final String CBG = | |
"(?:" + | |
"(?:" + | |
NUMBERS + "(?:" + HYPHEN + "|" + CHOUME + ")" + | |
")?" + NUMBERS + "(?:" + HYPHEN + "|" + BANCHI + ")" + | |
")?" + NUMBERS + "(?:" + GOU + ")?"; | |
public static final String JUSHO_KAMOSHIRENAI = "[\u90fd\u9053\u5e9c\u770c\u5e02\u533a\u753a\u6751]"; // 都道府県市区町村 | |
public static final String DELIMITERS = "\\s\u3000\\;\\<\\>\u3002\u3001"; // [whitespaces][double byte space];<> 。、 | |
public static final String GETC = "[^" + DELIMITERS + "]"; | |
public static final String NOT_ROMAJI_NUMBERS_OR_DELIMITERS = GETC.substring(0, GETC.length()-1) + NUMBER.substring(1); | |
public static final String JOU = "(?:" + NUMBERS + "(?:\u6761(" + NOT_ROMAJI_NUMBERS_OR_DELIMITERS + "+)))?"; // 条 | |
public static final String REGEX_ADDRESS = | |
"(" + GETC + "{2,5}" + JUSHO_KAMOSHIRENAI + ")(" + | |
NOT_ROMAJI_NUMBERS_OR_DELIMITERS + | |
"+)" + JOU + CBG; | |
public static final Pattern PATTERN_ADDRESS = Pattern.compile(REGEX_ADDRESS); |
Very help, thanks. Here's a PowerShell version adapted from this
#PowerShell Japanese Address Regular Expression
$SPACE = "[\\s\u3000]"
$NUMBER = "[0-9\uff10-\uff19]" # 0-90-9
$HYPHEN = "[-\uff0d\u2212]"
$NOT_NUMBER_OR_HYPHEN = "[^$($NUMBER.Substring(1, $NUMBER.Length-2))$($HYPHEN.Substring(1))]"
$NUMBERS = "([0-9\uff10-\uff19\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d]+)" # 0-90-9一二三四五六七八九
$CHOUME = "\u4e01\u76ee" # 丁目
$BANCHI = "\u756a\u5730?" # 番地
$GOU = "\u53f7" # 号
# 丁目番地号 combined
$CBG = "(?:" + "(?:" + "$NUMBERS" + "(?:" + "$HYPHEN" + "|" + "$CHOUME" + ")" + ")?"+ "$NUMBERS" + "(?:" + "$HYPHEN" + "|" + "$BANCHI" + ")" + ")?"+ "$NUMBERS" + "(?:" + "$GOU" + ")?"
$JUSHO_KAMOSHIRENAI = "[\u90fd\u9053\u5e9c\u770c\u5e02\u533a\u753a\u6751]" # 都道府県市区町村
$DELIMITERS = "\\s\u3000\\;\\<\\>\u3002\u3001" # [whitespaces][double byte space];<> 。、
$GETC = "[^$DELIMITERS]"
$NOT_ROMAJI_NUMBERS_OR_DELIMITERS = $GETC.Substring(0, $GETC.Length-1) + $NUMBER.Substring(1)
$JOU = "(?:" + "$NUMBERS" + "(?:\u6761(" + "$NOT_ROMAJI_NUMBERS_OR_DELIMITERS" + "+)))?" # 条
$REGEX_ADDRESS = "($GETC{2,5}$JUSHO_KAMOSHIRENAI)($NOT_ROMAJI_NUMBERS_OR_DELIMITERS$JOU$CBG)"
$PATTERN_ADDRESS = [regex]::new($REGEX_ADDRESS)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
super helpful
thank you