Created
August 6, 2014 09:16
-
-
Save terrycojones/94b8c1e0401a51c9f652 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package AC::Name; | |
use AC::Location; | |
my $thisYearFull = `date +%Y`; | |
# TODO: fix this on Jan 1st, 2100. | |
my $thisYear = $thisYearFull - 2000; | |
my $DEFAULT_SUBTYPE = 'H3N2'; | |
# The values in this hash will be subject to canonicalization, so don't try | |
# to do that here, just get them into a form we can uniformly deal with in | |
# one place (i.e., below, not here). | |
my %globalRewrites = ( | |
'AX147 (A/WYOMING/3/2003)' => 'A/WYOMING/3/2003 AX147', | |
'X-147 (A/WYOMING/3/2003)' => 'A/WYOMING/3/2003 AX147', | |
'X147 A/WYOMONG/3/2003' => 'A/WYOMING/3/2003 AX147', | |
'X147 A/WYOMING/3/2003' => 'A/WYOMING/3/2003 AX147', | |
'IVR134 (A/WYOM)' => 'A/WYOMING/3/2003 IVR-134', | |
'IVR 134 (A/WYOM)' => 'A/WYOMING/3/2003 IVR-134', | |
'IVR-134 (A/WYOM)' => 'A/WYOMING/3/2003 IVR-134', | |
'IVR 135 ( A/KUM/102/2002)' => 'A/KUMAMOTO/102/2002 IVR-135', | |
'IVR135 (A/KUMA)' => 'A/KUMAMOTO/102/2002 IVR-135', | |
'IVR 135 (A/KUMA)' => 'A/KUMAMOTO/102/2002 IVR-135', | |
'IVR-135 (A/KUMA)' => 'A/KUMAMOTO/102/2002 IVR-135', | |
"IVR-136 (A/C'CHURCH/28)" => 'A/CHRISTCHURCH/28/2003 IVR-136', | |
"IVR-137 A/C'CHUURCH/28/2003" => 'A/CHRISTCHURCH/28/2003 IVR-137', | |
"IVR-137(A/C'CHURCH/28)" => 'A/CHRISTCHURCH/28/2003 IVR-137', | |
'IVR-137 (A/CHRISTCHURCH/28/2003)' => 'A/CHRISTCHURCH/28/2003 IVR-137', | |
'IVR-137 A/CHRISTCHURCH/28/2003' => 'A/CHRISTCHURCH/28/2003 IVR-137', | |
'IVR138 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-138', | |
'IVR-138 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-138', | |
'IVR 138 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-138', | |
'IVR139 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-139', | |
'IVR 139 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-139', | |
'IVR-139 (A/WELLINGTON/1/2004)' => 'A/WELLINGTON/1/2004 IVR-139', | |
'A/PERTH/263/003' => 'A/PERTH/263/2003', | |
'A/ANHUI/397/20032' => 'A/ANHUI/397/2003', | |
'A/ANHUI/377/20032' => 'A/ANHUI/377/2003', | |
'VN-A/DUCK/VIETNAM/NIVR-1/2003' => 'A/DUCK/VIETNAM/NIVR-1/2003', | |
'**A/CHRISTCHURCH/413/2004' => 'A/CHRISTCHURCH/413/2004', | |
'**A/MALAYSIA/2400/2004' => 'A/MALAYSIA/2400/2004', | |
'ALYON/1313/06' => 'A/LYON/1313/2006', | |
'A/HONG KONGK/25572004' => 'A/HONG KONG/2557/2004', | |
'A/HONG/KONG/1550/2002' => 'A/HONG-KONG/1550/2002', | |
'A/BRISBANE/5/2002*' => 'A/BRISBANE/05/2002', | |
'A/CAILIFORNIA/7/2004' => 'A/CALIFORNIA/7/2004', | |
'A/ULAN/UDE/01/2001' => 'A/ULAN UDE/01/2001', | |
'A.BRISBANE/332/2003' => 'A/BRISBANE/332/2003', | |
'A/PHILIPPINES/472//2002' => 'A/PHILIPPINES/472/2002', | |
'A/HONG KONG/136//03' => 'A/HONG-KONG/136/2003', | |
"A/ISRAEL/3'/03" => 'A/ISRAEL/3/2003', | |
'A/LYON/CHU/52.58/06' => 'A/LYON-CHU/52.58/06', | |
'A/LYON/CHU/52.339/06' => 'A/LYON-CHU/52.339/06', | |
); | |
sub canonicalize_str($){ | |
# Get rid of chars that are not allowed to appear as part of a strain/serum name. | |
# Anything we don't like becomes a '-'. | |
my ($s) = @_; | |
# Note that in the following if I combine some subs into one line, the RHS ---- | |
# results in an "Ambiguous range in transliteration operator" error from perl. | |
$s = uc($s); | |
$s =~ s/[^0-9A-Z]/-/g; | |
$s =~ s/-+/-/g; | |
$s =~ s/^-+//; | |
$s =~ s/-+$//; | |
$s; | |
} | |
sub canonicalize_name($$;$){ | |
my ($name, $matrix, $default_subtype) = @_; | |
$default_subtype = $DEFAULT_SUBTYPE unless defined $default_subtype; | |
my ($letter, $species, $location, $middle, $year, $rest, $subtype); | |
# print STDERR "Canonicalize '$name' -> "; | |
# Look for a global rewrite. | |
$name = uc($globalRewrites{$name}) if (exists $globalRewrites{uc($name)} || exists $globalRewrites{$name}); | |
if ($name =~ /^(.*)\(\s*(H\s*\d+\s*N\s*\d+)\s*\)\s*$/i){ | |
$name = $1; | |
$subtype = $2; | |
$name =~ s/ +$//; | |
$subtype =~ s/\s//g; | |
$subtype = canonicalize_str($subtype); | |
} | |
else { | |
$subtype = $default_subtype; | |
} | |
if (0){ | |
# Nothing. This is here for indenting purposes - so all regexes below are equally indented. | |
} | |
elsif ($name =~ m:^([AB]) # Strain type | |
/ | |
([^/]*) # Species | |
/ | |
([^/]*) # Location | |
/ | |
([^/]*) # Middle part | |
/ | |
(\d\d|\d\d\d\d) # Year | |
\s*$ # No rest | |
:xi){ | |
($letter, $species, $location, $middle, $year, $rest) = ($1, $2, $3, $4, $5, ''); | |
#print STDERR "Part 1\n"; | |
} | |
elsif ($name =~ m:^([AB]) # Strain type | |
/ | |
([^/]*) # Species | |
/ | |
([^/]*) # Location | |
/ | |
([^/]*) # Middle part | |
/ | |
(\d\d|\d\d\d\d) # Year | |
\s+ # Some form of whitespace separating the rest. | |
(.*) # Rest can be anything. | |
:xi){ | |
($letter, $species, $location, $middle, $year, $rest) = ($1, $2, $3, $4, $5, $6); | |
#print STDERR "Part 2\n"; | |
} | |
elsif ($name =~ m:^([AB]) # Strain type | |
/ | |
([^/]*) # Species | |
/ | |
([^/]*) # Location | |
/ | |
([^/]*) # Middle part | |
/ | |
(\d\d|\d\d\d\d) # Year | |
# No space before rest. | |
([^\d/].*) # Rest cannot start with a digit or slash. | |
:xi){ | |
($letter, $species, $location, $middle, $year, $rest) = ($1, $2, $3, $4, $5, $6); | |
#print STDERR "Part 3\n"; | |
} | |
elsif ($name =~ m:^([AB]) # Strain type | |
/ | |
# No species part. | |
([^/]*) # Location | |
/ | |
([^/]*) # Middle part | |
/ | |
(\d\d|\d\d\d\d) # Year | |
\s*$ | |
# No rest. | |
:xi){ | |
($letter, $species, $location, $middle, $year, $rest) = ($1, 'HUMAN', $2, $3, $4, ''); | |
#print STDERR "Part 4\n"; | |
} | |
elsif ($name =~ m:^([AB]) # Strain type | |
/ | |
# No species part. | |
([^/]*) # Location | |
/ | |
([^/]*) # Middle part | |
/ | |
(\d\d|\d\d\d\d) # Year | |
\s+ # Some form of whitespace separating the rest. | |
(.*) # Rest can be anything. | |
:xi){ | |
($letter, $species, $location, $middle, $year, $rest) = ($1, 'HUMAN', $2, $3, $4, $5); | |
#print STDERR "Part 5\n"; | |
} | |
elsif ($name =~ m:^([AB]) # Strain type | |
/ | |
# No species part. | |
([^/]*) # Location | |
/ | |
([^/]*) # Middle part | |
/ | |
(\d\d|\d\d\d\d) # Year | |
([^\d/].*) # Rest cannot start with a digit or slash. | |
:xi){ | |
($letter, $species, $location, $middle, $year, $rest) = ($1, 'HUMAN', $2, $3, $4, $5); | |
#print STDERR "Part 6\n"; | |
} | |
elsif ($name =~ m:/.*/:){ | |
# Anything with at least 2 slashes might have been meant to be a name. | |
# Leave $letter undefined here. | |
$matrix->warning("'$name' looks like it might be a strain/serum name, but it could not be parsed."); | |
} | |
if (defined $letter){ | |
#print STDERR "$name = L=$letter, S=$species, L=$location, M=$middle, Y=$year, R=$rest\n"; | |
($letter, $species, $location, $middle, $year, $rest) = map { | |
canonicalize_str($_) } ($letter, $species, $location, $middle, $year, $rest); | |
$species = AC::Species::canonicalizeSpecies($species); | |
$location = AC::Location::canonicalizeLocation($location); | |
# Don't allow the middle part to start with leading zeroes. | |
$middle =~ s/^0+//; | |
# Don't allow the middle part to start or end with a hyphen. | |
$middle =~ s/-$//; | |
$middle =~ s/^-//; | |
# The rest part had better not have any slashes. | |
$rest =~ y|/|-|; | |
# Tack anything that appears at the end (after the date) onto the middle part. | |
if ($rest ne '-' && $rest ne ''){ | |
if (substr($rest, 0, 1) ne '-'){ | |
$middle .= '-'; | |
} | |
$rest =~ s/-$//; | |
$middle .= $rest; | |
} | |
$year = canonicalize_year($year, $matrix); | |
$name = "${letter}_$subtype/$species/$location/$middle/$year"; | |
} | |
else { | |
#print STDERR "NON-CANON: '$name' -> "; | |
my $orig = $name; | |
$name = canonicalize_str($name); | |
#print STDERR "'$name'.\n"; | |
$matrix->warning("Non-canonical name '$orig' -> '$name'."); | |
} | |
# print STDERR "canonicalized to '$name'.\n"; | |
return $name; | |
} | |
sub canonicalize_year($;$) | |
{ | |
my ($year, $matrix) = @_; | |
# make 4-digit year | |
if (length($year) == 2) { | |
if ($year > $thisYear) { | |
# This must be in the 20th century. Otherwise it would have to be in the future. | |
$year = "19$year"; | |
} | |
else { | |
$year = "20$year"; | |
} | |
} | |
elsif (length($year) == 4 && $year =~ /^(19|20)\d\d$/) { | |
if (int($year) > $thisYearFull || int($year) < 1968) { | |
if (defined $matrix) { | |
print STDERR "Found an implausible year: $year.\n"; | |
$matrix->warning("Found an implausible year: $year."); | |
} | |
} | |
} | |
else { | |
if (defined $matrix) { | |
print STDERR "Found an implausible or wrong-length year: $year.\n"; | |
$matrix->warning("Found an implausible or wrong-length year ($year)."); | |
} | |
} | |
return $year; | |
} | |
sub canonicalize_location_and_year_in_name($;$) | |
{ | |
my ($name, $matrix) = @_; | |
my ($letter, $species, $location, $middle, $year, $rest, $subtype); | |
if (0){ | |
# Nothing. This is here for indenting purposes - so all regexes below are equally indented. | |
} | |
elsif ($name =~ m:^([AB]) # Strain type | |
# No Species | |
/ | |
([^/]*) # Location | |
/ | |
([^/]*) # Middle part | |
/ | |
(\d\d|\d\d\d\d) # Year | |
(.*)$ # No rest | |
:xi){ | |
($letter, $location, $middle, $year, $rest) = ($1, $2, $3, $4, $5); | |
} | |
if (defined $letter) { | |
if (length($year) == 2) { | |
if (int($year) > 68) { | |
$year = "19$year"; | |
} | |
elsif (int($year) <= $thisYear) { | |
$year = "20$year"; | |
} | |
} | |
$location = AC::Location::canonicalizeLocation($location); | |
$letter = uc($letter); | |
$middle = uc($middle); | |
$rest = uc($rest); | |
if (defined $species) { | |
$name = "$letter/$species/$location/$middle/$year$rest"; | |
} | |
else { | |
$name = "$letter/$location/$middle/$year$rest"; | |
} | |
} | |
return $name; | |
} | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment