Created
February 17, 2015 15:18
-
-
Save mdoering/a42ab1f9b76e70309633 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.gbif.markus.udf; | |
import java.util.UUID; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import com.google.common.base.Strings; | |
import org.apache.commons.lang3.StringUtils; | |
import org.apache.hadoop.hive.ql.exec.Description; | |
import org.apache.hadoop.hive.ql.exec.UDF; | |
import org.apache.hadoop.io.Text; | |
/** | |
* A simple UDF for Hive that parses UUIDs (uuid:XYZ, uri:uuid:XYZ, uuid:XYZ, uuid:xyz) into canonical UUID representations. | |
*/ | |
@Description( | |
name = "identifierType", | |
value = "_FUNC_(field)") | |
public class IdentifierTypeUDF extends UDF { | |
private final Text text = new Text(); | |
public static enum IdentifierTypeEnum { | |
URL, URN, | |
INTEGER, DOUBLE, | |
UUID, UUID_URN, UUID_PREFIX, | |
DOI, DOI_NAME, DOI_URN, DOI_HTTP, | |
LSID, LSID_URN, LSID_HTTP, | |
TRIPLET, | |
HANDLER, ARK, | |
OTHER, NONE | |
} | |
private static final Pattern uuidUrn = Pattern.compile("^urn:uuid:[a-f0-9-]+$", Pattern.CASE_INSENSITIVE); | |
private static final Pattern uuidPrefix = Pattern.compile("^uuid:[a-f0-9-]+$", Pattern.CASE_INSENSITIVE); | |
private static final String DOI = " *10(\\.[0-9]+)+/.+$"; | |
private static final Pattern doi = Pattern.compile("^doi:"+DOI, Pattern.CASE_INSENSITIVE); | |
private static final Pattern doiName = Pattern.compile("^"+DOI, Pattern.CASE_INSENSITIVE); | |
private static final Pattern doiUrn = Pattern.compile("^urn:doi:"+DOI, Pattern.CASE_INSENSITIVE); | |
private static final Pattern doiHttp = Pattern.compile("^https?://(dx\\.)?doi\\.org/" | |
+ "(urn:)?(doi:)?" + DOI, Pattern.CASE_INSENSITIVE); | |
private static final Pattern lsid = Pattern.compile("^lsid:.+$", Pattern.CASE_INSENSITIVE); | |
private static final Pattern lsidUrn = Pattern.compile("^urn:lsid:.+$", Pattern.CASE_INSENSITIVE); | |
private static final Pattern lsidHttp = Pattern.compile("^http://lsid.tdwg.org/(summary/)?" | |
+ "(urn:)?lsid:.+$", Pattern.CASE_INSENSITIVE); | |
private static final Pattern url = Pattern.compile("^http(s?)://.+/.+$", Pattern.CASE_INSENSITIVE); | |
private static final Pattern urn = Pattern.compile("^urn:([a-z]+):.+$", Pattern.CASE_INSENSITIVE); | |
private static final Pattern triplet = Pattern.compile("^(\\w+)[ :.-](\\w+)[ :.-](.+)$", Pattern.CASE_INSENSITIVE); | |
public Text evaluate(Text field) { | |
if (field == null) { | |
set(IdentifierTypeEnum.NONE); | |
} else { | |
final String val = field.toString(); | |
if (Strings.isNullOrEmpty(val)) { | |
set(IdentifierTypeEnum.NONE); | |
} else { | |
if (!tryNumber(val)) { | |
if (!tryUUID(val)) { | |
if (!tryDOI(val)) { | |
if (!tryLSID(val)) { | |
if (!tryUrln(val)) { | |
if (!tryTriplet(val)) { | |
set(IdentifierTypeEnum.OTHER); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
return text; | |
} | |
private void set(IdentifierTypeEnum type) { | |
text.set(type.name()); | |
} | |
private boolean tryUrln(String val) { | |
if (url.matcher(val).matches()) { | |
set(IdentifierTypeEnum.URL); | |
return true; | |
} else { | |
Matcher m = urn.matcher(val); | |
if (m.find()) { | |
String scheme = m.group(1); | |
if (StringUtils.isAlpha(scheme)) { | |
text.set("URN:"+scheme); | |
} else { | |
set(IdentifierTypeEnum.URN); | |
} | |
return true; | |
} | |
} | |
return false; | |
} | |
private boolean tryLSID(String val) { | |
if (lsidUrn.matcher(val).matches()) { | |
set(IdentifierTypeEnum.LSID_URN); | |
} else if (lsidHttp.matcher(val).matches()) { | |
set(IdentifierTypeEnum.LSID_HTTP); | |
} else if (lsid.matcher(val).matches()) { | |
set(IdentifierTypeEnum.LSID); | |
} else { | |
return false; | |
} | |
return true; | |
} | |
private boolean tryTriplet(String val) { | |
if (triplet.matcher(val).matches()) { | |
set(IdentifierTypeEnum.TRIPLET); | |
return true; | |
} | |
return false; | |
} | |
private boolean tryUUID(String val) { | |
try { | |
UUID.fromString(val); | |
set(IdentifierTypeEnum.UUID); | |
} catch (Exception e) { | |
if (uuidUrn.matcher(val).matches()) { | |
set(IdentifierTypeEnum.UUID_URN); | |
} else if (uuidPrefix.matcher(val).matches()) { | |
set(IdentifierTypeEnum.UUID_PREFIX); | |
} else { | |
return false; | |
} | |
} | |
return true; | |
} | |
private boolean tryDOI(String val) { | |
if (doiUrn.matcher(val).matches()) { | |
set(IdentifierTypeEnum.DOI_URN); | |
} else if (doiHttp.matcher(val).matches()) { | |
set(IdentifierTypeEnum.DOI_HTTP); | |
} else if (doiName.matcher(val).matches()) { | |
set(IdentifierTypeEnum.DOI_NAME); | |
} else if (doi.matcher(val).matches()) { | |
set(IdentifierTypeEnum.DOI); | |
} else { | |
return false; | |
} | |
return true; | |
} | |
private boolean tryNumber(String val) { | |
try { | |
Integer.valueOf(val); | |
set(IdentifierTypeEnum.INTEGER); | |
} catch (Exception e) { | |
try { | |
Double.valueOf(val); | |
set(IdentifierTypeEnum.DOUBLE); | |
} catch (Exception e1) { | |
return false; | |
} | |
} | |
return true; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment