airawat · December 20, 2022 15:39
diff --git a/00-CustomGenericUDFHive-NVL2 b/00-CustomGenericUDFHive-NVL2
 This gist covers a simple Hive genericUDF in Java, that mimics NVL2 functionality in Oracle.
 NVL2 is used to handle nulls and conditionally substitute values.
 
 Included:
 1.  Input data
 2.  Expected results
 3.  UDF code in java
 4.  Hive query to demo the UDF
 5.  Output
 
 Note: The dataset is very small - as the purpose of this gist is instructional.  :)
 
 About the NVL2 functionality in this demo:
 ------------------------------------------
 Return type: String
 Parameters: Three comma separated strings, we will refer to as:
 expr1, expr2, expr3
 Purpose: If expr1 is null, NVL2 returns expr3, otherwise, expr3
diff --git a/01-InputData b/01-InputData
 **************************
 Input data
 **************************
 
 1. Execute locally on the node you are running Hive client from
 -----------------------------------------------------------------
 Create input file/data to use for the demo.
 Since this gist is merely for instructional purpose, the dataset is small.
 
 cd ~
 mkdir hiveProject
 cd hiveProject
 vi Departments_UDFTest
 
 Paste this..ensuring the fields are delimited by tabs and record with new line.
 d001	Marketing
 d002	Finance
 d003	Human Resources
 d004	Production
 d005	Development
 d006	Quality Management
 d007	Sales
 d008
 d009	Customer Service
 
 2.  Hadoop commands
 --------------------
 hadoop fs -mkdir hiveProject
 hadoop fs -put hiveProject/Departments_UDFTest hiveProject
diff --git a/02-HiveTableSetup b/02-HiveTableSetup
 *********************************************
 Setting up the Hive table
 *********************************************
 In hive shell....
 
 a) Create table:
 
 CREATE EXTERNAL TABLE IF NOT EXISTS departments_UDFTest
 (
 deptNo String,
 deptName String
 )
 Row format delimited
 fields terminated by '\t'
 LOCATION '/user/akhanolk/hiveProject';
 
 b) Quick test:
 
 Select * from departments_UDFTest;
 
 d001	Marketing
 d002	Finance
 d003	Human Resources
 d004	Production
 d005	Development
 d006	Quality Management
 d007	Sales
 d008	NULL
 d009	Customer Service
diff --git a/03a-UDFCode b/03a-UDFCode
 package khanolkar.HiveUDFs;

 import org.apache.hadoop.hive.ql.exec.Description;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.udf.UDFType;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;

 @UDFType(deterministic = true)
 @Description(name = "NVL2", value = "NVL2(expr1,expr2,expr3) returns expr3, if expr1 is null, otherwise returns expr2;", extended = "NVL2(expr1,expr2,expr3) returns expr3, if expr1 is null, otherwise retruns expr2")
 public class NVL2GenericUDF extends GenericUDF {

 	private GenericUDFUtils.ReturnObjectInspectorResolver returnOIResolver;
 	private ObjectInspector[] argumentOIs;

 	@Override
 	public ObjectInspector initialize(ObjectInspector[] arguments)
 			throws UDFArgumentException {
 		argumentOIs = arguments;

 		// First check - do we have the right number of arguments?
 		if (arguments.length != 3) {
 			throw new UDFArgumentLengthException(
 					"The operator 'NVL2'  accepts 3 arguments.");
 		}

 		// Second check - throw exception if any complex types have been passed
 		// as parameters
 		if (arguments[0].getCategory() != ObjectInspector.Category.PRIMITIVE
 				|| arguments[1].getCategory() != ObjectInspector.Category.PRIMITIVE
 				|| arguments[2].getCategory() != ObjectInspector.Category.PRIMITIVE)
 			throw new UDFArgumentTypeException(0,
 					"Only primitive type arguments are accepted");

 		// Third check - throw exception if the data types across parameters are
 		// different
 		if (!(returnOIResolver.update(arguments[0]) && returnOIResolver
 				.update(arguments[1]))
 				|| !(returnOIResolver.update(arguments[1]) && returnOIResolver
 						.update(arguments[2]))) {
 			throw new UDFArgumentTypeException(2,
 					"The arguments of function NLV2 should have the same type, "
 							+ "but they are different: \""
 							+ arguments[0].getTypeName() + "\" and \""
 							+ arguments[1].getTypeName() + "\" and \""
 							+ arguments[2].getTypeName() + "\"");
 		}
 		returnOIResolver = new GenericUDFUtils.ReturnObjectInspectorResolver(
 				true);

 		return returnOIResolver.get();
 	}

 	@Override
 	public Object evaluate(DeferredObject[] arguments) throws HiveException {
 		// The NVL2 functionality
 		Object retVal = returnOIResolver.convertIfNecessary(arguments[0].get(),
 				argumentOIs[0]);
 		if (retVal == null) {
 			retVal = returnOIResolver.convertIfNecessary(arguments[2].get(),
 					argumentOIs[2]);
 		} else {
 			retVal = returnOIResolver.convertIfNecessary(arguments[1].get(),
 					argumentOIs[1]);
 		}
 		return retVal;
 	}

 	@Override
 	public String getDisplayString(String[] children) {
 		StringBuilder sb = new StringBuilder();
 		sb.append("if ");
 		sb.append(children[0]);
 		sb.append(" is null ");
 		sb.append("returns");
 		sb.append(children[2]);
 		sb.append("else ");
 		sb.append("returns");
 		sb.append(children[1]);
 		return sb.toString();
 	}

 }
diff --git a/03b-SomeNotesOnUDF b/03b-SomeNotesOnUDF
 Concepts
 -------------------------
 There are three methods-
 1. initialize() - called once, at first. The goal of this method is to determine the return type 
 from the arguments. The user can also throw an Exception to signal that bad types are being sent 
 to the method. The returnOIResolver is a built-in class that determines the return type by finding 
 the type of non-null variables and using that type.  The ObjectInspector is used to the transform 
 raw records into objects that Hive can access.  The initialize() method is passed an 
 ObjectInspector for each argument

 2. evaluate() - where the logic for the function should be written.
 The evaluate method has access to the values passed to the method stored in an array of DeferredObject 
 values. The returnOIResolver created in the initialize method is used to get values from the 
 DeferredObjects.

 3. getDisplayString() - The final method to override is getDisplayString(), is used inside the Hadoop 
 tasks to display debugging information when the function is being used.

 Annotations: 
 @UDFType(deterministic = true) annotation: Indicates that the UDF returns the same value any time its 
 called
 @Description(...) annotation: Includes information that is displayed when you do a describe on the UDF
diff --git a/04-ExpectedResults b/04-ExpectedResults
 ****************************
 Expected results
 ****************************
 
 Query: 
 select deptNo,NVL2(deptName,deptName,'Procrastrination') from departments_UDFTest;
 
 The null in the department name for department d008, should be returned as "Procrastrination".
 For the rest of the records, the query should return the data in Hive, as is.
 
diff --git a/05-HiveTest b/05-HiveTest
 *******************************
 Testing the UDF
 *******************************
 1) Add jar
 hive> add jar hiveProject/jars/NVL2GenericUDF.jar;

 2) Create alias for the function
 hive> CREATE TEMPORARY FUNCTION NVL2
 AS 'khanolkar.HiveUDFs.NVL2GenericUDF';

 3) Test the description provided
 hive> DESCRIBE FUNCTION NVL2; 
 OK
 NVL2(expr1,expr2,expr3) returns expr3, if expr1 is null, otherwise returns expr2;
 
 4) Test if there are checks in place for number of parameters
 hive> select deptNo,NVL2(deptName,deptName) from departments_UDFTest;  
 FAILED: SemanticException [Error 10015]: Line 1:14 Arguments length mismatch 'deptName': The operator 'NVL2'  accepts 3 arguments.

 hive> select deptNo,NVL2(deptName,deptName,123,1) from departments_UDFTest;    
 FAILED: SemanticException [Error 10015]: Line 1:14 Arguments length mismatch '1': The operator 'NVL2'  accepts 3 arguments.

 5) Results
 hive> select deptNo,NVL2(deptName,deptName,'Procrastrination') from departments_UDFTest;
 OK
 d001	Marketing
 d002	Finance
 d003	Human Resources
 d004	Production
 d005	Development
 d006	Quality Management
 d007	Sales
 d008	Procrastrination
 d009	Customer Service
	This gist covers a simple Hive genericUDF in Java, that mimics NVL2 functionality in Oracle.
	NVL2 is used to handle nulls and conditionally substitute values.

	Included:
	1. Input data
	2. Expected results
	3. UDF code in java
	4. Hive query to demo the UDF
	5. Output

	Note: The dataset is very small - as the purpose of this gist is instructional. :)

	About the NVL2 functionality in this demo:
	------------------------------------------
	Return type: String
	Parameters: Three comma separated strings, we will refer to as:
	expr1, expr2, expr3
	Purpose: If expr1 is null, NVL2 returns expr3, otherwise, expr3
	**************************
	Input data
	**************************

	1. Execute locally on the node you are running Hive client from
	-----------------------------------------------------------------
	Create input file/data to use for the demo.
	Since this gist is merely for instructional purpose, the dataset is small.

	cd ~
	mkdir hiveProject
	cd hiveProject
	vi Departments_UDFTest

	Paste this..ensuring the fields are delimited by tabs and record with new line.
	d001 Marketing
	d002 Finance
	d003 Human Resources
	d004 Production
	d005 Development
	d006 Quality Management
	d007 Sales
	d008
	d009 Customer Service

	2. Hadoop commands
	--------------------
	hadoop fs -mkdir hiveProject
	hadoop fs -put hiveProject/Departments_UDFTest hiveProject
	*********************************************
	Setting up the Hive table
	*********************************************
	In hive shell....

	a) Create table:

	CREATE EXTERNAL TABLE IF NOT EXISTS departments_UDFTest
	(
	deptNo String,
	deptName String
	)
	Row format delimited
	fields terminated by '\t'
	LOCATION '/user/akhanolk/hiveProject';

	b) Quick test:

	Select * from departments_UDFTest;

	d001 Marketing
	d002 Finance
	d003 Human Resources
	d004 Production
	d005 Development
	d006 Quality Management
	d007 Sales
	d008 NULL
	d009 Customer Service
	package khanolkar.HiveUDFs;

	import org.apache.hadoop.hive.ql.exec.Description;
	import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
	import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
	import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
	import org.apache.hadoop.hive.ql.metadata.HiveException;
	import org.apache.hadoop.hive.ql.udf.UDFType;
	import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
	import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils;
	import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;

	@UDFType(deterministic = true)
	@Description(name = "NVL2", value = "NVL2(expr1,expr2,expr3) returns expr3, if expr1 is null, otherwise returns expr2;", extended = "NVL2(expr1,expr2,expr3) returns expr3, if expr1 is null, otherwise retruns expr2")
	public class NVL2GenericUDF extends GenericUDF {

	private GenericUDFUtils.ReturnObjectInspectorResolver returnOIResolver;
	private ObjectInspector[] argumentOIs;

	@Override
	public ObjectInspector initialize(ObjectInspector[] arguments)
	throws UDFArgumentException {
	argumentOIs = arguments;

	// First check - do we have the right number of arguments?
	if (arguments.length != 3) {
	throw new UDFArgumentLengthException(
	"The operator 'NVL2' accepts 3 arguments.");
	}

	// Second check - throw exception if any complex types have been passed
	// as parameters
	if (arguments[0].getCategory() != ObjectInspector.Category.PRIMITIVE
	\|\| arguments[1].getCategory() != ObjectInspector.Category.PRIMITIVE
	\|\| arguments[2].getCategory() != ObjectInspector.Category.PRIMITIVE)
	throw new UDFArgumentTypeException(0,
	"Only primitive type arguments are accepted");

	// Third check - throw exception if the data types across parameters are
	// different
	if (!(returnOIResolver.update(arguments[0]) && returnOIResolver
	.update(arguments[1]))
	\|\| !(returnOIResolver.update(arguments[1]) && returnOIResolver
	.update(arguments[2]))) {
	throw new UDFArgumentTypeException(2,
	"The arguments of function NLV2 should have the same type, "
	+ "but they are different: \""
	+ arguments[0].getTypeName() + "\" and \""
	+ arguments[1].getTypeName() + "\" and \""
	+ arguments[2].getTypeName() + "\"");
	}
	returnOIResolver = new GenericUDFUtils.ReturnObjectInspectorResolver(
	true);

	return returnOIResolver.get();
	}

	@Override
	public Object evaluate(DeferredObject[] arguments) throws HiveException {
	// The NVL2 functionality
	Object retVal = returnOIResolver.convertIfNecessary(arguments[0].get(),
	argumentOIs[0]);
	if (retVal == null) {
	retVal = returnOIResolver.convertIfNecessary(arguments[2].get(),
	argumentOIs[2]);
	} else {
	retVal = returnOIResolver.convertIfNecessary(arguments[1].get(),
	argumentOIs[1]);
	}
	return retVal;
	}

	@Override
	public String getDisplayString(String[] children) {
	StringBuilder sb = new StringBuilder();
	sb.append("if ");
	sb.append(children[0]);
	sb.append(" is null ");
	sb.append("returns");
	sb.append(children[2]);
	sb.append("else ");
	sb.append("returns");
	sb.append(children[1]);
	return sb.toString();
	}

	}
	Concepts
	-------------------------
	There are three methods-
	1. initialize() - called once, at first. The goal of this method is to determine the return type
	from the arguments. The user can also throw an Exception to signal that bad types are being sent
	to the method. The returnOIResolver is a built-in class that determines the return type by finding
	the type of non-null variables and using that type. The ObjectInspector is used to the transform
	raw records into objects that Hive can access. The initialize() method is passed an
	ObjectInspector for each argument

	2. evaluate() - where the logic for the function should be written.
	The evaluate method has access to the values passed to the method stored in an array of DeferredObject
	values. The returnOIResolver created in the initialize method is used to get values from the
	DeferredObjects.

	3. getDisplayString() - The final method to override is getDisplayString(), is used inside the Hadoop
	tasks to display debugging information when the function is being used.

	Annotations:
	@UDFType(deterministic = true) annotation: Indicates that the UDF returns the same value any time its
	called
	@Description(...) annotation: Includes information that is displayed when you do a describe on the UDF
	****************************
	Expected results
	****************************

	Query:
	select deptNo,NVL2(deptName,deptName,'Procrastrination') from departments_UDFTest;

	The null in the department name for department d008, should be returned as "Procrastrination".
	For the rest of the records, the query should return the data in Hive, as is.
	*******************************
	Testing the UDF
	*******************************
	1) Add jar
	hive> add jar hiveProject/jars/NVL2GenericUDF.jar;

	2) Create alias for the function
	hive> CREATE TEMPORARY FUNCTION NVL2
	AS 'khanolkar.HiveUDFs.NVL2GenericUDF';

	3) Test the description provided
	hive> DESCRIBE FUNCTION NVL2;
	OK
	NVL2(expr1,expr2,expr3) returns expr3, if expr1 is null, otherwise returns expr2;

	4) Test if there are checks in place for number of parameters
	hive> select deptNo,NVL2(deptName,deptName) from departments_UDFTest;
	FAILED: SemanticException [Error 10015]: Line 1:14 Arguments length mismatch 'deptName': The operator 'NVL2' accepts 3 arguments.

	hive> select deptNo,NVL2(deptName,deptName,123,1) from departments_UDFTest;
	FAILED: SemanticException [Error 10015]: Line 1:14 Arguments length mismatch '1': The operator 'NVL2' accepts 3 arguments.

	5) Results
	hive> select deptNo,NVL2(deptName,deptName,'Procrastrination') from departments_UDFTest;
	OK
	d001 Marketing
	d002 Finance
	d003 Human Resources
	d004 Production
	d005 Development
	d006 Quality Management
	d007 Sales
	d008 Procrastrination
	d009 Customer Service