Last active
January 23, 2019 10:30
-
-
Save stdatalabs/aaf286a9155c6e0c60d70dd1751155d8 to your computer and use it in GitHub Desktop.
A Spark UDF to find the MD5 message digest of a column. More @ stdatalabs.blogspot.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.spark.{SparkConf, SparkContext} | |
import org.apache.spark.sql._ | |
import org.apache.spark.sql.hive.HiveContext | |
val hiveContext = new HiveContext(sc) | |
import hiveContext.implicits._ | |
import hiveContext.sql | |
import sqlContext.implicits._ | |
import java.security.MessageDigest | |
/** | |
* A Spark UDF to find the MD5 message digest of a column | |
* | |
* More discussion at stdatalabs.blogspot.com | |
* | |
* @author Sachin Thirumala | |
*/ | |
// Define schema for custom_info table | |
case class customer_info(name: String, dob: String, address: String, city: String ) | |
// Populate dummy data | |
val details = sc.parallelize(Array( | |
customer_info("Sachin", "10-10-1972", "#24, Malad", "Mumbai"), | |
customer_info("Sourav", "31-09-1973", "#41, ultadanga", "Kolkata"), | |
customer_info("Sehwag", "23-10-1981", "#23, Dwaraka", "Delhi"), | |
customer_info("Rahul", "31-12-1971", "#41, Whitefield", "Bangalore") | |
)) | |
// Create DataFrame and register as temp table | |
val customerDF = sqlContext.createDataFrame(details) | |
customerDF.registerTempTable("customer_info") | |
// Function: dbms_crypto | |
def dbms_crypto(s:String) : String = { | |
// Create md5 of the string | |
val digest = MessageDigest.getInstance("MD5") | |
val md5hash = digest.digest(s.getBytes).map(0xFF & _).map { "%02x".format(_) }.foldLeft(""){_ + _} | |
return md5hash.map(_.toUpper) | |
} | |
// Register the function as a UDF | |
sqlContext.udf.register("dbms_crypto",dbms_crypto _) | |
sqlContext.sql("select dbms_crypto(CONCAT(name,dob,address)) hash_key from customer_info").show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment