Skip to content

Instantly share code, notes, and snippets.

@jarutis
Created September 12, 2015 12:45
Show Gist options
  • Save jarutis/4f66d8300e7e10b5a692 to your computer and use it in GitHub Desktop.
Save jarutis/4f66d8300e7e10b5a692 to your computer and use it in GitHub Desktop.
image vectorisation transformer for spark
Display the source blob
Display the rendered blob
Raw
{"metadata":{"name":"Image tests","user_save_timestamp":"1970-01-01T00:00:00.000Z","auto_save_timestamp":"1970-01-01T00:00:00.000Z","language_info":{"name":"scala","file_extension":"scala","codemirror_mode":"text/x-scala"},"trusted":true,"customLocalRepo":"/home/jjarutis/.m2/repository","customRepos":null,"customDeps":null,"customImports":null,"customArgs":null,"customSparkConf":{"spark.app.name":"Images","spark.master":"yarn-client","spark.executor.memory":"3G","spark.executor.instances":"20","spark.sql.shuffle.partitions":"400","spark.executor.cores":"3","spark.yarn.jar":"hdfs:///user/jjarutis/spark-assembly-1.4.1-hadoop2.6.0-cdh5.4.4.jar"}},"cells":[{"metadata":{"trusted":true,"input_collapsed":false,"output_stream_collapsed":true,"collapsed":false},"cell_type":"code","source":":dp\ndeeplearning4j:dl4j-spark-ml:0.4-rc1.2\norg.nd4j % canova-nd4j-image % 0.0.0.6\ncom.twelvemonkeys.imageio % imageio-core % 3.1.1\n- org.apache.hadoop % _ % _\n- org.apache.spark % _ % _","outputs":[{"name":"stdout","output_type":"stream","text":"warning: there were 4 feature warning(s); re-run with -feature for details\nglobalScope.jars: Array[String] = [Ljava.lang.String;@229ae253\nres4: List[String] = List(/home/jjarutis/.m2/repository/cache/org.nd4j/canova-api/jars/canova-api-0.0.0.6.jar, /home/jjarutis/.m2/repository/com/fasterxml/jackson/core/jackson-annotations/2.5.1/jackson-annotations-2.5.1.jar, /home/jjarutis/.m2/repository/com/google/guava/guava/18.0/guava-18.0.jar, /home/jjarutis/.m2/repository/com/fasterxml/jackson/core/jackson-databind/2.5.1/jackson-databind-2.5.1.jar, /home/jjarutis/.m2/repository/com/fasterxml/jackson/dataformat/jackson-dataformat-yaml/2.5.1/jackson-dataformat-yaml-2.5.1.jar, /home/jjarutis/.m2/repository/io/netty/netty-buffer/4.0.28.Final/netty-buffer-4.0.28.Final.jar, /home/jjarutis/.m2/repository/org/nd4j/nd4j-bytebuddy/0.4-rc1.2/nd4j-bytebuddy-0.4-rc1.2.jar, /home/jja..."},{"metadata":{},"data":{"text/html":"<div class=\"container-fluid\"><div><div class=\"col-md-12\"><div>\n <script data-this=\"{&quot;dataId&quot;:&quot;anonf72cbdccaea85f83053f5089d3de40ba&quot;,&quot;dataInit&quot;:[{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/cache/org.nd4j/canova-api/jars/canova-api-0.0.0.6.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/com/fasterxml/jackson/core/jackson-annotations/2.5.1/jackson-annotations-2.5.1.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/com/google/guava/guava/18.0/guava-18.0.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/com/fasterxml/jackson/core/jackson-databind/2.5.1/jackson-databind-2.5.1.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/com/fasterxml/jackson/dataformat/jackson-dataformat-yaml/2.5.1/jackson-dataformat-yaml-2.5.1.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/io/netty/netty-buffer/4.0.28.Final/netty-buffer-4.0.28.Final.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/org/nd4j/nd4j-bytebuddy/0.4-rc1.2/nd4j-bytebuddy-0.4-rc1.2.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/cache/org.nd4j/canova-data-image/jars/canova-data-image-0.0.0.6.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/org/springframework/spring-core/3.2.5.RELEASE/spring-core-3.2.5.RELEASE.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/deeplearning4j/dl4j-spark-ml/0.4-rc1.2/dl4j-spark-ml-0.4-rc1.2.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/com/github/jai-imageio/jai-imageio-core/1.3.0/jai-imageio-core-1.3.0.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/commons-logging/commons-logging/1.1.1/commons-logging-1.1.1.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/com/twelvemonkeys/imageio/imageio-bmp/3.1.1/imageio-bmp-3.1.1.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/com/twelvemonkeys/imageio/imageio-core/3.1.1/imageio-core-3.1.1.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/io/netty/netty-common/4.0.28.Final/netty-common-4.0.28.Final.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/com/twelvemonkeys/imageio/imageio-tiff/3.1.1/imageio-tiff-3.1.1.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/cache/org.nd4j/nd4j-x86/jars/nd4j-x86-0.4-rc1.2.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/org/apache/commons/commons-compress/1.8/commons-compress-1.8.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/cache/org.deeplearning4j/dl4j-spark-ml/jars/dl4j-spark-ml-0.4-rc1.2.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/org/yaml/snakeyaml/1.12/snakeyaml-1.12.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/org/json/json/20131018/json-20131018.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/org/scala-lang/scala-library/2.10.4/scala-library-2.10.4.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/com/twelvemonkeys/common/common-io/3.1.1/common-io-3.1.1.jar&quot;},{&quot;string value&quot;:&quot;/home/jjarutis/.m2/repository/com/fasterxml/jackson/core/jackson-core/2.5.1/jackson-core-2.5.1.jar&quot;}],&quot;genId&quot;:&quot;1543162579&quot;}\" type=\"text/x-scoped-javascript\">/*<![CDATA[*/req(['../javascripts/notebook/playground','../javascripts/notebook/magic/tableChart'], \n function(playground, _magictableChart) {\n // data ==> data-this (in observable.js's scopedEval) ==> this in JS => { dataId, dataInit, ... }\n // this ==> scope (in observable.js's scopedEval) ==> this.parentElement ==> div.container below (toHtml)\n\n playground.call(data,\n this\n ,\n {\n \"f\": _magictableChart,\n \"o\": {\"headers\":[\"string value\"],\"nrow\":56,\"shown\":25,\"width\":600,\"height\":400}\n }\n \n \n \n );\n }\n );/*]]>*/</script>\n </div></div></div></div>"},"output_type":"execute_result","execution_count":1}]},{"metadata":{"trusted":true,"input_collapsed":false,"output_stream_collapsed":true,"collapsed":false},"cell_type":"code","source":"import java.util.UUID\nimport java.net.URL\nimport org.canova.image.loader.ImageLoader\nimport org.apache.spark.ml.UnaryTransformer\nimport org.apache.spark.sql.types.{DataType, StringType}\nimport org.apache.spark.ml.param.{IntParam, ParamValidators}\nimport org.apache.spark.ml.feature.Tokenizer\nimport org.apache.spark.mllib.linalg.{Vector, Vectors}\nimport org.deeplearning4j.spark.util.conversions._\nimport org.deeplearning4j.spark.sql.types.VectorUDT","outputs":[{"name":"stdout","output_type":"stream","text":"import java.util.UUID\nimport java.net.URL\nimport org.canova.image.loader.ImageLoader\nimport org.apache.spark.ml.UnaryTransformer\nimport org.apache.spark.sql.types.{DataType, StringType}\nimport org.apache.spark.ml.param.{IntParam, ParamValidators}\nimport org.apache.spark.ml.feature.Tokenizer\nimport org.apache.spark.mllib.linalg.{Vector, Vectors}\nimport org.deeplearning4j.spark.util.conversions._\nimport org.deeplearning4j.spark.sql.types.VectorUDT\n"},{"metadata":{},"data":{"text/html":""},"output_type":"execute_result","execution_count":2}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":false},"cell_type":"code","source":"class ImageVectorizerFromURL(override val uid: String) extends UnaryTransformer[String, Vector, ImageVectorizerFromURL] {\n def this() = this( s\"imgVec_${UUID.randomUUID()}\" ) \n \n val height = new IntParam(this, \"height\", \"image height\", ParamValidators.gtEq(0))\n val width = new IntParam(this, \"width\", \"image width\", ParamValidators.gtEq(0))\n val channels = new IntParam(this, \"channels\", \"number of channels\", ParamValidators.gtEq(0))\n\n setDefault(height -> 28)\n setDefault(width -> 28)\n setDefault(channels -> 3)\n\n def getHeight: Int = $(height)\n def getWidth: Int = $(width)\n def getChannels: Int = $(channels)\n\n def setHeight(value: Int): this.type = set(height, value)\n def setWidth(value: Int): this.type = set(width, value)\n def setChannels(value: Int): this.type = set(channels, value)\n \n val imageLoader = new ImageLoader($(height), $(width), $(channels))\n \n override protected def createTransformFunc: String => Vector = {\n urlString: String => \n val imgStream = new URL(urlString).openStream()\n toVector(imageLoader.asRowVector(imgStream))\n }\n\n override protected def validateInputType(inputType: DataType): Unit = {\n require(inputType == StringType, s\"Input type must be string type but got $inputType.\")\n }\n\n override protected def outputDataType: DataType = VectorUDT()\n}","outputs":[{"name":"stdout","output_type":"stream","text":"defined class ImageVectorizerFromURL\n"},{"metadata":{},"data":{"text/html":""},"output_type":"execute_result","execution_count":8}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":false},"cell_type":"code","source":"import org.apache.spark.sql.hive.HiveContext\nval sqlContext = new HiveContext(sc)\nimport sqlContext.implicits._","outputs":[{"name":"stdout","output_type":"stream","text":"import org.apache.spark.sql.hive.HiveContext\nsqlContext: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@5727e405\nimport sqlContext.implicits._\n"},{"metadata":{},"data":{"text/html":""},"output_type":"execute_result","execution_count":9}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":false},"cell_type":"code","source":"val links = Seq(Tuple1(\"http://www.foldl.me/uploads/2015/conditional-gans-face-generation/lfwcrop/Akbar_Al_Baker_0001.jpg\"))\n .toDF(\"link\")\nlinks.show","outputs":[{"name":"stdout","output_type":"stream","text":"+--------------------+\n| link|\n+--------------------+\n|http://www.foldl....|\n+--------------------+\n\nlinks: org.apache.spark.sql.DataFrame = [link: string]\n"},{"metadata":{},"data":{"text/html":""},"output_type":"execute_result","execution_count":10}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":false},"cell_type":"code","source":"val imgVectorizer = new ImageVectorizerFromURL()\n .setInputCol(\"link\")\n .setOutputCol(\"image_vector\")\n .setHeight(64)\n .setWidth(64)\n .setChannels(3)","outputs":[{"name":"stdout","output_type":"stream","text":"imgVectorizer: ImageVectorizerFromURL = imgVec_3393257b-e527-49c4-af97-bc8fdf208fcf\n"},{"metadata":{},"data":{"text/html":"imgVec_3393257b-e527-49c4-af97-bc8fdf208fcf"},"output_type":"execute_result","execution_count":11}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":false},"cell_type":"code","source":"val vectorised_links = imgVectorizer.transform(links)\nvectorised_links.show","outputs":[{"name":"stdout","output_type":"stream","text":"+--------------------+--------------------+\n| link| image_vector|\n+--------------------+--------------------+\n|http://www.foldl....|[205.0,202.0,193....|\n+--------------------+--------------------+\n\nvectorised_links: org.apache.spark.sql.DataFrame = [link: string, image_vector: vector]\n"},{"metadata":{},"data":{"text/html":""},"output_type":"execute_result","execution_count":12}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":true},"cell_type":"code","source":"","outputs":[]}],"nbformat":4}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment