Created
September 12, 2015 12:45
-
-
Save jarutis/4f66d8300e7e10b5a692 to your computer and use it in GitHub Desktop.
image vectorisation transformer for spark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| {"metadata":{"name":"Image tests","user_save_timestamp":"1970-01-01T00:00:00.000Z","auto_save_timestamp":"1970-01-01T00:00:00.000Z","language_info":{"name":"scala","file_extension":"scala","codemirror_mode":"text/x-scala"},"trusted":true,"customLocalRepo":"/home/jjarutis/.m2/repository","customRepos":null,"customDeps":null,"customImports":null,"customArgs":null,"customSparkConf":{"spark.app.name":"Images","spark.master":"yarn-client","spark.executor.memory":"3G","spark.executor.instances":"20","spark.sql.shuffle.partitions":"400","spark.executor.cores":"3","spark.yarn.jar":"hdfs:///user/jjarutis/spark-assembly-1.4.1-hadoop2.6.0-cdh5.4.4.jar"}},"cells":[{"metadata":{"trusted":true,"input_collapsed":false,"output_stream_collapsed":true,"collapsed":false},"cell_type":"code","source":":dp\ndeeplearning4j:dl4j-spark-ml:0.4-rc1.2\norg.nd4j % canova-nd4j-image % 0.0.0.6\ncom.twelvemonkeys.imageio % imageio-core % 3.1.1\n- org.apache.hadoop % _ % _\n- org.apache.spark % _ % _","outputs":[{"name":"stdout","output_type":"stream","text":"warning: there were 4 feature warning(s); re-run with -feature for details\nglobalScope.jars: Array[String] = [Ljava.lang.String;@229ae253\nres4: List[String] = List(/home/jjarutis/.m2/repository/cache/org.nd4j/canova-api/jars/canova-api-0.0.0.6.jar, /home/jjarutis/.m2/repository/com/fasterxml/jackson/core/jackson-annotations/2.5.1/jackson-annotations-2.5.1.jar, /home/jjarutis/.m2/repository/com/google/guava/guava/18.0/guava-18.0.jar, /home/jjarutis/.m2/repository/com/fasterxml/jackson/core/jackson-databind/2.5.1/jackson-databind-2.5.1.jar, /home/jjarutis/.m2/repository/com/fasterxml/jackson/dataformat/jackson-dataformat-yaml/2.5.1/jackson-dataformat-yaml-2.5.1.jar, /home/jjarutis/.m2/repository/io/netty/netty-buffer/4.0.28.Final/netty-buffer-4.0.28.Final.jar, /home/jjarutis/.m2/repository/org/nd4j/nd4j-bytebuddy/0.4-rc1.2/nd4j-bytebuddy-0.4-rc1.2.jar, /home/jja..."},{"metadata":{},"data":{"text/html":"<div class=\"container-fluid\"><div><div class=\"col-md-12\"><div>\n <script data-this=\"{"dataId":"anonf72cbdccaea85f83053f5089d3de40ba","dataInit":[{"string value":"/home/jjarutis/.m2/repository/cache/org.nd4j/canova-api/jars/canova-api-0.0.0.6.jar"},{"string value":"/home/jjarutis/.m2/repository/com/fasterxml/jackson/core/jackson-annotations/2.5.1/jackson-annotations-2.5.1.jar"},{"string value":"/home/jjarutis/.m2/repository/com/google/guava/guava/18.0/guava-18.0.jar"},{"string value":"/home/jjarutis/.m2/repository/com/fasterxml/jackson/core/jackson-databind/2.5.1/jackson-databind-2.5.1.jar"},{"string value":"/home/jjarutis/.m2/repository/com/fasterxml/jackson/dataformat/jackson-dataformat-yaml/2.5.1/jackson-dataformat-yaml-2.5.1.jar"},{"string value":"/home/jjarutis/.m2/repository/io/netty/netty-buffer/4.0.28.Final/netty-buffer-4.0.28.Final.jar"},{"string value":"/home/jjarutis/.m2/repository/org/nd4j/nd4j-bytebuddy/0.4-rc1.2/nd4j-bytebuddy-0.4-rc1.2.jar"},{"string value":"/home/jjarutis/.m2/repository/cache/org.nd4j/canova-data-image/jars/canova-data-image-0.0.0.6.jar"},{"string value":"/home/jjarutis/.m2/repository/org/springframework/spring-core/3.2.5.RELEASE/spring-core-3.2.5.RELEASE.jar"},{"string value":"/home/jjarutis/.m2/repository/deeplearning4j/dl4j-spark-ml/0.4-rc1.2/dl4j-spark-ml-0.4-rc1.2.jar"},{"string value":"/home/jjarutis/.m2/repository/com/github/jai-imageio/jai-imageio-core/1.3.0/jai-imageio-core-1.3.0.jar"},{"string value":"/home/jjarutis/.m2/repository/commons-logging/commons-logging/1.1.1/commons-logging-1.1.1.jar"},{"string value":"/home/jjarutis/.m2/repository/com/twelvemonkeys/imageio/imageio-bmp/3.1.1/imageio-bmp-3.1.1.jar"},{"string value":"/home/jjarutis/.m2/repository/com/twelvemonkeys/imageio/imageio-core/3.1.1/imageio-core-3.1.1.jar"},{"string value":"/home/jjarutis/.m2/repository/io/netty/netty-common/4.0.28.Final/netty-common-4.0.28.Final.jar"},{"string value":"/home/jjarutis/.m2/repository/com/twelvemonkeys/imageio/imageio-tiff/3.1.1/imageio-tiff-3.1.1.jar"},{"string value":"/home/jjarutis/.m2/repository/cache/org.nd4j/nd4j-x86/jars/nd4j-x86-0.4-rc1.2.jar"},{"string value":"/home/jjarutis/.m2/repository/org/apache/commons/commons-compress/1.8/commons-compress-1.8.jar"},{"string value":"/home/jjarutis/.m2/repository/cache/org.deeplearning4j/dl4j-spark-ml/jars/dl4j-spark-ml-0.4-rc1.2.jar"},{"string value":"/home/jjarutis/.m2/repository/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar"},{"string value":"/home/jjarutis/.m2/repository/org/yaml/snakeyaml/1.12/snakeyaml-1.12.jar"},{"string value":"/home/jjarutis/.m2/repository/org/json/json/20131018/json-20131018.jar"},{"string value":"/home/jjarutis/.m2/repository/org/scala-lang/scala-library/2.10.4/scala-library-2.10.4.jar"},{"string value":"/home/jjarutis/.m2/repository/com/twelvemonkeys/common/common-io/3.1.1/common-io-3.1.1.jar"},{"string value":"/home/jjarutis/.m2/repository/com/fasterxml/jackson/core/jackson-core/2.5.1/jackson-core-2.5.1.jar"}],"genId":"1543162579"}\" type=\"text/x-scoped-javascript\">/*<![CDATA[*/req(['../javascripts/notebook/playground','../javascripts/notebook/magic/tableChart'], \n function(playground, _magictableChart) {\n // data ==> data-this (in observable.js's scopedEval) ==> this in JS => { dataId, dataInit, ... }\n // this ==> scope (in observable.js's scopedEval) ==> this.parentElement ==> div.container below (toHtml)\n\n playground.call(data,\n this\n ,\n {\n \"f\": _magictableChart,\n \"o\": {\"headers\":[\"string value\"],\"nrow\":56,\"shown\":25,\"width\":600,\"height\":400}\n }\n \n \n \n );\n }\n );/*]]>*/</script>\n </div></div></div></div>"},"output_type":"execute_result","execution_count":1}]},{"metadata":{"trusted":true,"input_collapsed":false,"output_stream_collapsed":true,"collapsed":false},"cell_type":"code","source":"import java.util.UUID\nimport java.net.URL\nimport org.canova.image.loader.ImageLoader\nimport org.apache.spark.ml.UnaryTransformer\nimport org.apache.spark.sql.types.{DataType, StringType}\nimport org.apache.spark.ml.param.{IntParam, ParamValidators}\nimport org.apache.spark.ml.feature.Tokenizer\nimport org.apache.spark.mllib.linalg.{Vector, Vectors}\nimport org.deeplearning4j.spark.util.conversions._\nimport org.deeplearning4j.spark.sql.types.VectorUDT","outputs":[{"name":"stdout","output_type":"stream","text":"import java.util.UUID\nimport java.net.URL\nimport org.canova.image.loader.ImageLoader\nimport org.apache.spark.ml.UnaryTransformer\nimport org.apache.spark.sql.types.{DataType, StringType}\nimport org.apache.spark.ml.param.{IntParam, ParamValidators}\nimport org.apache.spark.ml.feature.Tokenizer\nimport org.apache.spark.mllib.linalg.{Vector, Vectors}\nimport org.deeplearning4j.spark.util.conversions._\nimport org.deeplearning4j.spark.sql.types.VectorUDT\n"},{"metadata":{},"data":{"text/html":""},"output_type":"execute_result","execution_count":2}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":false},"cell_type":"code","source":"class ImageVectorizerFromURL(override val uid: String) extends UnaryTransformer[String, Vector, ImageVectorizerFromURL] {\n def this() = this( s\"imgVec_${UUID.randomUUID()}\" ) \n \n val height = new IntParam(this, \"height\", \"image height\", ParamValidators.gtEq(0))\n val width = new IntParam(this, \"width\", \"image width\", ParamValidators.gtEq(0))\n val channels = new IntParam(this, \"channels\", \"number of channels\", ParamValidators.gtEq(0))\n\n setDefault(height -> 28)\n setDefault(width -> 28)\n setDefault(channels -> 3)\n\n def getHeight: Int = $(height)\n def getWidth: Int = $(width)\n def getChannels: Int = $(channels)\n\n def setHeight(value: Int): this.type = set(height, value)\n def setWidth(value: Int): this.type = set(width, value)\n def setChannels(value: Int): this.type = set(channels, value)\n \n val imageLoader = new ImageLoader($(height), $(width), $(channels))\n \n override protected def createTransformFunc: String => Vector = {\n urlString: String => \n val imgStream = new URL(urlString).openStream()\n toVector(imageLoader.asRowVector(imgStream))\n }\n\n override protected def validateInputType(inputType: DataType): Unit = {\n require(inputType == StringType, s\"Input type must be string type but got $inputType.\")\n }\n\n override protected def outputDataType: DataType = VectorUDT()\n}","outputs":[{"name":"stdout","output_type":"stream","text":"defined class ImageVectorizerFromURL\n"},{"metadata":{},"data":{"text/html":""},"output_type":"execute_result","execution_count":8}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":false},"cell_type":"code","source":"import org.apache.spark.sql.hive.HiveContext\nval sqlContext = new HiveContext(sc)\nimport sqlContext.implicits._","outputs":[{"name":"stdout","output_type":"stream","text":"import org.apache.spark.sql.hive.HiveContext\nsqlContext: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@5727e405\nimport sqlContext.implicits._\n"},{"metadata":{},"data":{"text/html":""},"output_type":"execute_result","execution_count":9}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":false},"cell_type":"code","source":"val links = Seq(Tuple1(\"http://www.foldl.me/uploads/2015/conditional-gans-face-generation/lfwcrop/Akbar_Al_Baker_0001.jpg\"))\n .toDF(\"link\")\nlinks.show","outputs":[{"name":"stdout","output_type":"stream","text":"+--------------------+\n| link|\n+--------------------+\n|http://www.foldl....|\n+--------------------+\n\nlinks: org.apache.spark.sql.DataFrame = [link: string]\n"},{"metadata":{},"data":{"text/html":""},"output_type":"execute_result","execution_count":10}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":false},"cell_type":"code","source":"val imgVectorizer = new ImageVectorizerFromURL()\n .setInputCol(\"link\")\n .setOutputCol(\"image_vector\")\n .setHeight(64)\n .setWidth(64)\n .setChannels(3)","outputs":[{"name":"stdout","output_type":"stream","text":"imgVectorizer: ImageVectorizerFromURL = imgVec_3393257b-e527-49c4-af97-bc8fdf208fcf\n"},{"metadata":{},"data":{"text/html":"imgVec_3393257b-e527-49c4-af97-bc8fdf208fcf"},"output_type":"execute_result","execution_count":11}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":false},"cell_type":"code","source":"val vectorised_links = imgVectorizer.transform(links)\nvectorised_links.show","outputs":[{"name":"stdout","output_type":"stream","text":"+--------------------+--------------------+\n| link| image_vector|\n+--------------------+--------------------+\n|http://www.foldl....|[205.0,202.0,193....|\n+--------------------+--------------------+\n\nvectorised_links: org.apache.spark.sql.DataFrame = [link: string, image_vector: vector]\n"},{"metadata":{},"data":{"text/html":""},"output_type":"execute_result","execution_count":12}]},{"metadata":{"trusted":true,"input_collapsed":false,"collapsed":true},"cell_type":"code","source":"","outputs":[]}],"nbformat":4} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment