Created
December 17, 2016 14:01
-
-
Save maasg/28109dc3ce14b894bde3dab40a42ab4b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "simpleParser", | |
| "user_save_timestamp": "1970-01-01T01:00:00.000Z", | |
| "auto_save_timestamp": "1970-01-01T01:00:00.000Z", | |
| "language_info": { | |
| "name": "scala", | |
| "file_extension": "scala", | |
| "codemirror_mode": "text/x-scala" | |
| }, | |
| "trusted": true, | |
| "customLocalRepo": null, | |
| "customRepos": null, | |
| "customDeps": null, | |
| "customImports": null, | |
| "customArgs": null, | |
| "customSparkConf": null | |
| }, | |
| "cells": [ | |
| { | |
| "metadata": { | |
| "id": "3DEE3EAD68454E10ABF852DA3AC42986" | |
| }, | |
| "cell_type": "markdown", | |
| "source": "created with the [Spark Notebook](http://spark-notebook.io)" | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true, | |
| "input_collapsed": false, | |
| "collapsed": false, | |
| "id": "8B10C3D6F82E4FCFA9D0F758644B1B20" | |
| }, | |
| "cell_type": "code", | |
| "source": "val dataSample =\"\"\"2016-11-10T07:01:37|AAA|S16.12|MN-MN/AAA-329044|288364|2|3\n2016-11-10T07:01:37|BBB|S16.12|MN-MN/AAA-329044/BBB-1|304660|0|0\n2016-11-10T07:01:37|TSB|S16.12|MN-MN/AAA-329044/BBB-1/TSB-1|332164|NA|NA\n2016-11-10T07:01:37|RX|S16.12|MN-MN/AAA-329044/BBB-1/TSB-1/RX-1|357181|0|1\"\"\".split('\\n')", | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": "dataSample: Array[String] = Array(2016-11-10T07:01:37|AAA|S16.12|MN-MN/AAA-329044|288364|2|3, 2016-11-10T07:01:37|BBB|S16.12|MN-MN/AAA-329044/BBB-1|304660|0|0, 2016-11-10T07:01:37|TSB|S16.12|MN-MN/AAA-329044/BBB-1/TSB-1|332164|NA|NA, 2016-11-10T07:01:37|RX|S16.12|MN-MN/AAA-329044/BBB-1/TSB-1/RX-1|357181|0|1)\n" | |
| }, | |
| { | |
| "metadata": {}, | |
| "data": { | |
| "text/html": "" | |
| }, | |
| "output_type": "execute_result", | |
| "execution_count": 1, | |
| "time": "Took: 869 milliseconds, at 2016-12-17 14:31" | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true, | |
| "input_collapsed": false, | |
| "collapsed": false, | |
| "id": "8B59014E7EC945DF887447EB2F4989C3" | |
| }, | |
| "cell_type": "code", | |
| "source": "val data = sparkContext.parallelize(dataSample)", | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": "data: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[1] at parallelize at <console>:79\n" | |
| }, | |
| { | |
| "metadata": {}, | |
| "data": { | |
| "text/html": "" | |
| }, | |
| "output_type": "execute_result", | |
| "execution_count": 4, | |
| "time": "Took: 553 milliseconds, at 2016-12-17 14:31" | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true, | |
| "input_collapsed": false, | |
| "collapsed": false, | |
| "id": "D4D4B12D0604468681CE4F71892AE1AC" | |
| }, | |
| "cell_type": "code", | |
| "source": "val records= data.map(line=> line.split(\"\\\\|\"))", | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": "records: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[6] at map at <console>:81\n" | |
| }, | |
| { | |
| "metadata": {}, | |
| "data": { | |
| "text/html": "" | |
| }, | |
| "output_type": "execute_result", | |
| "execution_count": 20, | |
| "time": "Took: 399 milliseconds, at 2016-12-17 14:50" | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true, | |
| "input_collapsed": false, | |
| "collapsed": false, | |
| "id": "541B139FD0294B3FACFEAF3155A65CC4" | |
| }, | |
| "cell_type": "code", | |
| "source": "val numberExtractor = \"\\\\d+\".r.unanchored", | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": "numberExtractor: scala.util.matching.UnanchoredRegex = \\d+\n" | |
| }, | |
| { | |
| "metadata": {}, | |
| "data": { | |
| "text/html": "" | |
| }, | |
| "output_type": "execute_result", | |
| "execution_count": 11, | |
| "time": "Took: 418 milliseconds, at 2016-12-17 14:45" | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true, | |
| "input_collapsed": false, | |
| "collapsed": false, | |
| "presentation": { | |
| "tabs_state": "{\n \"tab_id\": \"#tab1319781912-0\"\n}", | |
| "pivot_chart_state": "{\n \"hiddenAttributes\": [],\n \"menuLimit\": 200,\n \"cols\": [],\n \"rows\": [],\n \"vals\": [],\n \"exclusions\": {},\n \"inclusions\": {},\n \"unusedAttrsVertical\": 85,\n \"autoSortUnusedAttrs\": false,\n \"inclusionsInfo\": {},\n \"aggregatorName\": \"Count\",\n \"rendererName\": \"Table\"\n}" | |
| }, | |
| "id": "0E9C32CA99A24F4482C5464E73CD0D01" | |
| }, | |
| "cell_type": "code", | |
| "source": "val field3Exploded = records.map{arr => arr.take(3) ++ numberExtractor.findAllIn(arr.drop(3).head) ++ arr.drop(4)}", | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": "field3Exploded: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[7] at map at <console>:85\n" | |
| }, | |
| { | |
| "metadata": {}, | |
| "data": { | |
| "text/html": "" | |
| }, | |
| "output_type": "execute_result", | |
| "execution_count": 21, | |
| "time": "Took: 519 milliseconds, at 2016-12-17 14:51" | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true, | |
| "input_collapsed": false, | |
| "collapsed": false, | |
| "presentation": { | |
| "tabs_state": "{\n \"tab_id\": \"#tab1744608766-0\"\n}", | |
| "pivot_chart_state": "{\n \"hiddenAttributes\": [],\n \"menuLimit\": 200,\n \"cols\": [],\n \"rows\": [],\n \"vals\": [],\n \"exclusions\": {},\n \"inclusions\": {},\n \"unusedAttrsVertical\": 85,\n \"autoSortUnusedAttrs\": false,\n \"inclusionsInfo\": {},\n \"aggregatorName\": \"Count\",\n \"rendererName\": \"Table\"\n}" | |
| }, | |
| "id": "9824FCDE9D184A2B8CF699E03515F32A" | |
| }, | |
| "cell_type": "code", | |
| "source": "field3Exploded.collect.foreach(arr=> println(arr.mkString(\",\")))", | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": "2016-11-10T07:01:37,AAA,S16.12,329044,288364,2,3\n2016-11-10T07:01:37,BBB,S16.12,329044,1,304660,0,0\n2016-11-10T07:01:37,TSB,S16.12,329044,1,1,332164,NA,NA\n2016-11-10T07:01:37,RX,S16.12,329044,1,1,1,357181,0,1\n" | |
| }, | |
| { | |
| "metadata": {}, | |
| "data": { | |
| "text/html": "" | |
| }, | |
| "output_type": "execute_result", | |
| "execution_count": 22, | |
| "time": "Took: 643 milliseconds, at 2016-12-17 14:51" | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true, | |
| "input_collapsed": false, | |
| "collapsed": true, | |
| "id": "D2370FE5A35E499886DF2B0532B22B88" | |
| }, | |
| "cell_type": "code", | |
| "source": "", | |
| "outputs": [] | |
| } | |
| ], | |
| "nbformat": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment