Skip to content

Instantly share code, notes, and snippets.

@maasg
Created December 17, 2016 14:01
Show Gist options
  • Select an option

  • Save maasg/28109dc3ce14b894bde3dab40a42ab4b to your computer and use it in GitHub Desktop.

Select an option

Save maasg/28109dc3ce14b894bde3dab40a42ab4b to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "simpleParser",
"user_save_timestamp": "1970-01-01T01:00:00.000Z",
"auto_save_timestamp": "1970-01-01T01:00:00.000Z",
"language_info": {
"name": "scala",
"file_extension": "scala",
"codemirror_mode": "text/x-scala"
},
"trusted": true,
"customLocalRepo": null,
"customRepos": null,
"customDeps": null,
"customImports": null,
"customArgs": null,
"customSparkConf": null
},
"cells": [
{
"metadata": {
"id": "3DEE3EAD68454E10ABF852DA3AC42986"
},
"cell_type": "markdown",
"source": "created with the [Spark Notebook](http://spark-notebook.io)"
},
{
"metadata": {
"trusted": true,
"input_collapsed": false,
"collapsed": false,
"id": "8B10C3D6F82E4FCFA9D0F758644B1B20"
},
"cell_type": "code",
"source": "val dataSample =\"\"\"2016-11-10T07:01:37|AAA|S16.12|MN-MN/AAA-329044|288364|2|3\n2016-11-10T07:01:37|BBB|S16.12|MN-MN/AAA-329044/BBB-1|304660|0|0\n2016-11-10T07:01:37|TSB|S16.12|MN-MN/AAA-329044/BBB-1/TSB-1|332164|NA|NA\n2016-11-10T07:01:37|RX|S16.12|MN-MN/AAA-329044/BBB-1/TSB-1/RX-1|357181|0|1\"\"\".split('\\n')",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "dataSample: Array[String] = Array(2016-11-10T07:01:37|AAA|S16.12|MN-MN/AAA-329044|288364|2|3, 2016-11-10T07:01:37|BBB|S16.12|MN-MN/AAA-329044/BBB-1|304660|0|0, 2016-11-10T07:01:37|TSB|S16.12|MN-MN/AAA-329044/BBB-1/TSB-1|332164|NA|NA, 2016-11-10T07:01:37|RX|S16.12|MN-MN/AAA-329044/BBB-1/TSB-1/RX-1|357181|0|1)\n"
},
{
"metadata": {},
"data": {
"text/html": ""
},
"output_type": "execute_result",
"execution_count": 1,
"time": "Took: 869 milliseconds, at 2016-12-17 14:31"
}
]
},
{
"metadata": {
"trusted": true,
"input_collapsed": false,
"collapsed": false,
"id": "8B59014E7EC945DF887447EB2F4989C3"
},
"cell_type": "code",
"source": "val data = sparkContext.parallelize(dataSample)",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "data: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[1] at parallelize at <console>:79\n"
},
{
"metadata": {},
"data": {
"text/html": ""
},
"output_type": "execute_result",
"execution_count": 4,
"time": "Took: 553 milliseconds, at 2016-12-17 14:31"
}
]
},
{
"metadata": {
"trusted": true,
"input_collapsed": false,
"collapsed": false,
"id": "D4D4B12D0604468681CE4F71892AE1AC"
},
"cell_type": "code",
"source": "val records= data.map(line=> line.split(\"\\\\|\"))",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "records: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[6] at map at <console>:81\n"
},
{
"metadata": {},
"data": {
"text/html": ""
},
"output_type": "execute_result",
"execution_count": 20,
"time": "Took: 399 milliseconds, at 2016-12-17 14:50"
}
]
},
{
"metadata": {
"trusted": true,
"input_collapsed": false,
"collapsed": false,
"id": "541B139FD0294B3FACFEAF3155A65CC4"
},
"cell_type": "code",
"source": "val numberExtractor = \"\\\\d+\".r.unanchored",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "numberExtractor: scala.util.matching.UnanchoredRegex = \\d+\n"
},
{
"metadata": {},
"data": {
"text/html": ""
},
"output_type": "execute_result",
"execution_count": 11,
"time": "Took: 418 milliseconds, at 2016-12-17 14:45"
}
]
},
{
"metadata": {
"trusted": true,
"input_collapsed": false,
"collapsed": false,
"presentation": {
"tabs_state": "{\n \"tab_id\": \"#tab1319781912-0\"\n}",
"pivot_chart_state": "{\n \"hiddenAttributes\": [],\n \"menuLimit\": 200,\n \"cols\": [],\n \"rows\": [],\n \"vals\": [],\n \"exclusions\": {},\n \"inclusions\": {},\n \"unusedAttrsVertical\": 85,\n \"autoSortUnusedAttrs\": false,\n \"inclusionsInfo\": {},\n \"aggregatorName\": \"Count\",\n \"rendererName\": \"Table\"\n}"
},
"id": "0E9C32CA99A24F4482C5464E73CD0D01"
},
"cell_type": "code",
"source": "val field3Exploded = records.map{arr => arr.take(3) ++ numberExtractor.findAllIn(arr.drop(3).head) ++ arr.drop(4)}",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "field3Exploded: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[7] at map at <console>:85\n"
},
{
"metadata": {},
"data": {
"text/html": ""
},
"output_type": "execute_result",
"execution_count": 21,
"time": "Took: 519 milliseconds, at 2016-12-17 14:51"
}
]
},
{
"metadata": {
"trusted": true,
"input_collapsed": false,
"collapsed": false,
"presentation": {
"tabs_state": "{\n \"tab_id\": \"#tab1744608766-0\"\n}",
"pivot_chart_state": "{\n \"hiddenAttributes\": [],\n \"menuLimit\": 200,\n \"cols\": [],\n \"rows\": [],\n \"vals\": [],\n \"exclusions\": {},\n \"inclusions\": {},\n \"unusedAttrsVertical\": 85,\n \"autoSortUnusedAttrs\": false,\n \"inclusionsInfo\": {},\n \"aggregatorName\": \"Count\",\n \"rendererName\": \"Table\"\n}"
},
"id": "9824FCDE9D184A2B8CF699E03515F32A"
},
"cell_type": "code",
"source": "field3Exploded.collect.foreach(arr=> println(arr.mkString(\",\")))",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "2016-11-10T07:01:37,AAA,S16.12,329044,288364,2,3\n2016-11-10T07:01:37,BBB,S16.12,329044,1,304660,0,0\n2016-11-10T07:01:37,TSB,S16.12,329044,1,1,332164,NA,NA\n2016-11-10T07:01:37,RX,S16.12,329044,1,1,1,357181,0,1\n"
},
{
"metadata": {},
"data": {
"text/html": ""
},
"output_type": "execute_result",
"execution_count": 22,
"time": "Took: 643 milliseconds, at 2016-12-17 14:51"
}
]
},
{
"metadata": {
"trusted": true,
"input_collapsed": false,
"collapsed": true,
"id": "D2370FE5A35E499886DF2B0532B22B88"
},
"cell_type": "code",
"source": "",
"outputs": []
}
],
"nbformat": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment