nicokosi · August 19, 2018 16:18
diff --git a/spark-word-count.ipynb b/spark-word-count.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Spark-word-count.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "[View in Colaboratory](https://colab.research.google.com/gist/nicokosi/3814ce9aeda33f40d1e1befa0a9982d9/spark-word-count.ipynb)"
      ]
    },
    {
      "metadata": {
        "id": "8GKSS9wR9z6P",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "Install [Apache Spark for Python](https://spark.apache.org/docs/latest):"
      ]
    },
    {
      "metadata": {
        "id": "Z_gZNjXb7Iic",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 245
        },
        "outputId": "1358d581-5f61-4f63-a4b8-51243fab5fad"
      },
      "cell_type": "code",
      "source": [
        "!pip install pyspark"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Collecting pyspark\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/ee/2f/709df6e8dc00624689aa0a11c7a4c06061a7d00037e370584b9f011df44c/pyspark-2.3.1.tar.gz (211.9MB)\n",
            "\u001b[K    100% |████████████████████████████████| 211.9MB 128kB/s \n",
            "\u001b[?25hCollecting py4j==0.10.7 (from pyspark)\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)\n",
            "\u001b[K    100% |████████████████████████████████| 204kB 23.5MB/s \n",
            "\u001b[?25hBuilding wheels for collected packages: pyspark\n",
            "  Running setup.py bdist_wheel for pyspark ... \u001b[?25l-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \b/\b \b-\b \b\\\b \b|\b \bdone\n",
            "\u001b[?25h  Stored in directory: /content/.cache/pip/wheels/37/48/54/f1b63f0dbb729e20c92f1bbcf1c53c03b300e0b93ca1781526\n",
            "Successfully built pyspark\n",
            "Installing collected packages: py4j, pyspark\n",
            "Successfully installed py4j-0.10.7 pyspark-2.3.1\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "bliUVBXK-JUX",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "Run one of [Spark code examples](https://github.com/apache/spark/tree/master/examples), [\"word count\"](https://github.com/apache/spark/blob/master/examples/src/main/python/wordcount.py):"
      ]
    },
    {
      "metadata": {
        "id": "wvRc6m3p75xM",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 165
        },
        "outputId": "d810daea-1c4f-4612-ede6-daacbaee3150"
      },
      "cell_type": "code",
      "source": [
        "from __future__ import print_function\n",
        "\n",
        "import sys\n",
        "from operator import add\n",
        "\n",
        "from pyspark.sql import SparkSession\n",
        "\n",
        "\n",
        "if __name__ == \"__main__\":\n",
        "    if len(sys.argv) != 2:\n",
        "        print(\"Usage: wordcount <file>\", file=sys.stderr)\n",
        "        sys.exit(-1)\n",
        "\n",
        "    spark = SparkSession\\\n",
        "        .builder\\\n",
        "        .appName(\"PythonWordCount\")\\\n",
        "        .getOrCreate()\n",
        "\n",
        "    lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])\n",
        "    counts = lines.flatMap(lambda x: x.split(' ')) \\\n",
        "                  .map(lambda x: (x, 1)) \\\n",
        "                  .reduceByKey(add)\n",
        "    output = counts.collect()\n",
        "    for (word, count) in output:\n",
        "        print(\"%s: %i\" % (word, count))\n",
        "\n",
        "spark.stop()"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Usage: wordcount <file>\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "error",
          "ename": "SystemExit",
          "evalue": "ignored",
          "traceback": [
            "An exception has occurred, use %tb to see the full traceback.\n",
            "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m -1\n"
          ]
        },
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2890: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n",
            "  warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n"
          ],
          "name": "stderr"
        }
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "Spark-word-count.ipynb",
	"version": "0.3.2",
	"provenance": [],
	"collapsed_sections": [],
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"[View in Colaboratory](https://colab.research.google.com/gist/nicokosi/3814ce9aeda33f40d1e1befa0a9982d9/spark-word-count.ipynb)"
	]
	},
	{
	"metadata": {
	"id": "8GKSS9wR9z6P",
	"colab_type": "text"
	},
	"cell_type": "markdown",
	"source": [
	"Install [Apache Spark for Python](https://spark.apache.org/docs/latest):"
	]
	},
	{
	"metadata": {
	"id": "Z_gZNjXb7Iic",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 245
	},
	"outputId": "1358d581-5f61-4f63-a4b8-51243fab5fad"
	},
	"cell_type": "code",
	"source": [
	"!pip install pyspark"
	],
	"execution_count": 10,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Collecting pyspark\n",
	"\u001b[?25l Downloading https://files.pythonhosted.org/packages/ee/2f/709df6e8dc00624689aa0a11c7a4c06061a7d00037e370584b9f011df44c/pyspark-2.3.1.tar.gz (211.9MB)\n",
	"\u001b[K 100% \|████████████████████████████████\| 211.9MB 128kB/s \n",
	"\u001b[?25hCollecting py4j==0.10.7 (from pyspark)\n",
	"\u001b[?25l Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)\n",
	"\u001b[K 100% \|████████████████████████████████\| 204kB 23.5MB/s \n",
	"\u001b[?25hBuilding wheels for collected packages: pyspark\n",
	" Running setup.py bdist_wheel for pyspark ... \u001b[?25l-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \b/\b \b-\b \b\\\b \b\|\b \bdone\n",
	"\u001b[?25h Stored in directory: /content/.cache/pip/wheels/37/48/54/f1b63f0dbb729e20c92f1bbcf1c53c03b300e0b93ca1781526\n",
	"Successfully built pyspark\n",
	"Installing collected packages: py4j, pyspark\n",
	"Successfully installed py4j-0.10.7 pyspark-2.3.1\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"id": "bliUVBXK-JUX",
	"colab_type": "text"
	},
	"cell_type": "markdown",
	"source": [
	"Run one of [Spark code examples](https://github.com/apache/spark/tree/master/examples), [\"word count\"](https://github.com/apache/spark/blob/master/examples/src/main/python/wordcount.py):"
	]
	},
	{
	"metadata": {
	"id": "wvRc6m3p75xM",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 165
	},
	"outputId": "d810daea-1c4f-4612-ede6-daacbaee3150"
	},
	"cell_type": "code",
	"source": [
	"from __future__ import print_function\n",
	"\n",
	"import sys\n",
	"from operator import add\n",
	"\n",
	"from pyspark.sql import SparkSession\n",
	"\n",
	"\n",
	"if __name__ == \"__main__\":\n",
	" if len(sys.argv) != 2:\n",
	" print(\"Usage: wordcount <file>\", file=sys.stderr)\n",
	" sys.exit(-1)\n",
	"\n",
	" spark = SparkSession\\\n",
	" .builder\\\n",
	" .appName(\"PythonWordCount\")\\\n",
	" .getOrCreate()\n",
	"\n",
	" lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])\n",
	" counts = lines.flatMap(lambda x: x.split(' ')) \\\n",
	" .map(lambda x: (x, 1)) \\\n",
	" .reduceByKey(add)\n",
	" output = counts.collect()\n",
	" for (word, count) in output:\n",
	" print(\"%s: %i\" % (word, count))\n",
	"\n",
	"spark.stop()"
	],
	"execution_count": 12,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Usage: wordcount <file>\n"
	],
	"name": "stderr"
	},
	{
	"output_type": "error",
	"ename": "SystemExit",
	"evalue": "ignored",
	"traceback": [
	"An exception has occurred, use %tb to see the full traceback.\n",
	"\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m -1\n"
	]
	},
	{
	"output_type": "stream",
	"text": [
	"/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2890: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n",
	" warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n"
	],
	"name": "stderr"
	}
	]
	}
	]
	}