Last active
March 12, 2023 15:27
-
-
Save kirisakow/3ad1566387afc19bf359a7f7cc329f3f to your computer and use it in GitHub Desktop.
Run Spark and PySpark in Google Colab
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "collapsed_sections": [], | |
| "toc_visible": true, | |
| "authorship_tag": "ABX9TyMkLMYgQaqDpeJaTu+HFQyc", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/kirisakow/3ad1566387afc19bf359a7f7cc329f3f/run_spark_pyspark_in_colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Run Spark and PySpark in Google Colab\n", | |
| "\n", | |
| "Inspired by https://towardsdatascience.com/pyspark-on-google-colab-101-d31830b238be\n", | |
| "\n", | |
| "Refactored and completed by myself" | |
| ], | |
| "metadata": { | |
| "id": "P2XQoc18uBX2" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "### Setting up PySpark in Colab\n", | |
| "\n", | |
| "Download Spark and unpack the archive:" | |
| ], | |
| "metadata": { | |
| "id": "pyGmNgB1KBDm" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "_RZbj1RquAhT" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "! wget --quiet --continue https://dlcdn.apache.org/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3-scala2.13.tgz # link from https://spark.apache.org/downloads.html\n", | |
| "! tar zxvf ./spark-3.3.0-bin-hadoop3-scala2.13.tgz > /dev/null" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Set the environment variables:" | |
| ], | |
| "metadata": { | |
| "id": "1L5zNX3ZKQ8q" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import os\n", | |
| "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-11-openjdk-amd64/\"\n", | |
| "os.environ[\"SPARK_HOME\"] = \"/content/spark-3.3.0-bin-hadoop3-scala2.13\"" | |
| ], | |
| "metadata": { | |
| "id": "ltLoeMMe6CgS" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Then we need to install and import the ‘[findspark](https://pypi.org/project/findspark/)’ library that will locate Spark on the system and import it as a regular library:" | |
| ], | |
| "metadata": { | |
| "id": "76iw8pIeKW-h" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "! pip install -q findspark\n", | |
| "import findspark\n", | |
| "findspark.init()" | |
| ], | |
| "metadata": { | |
| "id": "0LegObO8KXoE" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Import SparkSession from pyspark.sql and create a SparkSession instance:" | |
| ], | |
| "metadata": { | |
| "id": "fZiiBm77Kqyw" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "from pyspark.sql import SparkSession\n", | |
| "\n", | |
| "spark = SparkSession.builder.master(\"local\").appName(\"Colab\").config(\"spark.ui.port\", \"4050\").getOrCreate()" | |
| ], | |
| "metadata": { | |
| "id": "H8YhBk8N02tk" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "### Loading data into PySpark" | |
| ], | |
| "metadata": { | |
| "id": "Vp6PITybK70L" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "from pathlib import Path\n", | |
| "\n", | |
| "filename = 'sample_books.json'\n", | |
| "source_location = 'https://raw.githubusercontent.com/GarvitArya/pyspark-demo/main' + '/' + filename\n", | |
| "target_location = Path('/tmp/') / filename\n", | |
| "target_location = str(target_location)\n", | |
| "\n", | |
| "! wget --quiet --continue $source_location -O $target_location\n", | |
| "\n", | |
| "df = spark.read.json(target_location)\n", | |
| "df.printSchema()" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "nFRfA9PoK7du", | |
| "outputId": "910c92ed-6f8d-4b86-fb73-ac68670681de" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "root\n", | |
| " |-- author: string (nullable = true)\n", | |
| " |-- edition: string (nullable = true)\n", | |
| " |-- price: double (nullable = true)\n", | |
| " |-- title: string (nullable = true)\n", | |
| " |-- year_written: long (nullable = true)\n", | |
| "\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "df.show(n=4, truncate=False)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "j26dsrWkAUCI", | |
| "outputId": "ef13c429-faf6-42aa-8fe4-add98398f960" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "+---------------+--------------+-----+----------------+------------+\n", | |
| "|author |edition |price|title |year_written|\n", | |
| "+---------------+--------------+-----+----------------+------------+\n", | |
| "|Austen, Jane |Penguin |18.2 |Northanger Abbey|1814 |\n", | |
| "|Tolstoy, Leo |Penguin |12.7 |War and Peace |1865 |\n", | |
| "|Tolstoy, Leo |Penguin |13.5 |Anna Karenina |1875 |\n", | |
| "|Woolf, Virginia|Harcourt Brace|25.0 |Mrs. Dalloway |1925 |\n", | |
| "+---------------+--------------+-----+----------------+------------+\n", | |
| "only showing top 4 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "df.count()" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "zEXncJDXBaOp", | |
| "outputId": "bec69d67-9d7b-4ab6-bb63-9996ed42385d" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "13" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 7 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "df = df.withColumnRenamed('year_written', 'year')\n", | |
| "df.select('year', 'edition', 'author').orderBy('edition', 'author').show(5)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "vERvGMVDBvQh", | |
| "outputId": "fec5116b-3171-4d46-8c46-12dd5374f32e" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "+----+-----------------+--------------------+\n", | |
| "|year| edition| author|\n", | |
| "+----+-----------------+--------------------+\n", | |
| "|1999| Harcourt Brace|Cunnningham, Michael|\n", | |
| "|2000| Harcourt Brace| Rowling, J.K.|\n", | |
| "|1925| Harcourt Brace| Woolf, Virginia|\n", | |
| "|1967|Harper Perennial| Marquez|\n", | |
| "|1814| Penguin| Austen, Jane|\n", | |
| "+----+-----------------+--------------------+\n", | |
| "only showing top 5 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "df_filtered = df.filter(\"year > 1950 AND price > 10 AND title IS NOT NULL\")\n", | |
| "df_filtered.select('title', 'price', 'year').orderBy('price', ascending=False).show(10, False)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "MDYYt0-eDxR3", | |
| "outputId": "e8df9d00-123e-4523-bb4b-ab3a9cac7fe5" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "+-----------------------------+-----+----+\n", | |
| "|title |price|year|\n", | |
| "+-----------------------------+-----+----+\n", | |
| "|Harry Potter |19.95|2000|\n", | |
| "|One Hundred Years of Solitude|14.0 |1967|\n", | |
| "|The Hours |12.35|1999|\n", | |
| "+-----------------------------+-----+----+\n", | |
| "\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "df_filtered.select('title', 'year').filter(\"title LIKE '%Harry Potter%'\").distinct().orderBy('year', ascending=True).show(20, False)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "F26mDuHIIXo6", | |
| "outputId": "e0c8c42a-9f23-4068-c436-f63c784e9981" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "+------------+----+\n", | |
| "|title |year|\n", | |
| "+------------+----+\n", | |
| "|Harry Potter|2000|\n", | |
| "+------------+----+\n", | |
| "\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "from pyspark.sql.functions import max\n", | |
| "\n", | |
| "highest_price = df.agg(max(\"price\")).collect()[0][0]\n", | |
| "print(\"highest price:\", highest_price)\n", | |
| "df.select(df.columns).filter(df.price == highest_price).show(20, False)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "tu250KG8I6Pm", | |
| "outputId": "a4a24a6f-afe4-40e4-db3a-a0e0f6e9f659" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "highest price: 29.0\n", | |
| "+---------------+-------+-----+-------------------+----+\n", | |
| "|author |edition|price|title |year|\n", | |
| "+---------------+-------+-----+-------------------+----+\n", | |
| "|Woolf, Virginia|Penguin|29.0 |A Room of One's Own|1922|\n", | |
| "+---------------+-------+-----+-------------------+----+\n", | |
| "\n" | |
| ] | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment