rounakdatta · June 6, 2019 12:42
diff --git a/pyspark_experiments.ipynb b/pyspark_experiments.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "\n",
    "from pyspark.sql.types import StructType, StructField\n",
    "from pyspark.sql.types import DoubleType, IntegerType, StringType"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession.builder.master(\"local\").appName('Hotstar').getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "schema = StructType([\n",
    "    StructField(\"Letter\", StringType()),\n",
    "    StructField(\"Frequency\", StringType()),\n",
    "    StructField(\"Percentage\", StringType())\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read.csv(\"letter_frequency.csv\", header=True, schema=schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+----------+----------+\n",
      "| Letter| Frequency|Percentage|\n",
      "+-------+----------+----------+\n",
      "|    \"A\"|  24373121|       8.1|\n",
      "|    \"B\"|   4762938|       1.6|\n",
      "|    \"C\"|   8982417|       3.0|\n",
      "|    \"D\"|  10805580|       3.6|\n",
      "|    \"E\"|  37907119|      12.6|\n",
      "|    \"F\"|   7486889|       2.5|\n",
      "|    \"G\"|   5143059|       1.7|\n",
      "|    \"H\"|  18058207|       6.0|\n",
      "|    \"I\"|  21820970|       7.3|\n",
      "|    \"J\"|    474021|       0.2|\n",
      "|    \"K\"|   1720909|       0.6|\n",
      "|    \"L\"|  11730498|       3.9|\n",
      "|    \"M\"|   7391366|       2.5|\n",
      "|    \"N\"|  21402466|       7.1|\n",
      "|    \"O\"|  23215532|       7.7|\n",
      "|    \"P\"|   5719422|       1.9|\n",
      "|    \"Q\"|    297237|       0.1|\n",
      "|    \"R\"|  17897352|       5.9|\n",
      "|    \"S\"|  19059775|       6.3|\n",
      "|    \"T\"|  28691274|       9.5|\n",
      "+-------+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.write.parquet(\"./letter_frequency.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SQLContext\n",
    "sc = spark.sparkContext\n",
    "sqlContext = SQLContext(sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reading Parquet files in Pyspark (Method 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfParquet = sqlContext.read.parquet(\"./letter_frequency.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+----------+----------+\n",
      "| Letter| Frequency|Percentage|\n",
      "+-------+----------+----------+\n",
      "|    \"A\"|  24373121|       8.1|\n",
      "|    \"B\"|   4762938|       1.6|\n",
      "|    \"C\"|   8982417|       3.0|\n",
      "|    \"D\"|  10805580|       3.6|\n",
      "|    \"E\"|  37907119|      12.6|\n",
      "|    \"F\"|   7486889|       2.5|\n",
      "|    \"G\"|   5143059|       1.7|\n",
      "|    \"H\"|  18058207|       6.0|\n",
      "|    \"I\"|  21820970|       7.3|\n",
      "|    \"J\"|    474021|       0.2|\n",
      "|    \"K\"|   1720909|       0.6|\n",
      "|    \"L\"|  11730498|       3.9|\n",
      "|    \"M\"|   7391366|       2.5|\n",
      "|    \"N\"|  21402466|       7.1|\n",
      "|    \"O\"|  23215532|       7.7|\n",
      "|    \"P\"|   5719422|       1.9|\n",
      "|    \"Q\"|    297237|       0.1|\n",
      "|    \"R\"|  17897352|       5.9|\n",
      "|    \"S\"|  19059775|       6.3|\n",
      "|    \"T\"|  28691274|       9.5|\n",
      "+-------+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dfParquet.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- Letter: string (nullable = true)\n",
      " |-- Frequency: string (nullable = true)\n",
      " |-- Percentage: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dfParquet.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reading Parquet files in Pyspark (Method 2) [*Recommended*]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfParquetAlternate = spark.read.parquet(\"./letter_frequency.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+----------+----------+\n",
      "| Letter| Frequency|Percentage|\n",
      "+-------+----------+----------+\n",
      "|    \"A\"|  24373121|       8.1|\n",
      "|    \"B\"|   4762938|       1.6|\n",
      "|    \"C\"|   8982417|       3.0|\n",
      "|    \"D\"|  10805580|       3.6|\n",
      "|    \"E\"|  37907119|      12.6|\n",
      "|    \"F\"|   7486889|       2.5|\n",
      "|    \"G\"|   5143059|       1.7|\n",
      "|    \"H\"|  18058207|       6.0|\n",
      "|    \"I\"|  21820970|       7.3|\n",
      "|    \"J\"|    474021|       0.2|\n",
      "|    \"K\"|   1720909|       0.6|\n",
      "|    \"L\"|  11730498|       3.9|\n",
      "|    \"M\"|   7391366|       2.5|\n",
      "|    \"N\"|  21402466|       7.1|\n",
      "|    \"O\"|  23215532|       7.7|\n",
      "|    \"P\"|   5719422|       1.9|\n",
      "|    \"Q\"|    297237|       0.1|\n",
      "|    \"R\"|  17897352|       5.9|\n",
      "|    \"S\"|  19059775|       6.3|\n",
      "|    \"T\"|  28691274|       9.5|\n",
      "+-------+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dfParquetAlternate.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from pyspark.sql import SparkSession\n",
	"\n",
	"from pyspark.sql.types import StructType, StructField\n",
	"from pyspark.sql.types import DoubleType, IntegerType, StringType"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"spark = SparkSession.builder.master(\"local\").appName('Hotstar').getOrCreate()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"schema = StructType([\n",
	" StructField(\"Letter\", StringType()),\n",
	" StructField(\"Frequency\", StringType()),\n",
	" StructField(\"Percentage\", StringType())\n",
	"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = spark.read.csv(\"letter_frequency.csv\", header=True, schema=schema)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"+-------+----------+----------+\n",
	"\| Letter\| Frequency\|Percentage\|\n",
	"+-------+----------+----------+\n",
	"\| \"A\"\| 24373121\| 8.1\|\n",
	"\| \"B\"\| 4762938\| 1.6\|\n",
	"\| \"C\"\| 8982417\| 3.0\|\n",
	"\| \"D\"\| 10805580\| 3.6\|\n",
	"\| \"E\"\| 37907119\| 12.6\|\n",
	"\| \"F\"\| 7486889\| 2.5\|\n",
	"\| \"G\"\| 5143059\| 1.7\|\n",
	"\| \"H\"\| 18058207\| 6.0\|\n",
	"\| \"I\"\| 21820970\| 7.3\|\n",
	"\| \"J\"\| 474021\| 0.2\|\n",
	"\| \"K\"\| 1720909\| 0.6\|\n",
	"\| \"L\"\| 11730498\| 3.9\|\n",
	"\| \"M\"\| 7391366\| 2.5\|\n",
	"\| \"N\"\| 21402466\| 7.1\|\n",
	"\| \"O\"\| 23215532\| 7.7\|\n",
	"\| \"P\"\| 5719422\| 1.9\|\n",
	"\| \"Q\"\| 297237\| 0.1\|\n",
	"\| \"R\"\| 17897352\| 5.9\|\n",
	"\| \"S\"\| 19059775\| 6.3\|\n",
	"\| \"T\"\| 28691274\| 9.5\|\n",
	"+-------+----------+----------+\n",
	"only showing top 20 rows\n",
	"\n"
	]
	}
	],
	"source": [
	"df.show()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.write.parquet(\"./letter_frequency.parquet\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"from pyspark.sql import SQLContext\n",
	"sc = spark.sparkContext\n",
	"sqlContext = SQLContext(sc)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Reading Parquet files in Pyspark (Method 1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"dfParquet = sqlContext.read.parquet(\"./letter_frequency.parquet\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"+-------+----------+----------+\n",
	"\| Letter\| Frequency\|Percentage\|\n",
	"+-------+----------+----------+\n",
	"\| \"A\"\| 24373121\| 8.1\|\n",
	"\| \"B\"\| 4762938\| 1.6\|\n",
	"\| \"C\"\| 8982417\| 3.0\|\n",
	"\| \"D\"\| 10805580\| 3.6\|\n",
	"\| \"E\"\| 37907119\| 12.6\|\n",
	"\| \"F\"\| 7486889\| 2.5\|\n",
	"\| \"G\"\| 5143059\| 1.7\|\n",
	"\| \"H\"\| 18058207\| 6.0\|\n",
	"\| \"I\"\| 21820970\| 7.3\|\n",
	"\| \"J\"\| 474021\| 0.2\|\n",
	"\| \"K\"\| 1720909\| 0.6\|\n",
	"\| \"L\"\| 11730498\| 3.9\|\n",
	"\| \"M\"\| 7391366\| 2.5\|\n",
	"\| \"N\"\| 21402466\| 7.1\|\n",
	"\| \"O\"\| 23215532\| 7.7\|\n",
	"\| \"P\"\| 5719422\| 1.9\|\n",
	"\| \"Q\"\| 297237\| 0.1\|\n",
	"\| \"R\"\| 17897352\| 5.9\|\n",
	"\| \"S\"\| 19059775\| 6.3\|\n",
	"\| \"T\"\| 28691274\| 9.5\|\n",
	"+-------+----------+----------+\n",
	"only showing top 20 rows\n",
	"\n"
	]
	}
	],
	"source": [
	"dfParquet.show()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"root\n",
	" \|-- Letter: string (nullable = true)\n",
	" \|-- Frequency: string (nullable = true)\n",
	" \|-- Percentage: string (nullable = true)\n",
	"\n"
	]
	}
	],
	"source": [
	"dfParquet.printSchema()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Reading Parquet files in Pyspark (Method 2) [Recommended]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"dfParquetAlternate = spark.read.parquet(\"./letter_frequency.parquet\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"+-------+----------+----------+\n",
	"\| Letter\| Frequency\|Percentage\|\n",
	"+-------+----------+----------+\n",
	"\| \"A\"\| 24373121\| 8.1\|\n",
	"\| \"B\"\| 4762938\| 1.6\|\n",
	"\| \"C\"\| 8982417\| 3.0\|\n",
	"\| \"D\"\| 10805580\| 3.6\|\n",
	"\| \"E\"\| 37907119\| 12.6\|\n",
	"\| \"F\"\| 7486889\| 2.5\|\n",
	"\| \"G\"\| 5143059\| 1.7\|\n",
	"\| \"H\"\| 18058207\| 6.0\|\n",
	"\| \"I\"\| 21820970\| 7.3\|\n",
	"\| \"J\"\| 474021\| 0.2\|\n",
	"\| \"K\"\| 1720909\| 0.6\|\n",
	"\| \"L\"\| 11730498\| 3.9\|\n",
	"\| \"M\"\| 7391366\| 2.5\|\n",
	"\| \"N\"\| 21402466\| 7.1\|\n",
	"\| \"O\"\| 23215532\| 7.7\|\n",
	"\| \"P\"\| 5719422\| 1.9\|\n",
	"\| \"Q\"\| 297237\| 0.1\|\n",
	"\| \"R\"\| 17897352\| 5.9\|\n",
	"\| \"S\"\| 19059775\| 6.3\|\n",
	"\| \"T\"\| 28691274\| 9.5\|\n",
	"+-------+----------+----------+\n",
	"only showing top 20 rows\n",
	"\n"
	]
	}
	],
	"source": [
	"dfParquetAlternate.show()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}