Skip to content

Instantly share code, notes, and snippets.

@habedi
Created January 17, 2022 09:38
Show Gist options
  • Save habedi/6e274a9c4c374fec9d12316b1fff7c5a to your computer and use it in GitHub Desktop.
Save habedi/6e274a9c4c374fec9d12316b1fff7c5a to your computer and use it in GitHub Desktop.
Example code for loading a CSV file as a DF in Databricks Community Edition and saving it as a table
# Loading PySpark modules
from pyspark.sql import DataFrame
from pyspark.sql.types import *
#from pyspark.context import SparkContext
#from pyspark.sql.session import SparkSession
# sc = SparkContext('local')
# spark = SparkSession(sc)
# Defining a schema for 'badges' table
badges_schema = StructType([StructField('UserId', IntegerType(), False),
StructField('Name', StringType(), False),
StructField('Date', TimestampType(), False),
StructField('Class', IntegerType(), False)])
source_file_name = "/FileStore/tables/badges_csv.gz"
df = spark.read.csv(source_file_name, sep="\t", header=True, schema=badges_schema)
# Saving the df as a parquet file on DBFS
table_name = "badges"
df.write.format("parquet").option("parquet.enable.dictionary", "true") \
.option("parquet.page.write-checksum.enabled", "false").mode('overwrite') \
.saveAsTable(table_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment