Created
January 17, 2022 09:38
-
-
Save habedi/6e274a9c4c374fec9d12316b1fff7c5a to your computer and use it in GitHub Desktop.
Example code for loading a CSV file as a DF in Databricks Community Edition and saving it as a table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Loading PySpark modules | |
from pyspark.sql import DataFrame | |
from pyspark.sql.types import * | |
#from pyspark.context import SparkContext | |
#from pyspark.sql.session import SparkSession | |
# sc = SparkContext('local') | |
# spark = SparkSession(sc) | |
# Defining a schema for 'badges' table | |
badges_schema = StructType([StructField('UserId', IntegerType(), False), | |
StructField('Name', StringType(), False), | |
StructField('Date', TimestampType(), False), | |
StructField('Class', IntegerType(), False)]) | |
source_file_name = "/FileStore/tables/badges_csv.gz" | |
df = spark.read.csv(source_file_name, sep="\t", header=True, schema=badges_schema) | |
# Saving the df as a parquet file on DBFS | |
table_name = "badges" | |
df.write.format("parquet").option("parquet.enable.dictionary", "true") \ | |
.option("parquet.page.write-checksum.enabled", "false").mode('overwrite') \ | |
.saveAsTable(table_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment