Last active
September 24, 2017 15:25
-
-
Save jcaristy/7cbc049fd3778f75ee3b16eaf839a36b to your computer and use it in GitHub Desktop.
[PySpark: Leer un CSV] #pyspark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Listar en databricks el archivo | |
%fs ls dbfs:/databricks-datasets/online_retail/data-001/ | |
# En databricks ya esta inicializado el objeto SparkSession pero sino se puede inicializar asi | |
from pyspark.sql import SparkSession | |
spark = SparkSession.builder.master("local").appName("Word Count").config("spark.some.config.option", "some-value").getOrCreate() | |
# Leer CSV a un RDD | |
path = "dbfs:/databricks-datasets/online_retail/data-001/data.csv" | |
data = spark.read.csv(path) | |
data.take(20) | |
# Leer un CSV a un DataFrame | |
df = spark.read.load(path,'com.databricks.spark.csv',header='true',inferSchema='true') | |
display(df) | |
# Leer el Schema de un DataFrame | |
df.printSchema() | |
# Operaciones sobre el DataFrame | |
display( | |
df | |
.select(df["InvoiceNo"],df["UnitPrice"]*df["Quantity"]) | |
.groupBy("InvoiceNo") | |
.sum() | |
) | |
# Guardar en una tabla el DataFrame a traves del DataFrameWritter | |
r1 = df.select("Country").distinct().orderBy("Country") | |
r1.write.saveAsTable("countries") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment