Last active
April 5, 2021 16:25
-
-
Save msummersgill/bc52c6f7e0251caab1a1cdfbac4e3803 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np ## numpy version | |
import pandas as pd ## pandas version | |
import pyspark ## pypark version | |
from pyspark.sql import SQLContext | |
from pyspark.sql.types import IntegerType | |
from pyspark.sql.types import LongType | |
## Define dataframe row count | |
L = 5*365*86400 | |
## Define an arbitrary number of additional columns to add | |
## 7 extra columns (for a total of 8) collects successfully using databricks-connect | |
## 8 extra columns (for a total of 9) fails to collect using databricks-connect | |
## 50 columns works fine in a databricks notebook | |
ExtraCols = 50 | |
sqlContext = SQLContext(sc) | |
ID = sqlContext.range(1,L) | |
## Create a spark dataframe with a single column containing sequential integer values from 1:L | |
Remote = ID.withColumn("ID", ID["id"].cast(IntegerType())) | |
## Create additional columns by copying | |
for i in range(1,ExtraCols + 1): | |
Remote = Remote.withColumn("N"+str(i), Remote["ID"]) | |
## Print out structure of Remote Dataframe | |
Remote.printSchema() | |
## Attempt to collect Spark DataFrame to local pandas data.frame | |
Local = Remote.toPandas() | |
## Print head | |
Local.head() | |
## End Spark Session | |
sc.stop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment