Skip to content

Instantly share code, notes, and snippets.

@holdenk
Last active July 25, 2016 21:09
Show Gist options
  • Save holdenk/1d1fa5e5d234327a12f5bc1a84069591 to your computer and use it in GitHub Desktop.
Save holdenk/1d1fa5e5d234327a12f5bc1a84069591 to your computer and use it in GitHub Desktop.
scala> val df =spark.read.format("csv").option("header", "false").option("inferSchema", "true").load("/home/holden/Downloads/ex*.csv")
df: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 2125 more fields]
scala> df.collect()
16/07/25 12:53:40 WARN Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.
res9: Array[org.apache.spark.sql.Row] = Array([Date,Lifetime Total Likes,Daily New Likes,Daily Unlikes,Daily Page Engaged Users,Weekly Page Engaged Users,28 Days Page Engaged Users,Daily Like Sources - On Your Page,Daily Total Reach,Weekly Total Reach,28 Days Total Reach,Daily Organic Reach,Weekly Organic Reach,28 Days Organic Reach,Daily Total Impressions,Weekly Total Impressions,28 Days Total Impressions,Daily Organic impressions,Weekly Organic impressions,28 Days Organic impressions,Daily Reach of page posts,Weekly Reach of page posts,28 Days Reach of page posts,Daily Organic Reach of Page posts,Weekly Organic Reach of Page posts,28 Days Organic Reach of Page posts,Daily Total Impressions of your posts,Weekly Total Impressions of your posts,28 Days Total Impressions of your posts,Dai...
scala> val df =spark.read.format("csv").option("header", "false").option("inferSchema", "false").load("/home/holden/Downloads/ex*.csv")
df: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 2125 more fields]
scala> val miniHeader = df.take(1)
miniHeader: Array[org.apache.spark.sql.Row] = Array([Date,Lifetime Total Likes,Daily New Likes,Daily Unlikes,Daily Page Engaged Users,Weekly Page Engaged Users,28 Days Page Engaged Users,Daily Like Sources - On Your Page,Daily Total Reach,Weekly Total Reach,28 Days Total Reach,Daily Organic Reach,Weekly Organic Reach,28 Days Organic Reach,Daily Total Impressions,Weekly Total Impressions,28 Days Total Impressions,Daily Organic impressions,Weekly Organic impressions,28 Days Organic impressions,Daily Reach of page posts,Weekly Reach of page posts,28 Days Reach of page posts,Daily Organic Reach of Page posts,Weekly Organic Reach of Page posts,28 Days Organic Reach of Page posts,Daily Total Impressions of your posts,Weekly Total Impressions of your posts,28 Days Total Impressions of your pos...
scala> miniHeader(0).toSeq
res11: Seq[Any] = WrappedArray(Date, Lifetime Total Likes, Daily New Likes, Daily Unlikes, Daily Page Engaged Users, Weekly Page Engaged Users, 28 Days Page Engaged Users, Daily Like Sources - On Your Page, Daily Total Reach, Weekly Total Reach, 28 Days Total Reach, Daily Organic Reach, Weekly Organic Reach, 28 Days Organic Reach, Daily Total Impressions, Weekly Total Impressions, 28 Days Total Impressions, Daily Organic impressions, Weekly Organic impressions, 28 Days Organic impressions, Daily Reach of page posts, Weekly Reach of page posts, 28 Days Reach of page posts, Daily Organic Reach of Page posts, Weekly Organic Reach of Page posts, 28 Days Organic Reach of Page posts, Daily Total Impressions of your posts, Weekly Total Impressions of your posts, 28 Days Total Impressions of yo...
scala> miniHeader(0).toSeq
res12: Seq[Any] = WrappedArray(Date, Lifetime Total Likes, Daily New Likes, Daily Unlikes, Daily Page Engaged Users, Weekly Page Engaged Users, 28 Days Page Engaged Users, Daily Like Sources - On Your Page, Daily Total Reach, Weekly Total Reach, 28 Days Total Reach, Daily Organic Reach, Weekly Organic Reach, 28 Days Organic Reach, Daily Total Impressions, Weekly Total Impressions, 28 Days Total Impressions, Daily Organic impressions, Weekly Organic impressions, 28 Days Organic impressions, Daily Reach of page posts, Weekly Reach of page posts, 28 Days Reach of page posts, Daily Organic Reach of Page posts, Weekly Organic Reach of Page posts, 28 Days Organic Reach of Page posts, Daily Total Impressions of your posts, Weekly Total Impressions of your posts, 28 Days Total Impressions of yo...
scala> val idx1 = res12.indexOf("Date")
idx1: Int = 0
scala> df.schema
res13: org.apache.spark.sql.types.StructType = StructType(StructField(_c0,StringType,true), StructField(_c1,StringType,true), StructField(_c2,StringType,true), StructField(_c3,StringType,true), StructField(_c4,StringType,true), StructField(_c5,StringType,true), StructField(_c6,StringType,true), StructField(_c7,StringType,true), StructField(_c8,StringType,true), StructField(_c9,StringType,true), StructField(_c10,StringType,true), StructField(_c11,StringType,true), StructField(_c12,StringType,true), StructField(_c13,StringType,true), StructField(_c14,StringType,true), StructField(_c15,StringType,true), StructField(_c16,StringType,true), StructField(_c17,StringType,true), StructField(_c18,StringType,true), StructField(_c19,StringType,true), StructField(_c20,StringType,true), StructField(_c...
scala> df.select("_c0").collect()
res14: Array[org.apache.spark.sql.Row] = Array([Date], [2/1/2012], [3/2/2012], [3/3/2012])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment