Skip to content

Instantly share code, notes, and snippets.

@durgaswaroop
Last active December 27, 2017 08:09
Show Gist options
  • Save durgaswaroop/646ffb6283aa0238277aa16ae0771016 to your computer and use it in GitHub Desktop.
Save durgaswaroop/646ffb6283aa0238277aa16ae0771016 to your computer and use it in GitHub Desktop.
First part of the Datasets in Apache spark
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.desc;
public class DataSetMain {
public static void main(String[] args) {
// Initialize Sparksession
SparkSession spark = SparkSession.builder().appName("Freblogg-Spark").master("local").getOrCreate();
// Read text file
Dataset<String> lipsumDs = spark.read().textFile("fake-text.txt");
lipsumDs.show(5);
// Read csv file
Dataset<Row> peopleDs = spark.read().option("header", "true").csv("fake-people.csv");
peopleDs.show(5);
// Read json file
Dataset<Row> peopleJsonDs = spark.read().json("fake-people.json");
peopleJsonDs.show(5);
peopleJsonDs.select("id", "first_name", "last_name", "email", "gender", "ip_address").show(5);
// Selections
peopleDs.select("email").show(5);
peopleDs.select(col("email"), col("gender")).show(5);
// Filtering
peopleDs.filter(col("id").$less$eq(10).and(col("id").$greater(5))).show();
// Drop columns
peopleDs.drop("last_name", "ip_address").show(5);
// Sorting
peopleDs.sort(desc("first_name")).show(5);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment