Skip to content

Instantly share code, notes, and snippets.

@RahulJyala7
Created April 3, 2019 08:35
Show Gist options
  • Save RahulJyala7/908f69ec53941ba21968c6eda8f19406 to your computer and use it in GitHub Desktop.
Save RahulJyala7/908f69ec53941ba21968c6eda8f19406 to your computer and use it in GitHub Desktop.
PySpark
from pyspark.sql import SparkSession
from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017/')
def sparkdataframe(file):
spark = SparkSession \
.builder \
.appName("yelp") \
.config("spark.some.config.option", "some-value") \
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/sink.sh") \
.getOrCreate()
"""loading json datafile here to create dataframe from large json datasets"""
df = spark.read.json(file)
dataframe = df.select("business_id", "address", "categories", "city", "name", "stars", "state")
return dataframe
data = sparkdataframe(file = "/home/rahul/Documents/DataScience.Material/yelp_dataset/business.json")
# convert to pandas dataframe
df= data.toPandas()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment