Skip to content

Instantly share code, notes, and snippets.

@amywieliczka
Created October 16, 2019 23:42
Show Gist options
  • Save amywieliczka/a10abff10c26cf95d531c3c20eff3ce0 to your computer and use it in GitHub Desktop.
Save amywieliczka/a10abff10c26cf95d531c3c20eff3ce0 to your computer and use it in GitHub Desktop.
from pyspark.sql import SparkSession
def grade_collection(id):
untitledItemsDF = spark.sql(f"SELECT COUNT(*) FROM calisphere \
WHERE array_contains(collection_url, 'https://registry.cdlib.org/api/v1/collection/{id}/') \
AND array_contains(title, 'Untitled')")
totalItemsDF = spark.sql(f"SELECT COUNT(*) FROM calisphere \
WHERE array_contains(collection_url, 'https://registry.cdlib.org/api/v1/collection/{id}/')")
untitledItemsDF.show() / totalItemsDF.show()
def calisphere_dataset(spark):
sc = spark.sparkContext
filename = 'solrdump-2019-07-30.jsonl'
calisphereDF = spark.read.json(filename)
calisphereDF.createOrReplaceTempView("calisphere")
calisphereDF.printSchema()
titlesDF = spark.sql("SELECT title FROM calisphere \
WHERE array_contains(collection_url, 'https://registry.cdlib.org/api/v1/collection/27086/') \
AND array_contains(title, 'Untitled')")
titlesDF.show(20, False)
if __name__ == "__main__":
spark = SparkSession \
.builder \
.appName("Python Spark SQL data source example") \
.getOrCreate()
calisphere_dataset(spark)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment