Created
October 16, 2019 23:42
-
-
Save amywieliczka/a10abff10c26cf95d531c3c20eff3ce0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession | |
def grade_collection(id): | |
untitledItemsDF = spark.sql(f"SELECT COUNT(*) FROM calisphere \ | |
WHERE array_contains(collection_url, 'https://registry.cdlib.org/api/v1/collection/{id}/') \ | |
AND array_contains(title, 'Untitled')") | |
totalItemsDF = spark.sql(f"SELECT COUNT(*) FROM calisphere \ | |
WHERE array_contains(collection_url, 'https://registry.cdlib.org/api/v1/collection/{id}/')") | |
untitledItemsDF.show() / totalItemsDF.show() | |
def calisphere_dataset(spark): | |
sc = spark.sparkContext | |
filename = 'solrdump-2019-07-30.jsonl' | |
calisphereDF = spark.read.json(filename) | |
calisphereDF.createOrReplaceTempView("calisphere") | |
calisphereDF.printSchema() | |
titlesDF = spark.sql("SELECT title FROM calisphere \ | |
WHERE array_contains(collection_url, 'https://registry.cdlib.org/api/v1/collection/27086/') \ | |
AND array_contains(title, 'Untitled')") | |
titlesDF.show(20, False) | |
if __name__ == "__main__": | |
spark = SparkSession \ | |
.builder \ | |
.appName("Python Spark SQL data source example") \ | |
.getOrCreate() | |
calisphere_dataset(spark) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment