from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
import sys
sc = SparkContext('local')
spark = SparkSession(sc)
patt = sys.argv[1]
df = spark.read.json("file:///tmp/ol_cdump.json")
filtered = df.filter(df.title.contains(patt)).select("title", "publish_date", "number_of_pages").orderBy(df.number_of_pages.desc())
filtered.show(truncate = False)
filtered.repartition(1).write.mode("overwrite").csv('file:///tmp/filtered/')
sh ~/spark-2.4.5-bin-hadoop2.7/bin/spark-submit --master local[4] ~/filter_book.py "Harry Potter"
head -10 /tmp/filtered/part-0000