- Select all "Harry Potter" books
- Book with more pages
- Top 5 authors with more written books (assume author in first position in the array, "key" field)
(assuming each row is a different book)
- Top 5 Genres with more books
- Avg. number of pages (needs cleaning)
- Per publish year, get the number of authors that published at least one book
val sqlStmt = """
select
title
, regexp_replace(authors[0].key, '/authors/', '') as author
, cast(regexp_extract(publish_date,'\\d{4}', 0) as int) as publish_year
, number_of_pages
, genres
from clean_table
where title is not null
"""
val transformedDF = spark.sql(sqlStmt)
transformedDF.createOrReplaceTempView("transformed_table")
- Select all "Harry Potter" books
select *
from transformed_table
where title like '%Harry Potter%'
limit 100
- Book with more pages
select title, number_of_pages
from transformed_table
where number_of_pages = (
select max(number_of_pages)
from transformed_table
)
- Top 5 authors with more written books (assume author in first position in the array, "key" field)
(assuming each row is a different book)
select
author, count(*) as num_written_books
from transformed_table
where author is not null
group by author
order by count(*) desc
limit 5
--https://openlibrary.org/authors/OL1224818A/California._Dept._of_Water_Resources.
--https://openlibrary.org/authors/OL4283462A/Jir%C5%8D_Akagawa
- Top 5 Genres with more books
transformedDF.select($"title",explode($"genres").alias("genre")).createOrReplaceTempView("exploded_genres")
select genre, count(*)
from exploded_genres
group by genre
order by count(*) desc
limit 5
- Avg. number of pages (needs cleaning)
import org.apache.spark.ml.feature.Imputer
import org.apache.spark.sql.types.DoubleType
val df = transformedDF.select(transformedDF("number_of_pages").cast(DoubleType).as("number_of_pages"))
val imputer = new Imputer()
.setInputCols(df.columns)
.setOutputCols(df.columns.map(c => s"${c}_imputed"))
.setStrategy("median")
val fixed = imputer.fit(df).transform(df)
fixed.show(20, false)
fixed.select(avg($"number_of_pages_imputed")).show()
fixed.select(avg($"number_of_pages")).show()
- Per publish year, get the number of authors that published at least one book
select
publish_year, count(distinct author) as num_authors
from transformed_table
where author is not null and publish_year is not null
group by publish_year
order by publish_year asc