Skip to content

Instantly share code, notes, and snippets.

@zhu3pang
Last active July 3, 2021 04:54
Show Gist options
  • Save zhu3pang/91990e27fd3149d7dabd1a8519ab242f to your computer and use it in GitHub Desktop.
Save zhu3pang/91990e27fd3149d7dabd1a8519ab242f to your computer and use it in GitHub Desktop.
spark sql 列转行
df = spark.createDataFrame([
{'id': 1, u'姓名': u'张三', u'分数': 88, u'科目': u'数学'},
{'id': 2, u'姓名': u'李雷', u'分数': 67, u'科目': u'数学'},
{'id': 3, u'姓名': u'宫九', u'分数': 77, u'科目': u'数学'},
{'id': 4, u'姓名': u'王五', u'分数': 65, u'科目': u'数学'},
{'id': 1, u'姓名': u'张三', u'分数': 77, u'科目': u'英语'},
{'id': 2, u'姓名': u'李雷', u'分数': 90, u'科目': u'英语'},
{'id': 3, u'姓名': u'宫九', u'分数': 24, u'科目': u'英语'},
{'id': 4, u'姓名': u'王五', u'分数': 90, u'科目': u'英语'},
{'id': 1, u'姓名': u'张三', u'分数': 33, u'科目': u'语文'},
{'id': 2, u'姓名': u'李雷', u'分数': 87, u'科目': u'语文'},
{'id': 3, u'姓名': u'宫九', u'分数': 92, u'科目': u'语文'},
{'id': 4, u'姓名': u'王五', u'分数': 87, u'科目': u'语文'},
])
df.show()
# +---+----+----+----+
# | id|分数|姓名|科目|
# +---+----+----+----+
# | 1| 88|张三|数学|
# | 2| 67|李雷|数学|
# | 3| 77|宫九|数学|
# | 4| 65|王五|数学|
# | 1| 77|张三|英语|
# | 2| 90|李雷|英语|
# | 3| 24|宫九|英语|
# | 4| 90|王五|英语|
# | 1| 33|张三|语文|
# | 2| 87|李雷|语文|
# | 3| 92|宫九|语文|
# | 4| 87|王五|语文|
# +---+----+----+----+
# spark sql 列转行
spark_df_pivot = spark.sql("""
SELECT
*
FROM df
PIVOT
(
SUM(`分数`)
FOR `科目` in ('数学','英语')
)
ORDER BY id
""").cache()
spark_df_pivot.show()
# +---+----+----+----+
# | id|姓名|数学|英语|
# +---+----+----+----+
# | 1|张三| 88| 77|
# | 2|李雷| 67| 90|
# | 3|宫九| 77| 24|
# | 4|王五| 65| 90|
# +---+----+----+----+
# spark scala
val spark_scala_pivot =
df.groupBy("id", "姓名")
.pivot("科目")
.agg(sum("分数"))
spark_scala_pivot.show()
# +---+----+----+----+----+
# | id|姓名|数学|英语|语文|
# +---+----+----+----+----+
# | 2|李雷| 67| 90| 87|
# | 1|张三| 88| 77| 33|
# | 3|宫九| 77| 24| 92|
# | 4|王五| 65| 90| 87|
# +---+----+----+----+----+
# hive sql 列转行
hive_df_pivot = spark.sql("""
select
id,
`姓名`,
nvl(m ['数学'], 0) as `数学`,
nvl(m ['英语'], 0) as `英语`
from(
select
id,
`姓名`,
collect(`科目`, `分数`) as m
from
df
group by
id,
`姓名`
) t_b
""").cache()
hive_df_pivot.show()
# id 姓名 数学 英语
# 4 王五 65 90
# 1 张三 88 77
# 2 李雷 67 90
# 3 宫九 77 24
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment