Skip to content

Instantly share code, notes, and snippets.

@solidpple
Last active December 19, 2017 02:42
Show Gist options
  • Save solidpple/28478a3a7502a32eabf308965aa4f1bf to your computer and use it in GitHub Desktop.
Save solidpple/28478a3a7502a32eabf308965aa4f1bf to your computer and use it in GitHub Desktop.
# ArrayType()을 multiple columns으로 변환하는 방법
df = hc.createDataFrame(sc.parallelize([['a', [1,2,3]], ['b', [2,3,4]]]), ["key", "value"])
df.printSchema()
df.show()
'''
root
|-- key: string (nullable = true)
|-- value: array (nullable = true)
| |-- element: long (containsNull = true)
'''
# python []을 통해서 값에 접근이 가능하다.
df.select("key", df.value[0], df.value[1], df.value[2]).show()
'''
+---+--------+--------+--------+
|key|value[0]|value[1]|value[2]|
+---+--------+--------+--------+
| a| 1| 2| 3|
| b| 2| 3| 4|
+---+--------+--------+--------+
+---+-------+
|key| value|
+---+-------+
| a|[1,2,3]|
| b|[2,3,4]|
+---+-------+
'''
# StructType()일때는 아래와 같이
df2 = df.select("key", psf.struct(
df.value[0].alias("value1"),
df.value[1].alias("value2"),
df.value[2].alias("value3")
).alias("value"))
df2.printSchema()
df2.show()
'''
root
|-- key: string (nullable = true)
|-- value: struct (nullable = false)
| |-- value1: long (nullable = true)
| |-- value2: long (nullable = true)
| |-- value3: long (nullable = true)
+---+-------+
|key| value|
+---+-------+
| a|[1,2,3]|
| b|[2,3,4]|
+---+-------+
'''
# split된 컬럼은 *을 이용해서 접근이 가능하다.
'''
df2.select('key', 'value.*').show()
+---+------+------+------+
|key|value1|value2|value3|
+---+------+------+------+
| a| 1| 2| 3|
| b| 2| 3| 4|
+---+------+------+------+
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment