Menziess · March 19, 2024 12:13
diff --git a/flatten_structs.py b/flatten_structs.py
 from pyspark.sql.functions import *


 def flatten_structs(df):
    """Omits lists, and flattens structs into regular columns.

    >>> flatten_structs(test_df).show() # doctest: +NORMALIZE_WHITESPACE
    Omitted column rootstructype.nestedstructtype
    Omitted column arraytype
    +---+--------+---------+------------------+------------------+------------------+
    | id|   money|timestamp|structtype.number1|structtype.number2|structtype.number3|
    +---+--------+---------+------------------+------------------+------------------+
    |  1|$100.000|       14|                 1|                 2|                 3|
    |  1|$200.000|       15|                 3|                 2|                 1|
    |  1| $10.000|       17|                 1|                 3|                 2|
    |  2|   -$100|       17|                 3|                 1|                 2|
    |  2|    $100|       14|                 2|                 1|                 3|
    +---+--------+---------+------------------+------------------+------------------+
    """
    struct_selectors = []

    for c in df.schema.jsonValue()['fields']:
        if isinstance(c['type'], str):
            struct_selectors.append(c['name'])
    elif isinstance(c['type'], dict) and c['type']['type'] == 'struct':
        for field in c['type']['fields']:
            if isinstance(field['type'], dict) or isinstance(field['type'], list):
                print('Omitted column', c['name'] + '.' + field['name'])
            else:
                struct_selectors.append('.'.join([c['name'], field['name']]))
    else:
        print('Omitted column', c['name'])

    return df.select(*[
        col(selector).alias(selector)
        for selector 
        in struct_selectors
    ])
	from pyspark.sql.functions import *


	def flatten_structs(df):
	"""Omits lists, and flattens structs into regular columns.

	>>> flatten_structs(test_df).show() # doctest: +NORMALIZE_WHITESPACE
	Omitted column rootstructype.nestedstructtype
	Omitted column arraytype
	+---+--------+---------+------------------+------------------+------------------+
	\| id\| money\|timestamp\|structtype.number1\|structtype.number2\|structtype.number3\|
	+---+--------+---------+------------------+------------------+------------------+
	\| 1\|$100.000\| 14\| 1\| 2\| 3\|
	\| 1\|$200.000\| 15\| 3\| 2\| 1\|
	\| 1\| $10.000\| 17\| 1\| 3\| 2\|
	\| 2\| -$100\| 17\| 3\| 1\| 2\|
	\| 2\| $100\| 14\| 2\| 1\| 3\|
	+---+--------+---------+------------------+------------------+------------------+
	"""
	struct_selectors = []

	for c in df.schema.jsonValue()['fields']:
	if isinstance(c['type'], str):
	struct_selectors.append(c['name'])
	elif isinstance(c['type'], dict) and c['type']['type'] == 'struct':
	for field in c['type']['fields']:
	if isinstance(field['type'], dict) or isinstance(field['type'], list):
	print('Omitted column', c['name'] + '.' + field['name'])
	else:
	struct_selectors.append('.'.join([c['name'], field['name']]))
	else:
	print('Omitted column', c['name'])

	return df.select(*[
	col(selector).alias(selector)
	for selector
	in struct_selectors
	])