Skip to content

Instantly share code, notes, and snippets.

@nsivabalan
Last active February 9, 2022 23:04
Show Gist options
  • Save nsivabalan/2bda3e9f3335cefd44c004936b446c3e to your computer and use it in GitHub Desktop.
Save nsivabalan/2bda3e9f3335cefd44c004936b446c3e to your computer and use it in GitHub Desktop.
select * from hudi_mor3_rt;
+-----------------------------------+------------------------------------+----------------------------------+--------------------------------------+-------------------------------------------------------------------------------+--------------------+-----------------------+-----------------------+------------------+-----------------------------+--+
| hudi_mor3_rt._hoodie_commit_time | hudi_mor3_rt._hoodie_commit_seqno | hudi_mor3_rt._hoodie_record_key | hudi_mor3_rt._hoodie_partition_path | hudi_mor3_rt._hoodie_file_name | hudi_mor3_rt.uuid | hudi_mor3_rt.array_1 | hudi_mor3_rt.array_2 | hudi_mor3_rt.ts | hudi_mor3_rt.partitionpath |
+-----------------------------------+------------------------------------+----------------------------------+--------------------------------------+-------------------------------------------------------------------------------+--------------------+-----------------------+-----------------------+------------------+-----------------------------+--+
| 20220209224904138 | 20220209224904138_0_1 | 82 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 82 | [] | [] | 82 | partition |
| 20220209224851153 | 20220209224851153_0_602 | 9 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 9 | [] | [] | 9 | partition |
| 20220209224904138 | 20220209224904138_0_2 | 66 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 66 | [] | [] | 66 | partition |
| 20220209224851153 | 20220209224851153_0_604 | 21 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 21 | [] | [] | 21 | partition |
| 20220209224904138 | 20220209224904138_0_3 | 71 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 71 | [] | [] | 71 | partition |
| 20220209224904138 | 20220209224904138_0_4 | 55 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 55 | [] | [] | 55 | partition |
| 20220209224851153 | 20220209224851153_0_607 | 10 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 10 | [] | [] | 10 | partition |
| 20220209224851153 | 20220209224851153_0_608 | 39 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 39 | [] | [] | 39 | partition |
| 20220209224851153 | 20220209224851153_0_609 | 3 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 3 | [] | [] | 3 | partition |
| 20220209224904138 | 20220209224904138_0_5 | 60 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 60 | [] | [] | 60 | partition |
| 20220209224904138 | 20220209224904138_0_6 | 89 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 89 | [] | [] | 89 | partition |
| 20220209224851153 | 20220209224851153_0_612 | 44 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 44 | [] | [] | 44 | partition |
| 20220209224851153 | 20220209224851153_0_613 | 28 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 28 | [] | [] | 28 | partition |
| 20220209224904138 | 20220209224904138_0_7 | 94 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 94 | [] | [] | 94 | partition |
| 20220209224904138 | 20220209224904138_0_8 | 78 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 78 | [] | [] | 78 | partition |
| 20220209224851153 | 20220209224851153_0_616 | 33 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 33 | [] | [] | 33 | partition |
| 20220209224851153 | 20220209224851153_0_617 | 17 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 17 | [] | [] | 17 | partition |
| 20220209224904138 | 20220209224904138_0_9 | 83 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 83 | [] | [] | 83 | partition |
| 20220209224904138 | 20220209224904138_0_10 | 67 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 67 | [] | [] | 67 | partition |
| 20220209224851153 | 20220209224851153_0_620 | 22 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 22 | [] | [] | 22 | partition |
| 20220209224904138 | 20220209224904138_0_11 | 72 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 72 | [] | [] | 72 | partition |
| 20220209224904138 | 20220209224904138_0_12 | 56 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 56 | [] | [] | 56 | partition |
| 20220209224851153 | 20220209224851153_0_623 | 11 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 11 | [] | [] | 11 | partition |
| 20220209224851153 | 20220209224851153_0_624 | 4 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 4 | [] | [] | 4 | partition |
| 20220209224904138 | 20220209224904138_0_13 | 61 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 61 | [] | [] | 61 | partition |
| 20220209224851153 | 20220209224851153_0_626 | 45 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 45 | [] | [] | 45 | partition |
| 20220209224851153 | 20220209224851153_0_627 | 29 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 29 | [] | [] | 29 | partition |
| 20220209224904138 | 20220209224904138_0_14 | 95 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 95 | [] | [] | 95 | partition |
| 20220209224904138 | 20220209224904138_0_15 | 50 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0
select * from hudi_mor3_rt;
+-----------------------------------+------------------------------------+----------------------------------+--------------------------------------+-------------------------------------------------------------------------------+--------------------+-----------------------+-----------------------+------------------+-----------------------------+--+
| hudi_mor3_rt._hoodie_commit_time | hudi_mor3_rt._hoodie_commit_seqno | hudi_mor3_rt._hoodie_record_key | hudi_mor3_rt._hoodie_partition_path | hudi_mor3_rt._hoodie_file_name | hudi_mor3_rt.uuid | hudi_mor3_rt.array_1 | hudi_mor3_rt.array_2 | hudi_mor3_rt.ts | hudi_mor3_rt.partitionpath |
+-----------------------------------+------------------------------------+----------------------------------+--------------------------------------+-------------------------------------------------------------------------------+--------------------+-----------------------+-----------------------+------------------+-----------------------------+--+
| 20220209224904138 | 20220209224904138_0_1 | 82 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 82 | [] | [] | 82 | partition |
| 20220209224851153 | 20220209224851153_0_602 | 9 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 9 | [] | [] | 9 | partition |
| 20220209224904138 | 20220209224904138_0_2 | 66 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 66 | [] | [] | 66 | partition |
| 20220209224851153 | 20220209224851153_0_604 | 21 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 21 | [] | [] | 21 | partition |
| 20220209224904138 | 20220209224904138_0_3 | 71 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 71 | [] | [] | 71 | partition |
| 20220209224904138 | 20220209224904138_0_4 | 55 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 55 | [] | [] | 55 | partition |
| 20220209224851153 | 20220209224851153_0_607 | 10 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 10 | [] | [] | 10 | partition |
| 20220209224851153 | 20220209224851153_0_608 | 39 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 39 | [] | [] | 39 | partition |
| 20220209224851153 | 20220209224851153_0_609 | 3 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0_0-235-14661_20220209224851153.parquet | 3 | [] | [] | 3 | partition |
| 20220209224904138 | 20220209224904138_0_5 | 60 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0 | 60 | [] | [] | 60 | partition |
| 20220209224904138 | 20220209224904138_0_6 | 89 | partition | 19a65467-4945-444b-8ec1-4d168fa2a8a2-0
select * from hudi_mor4_rt;
Query 20220209_230102_00002_xn9dw, FAILED, 1 node
Splits: 17 total, 0 done (0.00%)
0:07 [0 rows, 0B] [0 rows/s, 0B/s]
Query 20220209_230102_00002_xn9dw failed: readDirect unsupported in RemoteBlockReader
select * from hudi_mor4_ro;
_hoodie_commit_time | _hoodie_commit_seqno | _hoodie_record_key | _hoodie_partition_path | _h
---------------------+-------------------------+--------------------+------------------------+-------------------------------
20220209225514907 | 20220209225514907_0_1 | 82 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_2 | 9 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_3 | 66 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_4 | 21 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_5 | 71 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_6 | 55 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_7 | 10 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_8 | 39 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_9 | 3 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_10 | 60 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_11 | 89 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_12 | 44 | partition | 3030dd9e-640d-4bc3-8d3c-612186
def gen_data(start, stop):
return [
{
"uuid": str(i),
"partitionpath": "partition",
"array_1": [], # array does not need to be populated to reproduce issue
"array_2": [], # two arrays need to be defined in the schema to reproduce
"ts": str(i),
}
for i in range(start, stop)
]
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
schema = StructType(
[
StructField("uuid", StringType(), True),
StructField("partitionpath", StringType(), True),
StructField("array_1", ArrayType(
StructType(
[
StructField("field_1", StringType(), True),
StructField("field_2", StringType(), True),
]
)
), True),
StructField("array_2", ArrayType(
StructType(
[
StructField("field_3", StringType(), True),
StructField("field_4", StringType(), True),
]
)
), True),
StructField("ts", StringType(), True)
]
)
destination = "/user/hive/warehouse/hudi_mor3"
hudi_write_options = {
"hoodie.table.name": "hudi_mor3",
"hoodie.datasource.write.operation": "upsert",
"hoodie.datasource.write.table.name": "hudi_mor3",
"hoodie.datasource.write.table.type": "MERGE_ON_READ",
"hoodie.datasource.write.partitionpath.field" : "partitionpath",
"hoodie.datasource.write.recordkey.field" : "uuid",
"hoodie.datasource.hive_sync.enable": True,
"hoodie.datasource.hive_sync.jdbcurl": "jdbc:hive2://hiveserver:10000",
"hoodie.datasource.hive_sync.database": "default",
"hoodie.datasource.hive_sync.table": "hudi_mor3",
"hoodie.datasource.hive_sync.partition_fields": "partitionpath",
"hoodie.datasource.hive_sync.partition_extractor_class": "org.apache.hudi.hive.MultiPartKeysValueExtractor"
}
df = spark.read.json(spark.sparkContext.parallelize(gen_data(0, 100)), schema)
df.write.format("hudi").options(**hudi_write_options).mode("overwrite").save(destination)
df = spark.read.json(spark.sparkContext.parallelize(gen_data(50, 100)), schema)
df.write.format("hudi").options(**hudi_write_options).mode("append").save(destination)
spark.sql("select * from default.hudi_mor3_rt").show()
spark.sql("select * from default.hudi_mor3_rt").show()
+-------------------+--------------------+------------------+----------------------+--------------------+----+-------------+-------+-------+---+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path| _hoodie_file_name|uuid|partitionpath|array_1|array_2| ts|
+-------------------+--------------------+------------------+----------------------+--------------------+----+-------------+-------+-------+---+
| 20220209224904138|20220209224904138...| 82| partition|19a65467-4945-444...| 82| partition| []| []| 82|
| 20220209224851153|20220209224851153...| 9| partition|19a65467-4945-444...| 9| partition| []| []| 9|
| 20220209224904138|20220209224904138...| 66| partition|19a65467-4945-444...| 66| partition| []| []| 66|
| 20220209224851153|20220209224851153...| 21| partition|19a65467-4945-444...| 21| partition| []| []| 21|
| 20220209224904138|20220209224904138...| 71| partition|19a65467-4945-444...| 71| partition| []| []| 71|
| 20220209224904138|20220209224904138...| 55| partition|19a65467-4945-444...| 55| partition| []| []| 55|
| 20220209224851153|20220209224851153...| 10| partition|19a65467-4945-444...| 10| partition| []| []| 10|
| 20220209224851153|20220209224851153...| 39| partition|19a65467-4945-444...| 39| partition| []| []| 39|
| 20220209224851153|20220209224851153...| 3| partition|19a65467-4945-444...| 3| partition| []| []| 3|
| 20220209224904138|20220209224904138...| 60| partition|19a65467-4945-444...| 60| partition| []| []| 60|
| 20220209224904138|20220209224904138...| 89| partition|19a65467-4945-444...| 89| partition| []| []| 89|
| 20220209224851153|20220209224851153...| 44| partition|19a65467-4945-444...| 44| partition| []| []| 44|
| 20220209224851153|20220209224851153...| 28| partition|19a65467-4945-444...| 28| partition| []| []| 28|
| 20220209224904138|20220209224904138...| 94| partition|19a65467-4945-444...| 94| partition| []| []| 94|
| 20220209224904138|20220209224904138...| 78| partition|19a65467-4945-444...| 78| partition| []| []| 78|
| 20220209224851153|20220209224851153...| 33| partition|19a65467-4945-444...| 33| partition| []| []| 33|
| 20220209224851153|20220209224851153...| 17| partition|19a65467-4945-444...| 17| partition| []| []| 17|
| 20220209224904138|20220209224904138...| 83| partition|19a65467-4945-444...| 83| partition| []| []| 83|
| 20220209224904138|20220209224904138...| 67| partition|19a65467-4945-444...| 67| partition| []| []| 67|
| 20220209224851153|20220209224851153...| 22| partition|19a65467-4945-444...| 22| partition| []| []| 22|
+-------------------+--------------------+------------------+----------------------+--------------------+----+-------------+-------+-------+---+
only showing top 20 rows
trino:default> select * from hudi_mor4_ro;
_hoodie_commit_time | _hoodie_commit_seqno | _hoodie_record_key | _hoodie_partition_path | _h
---------------------+-------------------------+--------------------+------------------------+-------------------------------
20220209225514907 | 20220209225514907_0_1 | 82 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_2 | 9 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_3 | 66 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_4 | 21 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_5 | 71 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_6 | 55 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_7 | 10 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_16 | 33 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_17 | 17 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_18 | 83 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_19 | 67 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_20 | 22 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_21 | 72 | partition | 3030dd9e-640d-4bc3-8d3c-612186
.
.
.
trino:default> select * from hudi_mor4_rt;
_hoodie_commit_time | _hoodie_commit_seqno | _hoodie_record_key | _hoodie_partition_path | _h
---------------------+-------------------------+--------------------+------------------------+-------------------------------
20220209225514907 | 20220209225514907_0_1 | 82 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_64 | 91 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_65 | 75 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_66 | 30 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_67 | 59 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_68 | 14 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_69 | 80 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_70 | 7 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_71 | 64 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_72 | 48 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_73 | 98 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_74 | 53 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_75 | 37 | partition | 3030dd9e-640d-4bc3-8d3c-612186
20220209225514907 | 20220209225514907_0_76 | 1 | partition | 3030dd9e-640d-4bc3-8d3c-612186
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment