Created
July 12, 2023 11:18
-
-
Save Megaprog/89313a0ee654925d736ce185db2ca12f to your computer and use it in GitHub Desktop.
Spark local read from JDBC and write to parquet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.parquet.hadoop.ParquetOutputFormat | |
import org.apache.spark.sql.SaveMode | |
import org.apache.spark.sql.SparkSession | |
fun main() { | |
val session = createSparkSession("test", 1) | |
val df = session.read().format("jdbc") | |
.option("url", "jdbc:postgresql://localhost:6432/pg-atlas-v2-stg-other?user=postgres") | |
.option("query", """select * from usdt_trx_udm_v2_transactions | |
where consensus_time >= '2023-07-11 00:00:00Z' and consensus_time < '2023-07-12 00:00:00Z' | |
order by consensus_time, hash""") | |
.option("fetchsize", 5000) | |
.load() | |
df.write() | |
.mode(SaveMode.Overwrite) | |
.parquet("./transactions.parquet") | |
} | |
fun createSparkSession(name: String, threads: Int): SparkSession = | |
SparkSession.builder() | |
.appName(name) | |
.master("local[$threads]") | |
.config("spark.ui.enabled", false) | |
.config("spark.sql.caseSensitive", true) | |
.config("spark.sql.datetime.java8API.enabled", true) | |
.config("spark.sql.legacy.utcTimestampFunc.enabled", true) | |
.config("spark.sql.session.timeZone", "UTC") | |
.config("spark.sql.parquet.enableVectorizedReader", false) | |
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") | |
.config("spark.kryoserializer.buffer.max", "1g") | |
.config(ParquetOutputFormat.COMPRESSION, "snappy") | |
.config(ParquetOutputFormat.BLOCK_SIZE, 512 * 1024 * 1024L) | |
.config(ParquetOutputFormat.PAGE_SIZE, 16 * 1024 * 1024L) | |
.config(ParquetOutputFormat.DICTIONARY_PAGE_SIZE, 32 * 1024 * 1024L) | |
.config(ParquetOutputFormat.MIN_MEMORY_ALLOCATION, 8 * 1024 * 1024L) | |
.config(ParquetOutputFormat.MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK, 500) | |
.config(ParquetOutputFormat.WRITER_VERSION, "PARQUET_2_0") | |
.config("spark.driver.maxResultSize", "4g") | |
.getOrCreate(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment