Megaprog · July 12, 2023 11:18
diff --git a/Main.kt b/Main.kt
 import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.spark.sql.SaveMode
 import org.apache.spark.sql.SparkSession

 fun main() {
    val session = createSparkSession("test", 1)

    val df = session.read().format("jdbc")
        .option("url", "jdbc:postgresql://localhost:6432/pg-atlas-v2-stg-other?user=postgres")
        .option("query", """select * from usdt_trx_udm_v2_transactions 
            where consensus_time >= '2023-07-11 00:00:00Z' and consensus_time < '2023-07-12 00:00:00Z'
            order by consensus_time, hash""")
        .option("fetchsize", 5000)
        .load()

    df.write()
        .mode(SaveMode.Overwrite)
        .parquet("./transactions.parquet")
 }

 fun createSparkSession(name: String, threads: Int): SparkSession =
    SparkSession.builder()
        .appName(name)
        .master("local[$threads]")
        .config("spark.ui.enabled", false)
        .config("spark.sql.caseSensitive", true)
        .config("spark.sql.datetime.java8API.enabled", true)
        .config("spark.sql.legacy.utcTimestampFunc.enabled", true)
        .config("spark.sql.session.timeZone", "UTC")
        .config("spark.sql.parquet.enableVectorizedReader", false)
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .config("spark.kryoserializer.buffer.max", "1g")
        .config(ParquetOutputFormat.COMPRESSION, "snappy")
        .config(ParquetOutputFormat.BLOCK_SIZE, 512 * 1024 * 1024L)
        .config(ParquetOutputFormat.PAGE_SIZE, 16 * 1024 * 1024L)
        .config(ParquetOutputFormat.DICTIONARY_PAGE_SIZE, 32 * 1024 * 1024L)
        .config(ParquetOutputFormat.MIN_MEMORY_ALLOCATION, 8 * 1024 * 1024L)
        .config(ParquetOutputFormat.MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK, 500)
        .config(ParquetOutputFormat.WRITER_VERSION, "PARQUET_2_0")
        .config("spark.driver.maxResultSize", "4g")
        .getOrCreate();
	import org.apache.parquet.hadoop.ParquetOutputFormat
	import org.apache.spark.sql.SaveMode
	import org.apache.spark.sql.SparkSession

	fun main() {
	val session = createSparkSession("test", 1)

	val df = session.read().format("jdbc")
	.option("url", "jdbc:postgresql://localhost:6432/pg-atlas-v2-stg-other?user=postgres")
	.option("query", """select * from usdt_trx_udm_v2_transactions
	where consensus_time >= '2023-07-11 00:00:00Z' and consensus_time < '2023-07-12 00:00:00Z'
	order by consensus_time, hash""")
	.option("fetchsize", 5000)
	.load()

	df.write()
	.mode(SaveMode.Overwrite)
	.parquet("./transactions.parquet")
	}

	fun createSparkSession(name: String, threads: Int): SparkSession =
	SparkSession.builder()
	.appName(name)
	.master("local[$threads]")
	.config("spark.ui.enabled", false)
	.config("spark.sql.caseSensitive", true)
	.config("spark.sql.datetime.java8API.enabled", true)
	.config("spark.sql.legacy.utcTimestampFunc.enabled", true)
	.config("spark.sql.session.timeZone", "UTC")
	.config("spark.sql.parquet.enableVectorizedReader", false)
	.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
	.config("spark.kryoserializer.buffer.max", "1g")
	.config(ParquetOutputFormat.COMPRESSION, "snappy")
	.config(ParquetOutputFormat.BLOCK_SIZE, 512 * 1024 * 1024L)
	.config(ParquetOutputFormat.PAGE_SIZE, 16 * 1024 * 1024L)
	.config(ParquetOutputFormat.DICTIONARY_PAGE_SIZE, 32 * 1024 * 1024L)
	.config(ParquetOutputFormat.MIN_MEMORY_ALLOCATION, 8 * 1024 * 1024L)
	.config(ParquetOutputFormat.MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK, 500)
	.config(ParquetOutputFormat.WRITER_VERSION, "PARQUET_2_0")
	.config("spark.driver.maxResultSize", "4g")
	.getOrCreate();