Hungsiro506 · August 4, 2017 01:22
diff --git a/spark_jdbc_upsert.scala b/spark_jdbc_upsert.scala
 //////////////////// Upsert //////////////////////////////////////////////////////////////////
 import java.sql._
 dataframe.coalesce("NUMBER OF WORKERS").mapPartitions((d) => Iterator(d)).foreach { batch =>
 val dbc: Connection = DriverManager.getConnection("JDBCURL")
 val st: PreparedStatement = dbc.prepareStatement("YOUR PREPARED STATEMENT")

 batch.grouped("# Of Rows you want per batch").foreach { session =>
  session.foreach { x =>
    st.setDouble(1, x.getDouble(1)) 
    st.addBatch()
    }
  st.executeBatch()
  }
  dbc.close()
 }
 //// PG insert on conflict ///////////////////////
 INSERT INTO the_table (id, column_1, column_2) 
 VALUES (1, 'A', 'X'), (2, 'B', 'Y'), (3, 'C', 'Z')
 ON CONFLICT (id) DO UPDATE 
  SET column_1 = excluded.column_1, 
      column_2 = excluded.column_2;



 //////////////////////////////////////////////////////////////////////////////////////////////


 def getInsertStatement(
      table: String,
      rddSchema: StructType,
      tableSchema: Option[StructType],
      isCaseSensitive: Boolean,
      dialect: JdbcDialect): String = {
    val columns = if (tableSchema.isEmpty) {
      rddSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")
    } else {
      val columnNameEquality = if (isCaseSensitive) {
        org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution
      } else {
        org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution
      }
      // The generated insert statement needs to follow rddSchema's column sequence and
      // tableSchema's column names. When appending data into some case-sensitive DBMSs like
      // PostgreSQL/Oracle, we need to respect the existing case-sensitive column names instead of
      // RDD column names for user convenience.
      val tableColumnNames = tableSchema.get.fieldNames
      rddSchema.fields.map { col =>
        val normalizedName = tableColumnNames.find(f => columnNameEquality(f, col.name)).getOrElse {
          throw new AnalysisException(s"""Column "${col.name}" not found in schema $tableSchema""")
        }
        dialect.quoteIdentifier(normalizedName)
      }.mkString(",")
    }
    val placeholders = rddSchema.fields.map(_ => "?").mkString(",")
    s"INSERT INTO $table ($columns) VALUES ($placeholders)"
  }

  def saveTable(
      df: DataFrame,
      tableSchema: Option[StructType],
      isCaseSensitive: Boolean,
      options: JDBCOptions): Unit = {
    val url = options.url
    val table = options.table
    val dialect = JdbcDialects.get(url)
    val rddSchema = df.schema
    val getConnection: () => Connection = createConnectionFactory(options)
    val batchSize = options.batchSize
    val isolationLevel = options.isolationLevel

    val insertStmt = getInsertStatement(table, rddSchema, tableSchema, isCaseSensitive, dialect)
    val repartitionedDF = options.numPartitions match {
      case Some(n) if n <= 0 => throw new IllegalArgumentException(
        s"Invalid value `$n` for parameter `${JDBCOptions.JDBC_NUM_PARTITIONS}` in table writing " +
          "via JDBC. The minimum value is 1.")
      case Some(n) if n < df.rdd.getNumPartitions => df.coalesce(n)
      case _ => df
    }
    repartitionedDF.rdd.foreachPartition(iterator => savePartition(
      getConnection, table, iterator, rddSchema, insertStmt, batchSize, dialect, isolationLevel)
    )
  }
	//////////////////// Upsert //////////////////////////////////////////////////////////////////
	import java.sql._
	dataframe.coalesce("NUMBER OF WORKERS").mapPartitions((d) => Iterator(d)).foreach { batch =>
	val dbc: Connection = DriverManager.getConnection("JDBCURL")
	val st: PreparedStatement = dbc.prepareStatement("YOUR PREPARED STATEMENT")

	batch.grouped("# Of Rows you want per batch").foreach { session =>
	session.foreach { x =>
	st.setDouble(1, x.getDouble(1))
	st.addBatch()
	}
	st.executeBatch()
	}
	dbc.close()
	}
	//// PG insert on conflict ///////////////////////
	INSERT INTO the_table (id, column_1, column_2)
	VALUES (1, 'A', 'X'), (2, 'B', 'Y'), (3, 'C', 'Z')
	ON CONFLICT (id) DO UPDATE
	SET column_1 = excluded.column_1,
	column_2 = excluded.column_2;



	//////////////////////////////////////////////////////////////////////////////////////////////


	def getInsertStatement(
	table: String,
	rddSchema: StructType,
	tableSchema: Option[StructType],
	isCaseSensitive: Boolean,
	dialect: JdbcDialect): String = {
	val columns = if (tableSchema.isEmpty) {
	rddSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")
	} else {
	val columnNameEquality = if (isCaseSensitive) {
	org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution
	} else {
	org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution
	}
	// The generated insert statement needs to follow rddSchema's column sequence and
	// tableSchema's column names. When appending data into some case-sensitive DBMSs like
	// PostgreSQL/Oracle, we need to respect the existing case-sensitive column names instead of
	// RDD column names for user convenience.
	val tableColumnNames = tableSchema.get.fieldNames
	rddSchema.fields.map { col =>
	val normalizedName = tableColumnNames.find(f => columnNameEquality(f, col.name)).getOrElse {
	throw new AnalysisException(s"""Column "${col.name}" not found in schema $tableSchema""")
	}
	dialect.quoteIdentifier(normalizedName)
	}.mkString(",")
	}
	val placeholders = rddSchema.fields.map(_ => "?").mkString(",")
	s"INSERT INTO $table ($columns) VALUES ($placeholders)"
	}

	def saveTable(
	df: DataFrame,
	tableSchema: Option[StructType],
	isCaseSensitive: Boolean,
	options: JDBCOptions): Unit = {
	val url = options.url
	val table = options.table
	val dialect = JdbcDialects.get(url)
	val rddSchema = df.schema
	val getConnection: () => Connection = createConnectionFactory(options)
	val batchSize = options.batchSize
	val isolationLevel = options.isolationLevel

	val insertStmt = getInsertStatement(table, rddSchema, tableSchema, isCaseSensitive, dialect)
	val repartitionedDF = options.numPartitions match {
	case Some(n) if n <= 0 => throw new IllegalArgumentException(
	s"Invalid value `$n` for parameter `${JDBCOptions.JDBC_NUM_PARTITIONS}` in table writing " +
	"via JDBC. The minimum value is 1.")
	case Some(n) if n < df.rdd.getNumPartitions => df.coalesce(n)
	case _ => df
	}
	repartitionedDF.rdd.foreachPartition(iterator => savePartition(
	getConnection, table, iterator, rddSchema, insertStmt, batchSize, dialect, isolationLevel)
	)
	}
No results found