Created
May 28, 2020 19:37
-
-
Save syedatifakhtar/e67fbd53d97b964d430e19f077c94fed to your computer and use it in GitHub Desktop.
Generate random records based on Schema in Scala without Spark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.{BufferedWriter, DataOutputStream, File, FileWriter} | |
import scala.io.Source.fromFile | |
import scala.util.Random | |
import scala.util.parsing.json.JSON | |
object DBGenSansSpark { | |
val master = "local[2]" | |
val appName = "testing" | |
val r = scala.util.Random | |
def generators = Map[String, Unit => Any]( | |
"INT" -> (_ => r.nextInt(100)), | |
"STRING" -> (_ => Random.alphanumeric.take(8).mkString("")), | |
"DOUBLE" -> (_ => r.nextDouble), | |
"LONG" -> (_ => r.nextLong) | |
) | |
type JSONMap = Map[String, String] | |
def argsParser(args: Array[String]) = { | |
args.map { | |
arg => | |
(arg.split("--")(1).split("=")(0), arg.split("--")(1).split("=")(1)) | |
}.toMap | |
} | |
def main(args: Array[String]): Unit = { | |
val argsMap = argsParser(args) | |
val schemaBasePath = argsMap("schemaBasePath") // /Users/$user/workspace/test_data/schemas | |
val outputBasePath = argsMap("outputBasePath") // /Users/$user/workspace/test_data/output | |
val numRecords = argsMap("numRecords").toInt | |
val tableFiles = argsMap("tables").split(",").map { tableName => (tableName, s"${schemaBasePath}/${tableName}.json") }.toMap //table1,table2 | |
val jsonSchemasForTables: Map[String, JSONMap] = tableFiles.map { case (k, v) => | |
(k, JSON.parseFull(fromFile(v).getLines.mkString).get.asInstanceOf[JSONMap]) | |
} | |
jsonSchemasForTables.map { | |
case (tableName, json) => | |
val records = (1 to numRecords).toStream.map { _ => generateRowForSchemaType(json) } | |
(tableName, records) | |
}.foreach { | |
case (tableName, records) => | |
val tableOutputPath = s"$outputBasePath/$tableName/part-0000.csv" | |
println(s"Writing files for table: $tableName at location $tableOutputPath") | |
val file = new File(tableOutputPath) | |
file.getParentFile().mkdirs() | |
file.createNewFile() | |
val bw = new BufferedWriter(new FileWriter(file)) | |
records.foreach { | |
record => | |
bw.write(record.mkString("\u0001")) | |
bw.newLine() | |
} | |
bw.flush() | |
bw.close() | |
println(s"Write completed for files for table: $tableName at location $tableOutputPath") | |
} | |
} | |
private def generateRowForSchemaType(json: JSONMap) = { | |
json.mapValues(x => generators(x).apply()).values.toSeq | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment