Created January 26, 2018 10:37
repartition small files on hdfs
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.SparkSession
object App {
def lsFiles(path: String, ss: SparkSession) = {
FileSystem.get(ss.sparkContext.hadoopConfiguration).listStatus(new Path(path)) {
def repackageFile(path: String, out: String, ss: SparkSession) {"compression", "snappy").text(out)
def process(path: String, outPrefix: String, ss: SparkSession) = {
lsFiles(path, ss).foreach { sPath =>
val subDir = lsFiles(sPath, ss)
if (subDir.size > 10) {
println(s"repackaging $subDir")
val output = outPrefix + sPath.split("/").last
repackageFile(sPath, output, ss)
