Skip to content

Instantly share code, notes, and snippets.

@phstudy
Created November 17, 2014 03:58
Show Gist options
  • Save phstudy/7d240b1ec8a95f253b9f to your computer and use it in GitHub Desktop.
Save phstudy/7d240b1ec8a95f253b9f to your computer and use it in GitHub Desktop.
Spark720pStats
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
public class Spark720pStats {
public static void main(String[] args) {
String appName = "720pStats";
String master = "local[10]";
SparkConf conf = new SparkConf().setAppName(appName).setMaster(master);
JavaSparkContext sc = new JavaSparkContext(conf);
Configuration.configure(sc);
JavaPairRDD<String, Integer> distFile = sc.textFile("s3n://mycdn.logs/*")
.map(v1 -> v1.split("\t"))
.map(v -> (v.length != 19) ? "" : v[7])
.filter(v1 -> StringUtils.endsWithIgnoreCase(v1, ".mp4"))
.filter(v1 -> StringUtils.contains(v1, "720p"))
.mapToPair(t -> new Tuple2<>(Arrays.asList("2m", "3m", "4m").stream().filter(t::contains).findFirst().orElse("unknown"), 1)
).reduceByKey((v1, v2) -> v1 + v2);
System.out.println(distFile.collect());
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment