Created
November 16, 2012 20:29
-
-
Save rhelmer/4090633 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/src/main/java/com/mozilla/socorro/pig/eval/OperatingSystemBag.java b/src/main/java/com/mozilla/socorro/pig/eval/OperatingSystemBag.java | |
| new file mode 100644 | |
| index 0000000..6e2ec68 | |
| --- /dev/null | |
| +++ b/src/main/java/com/mozilla/socorro/pig/eval/OperatingSystemBag.java | |
| @@ -0,0 +1,63 @@ | |
| +/** | |
| + * Copyright 2010 Mozilla Foundation | |
| + * | |
| + * Licensed to the Apache Software Foundation (ASF) under one | |
| + * or more contributor license agreements. See the NOTICE file | |
| + * distributed with this work for additional information | |
| + * regarding copyright ownership. The ASF licenses this file | |
| + * to you under the Apache License, Version 2.0 (the | |
| + * "License"); you may not use this file except in compliance | |
| + * with the License. You may obtain a copy of the License at | |
| + * | |
| + * http://www.apache.org/licenses/LICENSE-2.0 | |
| + * | |
| + * Unless required by applicable law or agreed to in writing, software | |
| + * distributed under the License is distributed on an "AS IS" BASIS, | |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| + * See the License for the specific language governing permissions and | |
| + * limitations under the License. | |
| + */ | |
| +package com.mozilla.socorro.pig.eval; | |
| + | |
| +import java.io.IOException; | |
| +import java.util.regex.Pattern; | |
| + | |
| +import org.apache.pig.EvalFunc; | |
| +import org.apache.pig.data.BagFactory; | |
| +import org.apache.pig.data.DataBag; | |
| +import org.apache.pig.data.Tuple; | |
| +import org.apache.pig.data.TupleFactory; | |
| + | |
| +public class OperatingSystemBag extends EvalFunc<DataBag> { | |
| + | |
| + private static final String OPERATING_SYSTEM_PREFIX = "OS|"; | |
| + private static final Pattern newlinePattern = Pattern.compile("\n"); | |
| + private static final Pattern pipePattern = Pattern.compile("\\|"); | |
| + | |
| + private static final BagFactory bagFactory = BagFactory.getInstance(); | |
| + private static final TupleFactory tupleFactory = TupleFactory.getInstance(); | |
| + | |
| + public DataBag exec(Tuple input) throws IOException { | |
| + if (input == null || input.size() == 0) { | |
| + return null; | |
| + } | |
| + | |
| + reporter.progress(); | |
| + DataBag db = bagFactory.newDefaultBag(); | |
| + for (String dumpline : newlinePattern.split((String)input.get(0))) { | |
| + if (dumpline.startsWith(OPERATING_SYSTEM_PREFIX)) { | |
| + // os_name, os_version | |
| + String[] splits = pipePattern.split(dumpline, -1); | |
| + Tuple t = tupleFactory.newTuple(splits.length-1); | |
| + for (int i=1; i < splits.length; i++) { | |
| + t.set(i-1, splits[i]); | |
| + } | |
| + if (t.size() > 0) { | |
| + db.add(t); | |
| + } | |
| + } | |
| + } | |
| + | |
| + return db; | |
| + } | |
| +} | |
| diff --git a/src/main/pig/crashstats.pig b/src/main/pig/crashstats.pig | |
| index 9233c00..3f4d977 100644 | |
| --- a/src/main/pig/crashstats.pig | |
| +++ b/src/main/pig/crashstats.pig | |
| @@ -1,5 +1,5 @@ | |
| REGISTER 'socorro-toolbox-0.1-SNAPSHOT.jar' | |
| -REGISTER 'lib/akela-0.4-SNAPSHOT.jar' | |
| +REGISTER 'akela-0.4-SNAPSHOT.jar' | |
| SET pig.logfile socorro-crashstats.log; | |
| SET default_parallel 30; | |
| diff --git a/src/main/pig/dumpsizetrends.pig b/src/main/pig/dumpsizetrends.pig | |
| index a15401e..19920f3 100644 | |
| --- a/src/main/pig/dumpsizetrends.pig | |
| +++ b/src/main/pig/dumpsizetrends.pig | |
| @@ -2,13 +2,15 @@ REGISTER 'akela-0.4-SNAPSHOT.jar' | |
| REGISTER 'socorro-toolbox-0.1-SNAPSHOT.jar' | |
| SET pig.logfile socorro-dumpsizetrends.log; | |
| -SET default_parallel 2; | |
| +SET default_parallel 10; | |
| +SET mapred.task.timeout 1800000; | |
| SET mapred.compress.map.output true; | |
| SET mapred.map.output.compression.codec org.apache.hadoop.io.compress.SnappyCodec; | |
| DEFINE JsonMap com.mozilla.pig.eval.json.JsonMap(); | |
| DEFINE FormatDate com.mozilla.pig.eval.date.FormatDate('yyyy-MM-dd'); | |
| DEFINE BytesSize com.mozilla.pig.eval.BytesSize(); | |
| +DEFINE OperatingSystemBag com.mozilla.socorro.pig.eval.OperatingSystemBag(); | |
| raw = LOAD 'hbase://crash_reports' USING com.mozilla.pig.load.HBaseMultiScanLoader('$start_date', '$end_date', | |
| 'yyMMdd', | |
| @@ -18,11 +20,13 @@ raw = LOAD 'hbase://crash_reports' USING com.mozilla.pig.load.HBaseMultiScanLoad | |
| meta_json:chararray, | |
| processed_json:chararray, | |
| raw_dump:bytearray); | |
| -gen_data = FOREACH raw GENERATE JsonMap(meta_json) AS meta_json_map:map[], meta_json, processed_json, raw_dump; | |
| -filtered_data = FILTER gen_data BY meta_json_map#'timestamp' IS NOT NULL; | |
| +gen_data = FOREACH raw GENERATE JsonMap(meta_json) AS meta_json_map:map[], meta_json, processed_json, JsonMap(processed_json) AS processed_json_map:map[], raw_dump; | |
| +filtered_data = FILTER gen_data BY meta_json_map#'timestamp' IS NOT NULL | |
| + AND processed_json_map#'dump' IS NOT NULL; | |
| sizes = FOREACH filtered_data GENERATE FormatDate((meta_json_map#'timestamp'*1000.0)) AS day, | |
| meta_json_map#'ProductName' AS product_name:chararray, | |
| meta_json_map#'Version' AS product_version:chararray, | |
| + FLATTEN(OperatingSystemBag(processed_json_map#'dump')) AS (os_name:chararray, os_version:chararray), | |
| (meta_json_map#'ProductID' IS NULL OR meta_json_map#'ProductID' != '{aa3c5121-dab2-40e2-81ca-7ea25febc110}' ? 1 : 0) AS is_xul:int, | |
| BytesSize(raw_dump) AS raw_size:long, | |
| BytesSize(meta_json) AS meta_size:long, | |
| @@ -34,8 +38,8 @@ filtered_sizes = FILTER sizes BY product_name == 'Firefox' OR | |
| product_name == 'B2G'; | |
| STORE filtered_sizes INTO '$start_date-$end_date-dumpsizes' USING PigStorage(); | |
| -grouped = GROUP filtered_sizes BY (day,product_name,product_version,is_xul); | |
| -daily_sums = FOREACH grouped GENERATE FLATTEN(group) AS (day,product_name,product_version,is_xul), | |
| +grouped = GROUP filtered_sizes BY (day,product_name,product_version,os_name,is_xul); | |
| +daily_sums = FOREACH grouped GENERATE FLATTEN(group) AS (day,product_name,product_version,os_name,is_xul), | |
| COUNT(filtered_sizes) AS doc_count:long, | |
| AVG(filtered_sizes.raw_size) AS avg_raw_size:double, | |
| AVG(filtered_sizes.meta_size) AS avg_meta_size:double, |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment