Skip to content

Instantly share code, notes, and snippets.

@rhelmer
Created November 16, 2012 20:29
Show Gist options
  • Select an option

  • Save rhelmer/4090633 to your computer and use it in GitHub Desktop.

Select an option

Save rhelmer/4090633 to your computer and use it in GitHub Desktop.
diff --git a/src/main/java/com/mozilla/socorro/pig/eval/OperatingSystemBag.java b/src/main/java/com/mozilla/socorro/pig/eval/OperatingSystemBag.java
new file mode 100644
index 0000000..6e2ec68
--- /dev/null
+++ b/src/main/java/com/mozilla/socorro/pig/eval/OperatingSystemBag.java
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2010 Mozilla Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.mozilla.socorro.pig.eval;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.BagFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+
+public class OperatingSystemBag extends EvalFunc<DataBag> {
+
+ private static final String OPERATING_SYSTEM_PREFIX = "OS|";
+ private static final Pattern newlinePattern = Pattern.compile("\n");
+ private static final Pattern pipePattern = Pattern.compile("\\|");
+
+ private static final BagFactory bagFactory = BagFactory.getInstance();
+ private static final TupleFactory tupleFactory = TupleFactory.getInstance();
+
+ public DataBag exec(Tuple input) throws IOException {
+ if (input == null || input.size() == 0) {
+ return null;
+ }
+
+ reporter.progress();
+ DataBag db = bagFactory.newDefaultBag();
+ for (String dumpline : newlinePattern.split((String)input.get(0))) {
+ if (dumpline.startsWith(OPERATING_SYSTEM_PREFIX)) {
+ // os_name, os_version
+ String[] splits = pipePattern.split(dumpline, -1);
+ Tuple t = tupleFactory.newTuple(splits.length-1);
+ for (int i=1; i < splits.length; i++) {
+ t.set(i-1, splits[i]);
+ }
+ if (t.size() > 0) {
+ db.add(t);
+ }
+ }
+ }
+
+ return db;
+ }
+}
diff --git a/src/main/pig/crashstats.pig b/src/main/pig/crashstats.pig
index 9233c00..3f4d977 100644
--- a/src/main/pig/crashstats.pig
+++ b/src/main/pig/crashstats.pig
@@ -1,5 +1,5 @@
REGISTER 'socorro-toolbox-0.1-SNAPSHOT.jar'
-REGISTER 'lib/akela-0.4-SNAPSHOT.jar'
+REGISTER 'akela-0.4-SNAPSHOT.jar'
SET pig.logfile socorro-crashstats.log;
SET default_parallel 30;
diff --git a/src/main/pig/dumpsizetrends.pig b/src/main/pig/dumpsizetrends.pig
index a15401e..19920f3 100644
--- a/src/main/pig/dumpsizetrends.pig
+++ b/src/main/pig/dumpsizetrends.pig
@@ -2,13 +2,15 @@ REGISTER 'akela-0.4-SNAPSHOT.jar'
REGISTER 'socorro-toolbox-0.1-SNAPSHOT.jar'
SET pig.logfile socorro-dumpsizetrends.log;
-SET default_parallel 2;
+SET default_parallel 10;
+SET mapred.task.timeout 1800000;
SET mapred.compress.map.output true;
SET mapred.map.output.compression.codec org.apache.hadoop.io.compress.SnappyCodec;
DEFINE JsonMap com.mozilla.pig.eval.json.JsonMap();
DEFINE FormatDate com.mozilla.pig.eval.date.FormatDate('yyyy-MM-dd');
DEFINE BytesSize com.mozilla.pig.eval.BytesSize();
+DEFINE OperatingSystemBag com.mozilla.socorro.pig.eval.OperatingSystemBag();
raw = LOAD 'hbase://crash_reports' USING com.mozilla.pig.load.HBaseMultiScanLoader('$start_date', '$end_date',
'yyMMdd',
@@ -18,11 +20,13 @@ raw = LOAD 'hbase://crash_reports' USING com.mozilla.pig.load.HBaseMultiScanLoad
meta_json:chararray,
processed_json:chararray,
raw_dump:bytearray);
-gen_data = FOREACH raw GENERATE JsonMap(meta_json) AS meta_json_map:map[], meta_json, processed_json, raw_dump;
-filtered_data = FILTER gen_data BY meta_json_map#'timestamp' IS NOT NULL;
+gen_data = FOREACH raw GENERATE JsonMap(meta_json) AS meta_json_map:map[], meta_json, processed_json, JsonMap(processed_json) AS processed_json_map:map[], raw_dump;
+filtered_data = FILTER gen_data BY meta_json_map#'timestamp' IS NOT NULL
+ AND processed_json_map#'dump' IS NOT NULL;
sizes = FOREACH filtered_data GENERATE FormatDate((meta_json_map#'timestamp'*1000.0)) AS day,
meta_json_map#'ProductName' AS product_name:chararray,
meta_json_map#'Version' AS product_version:chararray,
+ FLATTEN(OperatingSystemBag(processed_json_map#'dump')) AS (os_name:chararray, os_version:chararray),
(meta_json_map#'ProductID' IS NULL OR meta_json_map#'ProductID' != '{aa3c5121-dab2-40e2-81ca-7ea25febc110}' ? 1 : 0) AS is_xul:int,
BytesSize(raw_dump) AS raw_size:long,
BytesSize(meta_json) AS meta_size:long,
@@ -34,8 +38,8 @@ filtered_sizes = FILTER sizes BY product_name == 'Firefox' OR
product_name == 'B2G';
STORE filtered_sizes INTO '$start_date-$end_date-dumpsizes' USING PigStorage();
-grouped = GROUP filtered_sizes BY (day,product_name,product_version,is_xul);
-daily_sums = FOREACH grouped GENERATE FLATTEN(group) AS (day,product_name,product_version,is_xul),
+grouped = GROUP filtered_sizes BY (day,product_name,product_version,os_name,is_xul);
+daily_sums = FOREACH grouped GENERATE FLATTEN(group) AS (day,product_name,product_version,os_name,is_xul),
COUNT(filtered_sizes) AS doc_count:long,
AVG(filtered_sizes.raw_size) AS avg_raw_size:double,
AVG(filtered_sizes.meta_size) AS avg_meta_size:double,
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment