Created
October 9, 2012 15:48
-
-
Save ottomata/3859662 to your computer and use it in GitHub Desktop.
monthly subdomain counts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| DEFINE EXTRACT org.apache.pig.builtin.REGEX_EXTRACT_ALL(); | |
| LOG_FIELDS = LOAD '$input' USING PigStorage(' ') AS (hostname:chararray, udplog_sequence:chararray, timestamp:chararray, request_time:chararray, remote_addr:chararray, http_status:chararray, bytes_sent:chararray, request_method:chararray, uri:chararray, proxy_host:chararray, content_type:chararray, referer:chararray, x_forwarded_for:chararray, user_agent); | |
| -- only count text/html. '-' Comes from varnish. | |
| -- See: https://gerrit.wikimedia.org/r/gitweb?p=analytics/wikistats.git;a=blob;f=squids/SquidCountArchiveProcessLogRecord.pm;h=5b0d03d6473ce5d63afc6f9495af8651bd90f74b;hb=HEAD#l18 | |
| LOG_FIELDS = FILTER LOG_FIELDS BY content_type == 'text/html' OR (content_type == '-' AND uri MATCHES '.*(.*(\\.m\\..*?\\/wiki\\/|\\.m\\..*?\\/w\\/index.php).*).*'); | |
| -- only count 200 and 302 response statuses | |
| LOG_FIELDS = FILTER LOG_FIELDS BY (http_status MATCHES '^.*(200|302)$'); | |
| -- Extract the Month and subdomain out of the request log fields | |
| MONTH_SUBDOMAIN = FOREACH LOG_FIELDS GENERATE FLATTEN(EXTRACT(timestamp, '^.*(\\d\\d\\d\\d-\\d\\d)-.*')) as month:chararray, FLATTEN (EXTRACT(uri, 'https?://(en|ja|es|de|ru|fr)\\..+.+')) as subdomain:chararray; | |
| -- Discard any null subdomains | |
| MONTH_SUBDOMAIN = FILTER MONTH_SUBDOMAIN BY subdomain IS NOT NULL; | |
| -- Group by month and subdomain | |
| MONTH_SUBDOMAIN_GROUP = GROUP MONTH_SUBDOMAIN BY (month, subdomain) PARALLEL 3; | |
| -- Generate a COUNT | |
| MONTH_SUBDOMAIN_COUNT = FOREACH MONTH_SUBDOMAIN_GROUP GENERATE FLATTEN(group), COUNT($1) as num; | |
| -- Save the results | |
| STORE MONTH_SUBDOMAIN_COUNT into '$output'; | |
| -- This will generate output like: | |
| -- 2012-09 de 2 | |
| -- 2012-09 en 14 | |
| -- 2012-09 es 1 | |
| -- 2012-09 fr 1 | |
| -- 2012-09 ja 4 | |
| -- 2012-09 ru 3 | |
| -- 2012-10 en 14 | |
| -- 2012-10 ja 6 | |
| -- 2012-10 ru 3 | |
| -- The Total Count has to be grouped and created separately from subdomain count | |
| -- Run this if you want to generate a count of total requests per month. | |
| -- MONTH_TOTAL_COUNT = FOREACH (GROUP MONTH_SUBDOMAIN BY month) GENERATE FLATTEN(group), COUNT(MONTH_SUBDOMAIN); | |
| -- STORE MONTH_TOTAL_COUNT into '/user/otto/logs0/month_total_count.0'; | |
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 9 columns, instead of 8 in line 1.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Date,Total,All projects,English,Spanish,Japanese,German,Russian,French | |
| 2008/07/01,9548996328,5135131814,973755192,445589582,755196668,113662290,311381016 | |
| 2008/08/01,9501001543,4974344046,969975425,529752195,778407514,121195082,312562951 | |
| 2008/09/01,10631664080,5479533823,998032211,640332042,817672904,157256924,407994179 | |
| 2008/10/01,10745804182,5509405206,974745059,659209518,829895189,173119199,426838989 | |
| 2008/11/01,10845469196,5415832055,925842966,618951709,1007854264,193302177,456433510 | |
| 2008/12/01,10471563564,5043588824,867622536,429593165,1434267602,225726994,398506291 | |
| 2009/01/01,11496646419,5615498275,1020040333,525897906,1270923077,244179932,489329700 | |
| 2009/02/01,11679535567,5943558064,1015724496,664767977,981701403,261464142,489587152 | |
| 2009/03/01,12024862000,6092089410,965824332,708812666,978324435,284750415,510839619 | |
| 2009/04/01,11352618456,5988817321,935725490,622529405,816911070,275645537,513244212 | |
| 2009/05/01,11825871871,6066080834,1054087578,712546000,875354334,285393526,517848086 | |
| 2009/06/01,11511697836,5818924182,1076231552,689177935,908943732,287430317,478907393 | |
| 2009/07/01,10765973183,5614497075,1071919060,568949734,818710227,249613491,402046348 | |
| 2009/08/01,10938212049,5604048759,1056863842,670114201,812759742,263291318,405401461 | |
| 2009/09/01,11878057812,5937922018,1036022366,805384131,888953956,305019697,468445343 | |
| 2009/10/01,12257819634,6041279704,1042122587,793041104,884514330,336490934,517691251 | |
| 2009/11/01,12008718875,5841504417,1018879252,750257494,903770333,350558815,517541264 | |
| 2009/12/01,10626616804,5258610830,947765303,500123563,759783399,342298682,456708198 | |
| 2010/01/01,11987314019,5883232992,1083201946,608651538,916028392,377526978,532464602 | |
| 2010/02/01,12516731479,6227014896,1060245098,776934909,890617318,414429038,527713863 | |
| 2010/03/01,12202523180,6446465754,979748084,740804448,809344045,358678986,493544465 | |
| 2010/04/01,15032439023,8213623221,1152028494,953794373,957331874,438267171,550218307 | |
| 2010/05/01,15224406861,8126959544,1276258833,999782289,986952926,467543830,573315069 | |
| 2010/06/01,14593247972,7841269693,1260772500,939990212,955135652,456898699,536451155 | |
| 2010/07/01,13895953915,7728611935,1275749319,762596690,901839447,386032551,482220859 | |
| 2010/08/01,13761406064,7435888138,1222127187,872071201,936931491,408043313,510089112 | |
| 2010/09/01,14528728262,7572220871,1200138702,1018472015,989808185,494069475,586890265 | |
| 2010/10/01,14532380230,7526863400,1185314758,1020227911,973748712,525512847,598622500 | |
| 2010/11/01,14938670017,7552720233,1192956965,1029077915,1062730731,569946243,657362614 | |
| 2010/12/01,13948551131,7197323670,1104993076,748120629,965473265,587252199,619085980 | |
| 2011/01/01,15494531200,7886321417,1230270379,870565050,1155753949,643666675,697123896 | |
| 2011/02/01,15602268966,7905363844,1235772520,1017498578,1010369643,663129902,657610990 | |
| 2011/03/01,15206260328,7532208472,1195941376,1071499890,1000548752,649943184,656422906 | |
| 2011/04/01,14675654011,7345492982,1185126548,961460241,902105087,646546023,624116684 | |
| 2011/05/01,15136658234,7313738255,1255860535,1074132146,1035675916,647533682,653114631 | |
| 2011/06/01,14722782747,7129490662,1227660371,979439804,1009279530,626252967,626267778 | |
| 2011/07/01,14140710162,6968286733,1247518154,858854640,959536382,568028951,593023383 | |
| 2011/08/01,15061062187,7263497053,1321459774,1054094317,966935930,586031773,654016689 | |
| 2011/09/01,15837395045,7483008091,1314236262,1195023458,988041057,667136987,682681634 | |
| 2011/10/01,17102055307,7881085908,1322496602,1234884164,1028580510,740428763,745267659 | |
| 2011/11/01,17422228297,8314617939,1321316510,1214871541,1079434677,789448080,760596481 | |
| 2011/12/01,16365307778,7950128284,1253215909,899244274,1038191276,810251079,723308805 | |
| 2012/01/01,18054256464,8576588259,1438625504,1035012747,1192489116,872056989,790998265 | |
| 2012/02/01,18127912871,8782733975,1431170650,1211719057,1144831051,903756130,780606431 | |
| 2012/03/01,17273498605,8168174170,1333339996,1187560765,1067371844,858648904,769176782 | |
| 2012/04/01,17336954399,8387710959,1292741905,1105258369,1048780015,864363791,773087509 | |
| 2012/05/01,18020959870,8620167603,1410908220,1254907230,1072648074,899248097,806491651 | |
| 2012/06/01,18066023582,8675989210,1501801845,1227918217,1107375315,830865119,768239729 | |
| 2012/07/01,17720799507,8778080687,1521317171,1108629589,1073868677,762226287,728345715 | |
| 2012/08/01,18185903211,8870258331,1583046066,1291653170,1023852535,773053670,740917699 | |
| 2012/09/01,19145307872,9165370101,1604478704,1403795210,1075123176,837174443,808082611 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| 2011-10 de 402290000 | |
| 2011-10 en 2782304000 | |
| 2011-10 es 435003000 | |
| 2011-10 fr 276849000 | |
| 2011-10 ja 379727000 | |
| 2011-10 ru 282076000 | |
| 2011-11 de 1021732000 | |
| 2011-11 en 7186976000 | |
| 2011-11 es 1028719000 | |
| 2011-11 fr 690185000 | |
| 2011-11 ja 895412000 | |
| 2011-11 ru 726400000 | |
| 2011-12 de 979836000 | |
| 2011-12 en 6734719000 | |
| 2011-12 es 741720000 | |
| 2011-12 fr 652753000 | |
| 2011-12 ja 849979000 | |
| 2011-12 ru 760479000 | |
| 2012-01 de 1259571000 | |
| 2012-01 en 8081972000 | |
| 2012-01 es 957310000 | |
| 2012-01 fr 781668000 | |
| 2012-01 ja 1124228000 | |
| 2012-01 ru 879589000 | |
| 2012-02 de 1196934000 | |
| 2012-02 en 8181349000 | |
| 2012-02 es 1089469000 | |
| 2012-02 fr 749207000 | |
| 2012-02 ja 1140781000 | |
| 2012-02 ru 876332000 | |
| 2012-03 de 1187178000 | |
| 2012-03 en 8510510000 | |
| 2012-03 es 1162490000 | |
| 2012-03 fr 807071000 | |
| 2012-03 ja 1177780000 | |
| 2012-03 ru 895417000 | |
| 2012-04 de 1069360000 | |
| 2012-04 en 8062220000 | |
| 2012-04 es 1009067000 | |
| 2012-04 fr 747679000 | |
| 2012-04 ja 1091761000 | |
| 2012-04 ru 833177000 | |
| 2012-05 de 1068356000 | |
| 2012-05 en 7346812000 | |
| 2012-05 es 1067691000 | |
| 2012-05 fr 721372000 | |
| 2012-05 ja 937070000 | |
| 2012-05 ru 835675000 | |
| 2012-06 de 1012555000 | |
| 2012-06 en 6898143000 | |
| 2012-06 es 964575000 | |
| 2012-06 fr 636165000 | |
| 2012-06 ja 916162000 | |
| 2012-06 ru 740796000 | |
| 2012-07 de 935121000 | |
| 2012-07 en 6812590000 | |
| 2012-07 es 834117000 | |
| 2012-07 fr 578309000 | |
| 2012-07 ja 892255000 | |
| 2012-07 ru 664163000 | |
| 2012-08 de 916334000 | |
| 2012-08 en 6905478000 | |
| 2012-08 es 980069000 | |
| 2012-08 fr 591036000 | |
| 2012-08 ja 926223000 | |
| 2012-08 ru 677537000 | |
| 2012-09 de 933207000 | |
| 2012-09 en 6950454000 | |
| 2012-09 es 1043038000 | |
| 2012-09 fr 646489000 | |
| 2012-09 ja 881399000 | |
| 2012-09 ru 713935000 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment