brunoocasali · April 28, 2018 19:17 · brunoocasali · Apr 28, 2018 · brunoocasali · Apr 28, 2018
diff --git a/script-base.pig b/script-base.pig
 -- NOTES:
 -- double dash denotes comments
 -- $ denotes shell command
 -- everything else is Pig Latin, executed in Grunt
 -- Data set downloaded from http://www.ncdc.noaa.gov

 -- Load data into Hadoop
 $ hadoop fs -put ./input.txt input.txt
 $ hadoop fs -ls
 $ hadoop fs -tail hdfs://localhost.localdomain:8020/user/cloudera/input.txt
 $ pig

 -- Load 
 rawData = LOAD '*.txt' USING PigStorage(',') AS (station:int, wban:int, date: chararray, temp: double, temp_count: int, dewp: double, dewp_count:int, slp: double, slp_count: int, stp: double, stp_count: int, visibility: double, visibility_count: int, wind: double, wind_count: int, wind_max: double, wind_gust: int, temp_max: chararray, temp_min: chararray, precipitation: chararray, snow: double, frshtt: chararray);
 rawDataSample = LIMIT rawData 10;
 DUMP rawDataSample;

 -- You should now see 10 sample records with the correct schema.
 -- Let's do some manipulations

 snowDays = FILTER rawData BY snow < 999.9;
 snowDaysOrdered = ORDER snowDays BY snow DESC;
 snowDaysOrderedLimited = LIMIT snowDaysOrdered 10;
 snowSummary = FOREACH snowDaysOrderedLimited GENERATE station, REGEX_EXTRACT(date, '(\\d{4})', 1) as year, snow;
 DUMP snowSummary;


 temp = FILTER rawData BY temp < 999.9;
 tempWithYear = FOREACH temp GENERATE REGEX_EXTRACT(date, '(\\d{4})', 1) as year, temp;
 tempByYear = GROUP tempWithYear BY year;
 DESCRIBE tempByYear
 avgByYear = FOREACH tempByYear GENERATE group, AVG(tempWithYear.temp) AS averageTemp;
 avgByYearOrdered = ORDER avgByYear BY averageTemp;
 EXPLAIN avgByYearOrdered;
 DUMP avgByYearOrdered;
diff --git a/script.pig b/script.pig
 raw = LOAD '/user/hadoop/trabalho/' AS (line:chararray);
 yearAndTemp = FOREACH raw GENERATE (INT) SUBSTRING(line, 15, 19) AS year:int, (INT) SUBSTRING(line, 87, 92) AS temp:int;  
 tempByYear = GROUP yearAndTemp BY year;
 avgByYear = FOREACH tempByYear GENERATE group, AVG(yearAndTemp.temp) AS averageTemp;

 STORE avgByYear INTO '/user/hadoop/trabalho/output';
diff --git a/steps.sh b/steps.sh
 [cloudera@quickstart pos]$ gunzip 1901.gz
 [cloudera@quickstart pos]$ gunzip 1902.gz
 [cloudera@quickstart pos]$ hadoop fs -mkdir /user/hadoop/trabalho
 [cloudera@quickstart pos]$ hadoop fs -put ~/Downloads/190* /user/hadoop/trabalho
 [cloudera@quickstart pos]$ hadoop fs -ls /user/hadoop/trabalho
 Found 2 items
 -rw-r--r--   1 cloudera supergroup      73867 2018-04-27 21:15 /user/hadoop/trabalho/1901.gz
 -rw-r--r--   1 cloudera supergroup      74105 2018-04-27 21:15 /user/hadoop/trabalho/1902.gz
 [cloudera@quickstart pos]$
	-- NOTES:
	-- double dash denotes comments
	-- $ denotes shell command
	-- everything else is Pig Latin, executed in Grunt
	-- Data set downloaded from http://www.ncdc.noaa.gov

	-- Load data into Hadoop
	$ hadoop fs -put ./input.txt input.txt
	$ hadoop fs -ls
	$ hadoop fs -tail hdfs://localhost.localdomain:8020/user/cloudera/input.txt
	$ pig

	-- Load
	rawData = LOAD '*.txt' USING PigStorage(',') AS (station:int, wban:int, date: chararray, temp: double, temp_count: int, dewp: double, dewp_count:int, slp: double, slp_count: int, stp: double, stp_count: int, visibility: double, visibility_count: int, wind: double, wind_count: int, wind_max: double, wind_gust: int, temp_max: chararray, temp_min: chararray, precipitation: chararray, snow: double, frshtt: chararray);
	rawDataSample = LIMIT rawData 10;
	DUMP rawDataSample;

	-- You should now see 10 sample records with the correct schema.
	-- Let's do some manipulations

	snowDays = FILTER rawData BY snow < 999.9;
	snowDaysOrdered = ORDER snowDays BY snow DESC;
	snowDaysOrderedLimited = LIMIT snowDaysOrdered 10;
	snowSummary = FOREACH snowDaysOrderedLimited GENERATE station, REGEX_EXTRACT(date, '(\\d{4})', 1) as year, snow;
	DUMP snowSummary;


	temp = FILTER rawData BY temp < 999.9;
	tempWithYear = FOREACH temp GENERATE REGEX_EXTRACT(date, '(\\d{4})', 1) as year, temp;
	tempByYear = GROUP tempWithYear BY year;
	DESCRIBE tempByYear
	avgByYear = FOREACH tempByYear GENERATE group, AVG(tempWithYear.temp) AS averageTemp;
	avgByYearOrdered = ORDER avgByYear BY averageTemp;
	EXPLAIN avgByYearOrdered;
	DUMP avgByYearOrdered;
	raw = LOAD '/user/hadoop/trabalho/' AS (line:chararray);
	yearAndTemp = FOREACH raw GENERATE (INT) SUBSTRING(line, 15, 19) AS year:int, (INT) SUBSTRING(line, 87, 92) AS temp:int;
	tempByYear = GROUP yearAndTemp BY year;
	avgByYear = FOREACH tempByYear GENERATE group, AVG(yearAndTemp.temp) AS averageTemp;

	STORE avgByYear INTO '/user/hadoop/trabalho/output';
	[cloudera@quickstart pos]$ gunzip 1901.gz
	[cloudera@quickstart pos]$ gunzip 1902.gz
	[cloudera@quickstart pos]$ hadoop fs -mkdir /user/hadoop/trabalho
	[cloudera@quickstart pos]$ hadoop fs -put ~/Downloads/190* /user/hadoop/trabalho
	[cloudera@quickstart pos]$ hadoop fs -ls /user/hadoop/trabalho
	Found 2 items
	-rw-r--r-- 1 cloudera supergroup 73867 2018-04-27 21:15 /user/hadoop/trabalho/1901.gz
	-rw-r--r-- 1 cloudera supergroup 74105 2018-04-27 21:15 /user/hadoop/trabalho/1902.gz
	[cloudera@quickstart pos]$