ottomata · September 20, 2017 16:58
diff --git a/Querying with pyspark b/Querying with pyspark
 # read pyspark --help if you need to run pyspark in YARN cluster mode for big data.
 # By default it runs locally.  You can still access Hive and HDFS with it.

 $ pyspark
 ...

 >>> from pyspark.sql import HiveContext
 >>> sqlContext = HiveContext(sc)


 >>> q = "SELECT event_timestamp, page_id, revision_id, event_user_id, event_user_groups, event_user_revision_count, event_user_age  FROM nettrom_articlecreations.creation_data"
 results = sqlContext.sql("FROM src SELECT key, value")

 >>> results.take(2)
 [Row(event_timestamp=u'2009-01-13 08:13:30', page_id=11083573, revision_id=263767586, event_user_id=7573850, event_user_groups=u'', event_user_revision_count=6, event_user_age=14267429), Row(event_timestamp=u'2009-01-01 16:41:50', page_id=20914853, revision_id=261281429, event_user_id=85042, event_user_groups=u'', event_user_revision_count=52329, event_user_age=141442898)]
diff --git a/result b/result
 $ hdfs dfs -cat /tmp/tsv_otto1/* | head -n 10

 # if you need this locally, you can get it:

 hdfs dfs -get /tmp/tsv_otto1

diff --git a/tsv.hql b/tsv.hql
 -- got this from https://stackoverflow.com/questions/18129581/how-do-i-output-the-results-of-a-hiveql-query-to-csv

 USE nettrom_articlecreations;
 INSERT OVERWRITE DIRECTORY '/tmp/tsv_otto1' 
 ROW FORMAT DELIMITED 
 FIELDS TERMINATED BY '\t'
 SELECT event_timestamp, page_id, revision_id, event_user_id,
       CONCAT_WS(",", event_user_groups) AS user_groups,
       event_user_revision_count, event_user_age
 FROM creation_data
 LIMIT 100;
	# read pyspark --help if you need to run pyspark in YARN cluster mode for big data.
	# By default it runs locally. You can still access Hive and HDFS with it.

	$ pyspark
	...

	>>> from pyspark.sql import HiveContext
	>>> sqlContext = HiveContext(sc)


	>>> q = "SELECT event_timestamp, page_id, revision_id, event_user_id, event_user_groups, event_user_revision_count, event_user_age FROM nettrom_articlecreations.creation_data"
	results = sqlContext.sql("FROM src SELECT key, value")

	>>> results.take(2)
	[Row(event_timestamp=u'2009-01-13 08:13:30', page_id=11083573, revision_id=263767586, event_user_id=7573850, event_user_groups=u'', event_user_revision_count=6, event_user_age=14267429), Row(event_timestamp=u'2009-01-01 16:41:50', page_id=20914853, revision_id=261281429, event_user_id=85042, event_user_groups=u'', event_user_revision_count=52329, event_user_age=141442898)]
	$ hdfs dfs -cat /tmp/tsv_otto1/* \| head -n 10

	# if you need this locally, you can get it:

	hdfs dfs -get /tmp/tsv_otto1
	-- got this from https://stackoverflow.com/questions/18129581/how-do-i-output-the-results-of-a-hiveql-query-to-csv

	USE nettrom_articlecreations;
	INSERT OVERWRITE DIRECTORY '/tmp/tsv_otto1'
	ROW FORMAT DELIMITED
	FIELDS TERMINATED BY '\t'
	SELECT event_timestamp, page_id, revision_id, event_user_id,
	CONCAT_WS(",", event_user_groups) AS user_groups,
	event_user_revision_count, event_user_age
	FROM creation_data
	LIMIT 100;