masayuki5160 · August 29, 2015 14:20
diff --git a/input.txt b/input.txt
 one	two two three three
 three four four four four
diff --git a/wordSplit.py b/wordSplit.py
 #!/usr/bin/python

 import sys
 import re
 
 def main(argv):
 
  line = sys.stdin.readline()
  
  """ 
  Compile the specified regular expression into a regular expression 
  object that identifies distinct words.
  """
  pattern = re.compile("[a-zA-Z][a-zA-Z0-9]*")
  
  """ 
  Loop through the input, line by line, until you reach the end of the file.
  For each line, identify each distinct word and print it out in a format 
  that identifies the type (LongValueSum), the word (converted to lower case)
  and the value "1", indicating that it's been found 1 time. 
  
  This reduces the input data from a block of text to 
  counts of individual words. The counts of each word will be rolled together
  by the built-in Aggregate Hadoop function used as the mapper 
  for this job flow to create the output sum.
  """
  try:
    while line:
      for word in pattern.findall(line):
        print "LongValueSum:" + word.lower() + "\t" + "1"
      line = sys.stdin.readline()
  except "end of file":
    return None
 if __name__ == "__main__":
  main(sys.argv)
diff --git a/参考 b/参考

 Hadoop導入とPythonによるMapReduce
 http://qiita.com/pika_shi/items/7fc686a55a3ac6399b04

 pythonでElastic MapReduce
 http://analysis.blog.jp.klab.com/archives/24077923.html

 ストリーミングでのデータ処理
 http://docs.aws.amazon.com/ja_jp/ElasticMapReduce/latest/DeveloperGuide/UseCase_Streaming.html

 //======================//
 // MapReduceについて
 //======================//

 MapReduceは、大量のデータを複数のマシンで分散して扱うための技術です(デザインパターン的な)。
 基本的な考え方は、処理を以下の2種類に分けるだけです。
 map: データ1行に対する処理
 reduce: 集計処理


 まず，入力ファイルを用意する
 $ mkdir inputs
 $ echo "a b b c c c" > inputs/input.txt

 Mapperは以下のようなものを出力する
 a    1
 b    1
 b    1
 c    1
 c    1
 c    1

 ReducerはMapperで出力されたそれぞれのwordを数え上げ，以下のようなものを出力する
 a    1
 b    2
 c    3


 //======================//
 // MapReduceについておわり
 //======================//


 初めてEMR(Amazon Elastic MapReduce)を使う人が読むページ
 http://mgi.hatenablog.com/entry/2014/05/04/085148



 Amazon EMR編～ElasticMapReduceの使い方パート①～
 http://recipe.kc-cloud.jp/archives/1145

 Amazon EMR編～ElasticMapReduceの使い方パート②～
 http://recipe.kc-cloud.jp/archives/1152

 Amazon EMR編～ElasticMapReduceの使い方パート③～
 http://recipe.kc-cloud.jp/archives/1209


 ストリーミングステップの送信
 http://docs.aws.amazon.com/ja_jp/ElasticMapReduce/latest/DeveloperGuide/CLI_CreateStreaming.html
	#!/usr/bin/python

	import sys
	import re

	def main(argv):

	line = sys.stdin.readline()

	"""
	Compile the specified regular expression into a regular expression
	object that identifies distinct words.
	"""
	pattern = re.compile("[a-zA-Z][a-zA-Z0-9]*")

	"""
	Loop through the input, line by line, until you reach the end of the file.
	For each line, identify each distinct word and print it out in a format
	that identifies the type (LongValueSum), the word (converted to lower case)
	and the value "1", indicating that it's been found 1 time.

	This reduces the input data from a block of text to
	counts of individual words. The counts of each word will be rolled together
	by the built-in Aggregate Hadoop function used as the mapper
	for this job flow to create the output sum.
	"""
	try:
	while line:
	for word in pattern.findall(line):
	print "LongValueSum:" + word.lower() + "\t" + "1"
	line = sys.stdin.readline()
	except "end of file":
	return None
	if __name__ == "__main__":
	main(sys.argv)

	Hadoop導入とPythonによるMapReduce
	http://qiita.com/pika_shi/items/7fc686a55a3ac6399b04

	pythonでElastic MapReduce
	http://analysis.blog.jp.klab.com/archives/24077923.html

	ストリーミングでのデータ処理
	http://docs.aws.amazon.com/ja_jp/ElasticMapReduce/latest/DeveloperGuide/UseCase_Streaming.html

	//======================//
	// MapReduceについて
	//======================//

	MapReduceは、大量のデータを複数のマシンで分散して扱うための技術です(デザインパターン的な)。
	基本的な考え方は、処理を以下の2種類に分けるだけです。
	map: データ1行に対する処理
	reduce: 集計処理


	まず，入力ファイルを用意する
	$ mkdir inputs
	$ echo "a b b c c c" > inputs/input.txt

	Mapperは以下のようなものを出力する
	a 1
	b 1
	b 1
	c 1
	c 1
	c 1

	ReducerはMapperで出力されたそれぞれのwordを数え上げ，以下のようなものを出力する
	a 1
	b 2
	c 3


	//======================//
	// MapReduceについておわり
	//======================//


	初めてEMR(Amazon Elastic MapReduce)を使う人が読むページ
	http://mgi.hatenablog.com/entry/2014/05/04/085148



	Amazon EMR編～ElasticMapReduceの使い方パート①～
	http://recipe.kc-cloud.jp/archives/1145

	Amazon EMR編～ElasticMapReduceの使い方パート②～
	http://recipe.kc-cloud.jp/archives/1152

	Amazon EMR編～ElasticMapReduceの使い方パート③～
	http://recipe.kc-cloud.jp/archives/1209


	ストリーミングステップの送信
	http://docs.aws.amazon.com/ja_jp/ElasticMapReduce/latest/DeveloperGuide/CLI_CreateStreaming.html