natbusa · August 29, 2015 14:02
diff --git a/flask.api.py b/flask.api.py
 @app.route('/word/<keyword>')
 def fetch_word(keyword):
   db = get_cassandra()
    
   pages = []
   results = db.fetchWordResults(keyword)
   for hit in results:
      pages.append(db.fetchPageDetails(hit["url"]))

   return Response(json.dumps(pages), status=200,   
                    mimetype="application/json")

 if __name__ == '__main__':
    app.run()
diff --git a/mapper.head.sh b/mapper.head.sh
 > cat enwiki-latest-abstracts.xml | ./mapper.py

 element   008930	http://en.wikipedia.org/wiki/Gold
 with      008930	http://en.wikipedia.org/wiki/Gold
 symbol    008930	http://en.wikipedia.org/wiki/Gold
 atomic    008930	http://en.wikipedia.org/wiki/Gold
 number    008930	http://en.wikipedia.org/wiki/Gold
 dense     008930	http://en.wikipedia.org/wiki/Gold
 soft      008930	http://en.wikipedia.org/wiki/Gold
 malleable 008930	http://en.wikipedia.org/wiki/Gold
 ductile   008930	http://en.wikipedia.org/wiki/Gold
diff --git a/mapper.snippet.py b/mapper.snippet.py
 doc = ET.fromstring(doc)
 # xml extraction stuff happening here ...

 #extract words from title and abstract
 words = [w for w in txt.split() if w not in STOPWORDS and len(w) > 2]

 #relevance algorithm
 relevance = len(abstract) * len(links)

 #mapper output to cassandra wikipedia.pages table
 cassandra_client.insertPage(url, title, abstract, length, refs)

 #emit unique the key-value pairs
 emitted = list()
   for word in words:
      if word not in emitted:
         print '%s\t%06d\t%s' % (word, relevance, url)
         emitted.append(word)
diff --git a/no.mapreduce.sh b/no.mapreduce.sh
 > cat enwiki-latest-abstracts.xml | ./mapper.py | ./reducer.py
diff --git a/reducer.head.sh b/reducer.head.sh
 > cat enwiki-latest-abstracts.xml | ./mapper.py | ./reducer.py

 ductile	008930   http://en.wikipedia.org/wiki/Gold
 ductile	008452   http://en.wikipedia.org/wiki/Hydroforming
 ductile	007930   http://en.wikipedia.org/wiki/Liquid_metal_embrittlement
diff --git a/reducer.snippet.py b/reducer.snippet.py
 def emit_ranking(n=100):
   global sorted_dict
   for i in range(n):
      cassandra_client.insertWord(current_word, relevance, url)

 # … 

 def readLoop():
   # input comes from STDIN
   for line in sys.stdin:
      # parse the input we got from mapper.py
      word, relevance, url = line.split('\t', 2)

      if current_word == word :
         sorted_dict[relevance] = url
      else:
         if current_word:
            emit_ranking()
diff --git a/wikipedia_search.tables.sql b/wikipedia_search.tables.sql
 CREATE KEYSPACE wikipedia WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};

 CREATE TABLE wikipedia.pages (
   url text,
   title text,
   abstract text,
   length int,
   refs int,
   PRIMARY KEY (url) 
 );

 CREATE TABLE wikipedia.inverted (
   keyword text,
   relevance int,
   url text,
   PRIMARY KEY ((keyword), relevance) 
 );
diff --git a/yarn.mapreduce.sh b/yarn.mapreduce.sh
 $HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.4.0.jar \
   -files mapper.py,reducer.py                                                             \
   -mapper ./mapper.py                                                                     \
   -reducer ./reducer.py                                                                   \
   -jobconf stream.num.map.output.key.fields=1                                             \
   -jobconf stream.num.reduce.output.key.fields=1                                          \
   -jobconf mapred.reduce.tasks=16                                                         \
   -input wikipedia-latest-abstract                                                        \
   -output $HADOOP_OUTPUT_DIR
	@app.route('/word/<keyword>')
	def fetch_word(keyword):
	db = get_cassandra()

	pages = []
	results = db.fetchWordResults(keyword)
	for hit in results:
	pages.append(db.fetchPageDetails(hit["url"]))

	return Response(json.dumps(pages), status=200,
	mimetype="application/json")

	if __name__ == '__main__':
	app.run()
	> cat enwiki-latest-abstracts.xml \| ./mapper.py

	element 008930 http://en.wikipedia.org/wiki/Gold
	with 008930 http://en.wikipedia.org/wiki/Gold
	symbol 008930 http://en.wikipedia.org/wiki/Gold
	atomic 008930 http://en.wikipedia.org/wiki/Gold
	number 008930 http://en.wikipedia.org/wiki/Gold
	dense 008930 http://en.wikipedia.org/wiki/Gold
	soft 008930 http://en.wikipedia.org/wiki/Gold
	malleable 008930 http://en.wikipedia.org/wiki/Gold
	ductile 008930 http://en.wikipedia.org/wiki/Gold
	doc = ET.fromstring(doc)
	# xml extraction stuff happening here ...

	#extract words from title and abstract
	words = [w for w in txt.split() if w not in STOPWORDS and len(w) > 2]

	#relevance algorithm
	relevance = len(abstract) * len(links)

	#mapper output to cassandra wikipedia.pages table
	cassandra_client.insertPage(url, title, abstract, length, refs)

	#emit unique the key-value pairs
	emitted = list()
	for word in words:
	if word not in emitted:
	print '%s\t%06d\t%s' % (word, relevance, url)
	emitted.append(word)
	> cat enwiki-latest-abstracts.xml \| ./mapper.py \| ./reducer.py

	ductile 008930 http://en.wikipedia.org/wiki/Gold
	ductile 008452 http://en.wikipedia.org/wiki/Hydroforming
	ductile 007930 http://en.wikipedia.org/wiki/Liquid_metal_embrittlement
	def emit_ranking(n=100):
	global sorted_dict
	for i in range(n):
	cassandra_client.insertWord(current_word, relevance, url)

	# …

	def readLoop():
	# input comes from STDIN
	for line in sys.stdin:
	# parse the input we got from mapper.py
	word, relevance, url = line.split('\t', 2)

	if current_word == word :
	sorted_dict[relevance] = url
	else:
	if current_word:
	emit_ranking()
	CREATE KEYSPACE wikipedia WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};

	CREATE TABLE wikipedia.pages (
	url text,
	title text,
	abstract text,
	length int,
	refs int,
	PRIMARY KEY (url)
	);

	CREATE TABLE wikipedia.inverted (
	keyword text,
	relevance int,
	url text,
	PRIMARY KEY ((keyword), relevance)
	);
	$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.4.0.jar \
	-files mapper.py,reducer.py \
	-mapper ./mapper.py \
	-reducer ./reducer.py \
	-jobconf stream.num.map.output.key.fields=1 \
	-jobconf stream.num.reduce.output.key.fields=1 \
	-jobconf mapred.reduce.tasks=16 \
	-input wikipedia-latest-abstract \
	-output $HADOOP_OUTPUT_DIR