Skip to content

Instantly share code, notes, and snippets.

@natbusa
Last active August 29, 2015 14:02
Show Gist options
  • Save natbusa/04d8fcf4a094bef02619 to your computer and use it in GitHub Desktop.
Save natbusa/04d8fcf4a094bef02619 to your computer and use it in GitHub Desktop.
wikipedia live search demo
@app.route('/word/<keyword>')
def fetch_word(keyword):
db = get_cassandra()
pages = []
results = db.fetchWordResults(keyword)
for hit in results:
pages.append(db.fetchPageDetails(hit["url"]))
return Response(json.dumps(pages), status=200,
mimetype="application/json")
if __name__ == '__main__':
app.run()
> cat enwiki-latest-abstracts.xml | ./mapper.py
element 008930 http://en.wikipedia.org/wiki/Gold
with 008930 http://en.wikipedia.org/wiki/Gold
symbol 008930 http://en.wikipedia.org/wiki/Gold
atomic 008930 http://en.wikipedia.org/wiki/Gold
number 008930 http://en.wikipedia.org/wiki/Gold
dense 008930 http://en.wikipedia.org/wiki/Gold
soft 008930 http://en.wikipedia.org/wiki/Gold
malleable 008930 http://en.wikipedia.org/wiki/Gold
ductile 008930 http://en.wikipedia.org/wiki/Gold
doc = ET.fromstring(doc)
# xml extraction stuff happening here ...
#extract words from title and abstract
words = [w for w in txt.split() if w not in STOPWORDS and len(w) > 2]
#relevance algorithm
relevance = len(abstract) * len(links)
#mapper output to cassandra wikipedia.pages table
cassandra_client.insertPage(url, title, abstract, length, refs)
#emit unique the key-value pairs
emitted = list()
for word in words:
if word not in emitted:
print '%s\t%06d\t%s' % (word, relevance, url)
emitted.append(word)
> cat enwiki-latest-abstracts.xml | ./mapper.py | ./reducer.py
> cat enwiki-latest-abstracts.xml | ./mapper.py | ./reducer.py
ductile 008930 http://en.wikipedia.org/wiki/Gold
ductile 008452 http://en.wikipedia.org/wiki/Hydroforming
ductile 007930 http://en.wikipedia.org/wiki/Liquid_metal_embrittlement
def emit_ranking(n=100):
global sorted_dict
for i in range(n):
cassandra_client.insertWord(current_word, relevance, url)
# …
def readLoop():
# input comes from STDIN
for line in sys.stdin:
# parse the input we got from mapper.py
word, relevance, url = line.split('\t', 2)
if current_word == word :
sorted_dict[relevance] = url
else:
if current_word:
emit_ranking()
CREATE KEYSPACE wikipedia WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
CREATE TABLE wikipedia.pages (
url text,
title text,
abstract text,
length int,
refs int,
PRIMARY KEY (url)
);
CREATE TABLE wikipedia.inverted (
keyword text,
relevance int,
url text,
PRIMARY KEY ((keyword), relevance)
);
$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.4.0.jar \
-files mapper.py,reducer.py \
-mapper ./mapper.py \
-reducer ./reducer.py \
-jobconf stream.num.map.output.key.fields=1 \
-jobconf stream.num.reduce.output.key.fields=1 \
-jobconf mapred.reduce.tasks=16 \
-input wikipedia-latest-abstract \
-output $HADOOP_OUTPUT_DIR
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment