Skip to content

Instantly share code, notes, and snippets.

@danish-rehman
Last active July 2, 2016 09:10
Show Gist options
  • Save danish-rehman/4e636203fb2a5535425614a02118910f to your computer and use it in GitHub Desktop.
Save danish-rehman/4e636203fb2a5535425614a02118910f to your computer and use it in GitHub Desktop.
Spark : Bot stream
import random
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
verbs = ("runs", "hits", "jumps", "drives", "barfs")
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
while True:
num = random.randrange(0,5)
print nouns[num] + ' ' + verbs[num] + ' ' + adv[num] + ' ' + adj[num]

python bot_stream.py 2>&1 | nc -lk 127.0.0.1 9999

python network_wordcount.py 127.0.0.1 9999

"""
Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
Usage: network_wordcount.py <hostname> <port>
<hostname> and <port> describe the TCP server that Spark Streaming would connect to receive data.
To run this on your local machine, you need to first run a Netcat server
`$ nc -lk 9999`
and then run the example
`$ bin/spark-submit examples/src/main/python/streaming/network_wordcount.py localhost 9999`
"""
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: network_wordcount.py <hostname> <port>", file=sys.stderr)
exit(-1)
sc = SparkContext(appName="PythonStreamingNetworkWordCount")
ssc = StreamingContext(sc, 1)
lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
counts = lines.flatMap(lambda line: line.split(" "))\
.map(lambda word: (word, 1))\
.reduceByKey(lambda a, b: a+b)
counts.pprint()
ssc.start()
ssc.awaitTermination()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment