dimitardanailov · March 20, 2019 15:11 · sarkarChanchal105 · May 29, 2018
diff --git a/advanced-join-in-spark.py b/advanced-join-in-spark.py
 # Start python
 PYSPARK_DRIVER_PYTHON=ipython pyspark

 # Verify input data
 hdfs dfs -ls input/

 # Read shows files
 show_views_file = sc.textFile("input/join2_gennum?.txt")

 # view first two lines
 show_views_file.take(2)

 # Parse shows files
 def split_show_views(line):
    """
    Function to split and parse each line of the data set
    line: 'show,views' a string from a gennum file
    """
    # split the input line in word and count on the comma
    show,views=line.split(",")
    # turn the count to an integer  
    views=int(views)
    return (show, views)
    
 show_views = show_views_file.map(split_show_views)

 # view the result
 show_views.collect()

 # view just the first two lines
 show_views.take(2)

 # Read channel files
 show_channel_file = sc.textFile("input/join2_genchan?.txt")

 # view first two lines
 show_channel_file.take(2)

 # Parse channel files
 def split_show_channel(line):
    """
    Function to split and parse each line of the data set
    line: 'show,channel' a string from a gennum file
    """
    show,channel=line.split(",")
    return (show, channel)

 show_channel = show_channel_file.map(split_show_channel)    

 # view the result
 show_channel.take(2)

 # Join the two data sets
 # use the join transformation, order of files does not matter
 joined_dataset = show_views.join(show_channel)
 joined_dataset = show_channel.join(show_views)

 # view the result
 joined_dataset.take(2)

 # Extract channel as key
 # want total viewers by channel

 def extract_channel_views(show_views_channel): 
    """
    Aim is to find the total viewers by channel
    show_views_channel: 'show', (views, 'channel')
    returns:an RDD with the channel as key and all the viewer counts, whichever
    is the show.
    """
    channel,views=show_views_channel[1]
    return (channel, views)
    
 channel_views = joined_dataset.map(extract_channel_views)

 def sum_channel_viewers(a,b):
    return a + b

 channel_views.reduceByKey(sum_channel_viewers).collect()
diff --git a/simple-join-in-spark.py b/simple-join-in-spark.py
 fileA = sc.textFile("input/join1_FileA.txt")
 fileA.collect()
 # Out[]: [u'able,991', u'about,11', u'burger,15', u'actor,22']

 fileB = sc.textFile("input/join1_FileB.txt")
 fileB.collect()
 # Out[]: 
 # [u'Jan-01 able,5',
 #  u'Feb-02 about,3',
 #  u'Mar-03 about,8 ',
 #  u'Apr-04 able,13',
 #  u'Feb-22 actor,3',
 #  u'Feb-23 burger,5',
 #  u'Mar-08 burger,2',
 #  u'Dec-15 able,100']

 def split_fileA(line):
  # split the input line in word and count on the comma
  splitter = line.split(",")
  # turn the count to an integer  
  word = splitter[0]
  count = int(splitter[1])
  return (word, count)
  
 test_line = "able,991"
 split_fileA(test_line) # Out[]: ('able, 991)

 fileA_data = fileA.map(split_fileA)
 fileA_data.collect()
 # Out[]: [(u'able', 991), (u'about', 11), (u'burger', 15), (u'actor', 22)]

 def split_fileB(line):
  # split the input line into word, date and count_string
  splitter = line.split(",")
  splitter_word_date = splitter[0].split(" ")
  word = splitter_word_date[1]
  date = splitter_word_date[0]
  count_string = splitter[1]
  return (word, date + " " + count_string) 

 fileB_data = fileB.map(split_fileB)
 fileB_data.collect()
 # [(u'able', u'Jan-01 5'),
 #  (u'about', u'Feb-02 3'),
 #  (u'about', u'Mar-03 8'),
 #  (u'able', u'Apr-04 13'),
 #  (u'actor', u'Feb-22 3'),
 #  (u'burger', u'Feb-23 5'),
 #  (u'burger', u'Mar-08 2'),
 #  (u'able', u'Dec-15 100')]

 fileB_joined_fileA = fileB_data.join(fileA_data)
 fileB_joined_fileA.collect()
diff --git a/spark.md b/spark.md
diff --git a/transformations.py b/transformations.py
 # flatMap
 def split_words(line):
  return line.spilit()
  
 words_flat_RDD = text_RDD.flatMap(split_words)
 words_flat_RDD.collect()

 # filter
 def starts_with_a(word):
  return word.lower().startswith("a")
  
 words_RDD.filter(starts_with_a).collect()

 # Transofrmations of (K, V) pairs
 def create_pair):
  return (word, 1)
  
 pairs_RDD = text_RDD.flatMap(split_words).map(create_pair)
diff --git a/wordcount-spark.py b/wordcount-spark.py
 def split_words(line):
  return line.split()
  
 def create_pair(word):
  return (word, 1)
  
 pairs_RDD = text_RDD.flatMap(split_words).map(create_pair)

 pairs_RDD.collect()

 # Out[]: [(u'A', 1),
 # (u'long', 1),
 # (u'time', 1),
 # (u'ago', 1),
 # (u'in', 1),
 # (u'a',1),
 # (u'galaxy', 1),
 # (u'far',1),
 # (u'far', 1),
 # (u'away', 1)]

 def sum_counts(a, b):
  return a + b
  
 wordcounts_RDD = pairs_RDD.reduceByKey(sum_counts)

 wordcounts_RDD.collect()

 # Out[]: 
 # [(u'A', 1),
 # (u'ago', 1),
 # (u'a', 1),
 # (u'far', 2),
 # (u'long', 1),
 # (u'galaxt', 1),
 # (u'time', 1),
 # (u'in', 1),
 # (u'awat', 1)]
diff --git a/yelp - quiz.md b/yelp - quiz.md
diff --git a/yelp-hive.md b/yelp-hive.md
diff --git a/yelp.md b/yelp.md
	# Start python
	PYSPARK_DRIVER_PYTHON=ipython pyspark

	# Verify input data
	hdfs dfs -ls input/

	# Read shows files
	show_views_file = sc.textFile("input/join2_gennum?.txt")

	# view first two lines
	show_views_file.take(2)

	# Parse shows files
	def split_show_views(line):
	"""
	Function to split and parse each line of the data set
	line: 'show,views' a string from a gennum file
	"""
	# split the input line in word and count on the comma
	show,views=line.split(",")
	# turn the count to an integer
	views=int(views)
	return (show, views)

	show_views = show_views_file.map(split_show_views)

	# view the result
	show_views.collect()

	# view just the first two lines
	show_views.take(2)

	# Read channel files
	show_channel_file = sc.textFile("input/join2_genchan?.txt")

	# view first two lines
	show_channel_file.take(2)

	# Parse channel files
	def split_show_channel(line):
	"""
	Function to split and parse each line of the data set
	line: 'show,channel' a string from a gennum file
	"""
	show,channel=line.split(",")
	return (show, channel)

	show_channel = show_channel_file.map(split_show_channel)

	# view the result
	show_channel.take(2)

	# Join the two data sets
	# use the join transformation, order of files does not matter
	joined_dataset = show_views.join(show_channel)
	joined_dataset = show_channel.join(show_views)

	# view the result
	joined_dataset.take(2)

	# Extract channel as key
	# want total viewers by channel

	def extract_channel_views(show_views_channel):
	"""
	Aim is to find the total viewers by channel
	show_views_channel: 'show', (views, 'channel')
	returns:an RDD with the channel as key and all the viewer counts, whichever
	is the show.
	"""
	channel,views=show_views_channel[1]
	return (channel, views)

	channel_views = joined_dataset.map(extract_channel_views)

	def sum_channel_viewers(a,b):
	return a + b

	channel_views.reduceByKey(sum_channel_viewers).collect()
	fileA = sc.textFile("input/join1_FileA.txt")
	fileA.collect()
	# Out[]: [u'able,991', u'about,11', u'burger,15', u'actor,22']

	fileB = sc.textFile("input/join1_FileB.txt")
	fileB.collect()
	# Out[]:
	# [u'Jan-01 able,5',
	# u'Feb-02 about,3',
	# u'Mar-03 about,8 ',
	# u'Apr-04 able,13',
	# u'Feb-22 actor,3',
	# u'Feb-23 burger,5',
	# u'Mar-08 burger,2',
	# u'Dec-15 able,100']

	def split_fileA(line):
	# split the input line in word and count on the comma
	splitter = line.split(",")
	# turn the count to an integer
	word = splitter[0]
	count = int(splitter[1])
	return (word, count)

	test_line = "able,991"
	split_fileA(test_line) # Out[]: ('able, 991)

	fileA_data = fileA.map(split_fileA)
	fileA_data.collect()
	# Out[]: [(u'able', 991), (u'about', 11), (u'burger', 15), (u'actor', 22)]

	def split_fileB(line):
	# split the input line into word, date and count_string
	splitter = line.split(",")
	splitter_word_date = splitter[0].split(" ")
	word = splitter_word_date[1]
	date = splitter_word_date[0]
	count_string = splitter[1]
	return (word, date + " " + count_string)

	fileB_data = fileB.map(split_fileB)
	fileB_data.collect()
	# [(u'able', u'Jan-01 5'),
	# (u'about', u'Feb-02 3'),
	# (u'about', u'Mar-03 8'),
	# (u'able', u'Apr-04 13'),
	# (u'actor', u'Feb-22 3'),
	# (u'burger', u'Feb-23 5'),
	# (u'burger', u'Mar-08 2'),
	# (u'able', u'Dec-15 100')]

	fileB_joined_fileA = fileB_data.join(fileA_data)
	fileB_joined_fileA.collect()
	# flatMap
	def split_words(line):
	return line.spilit()

	words_flat_RDD = text_RDD.flatMap(split_words)
	words_flat_RDD.collect()

	# filter
	def starts_with_a(word):
	return word.lower().startswith("a")

	words_RDD.filter(starts_with_a).collect()

	# Transofrmations of (K, V) pairs
	def create_pair):
	return (word, 1)

	pairs_RDD = text_RDD.flatMap(split_words).map(create_pair)
	def split_words(line):
	return line.split()

	def create_pair(word):
	return (word, 1)

	pairs_RDD = text_RDD.flatMap(split_words).map(create_pair)

	pairs_RDD.collect()

	# Out[]: [(u'A', 1),
	# (u'long', 1),
	# (u'time', 1),
	# (u'ago', 1),
	# (u'in', 1),
	# (u'a',1),
	# (u'galaxy', 1),
	# (u'far',1),
	# (u'far', 1),
	# (u'away', 1)]

	def sum_counts(a, b):
	return a + b

	wordcounts_RDD = pairs_RDD.reduceByKey(sum_counts)

	wordcounts_RDD.collect()

	# Out[]:
	# [(u'A', 1),
	# (u'ago', 1),
	# (u'a', 1),
	# (u'far', 2),
	# (u'long', 1),
	# (u'galaxt', 1),
	# (u'time', 1),
	# (u'in', 1),
	# (u'awat', 1)]