korkridake · November 23, 2018 16:08
diff --git a/PySpark_Tutorial_Map_Reduce_Filter_Concept.py b/PySpark_Tutorial_Map_Reduce_Filter_Concept.py
 print(sc)
 print(spark)
 # <pyspark.sql.session.SparkSession at 0x7f8df8673ba8>

 # -------------------------------------------------------------------------------
 # Import PySpark Libraries
 # -------------------------------------------------------------------------------
 import datetime
 from datetime import datetime
 from pyspark.sql.functions import skewness, kurtosis
 from pyspark.sql.functions import var_pop, var_samp, stddev, stddev_pop, sumDistinct, ntile
 #'udf' stands for 'user defined function', and is simply a wrapper for functions you write and 
 #want to apply to a column that knows how to iterate through pySpark dataframe columns. it should
 #be more clear after we use it below
 from pyspark.sql.functions import udf
 from pyspark.sql.functions import col
 from pyspark.sql.types import IntegerType
 from pyspark.sql.types import StringType
 from pyspark.sql.types import DateType
 from pyspark.sql import DataFrame
 from pyspark.sql import Row
 from functools import reduce

 # -------------------------------------------------------------------------------
 # lambda <args>: <expr>
 # -------------------------------------------------------------------------------
 def f(x):
  '''
  Signature: int --> int
  Author: @Korkrid Akepanidtaworn
  Description: Double the value of a scalar
  '''
  return x**2

 print(f(8)) #64

 # In Lambda way, we can write:
 g = lambda x: x**2
 print(g(8)) # 64

 # As you can see both functions do exactly the same and can be used in the same ways.
 # - Note that the lambda definition does not include a “return” statement – it always contains a single expression which is returned.
 # - Also note that you can put a lambda definition anywhere a function is expected, and you don’t have to assign it to a variable at all.
 # - Lambda functions come from functional programming languages and the Lambda Calculus. Since they are so small they may be written on a single line.
 # - This is not exactly the same as lambda in functional programming languages, but it is a very powerful concept that’s well integrated into Python.
 f = lambda x,y: ["PASS",x,y] if x>3 and y<100 else ["FAIL",x,y]
 print(f(4,50))
 # ['PASS', 4, 50]

 def k(x,y):
   '''
  Signature: int --> int
  Author: @Korkrid Akepanidtaworn
  Description: Return pass if (x > 3 and y < 100), otherwise return fail.
  '''
  if (x>3) & (y<100):
    return ["PASS",x,y]
  else:
    return ["FAIL",x,y]
  
 print(k(4,50)) # ['PASS', 4, 50]
 print(k(60,130)) # ['FAIL', 60, 130]
 print(k(2,150)) # ['FAIL', 2, 150]

 # -------------------------------------------------------------------------------
 # Challenge 1:
 # Write a Lambda function and use it to sort pairs by key using their names. You will be using the list.sort() method of a list. It modifies the list in-place (here pairs)and 
 # has a key parameter to specify a function to be called on each list element prior to making comparisons. The value of the key parameter is a function that takes 
 # a single argument and returns a key to use for sorting purposes. Define this function as a Lambda function.
 # -------------------------------------------------------------------------------
 pairs = [(1, 'one'), (2, 'two'), (3, 'three'), (4, 'four'), (5, 'five'), (6, 'six'), (7, 'jk rowling')]
 pairs.sort(key=lambda pair: pair[1])
 pairs
 # Out[20]: 
 #   [(5, 'five'),
 #    (4, 'four'),
 #    (7, 'jk rowling'),
 #    (1, 'one'),
 #    (6, 'six'),
 #    (3, 'three'),
 #    (2, 'two')]

 pairs.sort(key=lambda pair: pair[1], reverse = True)
 pairs
 # Out[27]: 
 #   [(2, 'two'),
 #    (3, 'three'),
 #    (6, 'six'),
 #    (1, 'one'),
 #    (7, 'jk rowling'),
 #    (4, 'four'),
 #    (5, 'five')]

 pairs.sort(key=lambda pair: pair[0], reverse = True)
 pairs
 # Out[25]: 
 #   [(7, 'jk rowling'),
 #    (6, 'six'),
 #    (5, 'five'),
 #    (4, 'four'),
 #    (3, 'three'),
 #    (2, 'two'),
 #    (1, 'one')]

 pairs.sort(key=lambda pair: pair[0])
 pairs
 # [(1, 'one'),
 #  (2, 'two'),
 #  (3, 'three'),
 #  (4, 'four'),
 #  (5, 'five'),
 #  (6, 'six'),
 #  (7, 'jk rowling')]

 # -------------------------------------------------------------------------------
 # map, filter and reduce in python
 # Map takes a function f and an array as input parameters and outputs an array where f is applied to every element. In this respect, using map is equivalent to for loops.
 # For instance, to convert a list of temperatures in Celsius to a list of temperature in Kelvin:
 # -------------------------------------------------------------------------------
 temp_c = [10, 3, -5, 25, 1, 9, 29, -10, 5]
 temp_K = list(map(lambda x: x + 273.15, temp_c))
 list(temp_K)
 # Out[28]: [283.15, 276.15, 268.15, 298.15, 274.15, 282.15, 302.15, 263.15, 278.15]

 # map() is a function with two arguments:
 # r = map(func, seq)
 # The first argument func is the name of a function and the second a sequence (e.g. a list) seq. map() applies the function func to all the elements of the sequence seq. 
 # It returns a new list with the elements changed by func.

 # Let’s define a list of words: list_words = [“big”,”small”, “able”, “about”, “hairdresser”, “laboratory”]
 # Use a map function to print the number of character of each word:
 list_words = ['big', 'small', 'able', 'about', 'hairdresser', 'laboratory']
 list_num_char_for_each_word = list(map(lambda k: len(k), list_words))
 print(list(list_num_char_for_each_word)) # [3, 5, 4, 5, 11, 10]

 # or even shorter
 print(list(map(len,list_words))) # [3, 5, 4, 5, 11, 10]

 # -------------------------------------------------------------------------------
 # Filter
 # -------------------------------------------------------------------------------
 # As the name suggests, filter can be used to filter your data. It tests each element of your input data and 
 # returns a subset of it for which a condition given by a function is TRUE. It does not modify your input data.
 numbers = range(-15, 15)
 less_than_zero = list(filter(lambda x: x < 0, numbers))
 print(less_than_zero) # [-15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1]

 # Reuse numbers and extract all the odd numbers:
 # numbers = range(-15, 15)
 numbers = range(-15, 15)
 odd_nums = list(filter(lambda x: x % 2 == 1, numbers))
 print(odd_nums) # [-15, -13, -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13]

 # Reuse numbers and extract all the even numbers:
 numbers = range(-15, 15)
 even_numbers = list(filter(lambda x: x % 2 == 0, numbers))
 print(even_numbers) # [-14, -12, -10, -8, -6, -4, -2, 0, 2, 4, 6, 8, 10, 12, 14]

 # -------------------------------------------------------------------------------
 # Reduce
 # -------------------------------------------------------------------------------
 # Reduce takes a function f and an array as input. The function f gets two input parameters that work on individual elements of the array. 
 # Reduce combines every two elements of the array using the function f. Let’s take an example:

 # we define a list of integers
 numbers = [1, 4, 6, 2, 9, 10]
 # Define a new function combine
 # Convert x and y to strings and create a tuple from x,y
 def combine(x,y):
  return "(" + str(x) + ", " + str(y) + ")"

 # Use reduce to apply combine to numbers
 from functools import reduce

 print(numbers)
 reduce(combine,numbers)
 # [1, 4, 6, 2, 9, 10]
 # Out[38]: '(((((1, 4), 6), 2), 9), 10)'

 # we define a list of integers
 numbers = [1, 4, 6, 2, 9, 10]

 # Use reduce to combine numbers
 from functools import reduce

 print(numbers)
 reduce(lambda x,y: "(" + str(x) + ", " + str(y) + ")",numbers)
 # [1, 4, 6, 2, 9, 10]
 # Out[38]: '(((((1, 4), 6), 2), 9), 10)'

 # -----------------------------------------------------------------------------
 # Challenge 4:
 # Let’s define a string variable sentence:
 sentence = "Dis-moi ce que tu manges, je te dirai ce que tu es."
 # Compute the number of words in sentence
 # -----------------------------------------------------------------------------
 import string
 no_punctuation=sentence.translate(str.maketrans("","",string.punctuation))

 reduce(lambda x,y: x+y, map(lambda x: 1, no_punctuation.split())) # 12



 def wordCount(mystring):
  '''
  Signature: str --> int
  Author: Darrell White
  Description: return the word count in a sentence
  Link: https://stackoverflow.com/questions/19410018/how-to-count-the-number-of-words-in-a-sentence-ignoring-numbers-punctuation-an
  '''
  tempcount = 0  
  count = 1  

  try:  
    for character in mystring:  
      if character == " ":  
        tempcount +=1  
        if tempcount ==1:  
          count +=1  
        else:  
          tempcount +=1
      else:
        tempcount=0
    return count  

  except Exception:  
    error = "Not a string"  
    return error  

 mystring = "The ones who see things differently. They're not fond of rules. And they have no respect for the status quo."           
 print(wordCount(mystring)) # 20
	print(sc)
	print(spark)
	# <pyspark.sql.session.SparkSession at 0x7f8df8673ba8>

	# -------------------------------------------------------------------------------
	# Import PySpark Libraries
	# -------------------------------------------------------------------------------
	import datetime
	from datetime import datetime
	from pyspark.sql.functions import skewness, kurtosis
	from pyspark.sql.functions import var_pop, var_samp, stddev, stddev_pop, sumDistinct, ntile
	#'udf' stands for 'user defined function', and is simply a wrapper for functions you write and
	#want to apply to a column that knows how to iterate through pySpark dataframe columns. it should
	#be more clear after we use it below
	from pyspark.sql.functions import udf
	from pyspark.sql.functions import col
	from pyspark.sql.types import IntegerType
	from pyspark.sql.types import StringType
	from pyspark.sql.types import DateType
	from pyspark.sql import DataFrame
	from pyspark.sql import Row
	from functools import reduce

	# -------------------------------------------------------------------------------
	# lambda <args>: <expr>
	# -------------------------------------------------------------------------------
	def f(x):
	'''
	Signature: int --> int
	Author: @Korkrid Akepanidtaworn
	Description: Double the value of a scalar
	'''
	return x**2

	print(f(8)) #64

	# In Lambda way, we can write:
	g = lambda x: x**2
	print(g(8)) # 64

	# As you can see both functions do exactly the same and can be used in the same ways.
	# - Note that the lambda definition does not include a “return” statement – it always contains a single expression which is returned.
	# - Also note that you can put a lambda definition anywhere a function is expected, and you don’t have to assign it to a variable at all.
	# - Lambda functions come from functional programming languages and the Lambda Calculus. Since they are so small they may be written on a single line.
	# - This is not exactly the same as lambda in functional programming languages, but it is a very powerful concept that’s well integrated into Python.
	f = lambda x,y: ["PASS",x,y] if x>3 and y<100 else ["FAIL",x,y]
	print(f(4,50))
	# ['PASS', 4, 50]

	def k(x,y):
	'''
	Signature: int --> int
	Author: @Korkrid Akepanidtaworn
	Description: Return pass if (x > 3 and y < 100), otherwise return fail.
	'''
	if (x>3) & (y<100):
	return ["PASS",x,y]
	else:
	return ["FAIL",x,y]

	print(k(4,50)) # ['PASS', 4, 50]
	print(k(60,130)) # ['FAIL', 60, 130]
	print(k(2,150)) # ['FAIL', 2, 150]

	# -------------------------------------------------------------------------------
	# Challenge 1:
	# Write a Lambda function and use it to sort pairs by key using their names. You will be using the list.sort() method of a list. It modifies the list in-place (here pairs)and
	# has a key parameter to specify a function to be called on each list element prior to making comparisons. The value of the key parameter is a function that takes
	# a single argument and returns a key to use for sorting purposes. Define this function as a Lambda function.
	# -------------------------------------------------------------------------------
	pairs = [(1, 'one'), (2, 'two'), (3, 'three'), (4, 'four'), (5, 'five'), (6, 'six'), (7, 'jk rowling')]
	pairs.sort(key=lambda pair: pair[1])
	pairs
	# Out[20]:
	# [(5, 'five'),
	# (4, 'four'),
	# (7, 'jk rowling'),
	# (1, 'one'),
	# (6, 'six'),
	# (3, 'three'),
	# (2, 'two')]

	pairs.sort(key=lambda pair: pair[1], reverse = True)
	pairs
	# Out[27]:
	# [(2, 'two'),
	# (3, 'three'),
	# (6, 'six'),
	# (1, 'one'),
	# (7, 'jk rowling'),
	# (4, 'four'),
	# (5, 'five')]

	pairs.sort(key=lambda pair: pair[0], reverse = True)
	pairs
	# Out[25]:
	# [(7, 'jk rowling'),
	# (6, 'six'),
	# (5, 'five'),
	# (4, 'four'),
	# (3, 'three'),
	# (2, 'two'),
	# (1, 'one')]

	pairs.sort(key=lambda pair: pair[0])
	pairs
	# [(1, 'one'),
	# (2, 'two'),
	# (3, 'three'),
	# (4, 'four'),
	# (5, 'five'),
	# (6, 'six'),
	# (7, 'jk rowling')]

	# -------------------------------------------------------------------------------
	# map, filter and reduce in python
	# Map takes a function f and an array as input parameters and outputs an array where f is applied to every element. In this respect, using map is equivalent to for loops.
	# For instance, to convert a list of temperatures in Celsius to a list of temperature in Kelvin:
	# -------------------------------------------------------------------------------
	temp_c = [10, 3, -5, 25, 1, 9, 29, -10, 5]
	temp_K = list(map(lambda x: x + 273.15, temp_c))
	list(temp_K)
	# Out[28]: [283.15, 276.15, 268.15, 298.15, 274.15, 282.15, 302.15, 263.15, 278.15]

	# map() is a function with two arguments:
	# r = map(func, seq)
	# The first argument func is the name of a function and the second a sequence (e.g. a list) seq. map() applies the function func to all the elements of the sequence seq.
	# It returns a new list with the elements changed by func.

	# Let’s define a list of words: list_words = [“big”,”small”, “able”, “about”, “hairdresser”, “laboratory”]
	# Use a map function to print the number of character of each word:
	list_words = ['big', 'small', 'able', 'about', 'hairdresser', 'laboratory']
	list_num_char_for_each_word = list(map(lambda k: len(k), list_words))
	print(list(list_num_char_for_each_word)) # [3, 5, 4, 5, 11, 10]

	# or even shorter
	print(list(map(len,list_words))) # [3, 5, 4, 5, 11, 10]

	# -------------------------------------------------------------------------------
	# Filter
	# -------------------------------------------------------------------------------
	# As the name suggests, filter can be used to filter your data. It tests each element of your input data and
	# returns a subset of it for which a condition given by a function is TRUE. It does not modify your input data.
	numbers = range(-15, 15)
	less_than_zero = list(filter(lambda x: x < 0, numbers))
	print(less_than_zero) # [-15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1]

	# Reuse numbers and extract all the odd numbers:
	# numbers = range(-15, 15)
	numbers = range(-15, 15)
	odd_nums = list(filter(lambda x: x % 2 == 1, numbers))
	print(odd_nums) # [-15, -13, -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13]

	# Reuse numbers and extract all the even numbers:
	numbers = range(-15, 15)
	even_numbers = list(filter(lambda x: x % 2 == 0, numbers))
	print(even_numbers) # [-14, -12, -10, -8, -6, -4, -2, 0, 2, 4, 6, 8, 10, 12, 14]

	# -------------------------------------------------------------------------------
	# Reduce
	# -------------------------------------------------------------------------------
	# Reduce takes a function f and an array as input. The function f gets two input parameters that work on individual elements of the array.
	# Reduce combines every two elements of the array using the function f. Let’s take an example:

	# we define a list of integers
	numbers = [1, 4, 6, 2, 9, 10]
	# Define a new function combine
	# Convert x and y to strings and create a tuple from x,y
	def combine(x,y):
	return "(" + str(x) + ", " + str(y) + ")"

	# Use reduce to apply combine to numbers
	from functools import reduce

	print(numbers)
	reduce(combine,numbers)
	# [1, 4, 6, 2, 9, 10]
	# Out[38]: '(((((1, 4), 6), 2), 9), 10)'

	# we define a list of integers
	numbers = [1, 4, 6, 2, 9, 10]

	# Use reduce to combine numbers
	from functools import reduce

	print(numbers)
	reduce(lambda x,y: "(" + str(x) + ", " + str(y) + ")",numbers)
	# [1, 4, 6, 2, 9, 10]
	# Out[38]: '(((((1, 4), 6), 2), 9), 10)'

	# -----------------------------------------------------------------------------
	# Challenge 4:
	# Let’s define a string variable sentence:
	sentence = "Dis-moi ce que tu manges, je te dirai ce que tu es."
	# Compute the number of words in sentence
	# -----------------------------------------------------------------------------
	import string
	no_punctuation=sentence.translate(str.maketrans("","",string.punctuation))

	reduce(lambda x,y: x+y, map(lambda x: 1, no_punctuation.split())) # 12



	def wordCount(mystring):
	'''
	Signature: str --> int
	Author: Darrell White
	Description: return the word count in a sentence
	Link: https://stackoverflow.com/questions/19410018/how-to-count-the-number-of-words-in-a-sentence-ignoring-numbers-punctuation-an
	'''
	tempcount = 0
	count = 1

	try:
	for character in mystring:
	if character == " ":
	tempcount +=1
	if tempcount ==1:
	count +=1
	else:
	tempcount +=1
	else:
	tempcount=0
	return count

	except Exception:
	error = "Not a string"
	return error

	mystring = "The ones who see things differently. They're not fond of rules. And they have no respect for the status quo."
	print(wordCount(mystring)) # 20