TApicella · May 11, 2017 15:37
diff --git a/RosettaCode- Hash Join.py b/RosettaCode- Hash Join.py
 '''
 http://rosettacode.org/wiki/Hash_join
 Implement the "hash join" algorithm, and demonstrate that it passes the test-case listed below.

 You should represent the tables as data structures that feel natural in your programming language.
 Guidance

 The "hash join" algorithm consists of two steps:

    Hash phase: Create a multimap from one of the two tables, mapping from each join column value to all the rows that contain it.
        The multimap must support hash-based lookup which scales better than a simple linear search, because that's the whole point of this algorithm.
        Ideally we should create the multimap for the smaller table, thus minimizing its creation time and memory size.
    Join phase: Scan the other table, and find matching rows by looking in the multimap created before.


 In pseudo-code, the algorithm could be expressed as follows:

 let A = the first input table (or ideally, the larger one)
 let B = the second input table (or ideally, the smaller one)
 let jA = the join column ID of table A
 let jB = the join column ID of table B
 let MB = a multimap for mapping from single values to multiple rows of table B (starts out empty)
 let C = the output table (starts out empty)

 for each row b in table B:
   place b in multimap MB under key b(jB)

 for each row a in table A:
   for each row b in multimap MB under key a(jA):
      let c = the concatenation of row a and row b
      place row c in table C


 TABLE_A
 Age,Name
 27,Jonah
 18,Alan
 28,Glory
 18,Popeye
 28,Alan

 TABLE_B
 Character,Nemesis
 Jonah,Whales
 Jonah,Spiders
 Alan,Ghosts
 Alan,Zombies
 Glory,Buffy

 Output:
 A.Age,A.Name,B.Character,B.Nemesis
 18,Alan,Alan,Ghosts
 18,Alan,Alan,Zombies
 28,Alan,Alan,Ghosts
 28,Alan,Alan,Zombies
 28,Glory,Glory,Buffy
 27,Jonah,Jonah,Spiders
 27,Jonah,Jonah,Whales

 '''

 def pPrint(table):
  str_table = ""
  for row in table:
    maxlen = 10
    printrow = []
    for i in range(len(row)):
      printrow.append(row[i].ljust(maxlen+1))
    str_row = str(printrow)
    str_row = str_row.replace("[", "")
    str_row = str_row.replace("]", "")
    str_row = str_row.replace(",", "")
    str_row = str_row.replace("\'", "")
    str_table+=str_row+"\n"
  print(str_table)

 def joinTables(A, B, jA, jB):
  MB = {}
  C = []
  #Assume headers are first row 
  header_B = B.pop(0)
  keycol_B = header_B.index(jB)
  header_A = A.pop(0)
  keycol_A = header_A.index(jA)
  
  for row in B:
    if row[keycol_B] not in MB:
      MB[row[keycol_B]] = []
    MB[row[keycol_B]].append(row)
  
  header_C = []
  for ha in header_A:
    header_C.append("A."+ha)
  for hb in header_B:
    header_C.append("B."+hb)
  C.append(header_C)
  
  #Sort by first column for aesthetics, via http://stackoverflow.com/questions/3216398/sorting-by-arbitrary-lambda
  A.sort(key=lambda x: x[0]) 
  for row in A:
    if row[keycol_A] in MB:
      for r in MB[row[keycol_A]]:
        newrow = row+r 
        C.append(newrow)
  
  return C

 #TABLE_A
 t1= [["Age" ,"Name"],
 ["27","Jonah"],
 ["18","Alan"],
 ["28","Glory"],
 ["18","Popeye"],
 ["28","Alan"]]

 k1 = "Name"

 #TABLE_B
 t2 = [["Character" ,"Nemesis"],
 ["Jonah","Whales"],
 ["Jonah","Spiders"],
 ["Alan","Ghosts"],
 ["Alan","Zombies"],
 ["Glory","Buffy"]]

 k2 = "Character"

 tA = t1 if (len(t1)*len(t1[0])) <= (len(t2)*len(t2[0])) else t2
 tB = t1 if tA==t2 else t2
 kA = k1 if tA==t1 else k2
 kB = k1 if kA==k2 else k2

 print("Table A")
 pPrint(tA)
 print("Table B")
 pPrint(tB)
 print("Table C")
 pPrint(joinTables(tA, tB, kA, kB))
	'''
	http://rosettacode.org/wiki/Hash_join
	Implement the "hash join" algorithm, and demonstrate that it passes the test-case listed below.

	You should represent the tables as data structures that feel natural in your programming language.
	Guidance

	The "hash join" algorithm consists of two steps:

	Hash phase: Create a multimap from one of the two tables, mapping from each join column value to all the rows that contain it.
	The multimap must support hash-based lookup which scales better than a simple linear search, because that's the whole point of this algorithm.
	Ideally we should create the multimap for the smaller table, thus minimizing its creation time and memory size.
	Join phase: Scan the other table, and find matching rows by looking in the multimap created before.


	In pseudo-code, the algorithm could be expressed as follows:

	let A = the first input table (or ideally, the larger one)
	let B = the second input table (or ideally, the smaller one)
	let jA = the join column ID of table A
	let jB = the join column ID of table B
	let MB = a multimap for mapping from single values to multiple rows of table B (starts out empty)
	let C = the output table (starts out empty)

	for each row b in table B:
	place b in multimap MB under key b(jB)

	for each row a in table A:
	for each row b in multimap MB under key a(jA):
	let c = the concatenation of row a and row b
	place row c in table C


	TABLE_A
	Age,Name
	27,Jonah
	18,Alan
	28,Glory
	18,Popeye
	28,Alan

	TABLE_B
	Character,Nemesis
	Jonah,Whales
	Jonah,Spiders
	Alan,Ghosts
	Alan,Zombies
	Glory,Buffy

	Output:
	A.Age,A.Name,B.Character,B.Nemesis
	18,Alan,Alan,Ghosts
	18,Alan,Alan,Zombies
	28,Alan,Alan,Ghosts
	28,Alan,Alan,Zombies
	28,Glory,Glory,Buffy
	27,Jonah,Jonah,Spiders
	27,Jonah,Jonah,Whales

	'''

	def pPrint(table):
	str_table = ""
	for row in table:
	maxlen = 10
	printrow = []
	for i in range(len(row)):
	printrow.append(row[i].ljust(maxlen+1))
	str_row = str(printrow)
	str_row = str_row.replace("[", "")
	str_row = str_row.replace("]", "")
	str_row = str_row.replace(",", "")
	str_row = str_row.replace("\'", "")
	str_table+=str_row+"\n"
	print(str_table)

	def joinTables(A, B, jA, jB):
	MB = {}
	C = []
	#Assume headers are first row
	header_B = B.pop(0)
	keycol_B = header_B.index(jB)
	header_A = A.pop(0)
	keycol_A = header_A.index(jA)

	for row in B:
	if row[keycol_B] not in MB:
	MB[row[keycol_B]] = []
	MB[row[keycol_B]].append(row)

	header_C = []
	for ha in header_A:
	header_C.append("A."+ha)
	for hb in header_B:
	header_C.append("B."+hb)
	C.append(header_C)

	#Sort by first column for aesthetics, via http://stackoverflow.com/questions/3216398/sorting-by-arbitrary-lambda
	A.sort(key=lambda x: x[0])
	for row in A:
	if row[keycol_A] in MB:
	for r in MB[row[keycol_A]]:
	newrow = row+r
	C.append(newrow)

	return C

	#TABLE_A
	t1= [["Age" ,"Name"],
	["27","Jonah"],
	["18","Alan"],
	["28","Glory"],
	["18","Popeye"],
	["28","Alan"]]

	k1 = "Name"

	#TABLE_B
	t2 = [["Character" ,"Nemesis"],
	["Jonah","Whales"],
	["Jonah","Spiders"],
	["Alan","Ghosts"],
	["Alan","Zombies"],
	["Glory","Buffy"]]

	k2 = "Character"

	tA = t1 if (len(t1)len(t1[0])) <= (len(t2)len(t2[0])) else t2
	tB = t1 if tA==t2 else t2
	kA = k1 if tA==t1 else k2
	kB = k1 if kA==k2 else k2

	print("Table A")
	pPrint(tA)
	print("Table B")
	pPrint(tB)
	print("Table C")
	pPrint(joinTables(tA, tB, kA, kB))