Last active
July 28, 2021 17:36
-
-
Save fabian-thomas/8fff16f65f0339ae70237b0eee8d56f6 to your computer and use it in GitHub Desktop.
BigDataEngineering - Plan Enumeration scripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/python | |
import sys | |
inputFile = open(sys.argv[1], 'r') # file to read data from | |
def sortAndRemoveNonAlpha(s): | |
return ''.join(sorted(list(''.join(filter(str.isalnum, s))))) | |
resultSizeMap = {} | |
selectivityMap = {} | |
readingState = 0 | |
currentProblems=[] | |
involvedRelations=[] | |
while True: | |
line = inputFile.readline() | |
if not line: | |
break | |
elif line.startswith('#'): | |
continue | |
elif line=='\n': | |
readingState+=1 | |
continue | |
if readingState==0: | |
problem=sortAndRemoveNonAlpha(line) | |
currentProblems.append(problem) | |
involvedRelations.append(problem) | |
resultSizeMap[problem]=int(''.join(filter(str.isdigit, inputFile.readline()))) | |
elif readingState==1: | |
selectivityMap[sortAndRemoveNonAlpha(line)]=float(inputFile.readline()) | |
while len(currentProblems)>1: | |
newProblems = set() | |
for problem in currentProblems: | |
for single in involvedRelations: | |
alreadyJoined=list(sortAndRemoveNonAlpha(problem)) | |
if not single in alreadyJoined: | |
selectivity=1 | |
for p in alreadyJoined: | |
pair=sortAndRemoveNonAlpha(p+single) | |
if pair in selectivityMap: | |
selectivity*=selectivityMap[pair] | |
newProblem=sortAndRemoveNonAlpha(problem+single) | |
result=int(resultSizeMap[single]*resultSizeMap[problem]*selectivity) | |
if newProblem in resultSizeMap: | |
resultSizeMap[newProblem]=min(resultSizeMap[newProblem], result) | |
else: | |
resultSizeMap[newProblem]=result | |
newProblems.add(newProblem) | |
currentProblems=newProblems | |
for e in resultSizeMap: | |
print(e) | |
print() | |
for e in resultSizeMap: | |
print(resultSizeMap[e]) | |
inputFile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# relations with their table sizes | |
A | |
20000 | |
B | |
30000 | |
C | |
50000 | |
D | |
30000 | |
# relations that are joined by some predicate and their selectivity (exclude those that are joined by cross product) | |
AB | |
0.1 | |
AC | |
0.05 | |
BD | |
0.02 | |
CD | |
0.04 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/python | |
import sys | |
inputFile = open(sys.argv[1], 'r') | |
def sortAndRemoveNonAlpha(s): | |
return ''.join(sorted(list(''.join(filter(str.isalnum, s))))) | |
resultSizeMap = {} | |
joined = set() | |
subProblems = [] | |
readingState = 0 | |
subProblemIndex = 0 | |
for line in inputFile.readlines(): | |
if line.startswith('#'): | |
continue | |
elif line=='\n': | |
readingState+=1 | |
continue | |
if readingState==0: | |
subProblems.append(sortAndRemoveNonAlpha(line)) | |
elif readingState==1: | |
p = subProblems[subProblemIndex] | |
resultSizeMap[p] = int(''.join(filter(str.isdigit, line))) | |
subProblemIndex+=1 | |
elif readingState==2: | |
joined.add(sortAndRemoveNonAlpha(line)) | |
costs = {} | |
lowestCosts = {} | |
involvedRelations = [] | |
currentProblems = [] | |
for entry in resultSizeMap: | |
if len(entry)==1: | |
costs[entry]=0 | |
lowestCosts[entry]=0 | |
currentProblems.append(entry) | |
involvedRelations.append(entry) | |
involvedRelations=sorted(involvedRelations) | |
currentProblems=sorted(currentProblems) | |
joinCostModel=lambda x, y : x + y | |
joinChar='⨝' | |
crossChar='×' | |
while len(currentProblems)>1: | |
prettyNewProblems = [] | |
for prettySubProblem in currentProblems: | |
subProblem=sortAndRemoveNonAlpha(prettySubProblem) | |
if costs[prettySubProblem]==lowestCosts[subProblem]: | |
for single in involvedRelations: | |
alreadyJoined=list(sortAndRemoveNonAlpha(subProblem)) | |
if not single in alreadyJoined: | |
cross=True | |
for p in alreadyJoined: | |
if sortAndRemoveNonAlpha(single+p) in joined: | |
cross=False | |
if cross: | |
prettyNewProblem='('+prettySubProblem+crossChar+single+')' | |
prettyNewProblemFlipped='('+single+crossChar+prettySubProblem+')' | |
else: | |
prettyNewProblem='('+prettySubProblem+joinChar+single+')' | |
prettyNewProblemFlipped='('+single+joinChar+prettySubProblem+')' | |
if not prettyNewProblemFlipped in prettyNewProblems: | |
cost=costs[prettySubProblem]+joinCostModel(resultSizeMap[subProblem], resultSizeMap[single]) | |
costs[prettyNewProblem]=cost | |
prettyNewProblems.append(prettyNewProblem) | |
newProblem=sortAndRemoveNonAlpha(prettyNewProblem) | |
if newProblem in lowestCosts: | |
lowestCosts[newProblem]=min(lowestCosts[newProblem], cost) | |
else: | |
lowestCosts[newProblem]=cost | |
currentProblems=prettyNewProblems | |
print("{:^20} {:^20} {:^20}".format("Subplan", "Costs", "Resultsize")) | |
for e in costs: | |
print("{:>20} {:>20,} {:>20,}".format(e, costs[e], resultSizeMap[sortAndRemoveNonAlpha(e)]).replace(',', '.')) | |
inputFile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# possible subproblems | |
# may be extracted directly from sizes script | |
A | |
B | |
C | |
D | |
AB | |
AC | |
AD | |
BC | |
BD | |
CD | |
ABC | |
ABD | |
ACD | |
BCD | |
ABCD | |
# corresponding result sizes mapped by index to the subproblems above | |
# may be extracted directly from sizes script | |
20000 | |
30000 | |
50000 | |
30000 | |
60000000 | |
50000000 | |
600000000 | |
1500000000 | |
18000000 | |
60000000 | |
150000000000 | |
36000000000 | |
60000000000 | |
36000000000 | |
3600000000000 | |
# relations that are joined by some predicate (exclude those that are joined by cross product) | |
# should be the same as for the sizes script just without the selectivities | |
AB | |
AC | |
BD | |
CD |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment