fabian-thomas · July 28, 2021 17:36
diff --git a/sizes.py b/sizes.py
 #!/bin/python
 import sys

 inputFile = open(sys.argv[1], 'r')  # file to read data from

 def sortAndRemoveNonAlpha(s):
    return ''.join(sorted(list(''.join(filter(str.isalnum, s)))))

 resultSizeMap = {}
 selectivityMap = {}

 readingState = 0
 currentProblems=[]
 involvedRelations=[]
 while True:
    line = inputFile.readline()

    if not line:
        break
    elif line.startswith('#'):
        continue
    elif line=='\n':
        readingState+=1
        continue

    if readingState==0:
        problem=sortAndRemoveNonAlpha(line)
        currentProblems.append(problem)
        involvedRelations.append(problem)
        resultSizeMap[problem]=int(''.join(filter(str.isdigit, inputFile.readline())))
    elif readingState==1:
        selectivityMap[sortAndRemoveNonAlpha(line)]=float(inputFile.readline())

 while len(currentProblems)>1:
    newProblems = set()
    for problem in currentProblems:
        for single in involvedRelations:
            alreadyJoined=list(sortAndRemoveNonAlpha(problem))
            if not single in alreadyJoined:
                selectivity=1
                for p in alreadyJoined:
                    pair=sortAndRemoveNonAlpha(p+single)
                    if pair in selectivityMap:
                        selectivity*=selectivityMap[pair]
                newProblem=sortAndRemoveNonAlpha(problem+single)
                result=int(resultSizeMap[single]*resultSizeMap[problem]*selectivity)
                if newProblem in resultSizeMap:
                    resultSizeMap[newProblem]=min(resultSizeMap[newProblem], result)
                else:
                    resultSizeMap[newProblem]=result
                    newProblems.add(newProblem)
    currentProblems=newProblems

 for e in resultSizeMap:
    print(e)
 print()
 for e in resultSizeMap:
    print(resultSizeMap[e])

 inputFile.close()
diff --git a/sizes_input_sample b/sizes_input_sample
 # relations with their table sizes
 A
 20000
 B
 30000
 C
 50000
 D
 30000

 # relations that are joined by some predicate and their selectivity (exclude those that are joined by cross product)
 AB
 0.1
 AC
 0.05
 BD
 0.02
 CD
 0.04
diff --git a/table.py b/table.py
 #!/bin/python
 import sys

 inputFile = open(sys.argv[1], 'r')

 def sortAndRemoveNonAlpha(s):
    return ''.join(sorted(list(''.join(filter(str.isalnum, s)))))

 resultSizeMap = {}
 joined = set()

 subProblems = []
 readingState = 0
 subProblemIndex = 0
 for line in inputFile.readlines():
    if line.startswith('#'):
        continue
    elif line=='\n':
        readingState+=1
        continue

    if readingState==0:
        subProblems.append(sortAndRemoveNonAlpha(line))
    elif readingState==1:
        p = subProblems[subProblemIndex]
        resultSizeMap[p] = int(''.join(filter(str.isdigit, line)))
        subProblemIndex+=1
    elif readingState==2:
        joined.add(sortAndRemoveNonAlpha(line))

 costs = {}
 lowestCosts = {}
 involvedRelations = []
 currentProblems = []
 for entry in resultSizeMap:
    if len(entry)==1:
        costs[entry]=0
        lowestCosts[entry]=0
        currentProblems.append(entry)
        involvedRelations.append(entry)

 involvedRelations=sorted(involvedRelations)
 currentProblems=sorted(currentProblems)

 joinCostModel=lambda x, y : x + y

 joinChar='⨝'
 crossChar='×'
 while len(currentProblems)>1:
    prettyNewProblems = []
    for prettySubProblem in currentProblems:
        subProblem=sortAndRemoveNonAlpha(prettySubProblem)
        if costs[prettySubProblem]==lowestCosts[subProblem]:
            for single in involvedRelations:
                alreadyJoined=list(sortAndRemoveNonAlpha(subProblem))
                if not single in alreadyJoined:
                    cross=True
                    for p in alreadyJoined:
                        if sortAndRemoveNonAlpha(single+p) in joined:
                            cross=False
                    if cross:
                        prettyNewProblem='('+prettySubProblem+crossChar+single+')'
                        prettyNewProblemFlipped='('+single+crossChar+prettySubProblem+')'
                    else:
                        prettyNewProblem='('+prettySubProblem+joinChar+single+')'
                        prettyNewProblemFlipped='('+single+joinChar+prettySubProblem+')'
                    if not prettyNewProblemFlipped in prettyNewProblems:
                        cost=costs[prettySubProblem]+joinCostModel(resultSizeMap[subProblem], resultSizeMap[single])
                        costs[prettyNewProblem]=cost
                        prettyNewProblems.append(prettyNewProblem)
                        newProblem=sortAndRemoveNonAlpha(prettyNewProblem)
                        if newProblem in lowestCosts:
                            lowestCosts[newProblem]=min(lowestCosts[newProblem], cost)
                        else:
                            lowestCosts[newProblem]=cost
    currentProblems=prettyNewProblems

 print("{:^20} {:^20} {:^20}".format("Subplan", "Costs", "Resultsize"))
 for e in costs:
    print("{:>20} {:>20,} {:>20,}".format(e, costs[e], resultSizeMap[sortAndRemoveNonAlpha(e)]).replace(',', '.'))

 inputFile.close()
diff --git a/table_input_sample b/table_input_sample
 # possible subproblems
 # may be extracted directly from sizes script
 A
 B
 C
 D
 AB
 AC
 AD
 BC
 BD
 CD
 ABC
 ABD
 ACD
 BCD
 ABCD

 # corresponding result sizes mapped by index to the subproblems above
 # may be extracted directly from sizes script
 20000
 30000
 50000
 30000
 60000000
 50000000
 600000000
 1500000000
 18000000
 60000000
 150000000000
 36000000000
 60000000000
 36000000000
 3600000000000

 # relations that are joined by some predicate (exclude those that are joined by cross product)
 # should be the same as for the sizes script just without the selectivities
 AB
 AC
 BD
 CD
	#!/bin/python
	import sys

	inputFile = open(sys.argv[1], 'r') # file to read data from

	def sortAndRemoveNonAlpha(s):
	return ''.join(sorted(list(''.join(filter(str.isalnum, s)))))

	resultSizeMap = {}
	selectivityMap = {}

	readingState = 0
	currentProblems=[]
	involvedRelations=[]
	while True:
	line = inputFile.readline()

	if not line:
	break
	elif line.startswith('#'):
	continue
	elif line=='\n':
	readingState+=1
	continue

	if readingState==0:
	problem=sortAndRemoveNonAlpha(line)
	currentProblems.append(problem)
	involvedRelations.append(problem)
	resultSizeMap[problem]=int(''.join(filter(str.isdigit, inputFile.readline())))
	elif readingState==1:
	selectivityMap[sortAndRemoveNonAlpha(line)]=float(inputFile.readline())

	while len(currentProblems)>1:
	newProblems = set()
	for problem in currentProblems:
	for single in involvedRelations:
	alreadyJoined=list(sortAndRemoveNonAlpha(problem))
	if not single in alreadyJoined:
	selectivity=1
	for p in alreadyJoined:
	pair=sortAndRemoveNonAlpha(p+single)
	if pair in selectivityMap:
	selectivity*=selectivityMap[pair]
	newProblem=sortAndRemoveNonAlpha(problem+single)
	result=int(resultSizeMap[single]resultSizeMap[problem]selectivity)
	if newProblem in resultSizeMap:
	resultSizeMap[newProblem]=min(resultSizeMap[newProblem], result)
	else:
	resultSizeMap[newProblem]=result
	newProblems.add(newProblem)
	currentProblems=newProblems

	for e in resultSizeMap:
	print(e)
	print()
	for e in resultSizeMap:
	print(resultSizeMap[e])

	inputFile.close()
	# relations with their table sizes
	A
	20000
	B
	30000
	C
	50000
	D
	30000

	# relations that are joined by some predicate and their selectivity (exclude those that are joined by cross product)
	AB
	0.1
	AC
	0.05
	BD
	0.02
	CD
	0.04
	#!/bin/python
	import sys

	inputFile = open(sys.argv[1], 'r')

	def sortAndRemoveNonAlpha(s):
	return ''.join(sorted(list(''.join(filter(str.isalnum, s)))))

	resultSizeMap = {}
	joined = set()

	subProblems = []
	readingState = 0
	subProblemIndex = 0
	for line in inputFile.readlines():
	if line.startswith('#'):
	continue
	elif line=='\n':
	readingState+=1
	continue

	if readingState==0:
	subProblems.append(sortAndRemoveNonAlpha(line))
	elif readingState==1:
	p = subProblems[subProblemIndex]
	resultSizeMap[p] = int(''.join(filter(str.isdigit, line)))
	subProblemIndex+=1
	elif readingState==2:
	joined.add(sortAndRemoveNonAlpha(line))

	costs = {}
	lowestCosts = {}
	involvedRelations = []
	currentProblems = []
	for entry in resultSizeMap:
	if len(entry)==1:
	costs[entry]=0
	lowestCosts[entry]=0
	currentProblems.append(entry)
	involvedRelations.append(entry)

	involvedRelations=sorted(involvedRelations)
	currentProblems=sorted(currentProblems)

	joinCostModel=lambda x, y : x + y

	joinChar='⨝'
	crossChar='×'
	while len(currentProblems)>1:
	prettyNewProblems = []
	for prettySubProblem in currentProblems:
	subProblem=sortAndRemoveNonAlpha(prettySubProblem)
	if costs[prettySubProblem]==lowestCosts[subProblem]:
	for single in involvedRelations:
	alreadyJoined=list(sortAndRemoveNonAlpha(subProblem))
	if not single in alreadyJoined:
	cross=True
	for p in alreadyJoined:
	if sortAndRemoveNonAlpha(single+p) in joined:
	cross=False
	if cross:
	prettyNewProblem='('+prettySubProblem+crossChar+single+')'
	prettyNewProblemFlipped='('+single+crossChar+prettySubProblem+')'
	else:
	prettyNewProblem='('+prettySubProblem+joinChar+single+')'
	prettyNewProblemFlipped='('+single+joinChar+prettySubProblem+')'
	if not prettyNewProblemFlipped in prettyNewProblems:
	cost=costs[prettySubProblem]+joinCostModel(resultSizeMap[subProblem], resultSizeMap[single])
	costs[prettyNewProblem]=cost
	prettyNewProblems.append(prettyNewProblem)
	newProblem=sortAndRemoveNonAlpha(prettyNewProblem)
	if newProblem in lowestCosts:
	lowestCosts[newProblem]=min(lowestCosts[newProblem], cost)
	else:
	lowestCosts[newProblem]=cost
	currentProblems=prettyNewProblems

	print("{:^20} {:^20} {:^20}".format("Subplan", "Costs", "Resultsize"))
	for e in costs:
	print("{:>20} {:>20,} {:>20,}".format(e, costs[e], resultSizeMap[sortAndRemoveNonAlpha(e)]).replace(',', '.'))

	inputFile.close()
	# possible subproblems
	# may be extracted directly from sizes script
	A
	B
	C
	D
	AB
	AC
	AD
	BC
	BD
	CD
	ABC
	ABD
	ACD
	BCD
	ABCD

	# corresponding result sizes mapped by index to the subproblems above
	# may be extracted directly from sizes script
	20000
	30000
	50000
	30000
	60000000
	50000000
	600000000
	1500000000
	18000000
	60000000
	150000000000
	36000000000
	60000000000
	36000000000
	3600000000000

	# relations that are joined by some predicate (exclude those that are joined by cross product)
	# should be the same as for the sizes script just without the selectivities
	AB
	AC
	BD
	CD