-
-
Save foolnotion/5a99ae8c76c27de614320ac272ebe010 to your computer and use it in GitHub Desktop.
# This file is part of EAP. | |
# | |
# EAP is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU Lesser General Public License as | |
# published by the Free Software Foundation, either version 3 of | |
# the License, or (at your option) any later version. | |
# | |
# EAP is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU Lesser General Public License for more details. | |
# | |
# You should have received a copy of the GNU Lesser General Public | |
# License along with EAP. If not, see <http://www.gnu.org/licenses/>. | |
import operator | |
import math | |
import random | |
import warnings # suppress some warnings related to invalid values | |
import numpy as np | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import r2_score | |
from sklearn.metrics import mean_squared_error | |
import multiprocessing | |
import timeit | |
from deap import algorithms | |
from deap import base | |
from deap import creator | |
from deap import tools | |
from deap import gp | |
def evalSymbReg(individual, pset, X_train, y_train): | |
# Transform the tree expression in a callable function | |
func = gp.compile(expr=individual, pset=pset) | |
with warnings.catch_warnings(): | |
warnings.simplefilter("ignore") | |
y_pred = np.array([ func(*x) for x in X_train ]) | |
min_ = np.nanmin(y_pred) | |
max_ = np.nanmax(y_pred) | |
if ~np.isfinite(min_) or ~np.isfinite(max_): | |
return 0, | |
mid_ = (min_ + max_) / 2 | |
np.nan_to_num(y_pred, copy=False, nan=mid_, posinf=mid_, neginf=mid_) | |
fit = r2_score(y_train, y_pred) | |
if ~np.isfinite(fit): | |
fit = 0 | |
return fit, | |
# load data | |
df = pd.read_csv('./data/Poly-10.csv', sep=',') | |
X = df.iloc[:,:-1].to_numpy() | |
y = df.iloc[:,-1].to_numpy() | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1234) | |
_, cols = X_train.shape | |
# set static height limit for all generated trees | |
pset = gp.PrimitiveSet("MAIN", cols) | |
pset.addPrimitive(np.add, 2, name="vadd") | |
pset.addPrimitive(np.subtract, 2, name="vsub") | |
pset.addPrimitive(np.multiply, 2, name="vmul") | |
pset.addPrimitive(np.divide, 2, name="vdiv") | |
pset.addPrimitive(np.negative, 1, name="vneg") | |
pset.addPrimitive(np.cos, 1, name="vcos") | |
pset.addPrimitive(np.sin, 1, name="vsin") | |
pset.addPrimitive(np.exp, 1, name="vexp") | |
pset.addPrimitive(np.log, 1, name="vlog") | |
pset.addEphemeralConstant("rand101", lambda: np.random.uniform(-1.0, 1.0)) | |
creator.create("FitnessMin", base.Fitness, weights=(1.0,)) | |
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin) | |
maxHeight = 10 | |
maxLength = 50 | |
toolbox = base.Toolbox() | |
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=maxHeight) | |
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr) | |
toolbox.register("population", tools.initRepeat, list, toolbox.individual) | |
toolbox.register("evaluate", evalSymbReg, pset=pset, X_train=X_train, y_train=y_train) | |
toolbox.register("select", tools.selTournament, tournsize=5) | |
limitHeight = gp.staticLimit(operator.attrgetter('height'), maxHeight) | |
limitLength = gp.staticLimit(len, maxLength) | |
mutOperators = [ gp.mutUniform ] | |
def mutOperator(*args, **kwargs): | |
mut = np.random.choice(mutOperators) | |
return mut(*args, **kwargs) | |
toolbox.register("mate", gp.cxOnePoint) | |
toolbox.decorate("mate", limitHeight) | |
toolbox.decorate("mate", limitLength) | |
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2) | |
toolbox.register('mutate', mutOperator, expr=toolbox.expr_mut, pset=pset) | |
toolbox.decorate("mutate", limitHeight) | |
toolbox.decorate("mutate", limitLength) | |
def main(): | |
np.seterr(all='ignore') | |
random.seed(318) | |
pool = multiprocessing.Pool() | |
toolbox.register("map", pool.map) | |
pop = toolbox.population(n=1000) | |
hof = tools.HallOfFame(1) | |
stats = tools.Statistics(lambda ind: ind.fitness.values) | |
stats.register("avg", np.nanmean) | |
stats.register("std", np.nanstd) | |
stats.register("min", np.nanmin) | |
stats.register("max", np.nanmax) | |
algorithms.eaSimple(pop, toolbox, cxpb=1, mutpb=0.25, ngen=100, stats=stats, halloffame=hof) | |
return pop, stats, hof | |
if __name__ == "__main__": | |
print(timeit.timeit(stmt=main, number=1)) |
Hi,
Can you perhaps share your csv file?
Thanks in advance.
You can find my data and latest code here: https://github.com/foolnotion/deap-symreg
Thanks again. A suggestion/question: before going from one generation to the next, wouldn't it help to "simplify" the trees (i.e., the individuals)? For example, if we have a node "-' with leaves "x" and "x", we can simply replace "-" with a "0" (other example: x/x). This should also help with "not" reaching Python's limit on the tree depth. Any ideas on how to do it?
for deap expressions i would look at simpy, but from a genetic programming perspective simplification/pruning is not great. like in nature genotypes include a lot of redundancy (cryptic genetic variation, other mechanisms for robustness/buffering). this actually increases the potential to evolve, what gp calls evolvability. simplification would cancel this effect, as it woukd offer less targets for mutation or crossover to create some adaptive change.
deap gp support is not quite mature imho. recombination operators (crossover, mutation) should be aware of tree and depth limits and prevent the generation of offspring individuals exceeding them.
I agree with foolnotion about evolvability and a desire to keep 'dead genes' around as complexity can pay off down the line. The above code utilizes length and depth operator limits in order to manage the progress of this grown over man generations:
limitLength = 100
limitHeight = 15
toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=limitHeight))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=limitHeight))
toolbox.decorate("mate", gp.staticLimit(key=len, max_value=limitLength))
toolbox.decorate("mutate", gp.staticLimit(key=len, max_value=limitLength))
Additionally, there are many other methods you can use in addition to this to manage size(so many white papers to read) if that is an issue perhaps many generations into the evolutions and problems start to arise because of individual size.
One example is using a double tournmanet:
toolbox.register("select", tools.selDoubleTournament,
fitness_size=7,
parsimony_size=1.4,
fitness_first=True)
https://deap.readthedocs.io/en/master/api/tools.html#deap.tools.selDoubleTournament
Another brute force method may be more... messy, like having your mutation operator randomly choose between the normal mutation, and something like a shrink operator with a growing probability as the size reaches some defines limit that we don't necessarily want to converge too...this encourages the evolvability up to a point that we happen to know already.
deap.gp.shrink
https://deap.readthedocs.io/en/master/api/tools.html#deap.gp.mutShrink
Here is an example of that...but this is sorta tricky to really tune so be wary of slapping it into your code:
toolbox.register("mutShrink", gp.mutShrink)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutUniform", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)
limitLength = 65
target_mean_length = 30
def mixed_mutation(ind):
"""
proba_by_size_i = np.round([0.5*(float(ind-30)/float(65)) for ind in [10,20,30,40,50,60,65]], 3) = array([-0.154, -0.077, 0. , 0.077, 0.154, 0.231, 0.269])
"""
proba_by_size = 0.5*(float(len(ind)-target_mean_length)/float(limitLength))
if random.random() < proba_by_size:
ind, = toolbox.mutShrink(ind)
else:
ind, = toolbox.mutUniform(ind)
return ind,
toolbox.register("mutate", mixed_mutation)
Here is a big update to the code, it was on Revision #3 I believe when posted. More info with discussion here:
https://mail.google.com/mail/u/0/#inbox/FMfcgxwGCbBGsFnpFGwRTTTjnrVCHPTk