NicMcPhee · July 15, 2018 23:26
diff --git a/GECCO_tutorial_graph_database_demo.cypher b/GECCO_tutorial_graph_database_demo.cypher
 // Clear the DB for a clean start
 MATCH (n) DETACH DELETE n;

 CREATE CONSTRAINT ON (i:Individual) ASSERT i.uuid IS UNIQUE;
 CREATE CONSTRAINT ON (e:Errors) ASSERT e.Errors_vector IS UNIQUE;

 CREATE INDEX ON :Individual(generation);
 CREATE INDEX ON :Errors(total_error);

 USING PERIODIC COMMIT
 LOAD CSV WITH HEADERS FROM
 'http://facultypages.morris.umn.edu/~mcphee/Research/GECCO2016_tutorial/push_regression_run_2.csv' AS line
 WITH line,
 [toInteger(line.TC0), toInteger(line.TC1), toInteger(line.TC2), toInteger(line.TC3), toInteger(line.TC4), toInteger(line.TC5), toInteger(line.TC6), toInteger(line.TC7), toInteger(line.TC8), toInteger(line.TC9)]
 AS errors_vector
 CREATE (individual:Individual {uuid: line.uuid})
 SET individual.generation = toInteger(line.generation),
    individual.location = toInteger(line.location),
    individual.plush_genome_size = toInteger(line.`plush-genome-size`),
    individual.push_program_size = toInteger(line.`push-program-size`),
    individual.plush_genome = line.`plush-genome`
 MERGE (errors:Errors {errors_vector: errors_vector, total_error: toInteger(line.`total-error`)})
 CREATE (individual)-[r:HAS]->(errors)
 ;

 USING PERIODIC COMMIT
 LOAD CSV WITH HEADERS FROM
 'http://facultypages.morris.umn.edu/~mcphee/Research/GECCO2016_tutorial/push_regression_run_2.csv' AS line
 WITH line, SPLIT(line.`parent-uuids`, ' ') AS parent_uuids
 MATCH (child:Individual {uuid: line.uuid})
 UNWIND parent_uuids as parent_uuid
 MATCH (parent:Individual {uuid: parent_uuid})
 CREATE (parent)-[r:PARENT_OF]->(child)
 SET r.genetic_operator = line.`genetic-operators`
 ;

 // Set all the individuals to have 0 selections by default
 MATCH (n:Individual)
 SET n += {num_selections: 0}
 ;

 // Update num_selections for individuals with more than zero selections
 MATCH (parent:Individual)-[e:PARENT_OF]->(child:Individual)
 WITH parent, count(e) as num_selections
 SET parent.num_selections = num_selections
 ;

 ////////////////////////////
 // Done setting up the DB //
 ////////////////////////////
 // Now some queries!      //
 ////////////////////////////

 // Open with the five generation graph nicely laid out.
 // Then find out how many nodes and edges there are.

 // Then show the Schema diagram

 // Were there winners?
 MATCH (winner:Individual)-[:HAS]->(errors:Errors {total_error: 0})
 RETURN DISTINCT winner
 ;
 // Click open a few nodes to show how we can explore in the GUI

 // What do the last two generations look like?
 MATCH (winner:Individual)-[:HAS]->(:Errors {total_error: 0})
 MATCH (parent:Individual)-[:PARENT_OF]->(winner)
 RETURN DISTINCT winner, parent
 ;

 // What do the errors look like in the last five generations?
 MATCH (winner:Individual)-[:HAS]->(:Errors {total_error: 0})
 MATCH (ancestor:Individual)-[:PARENT_OF*0..4]->(winner)
 MATCH (ancestor:Individual)-[:HAS]->(errors:Errors)
 RETURN DISTINCT ancestor, errors
 ;

 // How many distinct ancestors were there in the first generation?
 MATCH (winner:Individual)-[:HAS]->(:Errors {total_error: 0})
 MATCH (winner)<-[:PARENT_OF*39]-(ancestor:Individual)
 WITH DISTINCT ancestor
 MATCH (ancestor)-[:HAS]->(errors:Errors)
 RETURN DISTINCT ancestor.uuid, ancestor.num_selections, errors
 ORDER BY ancestor.num_selections DESC
 ;

 // How many selections were there in the first generation?
 MATCH (n:Individual {generation: 0})
 RETURN SUM(n.num_selections)
 ;

 // What was the average number of selections?
 // Ignore the last generation since no selections where made there
 MATCH (n:Individual)
 WHERE n.generation < 39
 RETURN AVG(n.num_selections)
 ;

 // How many 10% hyperselections were there?
 // Use 14 as the cutoff
 MATCH (n:Individual)
 WHERE n.num_selections > 14
 MATCH (n)-[:HAS]->(errors:Errors)
 RETURN n.uuid, n.num_selections, n.generation, errors
 ORDER BY n.num_selections DESC
 ;

 // How many 10% *semantic* hyperselections were there
 // in a single generation? I.e., how often in a single
 // generation were more than 10% of the selections (14 here)
 // all from individuals with the same error vector.
 MATCH (errors:Errors)
 MATCH (n:Individual)-[:HAS]->(errors)
 WITH errors, n.generation AS gen, COUNT(DISTINCT n) as num_individuals, SUM(n.num_selections) AS semantic_selections
 WHERE semantic_selections > 14
 RETURN gen, semantic_selections, num_individuals, errors
 ORDER BY semantic_selections DESC, num_individuals DESC, gen ASC
 ;

 MATCH (errors:Errors)
 MATCH (n:Individual)-[:HAS]->(errors)
 WITH errors, n.generation AS gen, COUNT(DISTINCT n) as num_individuals, SUM(n.num_selections) AS semantic_selections
 WHERE semantic_selections > 14 AND num_individuals >= 10
 RETURN gen, semantic_selections, num_individuals, errors
 ORDER BY gen ASC, semantic_selections DESC, num_individuals DESC
 ;

 // How often is there no change in errors from parent to child?
 MATCH (parent:Individual)-[:PARENT_OF]->(child:Individual)
 MATCH (parent)-[:HAS]->(e:Errors)
 MATCH (child)-[:HAS]->(e)
 RETURN COUNT(DISTINCT [parent.uuid, child.uuid])
 ;

 ///////////////////////////////////////////////////

 // How many distinct errors were there?
 MATCH (errors:Errors) return count(*)
 ;

 // What errors had total error < 100?
 MATCH (errors:Errors) WHERE errors.total_error < 100
 RETURN errors
 ORDER BY errors.total_error
 ;

 // How many individual had those errors & when were they introduced?
 MATCH (errors:Errors) WHERE errors.total_error < 100
 MATCH (n:Individual)-[:HAS]->(errors:Errors)
 RETURN errors, MIN(n.generation), count(n)
 ORDER BY errors.total_error
 ;

 // What's the maximum number of selections in each generation?
 UNWIND RANGE(0, 38) AS gen
 MATCH (n:Individual {generation: gen})
 WITH gen, MAX(n.num_selections) AS max_selections
 MATCH (most_selected:Individual {generation: gen, num_selections: max_selections})
 MATCH (most_selected:Individual)-[:HAS]->(errors:Errors)
 RETURN gen, max_selections, errors
 ORDER BY gen;

 // How often do things get worse before (immediately) getting better?
 MATCH (grandparent:Individual)-[:PARENT_OF]->(parent:Individual)
 MATCH (grandparent:Individual)-[:HAS]->(gpe:Errors)
 MATCH (parent:Individual)-[:HAS]->(pe:Errors)
 WHERE gpe.total_error < pe.total_error
 MATCH (parent:Individual)-[:PARENT_OF]->(child:Individual)
 MATCH (child:Individual)-[:HAS]->(ce:Errors)
 WHERE gpe.total_error > ce.total_error
 RETURN DISTINCT grandparent.generation, gpe.total_error, pe.total_error, ce.total_error
 ORDER BY grandparent.generation;

 // Find the errors that eventually led to a success
 MATCH (w:Individual)-[:HAS]-(:Errors {total_error: 0})
 MATCH (n:Individual)-[:PARENT_OF*0..40]->(w:Individual)
 MATCH (e:Errors)<-[:HAS]-(n:Individual)
 RETURN DISTINCT n.generation, e
 ORDER BY n.generation;

 // Count how many times each errors appeared in a winner ancestry
 MATCH (w:Individual)-[:HAS]-(:Errors {total_error: 0})
 MATCH (n:Individual)-[:PARENT_OF*0..40]->(w:Individual)
 MATCH (e:Errors)<-[:HAS]-(n:Individual)
 RETURN DISTINCT e, COUNT(DISTINCT n)
 ORDER BY COUNT(DISTINCT n) DESC;

 // How many distinct ancestors did the winner(s) have?
 MATCH (w:Individual)-[:HAS]-(:Errors {total_error: 0})
 MATCH (n:Individual)-[:PARENT_OF*0..40]->(w:Individual)
 RETURN COUNT(DISTINCT n);

 // Create LEADS_TO edges between parent and child
 // semantics
 MATCH (p:Individual)-[:PARENT_OF]->(c:Individual)
 MATCH (p)-[:HAS]-(pe:Errors)
 MATCH (c)-[:HAS]-(ce:Errors)
 MERGE (pe)-[:LEADS_TO]->(ce);

 match (e:Errors) set e.num_leads_to=0;

 match (e:Errors)-[:LEADS_TO]->(ce:Errors) 
 with e, count(distinct ce) as num_children 
 set e.num_leads_to = num_children;

 MATCH (pe:Errors)-[:LEADS_TO]->(e:Errors)
	// Clear the DB for a clean start
	MATCH (n) DETACH DELETE n;

	CREATE CONSTRAINT ON (i:Individual) ASSERT i.uuid IS UNIQUE;
	CREATE CONSTRAINT ON (e:Errors) ASSERT e.Errors_vector IS UNIQUE;

	CREATE INDEX ON :Individual(generation);
	CREATE INDEX ON :Errors(total_error);

	USING PERIODIC COMMIT
	LOAD CSV WITH HEADERS FROM
	'http://facultypages.morris.umn.edu/~mcphee/Research/GECCO2016_tutorial/push_regression_run_2.csv' AS line
	WITH line,
	[toInteger(line.TC0), toInteger(line.TC1), toInteger(line.TC2), toInteger(line.TC3), toInteger(line.TC4), toInteger(line.TC5), toInteger(line.TC6), toInteger(line.TC7), toInteger(line.TC8), toInteger(line.TC9)]
	AS errors_vector
	CREATE (individual:Individual {uuid: line.uuid})
	SET individual.generation = toInteger(line.generation),
	individual.location = toInteger(line.location),
	individual.plush_genome_size = toInteger(line.`plush-genome-size`),
	individual.push_program_size = toInteger(line.`push-program-size`),
	individual.plush_genome = line.`plush-genome`
	MERGE (errors:Errors {errors_vector: errors_vector, total_error: toInteger(line.`total-error`)})
	CREATE (individual)-[r:HAS]->(errors)
	;

	USING PERIODIC COMMIT
	LOAD CSV WITH HEADERS FROM
	'http://facultypages.morris.umn.edu/~mcphee/Research/GECCO2016_tutorial/push_regression_run_2.csv' AS line
	WITH line, SPLIT(line.`parent-uuids`, ' ') AS parent_uuids
	MATCH (child:Individual {uuid: line.uuid})
	UNWIND parent_uuids as parent_uuid
	MATCH (parent:Individual {uuid: parent_uuid})
	CREATE (parent)-[r:PARENT_OF]->(child)
	SET r.genetic_operator = line.`genetic-operators`
	;

	// Set all the individuals to have 0 selections by default
	MATCH (n:Individual)
	SET n += {num_selections: 0}
	;

	// Update num_selections for individuals with more than zero selections
	MATCH (parent:Individual)-[e:PARENT_OF]->(child:Individual)
	WITH parent, count(e) as num_selections
	SET parent.num_selections = num_selections
	;

	////////////////////////////
	// Done setting up the DB //
	////////////////////////////
	// Now some queries! //
	////////////////////////////

	// Open with the five generation graph nicely laid out.
	// Then find out how many nodes and edges there are.

	// Then show the Schema diagram

	// Were there winners?
	MATCH (winner:Individual)-[:HAS]->(errors:Errors {total_error: 0})
	RETURN DISTINCT winner
	;
	// Click open a few nodes to show how we can explore in the GUI

	// What do the last two generations look like?
	MATCH (winner:Individual)-[:HAS]->(:Errors {total_error: 0})
	MATCH (parent:Individual)-[:PARENT_OF]->(winner)
	RETURN DISTINCT winner, parent
	;

	// What do the errors look like in the last five generations?
	MATCH (winner:Individual)-[:HAS]->(:Errors {total_error: 0})
	MATCH (ancestor:Individual)-[:PARENT_OF*0..4]->(winner)
	MATCH (ancestor:Individual)-[:HAS]->(errors:Errors)
	RETURN DISTINCT ancestor, errors
	;

	// How many distinct ancestors were there in the first generation?
	MATCH (winner:Individual)-[:HAS]->(:Errors {total_error: 0})
	MATCH (winner)<-[:PARENT_OF*39]-(ancestor:Individual)
	WITH DISTINCT ancestor
	MATCH (ancestor)-[:HAS]->(errors:Errors)
	RETURN DISTINCT ancestor.uuid, ancestor.num_selections, errors
	ORDER BY ancestor.num_selections DESC
	;

	// How many selections were there in the first generation?
	MATCH (n:Individual {generation: 0})
	RETURN SUM(n.num_selections)
	;

	// What was the average number of selections?
	// Ignore the last generation since no selections where made there
	MATCH (n:Individual)
	WHERE n.generation < 39
	RETURN AVG(n.num_selections)
	;

	// How many 10% hyperselections were there?
	// Use 14 as the cutoff
	MATCH (n:Individual)
	WHERE n.num_selections > 14
	MATCH (n)-[:HAS]->(errors:Errors)
	RETURN n.uuid, n.num_selections, n.generation, errors
	ORDER BY n.num_selections DESC
	;

	// How many 10% semantic hyperselections were there
	// in a single generation? I.e., how often in a single
	// generation were more than 10% of the selections (14 here)
	// all from individuals with the same error vector.
	MATCH (errors:Errors)
	MATCH (n:Individual)-[:HAS]->(errors)
	WITH errors, n.generation AS gen, COUNT(DISTINCT n) as num_individuals, SUM(n.num_selections) AS semantic_selections
	WHERE semantic_selections > 14
	RETURN gen, semantic_selections, num_individuals, errors
	ORDER BY semantic_selections DESC, num_individuals DESC, gen ASC
	;

	MATCH (errors:Errors)
	MATCH (n:Individual)-[:HAS]->(errors)
	WITH errors, n.generation AS gen, COUNT(DISTINCT n) as num_individuals, SUM(n.num_selections) AS semantic_selections
	WHERE semantic_selections > 14 AND num_individuals >= 10
	RETURN gen, semantic_selections, num_individuals, errors
	ORDER BY gen ASC, semantic_selections DESC, num_individuals DESC
	;

	// How often is there no change in errors from parent to child?
	MATCH (parent:Individual)-[:PARENT_OF]->(child:Individual)
	MATCH (parent)-[:HAS]->(e:Errors)
	MATCH (child)-[:HAS]->(e)
	RETURN COUNT(DISTINCT [parent.uuid, child.uuid])
	;

	///////////////////////////////////////////////////

	// How many distinct errors were there?
	MATCH (errors:Errors) return count(*)
	;

	// What errors had total error < 100?
	MATCH (errors:Errors) WHERE errors.total_error < 100
	RETURN errors
	ORDER BY errors.total_error
	;

	// How many individual had those errors & when were they introduced?
	MATCH (errors:Errors) WHERE errors.total_error < 100
	MATCH (n:Individual)-[:HAS]->(errors:Errors)
	RETURN errors, MIN(n.generation), count(n)
	ORDER BY errors.total_error
	;

	// What's the maximum number of selections in each generation?
	UNWIND RANGE(0, 38) AS gen
	MATCH (n:Individual {generation: gen})
	WITH gen, MAX(n.num_selections) AS max_selections
	MATCH (most_selected:Individual {generation: gen, num_selections: max_selections})
	MATCH (most_selected:Individual)-[:HAS]->(errors:Errors)
	RETURN gen, max_selections, errors
	ORDER BY gen;

	// How often do things get worse before (immediately) getting better?
	MATCH (grandparent:Individual)-[:PARENT_OF]->(parent:Individual)
	MATCH (grandparent:Individual)-[:HAS]->(gpe:Errors)
	MATCH (parent:Individual)-[:HAS]->(pe:Errors)
	WHERE gpe.total_error < pe.total_error
	MATCH (parent:Individual)-[:PARENT_OF]->(child:Individual)
	MATCH (child:Individual)-[:HAS]->(ce:Errors)
	WHERE gpe.total_error > ce.total_error
	RETURN DISTINCT grandparent.generation, gpe.total_error, pe.total_error, ce.total_error
	ORDER BY grandparent.generation;

	// Find the errors that eventually led to a success
	MATCH (w:Individual)-[:HAS]-(:Errors {total_error: 0})
	MATCH (n:Individual)-[:PARENT_OF*0..40]->(w:Individual)
	MATCH (e:Errors)<-[:HAS]-(n:Individual)
	RETURN DISTINCT n.generation, e
	ORDER BY n.generation;

	// Count how many times each errors appeared in a winner ancestry
	MATCH (w:Individual)-[:HAS]-(:Errors {total_error: 0})
	MATCH (n:Individual)-[:PARENT_OF*0..40]->(w:Individual)
	MATCH (e:Errors)<-[:HAS]-(n:Individual)
	RETURN DISTINCT e, COUNT(DISTINCT n)
	ORDER BY COUNT(DISTINCT n) DESC;

	// How many distinct ancestors did the winner(s) have?
	MATCH (w:Individual)-[:HAS]-(:Errors {total_error: 0})
	MATCH (n:Individual)-[:PARENT_OF*0..40]->(w:Individual)
	RETURN COUNT(DISTINCT n);

	// Create LEADS_TO edges between parent and child
	// semantics
	MATCH (p:Individual)-[:PARENT_OF]->(c:Individual)
	MATCH (p)-[:HAS]-(pe:Errors)
	MATCH (c)-[:HAS]-(ce:Errors)
	MERGE (pe)-[:LEADS_TO]->(ce);

	match (e:Errors) set e.num_leads_to=0;

	match (e:Errors)-[:LEADS_TO]->(ce:Errors)
	with e, count(distinct ce) as num_children
	set e.num_leads_to = num_children;

	MATCH (pe:Errors)-[:LEADS_TO]->(e:Errors)