iamarkdev · December 4, 2019 16:33 · iamarkdev · Dec 20, 2016 · jonathanGB · Dec 20, 2016
diff --git a/ByteSubstringsChallenge.js b/ByteSubstringsChallenge.js
 /*

 Preface:

 I'm issuing this challenge as I've hit a major roadblock in the development process
 of one of my personal side projects. I've spent the last 2 weeks exhaustively exploring
 solutions to the following problem and I'm at my wits end. I've taken a number of approaches
 using various tree data structures and clever algorithms with no success. 


 Problem:

 Given an input, find all possible byte substrings for the input and count how many 
 times each unique byte substring occurs throughout the input. Return all byte substrings 
 along with their occurence count that have an occurence count > 1.

 You'll notice the examples given below show all byte substrings including byte substrings that have
 an occurrence count of 1. This is mainly for illustration of the problem. Your solution
 should exclude byte substrings that only occur 1 time in the input.

 REQUIREMENTS
 - Your solution may be written in whatever language you're comfortable with.
 - Your solution should be able to compute a correct output more efficiently than O(n^2). 
 - Your solution should not exceed a memory usage limit of 32GB. 
 - Your solution must work with any file of any type that is <= 1GB in size. 
 - Your solution's output must exclude byte substrings than only have 1 occurrence in the input.
 - Your solution's output must be written to a new file in a transferrable data format such as JSON (output.json for example).
 - Your solution's output should be equivalent to an object of key value pairs as: (substring) => (# of occurrences).
 - Please provide a way to contact you in the event your solution is accepted as the winning solution.

 JUDGING CRITERIA
 - Correct output (See requirements and examples below to understand what's considered correct output). (Pass / fail)
 - Meets requirements (See above). (Pass / fail)
 - Execution time: I will benchmark all solutions on the same machine with the same input - the input will be a binary file.
 - Algorithm / Data Structure clarity and concision - if your solution has complex working parts, 
  please include comments where applicable to give better explain what's going on. 

 SUBMISSION
 Please comment with a link to a gist of your solution to submit it for consideration.

 PRIZE
 I am guaranteeing $500.00 USD that will be gifted through PayPal to the individual that provides 
 the solution that best fits the judging criteria and requirements listed above. 

 --

 I will be checking back periodically. Judging will be done on a rolling basis up until December 25th. 

 I will give feedback as comments to the gists you link to as solutions.

 A winning solution will be selected on December 25th, 2016. 

 If you have any questions, feel free to email me: [email protected] ...
 Or message me on Facebook: https://www.facebook.com/braydonbatungbacal

 -----------
 -----------
 -----------

 Given this example input: 0xFE 0xFF 0xAB 0x00 0xFE 0xFF 0xAB 0xAC

 We would find the following byte substrings with the following occurrence counts.

 0xFE - 2 occurrences
 0xFF - 2 occurrences
 0xAB - 2 occurrences
 0x00 - 1 occurrence
 0xAC - 1 occurrence

 0xFE 0xFF - 2 occurrences
 0xFF 0xAB - 2 occurrences
 0xAB 0x00 - 1 occurrence
 0x00 0xFE - 1 occurrence
 0xAB 0xAC - 1 occurrence

 0xFE 0xFF 0xAB - 2 occurrences
 0xFF 0xAB 0x00 - 1 occurrence
 0xAB 0x00 0xFE - 1 occurrence
 0x00 0xFE 0xFF - 1 occurrence
 0xFF 0xAB 0xAC - 1 occurrence

 0xFE 0xFF 0xAB 0x00 - 1 occurrence
 0xFF 0xAB 0x00 0xFE - 1 occurrence
 0xAB 0x00 0xFE 0xFF - 1 occurrence
 0x00 0xFE 0xFF 0xAB - 1 occurrence
 0xFE 0xFF 0xAB 0xAC - 1 occurrence

 0xFE 0xFF 0xAB 0x00 0xFE - 1 occurrence 
 0xFF 0xAB 0x00 0xFE 0xFF - 1 occurrence
 0xAB 0x00 0xFE 0xFF 0xAB - 1 occurrence
 0x00 0xFE 0xFF 0xAB 0xAC - 1 occurrence

 0xFE 0xFF 0xAB 0x00 0xFE 0xFF - 1 occurrence
 0xFF 0xAB 0x00 0xFE 0xFF 0xAB - 1 occurrence
 0xAB 0x00 0xFE 0xFF 0xAB 0xAC - 1 occurrence

 0xFE 0xFF 0xAB 0x00 0xFE 0xFF 0xAB - 1 occurrence
 0xFF 0xAB 0x00 0xFE 0xFF 0xAB 0xAC - 1 occurrence

 0xFE 0xFF 0xAB 0x00 0xFE 0xFF 0xAB 0xAC - 1 occurrence

 -

 The accepted output for the given input above that excludes all 1 occurrence byte substrings would be:

 0xFE - 2 occurrences
 0xFF - 2 occurrences
 0xAB - 2 occurrences
 0xFE 0xFF - 2 occurrences
 0xFE 0xFF 0xAB - 2 occurrences

 And then as JSON:

 {
  "FE" : 2,
  "FF" : 2,
  "AB" : 2,
  "FEFF" : 2,
  "FEFFAB" : 2
 }

 -----------

 Edge case handling

 Let's say we have the following input 0x00 0x00 0x00 0x00

 The correct matched byte substrings would be 

 0x00 - 4 occurrences
 0x00 0x00 - 3 occurrences
 0x00 0x00 0x00 - 2 occurrences

 {
  "00" : 4,
  "0000" : 3,
  "000000" : 2
 }

 -----------

 Example Input Files

 Here are links to two example input files. These files are well below the 1GB limit, but they represent
 the level of entropy and distribution of byte sequences you can likely expect from the final input file
 your solution will be benchmarked against.

 https://drive.google.com/file/d/0B1GeH8qXPbvBZTRuNWVJbUVNZGs/view?usp=sharing

 AND

 https://drive.google.com/file/d/0B1GeH8qXPbvBYmxMU1JSWjlPc2c/view?usp=sharing

 -----------

 The proposed solution below is O(n^2) where n is the length of the given input. 
 This is not ideal but it is enough to demonstrate expected output for a given input 
 for this problem, along with a brute force solution.

 Also note, while this example is using strings and characters - it is purely to
 demonstrate the generalness of the brute force algorithm used. Your solution will
 be required to work with an array of bytes.

 */

 function Occurrences() {
    this.sequences = {};
    this.longestSequence = '';
 }

 Occurrences.prototype.expand = function(byte) {
    this.longestSequence += byte;

    for (var offset = 0; offset < this.longestSequence.length; offset++) {
        var sequence = this.longestSequence.substr(offset);
        this.sequences[sequence] = this.sequences[sequence] + 1 || 1;
    }
 }


 var occurrences = new Occurrences();

 var input = "abcdabcefgabc";

 for (var i = 0; i < input.length; i++) {
    occurrences.expand(input[i]);
 }

 console.log(occurrences.sequences);

 /* 

 --------

 Given input for solution above: "abcdabcefgabc".

 Acceptable Output - This is excluding substrings that only occur 1 time in the input: 
 {
 "a":3,
 "ab":3,
 "b":3,
 "abc":3,
 "bc":3,
 "c":3
 }

 Complete Solution Output: 
 {
 "a":3,
 "ab":3,
 "b":3,
 "abc":3,
 "bc":3,
 "c":3,
 "abcd":1,
 "bcd":1,
 "cd":1,
 "d":1,
 "abcda":1,
 "bcda":1,
 "cda":1,
 "da":1,
 "abcdab":1,
 "bcdab":1,
 "cdab":1,
 "dab":1,
 "abcdabc":1,
 "bcdabc":1,
 "cdabc":1,
 "dabc":1,
 "abcdabce":1,
 "bcdabce":1,
 "cdabce":1,
 "dabce":1,
 "abce":1,
 "bce":1,
 "ce":1,
 "e":1,
 "abcdabcef":1,
 "bcdabcef":1,
 "cdabcef":1,
 "dabcef":1,
 "abcef":1,
 "bcef":1,
 "cef":1,
 "ef":1,
 "f":1,
 "abcdabcefg":1,
 "bcdabcefg":1,
 "cdabcefg":1,
 "dabcefg":1,
 "abcefg":1,
 "bcefg":1,
 "cefg":1,
 "efg":1,
 "fg":1,
 "g":1,
 "abcdabcefga":1,
 "bcdabcefga":1,
 "cdabcefga":1,
 "dabcefga":1,
 "abcefga":1,
 "bcefga":1,
 "cefga":1,
 "efga":1,
 "fga":1,
 "ga":1,
 "abcdabcefgab":1,
 "bcdabcefgab":1,
 "cdabcefgab":1,
 "dabcefgab":1,
 "abcefgab":1,
 "bcefgab":1,
 "cefgab":1,
 "efgab":1,
 "fgab":1,
 "gab":1,
 "abcdabcefgabc":1,
 "bcdabcefgabc":1,
 "cdabcefgabc":1,
 "dabcefgabc":1,
 "abcefgabc":1,
 "bcefgabc":1,
 "cefgabc":1,
 "efgabc":1,
 "fgabc":1,
 "gabc":1
 }

 */
	/*

	Preface:

	I'm issuing this challenge as I've hit a major roadblock in the development process
	of one of my personal side projects. I've spent the last 2 weeks exhaustively exploring
	solutions to the following problem and I'm at my wits end. I've taken a number of approaches
	using various tree data structures and clever algorithms with no success.


	Problem:

	Given an input, find all possible byte substrings for the input and count how many
	times each unique byte substring occurs throughout the input. Return all byte substrings
	along with their occurence count that have an occurence count > 1.

	You'll notice the examples given below show all byte substrings including byte substrings that have
	an occurrence count of 1. This is mainly for illustration of the problem. Your solution
	should exclude byte substrings that only occur 1 time in the input.

	REQUIREMENTS
	- Your solution may be written in whatever language you're comfortable with.
	- Your solution should be able to compute a correct output more efficiently than O(n^2).
	- Your solution should not exceed a memory usage limit of 32GB.
	- Your solution must work with any file of any type that is <= 1GB in size.
	- Your solution's output must exclude byte substrings than only have 1 occurrence in the input.
	- Your solution's output must be written to a new file in a transferrable data format such as JSON (output.json for example).
	- Your solution's output should be equivalent to an object of key value pairs as: (substring) => (# of occurrences).
	- Please provide a way to contact you in the event your solution is accepted as the winning solution.

	JUDGING CRITERIA
	- Correct output (See requirements and examples below to understand what's considered correct output). (Pass / fail)
	- Meets requirements (See above). (Pass / fail)
	- Execution time: I will benchmark all solutions on the same machine with the same input - the input will be a binary file.
	- Algorithm / Data Structure clarity and concision - if your solution has complex working parts,
	please include comments where applicable to give better explain what's going on.

	SUBMISSION
	Please comment with a link to a gist of your solution to submit it for consideration.

	PRIZE
	I am guaranteeing $500.00 USD that will be gifted through PayPal to the individual that provides
	the solution that best fits the judging criteria and requirements listed above.

	--

	I will be checking back periodically. Judging will be done on a rolling basis up until December 25th.

	I will give feedback as comments to the gists you link to as solutions.

	A winning solution will be selected on December 25th, 2016.

	If you have any questions, feel free to email me: [email protected] ...
	Or message me on Facebook: https://www.facebook.com/braydonbatungbacal

	-----------
	-----------
	-----------

	Given this example input: 0xFE 0xFF 0xAB 0x00 0xFE 0xFF 0xAB 0xAC

	We would find the following byte substrings with the following occurrence counts.

	0xFE - 2 occurrences
	0xFF - 2 occurrences
	0xAB - 2 occurrences
	0x00 - 1 occurrence
	0xAC - 1 occurrence

	0xFE 0xFF - 2 occurrences
	0xFF 0xAB - 2 occurrences
	0xAB 0x00 - 1 occurrence
	0x00 0xFE - 1 occurrence
	0xAB 0xAC - 1 occurrence

	0xFE 0xFF 0xAB - 2 occurrences
	0xFF 0xAB 0x00 - 1 occurrence
	0xAB 0x00 0xFE - 1 occurrence
	0x00 0xFE 0xFF - 1 occurrence
	0xFF 0xAB 0xAC - 1 occurrence

	0xFE 0xFF 0xAB 0x00 - 1 occurrence
	0xFF 0xAB 0x00 0xFE - 1 occurrence
	0xAB 0x00 0xFE 0xFF - 1 occurrence
	0x00 0xFE 0xFF 0xAB - 1 occurrence
	0xFE 0xFF 0xAB 0xAC - 1 occurrence

	0xFE 0xFF 0xAB 0x00 0xFE - 1 occurrence
	0xFF 0xAB 0x00 0xFE 0xFF - 1 occurrence
	0xAB 0x00 0xFE 0xFF 0xAB - 1 occurrence
	0x00 0xFE 0xFF 0xAB 0xAC - 1 occurrence

	0xFE 0xFF 0xAB 0x00 0xFE 0xFF - 1 occurrence
	0xFF 0xAB 0x00 0xFE 0xFF 0xAB - 1 occurrence
	0xAB 0x00 0xFE 0xFF 0xAB 0xAC - 1 occurrence

	0xFE 0xFF 0xAB 0x00 0xFE 0xFF 0xAB - 1 occurrence
	0xFF 0xAB 0x00 0xFE 0xFF 0xAB 0xAC - 1 occurrence

	0xFE 0xFF 0xAB 0x00 0xFE 0xFF 0xAB 0xAC - 1 occurrence

	-

	The accepted output for the given input above that excludes all 1 occurrence byte substrings would be:

	0xFE - 2 occurrences
	0xFF - 2 occurrences
	0xAB - 2 occurrences
	0xFE 0xFF - 2 occurrences
	0xFE 0xFF 0xAB - 2 occurrences

	And then as JSON:

	{
	"FE" : 2,
	"FF" : 2,
	"AB" : 2,
	"FEFF" : 2,
	"FEFFAB" : 2
	}

	-----------

	Edge case handling

	Let's say we have the following input 0x00 0x00 0x00 0x00

	The correct matched byte substrings would be

	0x00 - 4 occurrences
	0x00 0x00 - 3 occurrences
	0x00 0x00 0x00 - 2 occurrences

	{
	"00" : 4,
	"0000" : 3,
	"000000" : 2
	}

	-----------

	Example Input Files

	Here are links to two example input files. These files are well below the 1GB limit, but they represent
	the level of entropy and distribution of byte sequences you can likely expect from the final input file
	your solution will be benchmarked against.

	https://drive.google.com/file/d/0B1GeH8qXPbvBZTRuNWVJbUVNZGs/view?usp=sharing

	AND

	https://drive.google.com/file/d/0B1GeH8qXPbvBYmxMU1JSWjlPc2c/view?usp=sharing

	-----------

	The proposed solution below is O(n^2) where n is the length of the given input.
	This is not ideal but it is enough to demonstrate expected output for a given input
	for this problem, along with a brute force solution.

	Also note, while this example is using strings and characters - it is purely to
	demonstrate the generalness of the brute force algorithm used. Your solution will
	be required to work with an array of bytes.

	*/

	function Occurrences() {
	this.sequences = {};
	this.longestSequence = '';
	}

	Occurrences.prototype.expand = function(byte) {
	this.longestSequence += byte;

	for (var offset = 0; offset < this.longestSequence.length; offset++) {
	var sequence = this.longestSequence.substr(offset);
	this.sequences[sequence] = this.sequences[sequence] + 1 \|\| 1;
	}
	}


	var occurrences = new Occurrences();

	var input = "abcdabcefgabc";

	for (var i = 0; i < input.length; i++) {
	occurrences.expand(input[i]);
	}

	console.log(occurrences.sequences);

	/*

	--------

	Given input for solution above: "abcdabcefgabc".

	Acceptable Output - This is excluding substrings that only occur 1 time in the input:
	{
	"a":3,
	"ab":3,
	"b":3,
	"abc":3,
	"bc":3,
	"c":3
	}

	Complete Solution Output:
	{
	"a":3,
	"ab":3,
	"b":3,
	"abc":3,
	"bc":3,
	"c":3,
	"abcd":1,
	"bcd":1,
	"cd":1,
	"d":1,
	"abcda":1,
	"bcda":1,
	"cda":1,
	"da":1,
	"abcdab":1,
	"bcdab":1,
	"cdab":1,
	"dab":1,
	"abcdabc":1,
	"bcdabc":1,
	"cdabc":1,
	"dabc":1,
	"abcdabce":1,
	"bcdabce":1,
	"cdabce":1,
	"dabce":1,
	"abce":1,
	"bce":1,
	"ce":1,
	"e":1,
	"abcdabcef":1,
	"bcdabcef":1,
	"cdabcef":1,
	"dabcef":1,
	"abcef":1,
	"bcef":1,
	"cef":1,
	"ef":1,
	"f":1,
	"abcdabcefg":1,
	"bcdabcefg":1,
	"cdabcefg":1,
	"dabcefg":1,
	"abcefg":1,
	"bcefg":1,
	"cefg":1,
	"efg":1,
	"fg":1,
	"g":1,
	"abcdabcefga":1,
	"bcdabcefga":1,
	"cdabcefga":1,
	"dabcefga":1,
	"abcefga":1,
	"bcefga":1,
	"cefga":1,
	"efga":1,
	"fga":1,
	"ga":1,
	"abcdabcefgab":1,
	"bcdabcefgab":1,
	"cdabcefgab":1,
	"dabcefgab":1,
	"abcefgab":1,
	"bcefgab":1,
	"cefgab":1,
	"efgab":1,
	"fgab":1,
	"gab":1,
	"abcdabcefgabc":1,
	"bcdabcefgabc":1,
	"cdabcefgabc":1,
	"dabcefgabc":1,
	"abcefgabc":1,
	"bcefgabc":1,
	"cefgabc":1,
	"efgabc":1,
	"fgabc":1,
	"gabc":1
	}

	*/