parthvshah · October 19, 2022 00:22
diff --git a/dfai.py b/dfai.py
 """
 a) Images generated by the microscope

 . . . . . . . . . .
 . . . . . * . . . .
 . . . * * * * . . .
 . . * * * * * . . .
 * * * * * * * . . .
 . . * * * * * . . .
 . . * * * . . . . .
 . . . . * . . . . .
 . . . . . . . . . .
 . . . . . . . . . .

 (26% of image resolution)

 b) Images generated by the dye sensor

 . . . . . . . . . .
 . . . . . . . . . .
 . . . . . . . x . .
 . . . . . x . . . .
 x . . . . . . . . .
 . . . x . . . . . .
 . . . . . . . . . .
 . . . . . . . . . .
 . . . . . . . . x .
 . . . . . . . . . .

 (~11.5% of parasite)

 c) Overlay

 . . . . . . . . . .
 . . . . . * . . . .
 . . . * * * * x . .
 . . * * * x * . . .
 x * * * * * * . . .
 . . * x * * * . . .
 . . * * * . . . . .
 . . . . * . . . . .
 . . . . . . . . x .
 . . . . . . . . . .

 (+ve for cancer)

 For the creation of the microscope images, I am using a novel technique. I generate values 
 from a normal distribution and then bin them. Once binned, the frequencies are encoded as 1s 
 and they form a histogram. I mirror this and append it to form a blob. I then use values from 
 a uniform distribution to apply the dye.
 """

 import numpy as np

 from numpy.random import seed
 from numpy.random import normal

 import random

 import matplotlib.pyplot as plt

 DATA_DIR = "./data/"
 SIZE = 100000  # 100K


 def create_images(index):
    seed(index)
    random.seed(index)

    data = normal(loc=0, scale=1, size=int(SIZE * SIZE * 0.1))
    count, bins, ignored = plt.hist(data, SIZE)

    f1 = open(DATA_DIR + "microscope_" + str(index), "w+")
    f2 = open(DATA_DIR + "dye_" + str(index), "w+")

    for val in count:
        row = np.zeros(int(SIZE / 2))
        if int(val) == 0:
            row[:1] = 1
        else:
            row[: int(val)] = 1

        reversed_row = row[::-1]
        entire_row = np.concatenate((reversed_row, row), axis=0)
        f1.write("".join(str(int(x)) for x in list(entire_row)))
        f1.write("\n")

        if random.uniform(0, 1) < 0.35:
            for j in range(SIZE):
                if random.uniform(0, 1) < 0.25:
                    entire_row[j] = -1.0

        entire_row[entire_row == 1] = 0
        entire_row[entire_row == -1] = 1

        f2.write("".join(str(int(x)) for x in list(entire_row)))
        f2.write("\n")

    f1.close()
    f2.close()


 for i in range(10):
    print("Creating image", i + 1)
    create_images(i + 1)

 """
 1) Microscope image

    - Hamming code - Represent each row as a vector of pairs. The pairs consist of "1" or "0" 
    and the number of times it repeats. Eg. Row indexed as 1 is [('0', 5), ('1', 1), ('0', 4)]

    Space estimate - Each row would at most have 3 tuples of size two with the first element 
    taking 1 byte (char) and the second element taking 4 bytes (int). That is a total of 15 bytes
    per row. Given 1e5 rows, that's 1.5 MB per image. 

 2) Dye image
    
    - Coordinate pairs - Since each image is sparse, store the location of the dye as a vector 
    of pairs (x, y) of integers.

    Space estimate - This depends on the amount of dye affecting the cells. But each pair of
    coordinates takes 8 bytes (int). In my tests, this is approximately 14 MB per image.   

 """
 DATA_DIR = "./data/"


 def read_image(index, type):
    encoded_image = list()

    if type == "microscope":
        with open(DATA_DIR + "microscope_" + str(index), "r") as f:
            lines = f.readlines()
            for line in lines:
                line_image = list()
                character = 0
                count = 0
                for i in range(len(line) - 1):
                    character = line[i]
                    if line[i] != line[i + 1]:
                        line_image.append((character, count + 1))
                        count = 0
                    else:
                        count += 1
                encoded_image.append(line_image)

    if type == "dye":
        with open(DATA_DIR + "dye_" + str(index), "r") as f:
            lines = f.readlines()
            for i, line in enumerate(lines):
                for j, character in enumerate(line):
                    if character == "1":
                        encoded_image.append((i, j))

    return encoded_image


 """
 3) Detect cancer

 Check each pair of numbers from the dye representation in the microscope representation. If 
 an overlap in 1s, increase the count of cancerous cells. Advance pointers based on the size of the
 encoding pointer. Eg. (3, 5) in [('0', 2), ('1', 5), ('0', 3)] is going to be found in the
 second segment encoding 1s. Hence, it is found inside the parasite.

 Time complexity - The algorithm runs in O(N) where N is the number of points with dye. This
 comes up to 300s in my testing. 
 """


 def find_overlap(microscope, dye, threshold):
    number_points = 0
    number_lines = 0

    for point in dye:
        line = microscope[point[0]]

        current_char = line[0][0]
        running_sum = line[0][1]

        for i in range(1, len(line)):
            if running_sum > point[1]:
                current_char = line[i - 1][0]
                break
            else:
                running_sum += line[i][1]

        if current_char == "1":
            number_points += 1

    for line in microscope:
        for segment in line:
            if segment[0] == "1":
                number_lines += segment[1]

    affected = round(number_points / number_lines, 6)
    if round(number_points / number_lines, 6) >= threshold:
        return True, affected
    else:
        return False, affected


 import time

 for i in range(10):
    microscope_image = read_image(i + 1, "microscope")
    dye_image = read_image(i + 1, "dye")

    start = time.time()
    is_cancer, probability = find_overlap(microscope_image, dye_image, 0.1)
    end = time.time()
    print(i + 1, "-", is_cancer, probability, round(end - start, 4))

 """
 4) Storing images of cancerous parasites

 If we wish to only store images of cancerous parasites, we can adopt the same Hamming code
 representation. The dye can be encoded with any other single character such as 'd'. Eg.
 Row indexed 3 can be stored as [('0', 2), ('1', 3), ('d', 1), ('1', 1), ('0', 3)]

 5) Speed up

 The simplest way to speed up the current algorithm would be to parallelize the detection of 
 cancer. This would parallelize the reads too which take up a bulk of the time. Using the SIMD
 paradigm, distribute images amongst the processors available. A finer-grained parallelization
 would be to split the image row-wise and allow each processor to work on batches of rows. 

 I have used the Message Passing Interface and OpenMP in C to parallelize algorithms in the
 past. Superlinear speedups can be observed.
 """
	"""
	a) Images generated by the microscope

	. . . . . . . . . .
	. . . . . * . . . .
	. . . * * * * . . .
	. . * * * * * . . .
	* * * * * * * . . .
	. . * * * * * . . .
	. . * * * . . . . .
	. . . . * . . . . .
	. . . . . . . . . .
	. . . . . . . . . .

	(26% of image resolution)

	b) Images generated by the dye sensor

	. . . . . . . . . .
	. . . . . . . . . .
	. . . . . . . x . .
	. . . . . x . . . .
	x . . . . . . . . .
	. . . x . . . . . .
	. . . . . . . . . .
	. . . . . . . . . .
	. . . . . . . . x .
	. . . . . . . . . .

	(~11.5% of parasite)

	c) Overlay

	. . . . . . . . . .
	. . . . . * . . . .
	. . . * * * * x . .
	. . * * * x * . . .
	x * * * * * * . . .
	. . * x * * * . . .
	. . * * * . . . . .
	. . . . * . . . . .
	. . . . . . . . x .
	. . . . . . . . . .

	(+ve for cancer)

	For the creation of the microscope images, I am using a novel technique. I generate values
	from a normal distribution and then bin them. Once binned, the frequencies are encoded as 1s
	and they form a histogram. I mirror this and append it to form a blob. I then use values from
	a uniform distribution to apply the dye.
	"""

	import numpy as np

	from numpy.random import seed
	from numpy.random import normal

	import random

	import matplotlib.pyplot as plt

	DATA_DIR = "./data/"
	SIZE = 100000 # 100K


	def create_images(index):
	seed(index)
	random.seed(index)

	data = normal(loc=0, scale=1, size=int(SIZE * SIZE * 0.1))
	count, bins, ignored = plt.hist(data, SIZE)

	f1 = open(DATA_DIR + "microscope_" + str(index), "w+")
	f2 = open(DATA_DIR + "dye_" + str(index), "w+")

	for val in count:
	row = np.zeros(int(SIZE / 2))
	if int(val) == 0:
	row[:1] = 1
	else:
	row[: int(val)] = 1

	reversed_row = row[::-1]
	entire_row = np.concatenate((reversed_row, row), axis=0)
	f1.write("".join(str(int(x)) for x in list(entire_row)))
	f1.write("\n")

	if random.uniform(0, 1) < 0.35:
	for j in range(SIZE):
	if random.uniform(0, 1) < 0.25:
	entire_row[j] = -1.0

	entire_row[entire_row == 1] = 0
	entire_row[entire_row == -1] = 1

	f2.write("".join(str(int(x)) for x in list(entire_row)))
	f2.write("\n")

	f1.close()
	f2.close()


	for i in range(10):
	print("Creating image", i + 1)
	create_images(i + 1)

	"""
	1) Microscope image

	- Hamming code - Represent each row as a vector of pairs. The pairs consist of "1" or "0"
	and the number of times it repeats. Eg. Row indexed as 1 is [('0', 5), ('1', 1), ('0', 4)]

	Space estimate - Each row would at most have 3 tuples of size two with the first element
	taking 1 byte (char) and the second element taking 4 bytes (int). That is a total of 15 bytes
	per row. Given 1e5 rows, that's 1.5 MB per image.

	2) Dye image

	- Coordinate pairs - Since each image is sparse, store the location of the dye as a vector
	of pairs (x, y) of integers.

	Space estimate - This depends on the amount of dye affecting the cells. But each pair of
	coordinates takes 8 bytes (int). In my tests, this is approximately 14 MB per image.

	"""
	DATA_DIR = "./data/"


	def read_image(index, type):
	encoded_image = list()

	if type == "microscope":
	with open(DATA_DIR + "microscope_" + str(index), "r") as f:
	lines = f.readlines()
	for line in lines:
	line_image = list()
	character = 0
	count = 0
	for i in range(len(line) - 1):
	character = line[i]
	if line[i] != line[i + 1]:
	line_image.append((character, count + 1))
	count = 0
	else:
	count += 1
	encoded_image.append(line_image)

	if type == "dye":
	with open(DATA_DIR + "dye_" + str(index), "r") as f:
	lines = f.readlines()
	for i, line in enumerate(lines):
	for j, character in enumerate(line):
	if character == "1":
	encoded_image.append((i, j))

	return encoded_image


	"""
	3) Detect cancer

	Check each pair of numbers from the dye representation in the microscope representation. If
	an overlap in 1s, increase the count of cancerous cells. Advance pointers based on the size of the
	encoding pointer. Eg. (3, 5) in [('0', 2), ('1', 5), ('0', 3)] is going to be found in the
	second segment encoding 1s. Hence, it is found inside the parasite.

	Time complexity - The algorithm runs in O(N) where N is the number of points with dye. This
	comes up to 300s in my testing.
	"""


	def find_overlap(microscope, dye, threshold):
	number_points = 0
	number_lines = 0

	for point in dye:
	line = microscope[point[0]]

	current_char = line[0][0]
	running_sum = line[0][1]

	for i in range(1, len(line)):
	if running_sum > point[1]:
	current_char = line[i - 1][0]
	break
	else:
	running_sum += line[i][1]

	if current_char == "1":
	number_points += 1

	for line in microscope:
	for segment in line:
	if segment[0] == "1":
	number_lines += segment[1]

	affected = round(number_points / number_lines, 6)
	if round(number_points / number_lines, 6) >= threshold:
	return True, affected
	else:
	return False, affected


	import time

	for i in range(10):
	microscope_image = read_image(i + 1, "microscope")
	dye_image = read_image(i + 1, "dye")

	start = time.time()
	is_cancer, probability = find_overlap(microscope_image, dye_image, 0.1)
	end = time.time()
	print(i + 1, "-", is_cancer, probability, round(end - start, 4))

	"""
	4) Storing images of cancerous parasites

	If we wish to only store images of cancerous parasites, we can adopt the same Hamming code
	representation. The dye can be encoded with any other single character such as 'd'. Eg.
	Row indexed 3 can be stored as [('0', 2), ('1', 3), ('d', 1), ('1', 1), ('0', 3)]

	5) Speed up

	The simplest way to speed up the current algorithm would be to parallelize the detection of
	cancer. This would parallelize the reads too which take up a bulk of the time. Using the SIMD
	paradigm, distribute images amongst the processors available. A finer-grained parallelization
	would be to split the image row-wise and allow each processor to work on batches of rows.

	I have used the Message Passing Interface and OpenMP in C to parallelize algorithms in the
	past. Superlinear speedups can be observed.
	"""