Last active
November 5, 2021 15:43
-
-
Save mattsgithub/9f04ead3e51dabd8c17f9dc3355f2905 to your computer and use it in GitHub Desktop.
2D Hole Dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
def get_disk_holes(r, r_hole, n_hole, allow_center=True): | |
""" | |
Returns disks that are can be used to | |
cut data from a disk of radius `r` | |
""" | |
df = pd.DataFrame(columns=['r_hole', 'r', 'theta', 'x', 'y']) | |
if n_hole < 1: | |
return df | |
n_non_center_holes = n_hole - allow_center | |
df['theta'] = np.linspace(0, 2 * np.pi, num=n_non_center_holes, endpoint=False) | |
df['r'] = [r / 2.] * n_non_center_holes | |
df['x'] = df.r * np.cos(df.theta) | |
df['y'] = df.r * np.sin(df.theta) | |
# Special case for centered hole (undefined) | |
if allow_center: | |
center = {'r_hole': r_hole, 'r' : r / 2., 'theta': None, 'x': 0, 'y': 0} | |
df = df.append(center, ignore_index=True) | |
# Constants | |
df['r_hole'] = r_hole | |
df['r'] = r / 2. | |
return df | |
def get_sample_from_disk(n, r): | |
# Must take square root to get uniform | |
# density across disk | |
r = r * np.sqrt(np.random.random(size=n)) | |
theta = 2 * np.pi * np.random.random(size=len(r)) | |
df = pd.DataFrame() | |
df['r'] = r | |
df['theta'] = theta | |
df['x'] = df.r * np.cos(df.theta) | |
df['y'] = df.r * np.sin(df.theta) | |
return df | |
def get_2d_holes_dataset(n_neg=500, | |
n_pos=500, | |
n_hole=9, | |
r=1., | |
r_hole=.1, | |
r_pos=.02, | |
allow_center=True): | |
""" | |
Generates a complex topological dataset | |
consisting of two manifolds. | |
Args | |
n_neg: int | |
How many negative examples to sample | |
n_pos: int | |
How many negative examples to sample | |
n_hole: int | |
Number of holes to generate | |
r: float | |
Radius of entire circle of which all data resides | |
This is the disk from which negative examples | |
are sampled from | |
r_hole: | |
Radius of hole(s) | |
r_pos: float | |
Radius of disks for positive examples | |
allow_center: True | |
If True, allow a hole to be created in the center | |
""" | |
# First, perform data checks | |
# We can't allow for example, r_hole > r | |
r_hole_diameter = 2 * r_hole | |
max_stacked_holes = 3 | |
holes_max_width = max_stacked_holes * r_hole_diameter | |
if holes_max_width > r: | |
raise ValueError(f'r_hole must be no more than r/6') | |
if r_pos > r_hole: | |
raise ValueError('r_pos cannot be greater than r_role') | |
# Oversample. Will delete after | |
# Need a smater approach instead of factor of 5 approach | |
# Sampling will be proportional to the number of holes | |
# Need to calculate how many points we expect to be | |
# removed for each hole | |
df_neg = get_sample_from_disk(n=5 * n_neg, r=r) | |
df_neg['label'] = 0 | |
# These are the regions where negative | |
# examples are forbidden | |
df_holes = get_disk_holes(r, r_hole, n_hole, | |
allow_center=allow_center) | |
# Find the rows of df_neg that in df_holes | |
# To find this, we recenter data and check if | |
# radius is satisfied | |
df_neg['in_hole'] = 0 | |
for _, row in df_holes.iterrows(): | |
df_neg['in_hole'] = (((row.x - df_neg.x)**2 + (row.y - df_neg.y)**2) <= row.r_hole**2).astype(int) | df_neg.in_hole | |
# Drop points contained in holes | |
df_neg = df_neg[df_neg.in_hole == 0].drop(columns=['in_hole']) | |
df_neg = df_neg.sample(n=n_neg, replace=False) | |
# Sample points inside disk | |
# We can't always get an even split | |
n_pos_per_disk = int(np.floor(n_pos / n_hole)) | |
remainder = n_pos - n_pos_per_disk * n_hole | |
dfs = pd.DataFrame() | |
for _, row in df_holes.iterrows(): | |
if remainder >= 1: | |
df_pos_disk = get_sample_from_disk(n_pos_per_disk + 1, r=r_pos) | |
remainder -= 1 | |
else: | |
df_pos_disk = get_sample_from_disk(n_pos_per_disk, r=r_pos) | |
# Translate disk to center of hole | |
df_pos_disk['x'] = df_pos_disk.x + row.x | |
df_pos_disk['y'] = df_pos_disk.y + row.y | |
dfs = dfs.append(df_pos_disk) | |
dfs['label'] = 1 | |
df_neg = df_neg.append(dfs) | |
df_neg = df_neg.sample(frac=1.) | |
return df_neg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment