Skip to content

Instantly share code, notes, and snippets.

@alpha-beta-soup
Last active April 2, 2023 22:52
Show Gist options
  • Save alpha-beta-soup/b18a8ff2c869b17bfc3f839bfe11c3f5 to your computer and use it in GitHub Desktop.
Save alpha-beta-soup/b18a8ff2c869b17bfc3f839bfe11c3f5 to your computer and use it in GitHub Desktop.
Stratified random sampling of GPKG files
import sys
from pathlib import Path
import pandas as pd
import geopandas as gpd
def stratified_sample(df: pd.DataFrame, groupby_column: str, sampling_rate: float = 0.01) -> pd.DataFrame:
assert 0.0 < sampling_rate <= 1.0
assert groupby_column in df.columns
num_rows = int((df.shape[0] * sampling_rate) // 1)
num_classes = len(df[groupby_column].unique())
num_rows_per_class = int(max(1, ((num_rows / num_classes) // 1)))
df_sample = df.groupby(groupby_column, group_keys=False).apply(lambda x: x.sample(min(len(x), num_rows_per_class)))
return df_sample
if __name__ == '__main__':
# python stratifed_sample.py input output groupby_column sampling_rate
# python stratified_sample.py input.gpkg output.gpkg lu_coden 0.005
input, output = Path(sys.argv[1]), Path(sys.argv[2])
groupby_column = str(sys.argv[3])
sampling_rate = float(sys.argv[4])
df = gpd.read_file(input)
stratified_sample(df, groupby_column, sampling_rate).to_file(output)
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment