Last active
June 24, 2020 16:01
-
-
Save simicd/fda4d5a4ebcd485a7d66c94d8ecaa013 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Read Palmer Station Penguin dataset from GitHub | |
import pandas as pd | |
df = pd.read_csv("https://raw.githubusercontent.com/allisonhorst/" | |
"palmerpenguins/47a3476d2147080e7ceccef4cf70105c808f2cbf/" | |
"data-raw/penguins_raw.csv") | |
# Increase dataset to 1m rows and reset index | |
df = df.sample(1_000_000, replace=True).reset_index(drop=True) | |
# Update sample number (0 to 999'999) | |
df["Sample Number"] = df.index | |
# Add some random variation to numeric columns | |
df[["Culmen Length (mm)", "Culmen Depth (mm)", | |
"Flipper Length (mm)", "Body Mass (g)"]] = df[["Culmen Length (mm)", "Culmen Depth (mm)", | |
"Flipper Length (mm)", "Body Mass (g)"]] \ | |
+ np.random.rand(df.shape[0], 4) | |
# Create dataframe where missing numeric values are filled with zero | |
df_nonan = df.copy() | |
df_nonan[["Culmen Length (mm)", "Culmen Depth (mm)", | |
"Flipper Length (mm)", "Body Mass (g)"]] = df[["Culmen Length (mm)", "Culmen Depth (mm)", | |
"Flipper Length (mm)", "Body Mass (g)"]].fillna(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment