devishot · February 13, 2025 08:06
diff --git a/california_housing_train__basic_statistics.txt b/california_housing_train__basic_statistics.txt
 longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
 count	17000.0	17000.0	17000.0	17000.0	17000.0	17000.0	17000.0	17000.0	17000.0
 mean	-119.6	35.6	28.6	2643.7	539.4	1429.6	501.2	3.9	207300.9
 std	2.0	2.1	12.6	2179.9	421.5	1147.9	384.5	1.9	115983.8
 min	-124.3	32.5	1.0	2.0	1.0	3.0	1.0	0.5	14999.0
 25%	-121.8	33.9	18.0	1462.0	297.0	790.0	282.0	2.6	119400.0
 50%	-118.5	34.2	29.0	2127.0	434.0	1167.0	409.0	3.5	180400.0
 75%	-118.0	37.7	37.0	3151.2	648.2	1721.0	605.2	4.8	265000.0
 max	-114.3	42.0	52.0	37937.0	6445.0	35682.0	6082.0	15.0	500001.0
diff --git a/numerical_data_bad_values.ipynb b/numerical_data_bad_values.ipynb
diff --git a/numerical_data_stats.ipynb b/numerical_data_stats.ipynb
 # @title Setup - Import relevant modules

 # The following code imports relevant modules that
 # allow you to run the colab.
 # If you encounter technical issues running some of the code sections
 # that follow, try running this section again.

 import pandas as pd

 # The following lines adjust the granularity of reporting.
 pd.options.display.max_rows = 10
 pd.options.display.float_format = "{:.1f}".format


 #@title Import the dataset

 # The following code imports the dataset that is used in the colab.

 training_df = pd.read_csv(filepath_or_buffer="https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv")


 # Get statistics on the dataset.

 # The following code returns basic statistics about the data in the dataframe.

 training_df.describe()


 # @title Solution (run this code block to view) { display-mode: "form" }

 print("""The following columns might contain outliers:

  * total_rooms
  * total_bedrooms
  * population
  * households
  * possibly, median_income

 In all of those columns:

  * the standard deviation is almost as high as the mean
  * the delta between 75% and max is much higher than the
      delta between min and 25%.""")
	longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
	count 17000.0 17000.0 17000.0 17000.0 17000.0 17000.0 17000.0 17000.0 17000.0
	mean -119.6 35.6 28.6 2643.7 539.4 1429.6 501.2 3.9 207300.9
	std 2.0 2.1 12.6 2179.9 421.5 1147.9 384.5 1.9 115983.8
	min -124.3 32.5 1.0 2.0 1.0 3.0 1.0 0.5 14999.0
	25% -121.8 33.9 18.0 1462.0 297.0 790.0 282.0 2.6 119400.0
	50% -118.5 34.2 29.0 2127.0 434.0 1167.0 409.0 3.5 180400.0
	75% -118.0 37.7 37.0 3151.2 648.2 1721.0 605.2 4.8 265000.0
	max -114.3 42.0 52.0 37937.0 6445.0 35682.0 6082.0 15.0 500001.0
	# @title Setup - Import relevant modules

	# The following code imports relevant modules that
	# allow you to run the colab.
	# If you encounter technical issues running some of the code sections
	# that follow, try running this section again.

	import pandas as pd

	# The following lines adjust the granularity of reporting.
	pd.options.display.max_rows = 10
	pd.options.display.float_format = "{:.1f}".format


	#@title Import the dataset

	# The following code imports the dataset that is used in the colab.

	training_df = pd.read_csv(filepath_or_buffer="https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv")


	# Get statistics on the dataset.

	# The following code returns basic statistics about the data in the dataframe.

	training_df.describe()


	# @title Solution (run this code block to view) { display-mode: "form" }

	print("""The following columns might contain outliers:

	* total_rooms
	* total_bedrooms
	* population
	* households
	* possibly, median_income

	In all of those columns:

	* the standard deviation is almost as high as the mean
	* the delta between 75% and max is much higher than the
	delta between min and 25%.""")