dragonhuntr · May 2, 2025 17:14
diff --git a/mis315 final notes 2 b/mis315 final notes 2
 # File operations

 ## Open file

 ```python
 with open('mytext.txt', 'r', encoding='utf-8') as f:
    content = f.read()
 ```

 ## Example

 ```python
 import matplotlib.pyplot as plt
 import seaborn as sns
 import string

 with open('mytext.txt', 'r', encoding='utf-8') as f:
    content = f.read()

 content = content.lower()

 for symbol in string.punctuation:
    content = content.replace(symbol, '')

 word_list = content.split()

 word_counts = {}
 for w in word_list:
    if w in word_counts:
        word_counts[w] += 1
    else:
        word_counts[w] = 1

 def get_count(item):
    return item[1]

 top_words = sorted(word_counts.items(), key=get_count, reverse=True)[:10] # get top 10

 print(top_words)

 plt.figure(figsize=(10, 5))
 labels, values = zip(*top_words)
 sns.barplot(x=list(labels), y=list(values), palette='pastel')
 plt.xticks(rotation=40, ha='right')
 plt.tight_layout()
 plt.show()
 ```

 # OOP

 ## Class Definition

 ```python
 class ClassName:
    def __init__(self):         # Constructor (like JS `constructor`)
        self.attribute = value  # Instance variable

    def method_name(self):      # Method (uses `self` like `this` in JS)
        # code block
        pass
 ```

 - self: Explicit reference to the instance (`this` in JS)  
 - \_\_init__: Constructor method, runs when you create an object 

 ## Create and Use Object Instance

 ```python
 obj = ClassName()  
 obj.method_name()
 ```


 ## Examples

 ```python
 import pandas as pd  

 class PropertySales:  
    def __init__(self):  
        self.sum = 0  

    def calculate_total_sale_price(self):  
        df = pd.read_excel('Properties.xlsx')  # reads the spreadsheet  
        self.sum = df['Sale Price'].sum()  

    def get_sum(self):  
        return self.sum  

 property_sales = PropertySales()  
 property_sales.calculate_total_sale_price()  
 total = property_sales.get_sum()  

 print(f"The total sale price is: {total}")  
 ```

 ```python
 # prop.py
 class Prop:
    def __init__(self):
        self.__x = None  # Initialize __x properly

    def read_data(self):
        import pandas as pd
        pr = pd.read_excel('Properties.xlsx')
        print(pr)
        self.__x = pr['Sale Price'].sum()  # Store just the sum

    def get_answer(self):
        return self.__x
 ```

 ```python
 # main.py
 import prop

 my_prop = prop.Prop()

 # You need to call the `read_data` method first to load the Excel and compute the sum
 my_prop.read_data()

 # Then print the answer
 print(my_prop.get_answer())
 ```

 - Every method must include `self` as the first parameter  
 - Use `:` after `def`, `class`, `if`, `for`, etc.  
 - Indentation matters — use consistent 4-space indents  
 - Class names should be in CamelCase by convention  

 ### What to do in exam
 1. Upload the `.xlsx` file to session storage  
 2. Upload your class definition `.py` file  
 3. Instantiate the class with `obj = ClassName()`  
 4. Call methods like `obj.method_name()`  
 5. Output results with `print()`

 # DataFrame (Pandas / Numpy / Seaborn)

 ## Libraries

 ```python
 import pandas as pd  
 import numpy as np  
 import seaborn as sns  
 import matplotlib.pyplot as plt
 ```

 ## Read Excel File

 ```python
 df = pd.read_excel('filename.xlsx')  
 Example:  
 df = pd.read_excel('Dogs.xlsx')  
 df = pd.read_excel('Properties.xlsx')
 ```

 ## Exploring DataFrame

 ```python
 pd.set_option('display.max_columns', None)   # Show all columns  
 pd.set_option('display.max_rows', None)      # Show all rows  
 pd.reset_option('display.max_columns')       # Reset columns view  
 pd.reset_option('display.max_rows')          # Reset rows view  

 df = pd.DataFrame(data)                      # Re-wrap Excel into DataFrame  

 df.describe()                                # Summary stats (mean, min, max, etc.)  
 df.shape                                     # (rows, columns)  
 df.info()                                    # Data types & non-null counts  
 df.head()                                    # First 5 rows  
 df.index                                     # Row index
 ```

 ## Sorting & Filtering

 ```python
 df.sort_values('List Price')                          # Ascending  
 df.sort_values('List Price', ascending=False)         # Descending  

 df[df['List Price'] > 450000]                         # Filter rows  

 top5 = df.sort_values(by='Weight_KG', ascending=False).head(5)  
 # Sort by Weight and select top 5 dogs  
 ```

 ## Plotting with Seaborn

 ```python
 plt.figure(figsize=(12, 6))  
 sns.barplot(x='Name', y='Weight_KG', data=top5, palette='bright')  
 plt.tight_layout()  
 plt.show()
 ```

 ## Examples

 ```python
 import pandas as pd
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt

 df = pd.read_excel('Dogs.xlsx')
 top5 = df.sort_values(by='Weight_KG', ascending=False).head(5)

 plt.figure(figsize=(12, 6))
 sns.barplot(x='Name', y='Weight_KG', data=top5, palette='bright')
 plt.tight_layout()
 plt.show()
 ```

 ```python
 import pandas as pd
 import numpy as np

 # read excel file
 df = pd.read_excel('Dogs.xlsx')

 # describe
 print("Description of dataframe")
 print(df.describe())

 # shape
 print("\nShape of the dataframe")
 print(df.shape)

 # info
 print("\nInfo of the dataframe")
 print(df.info())

 # head
 print("\nHead of the dataframe")
 print(df.head())

 # index
 print("\nIndex of the dataframe")
 print(df.index)

 # sort ascending by weight
 print("\nSort ascending by weight")
 print(df.sort_values(by='Weight_KG', ascending=True))

 # sort descending by weight
 print("\nSort descending by weight")
 print(df.sort_values(by='Weight_KG', ascending=False)) 

 # list with weight over 25 KG
 print("\nList with weight over 25 KG")
 print(df[df['Weight_KG'] > 25])
 ```

 ### What to do in exam

 1. You'll be given a `.xlsx` spreadsheet in the exam folder  
 2. Upload it to session storage  
 3. Read it into a DataFrame using `pd.read_excel()`  
 4. Perform analysis: sorting, filtering, describing, shape, info  
 5. Plot a bar chart using seaborn if asked

 # Plotly Express (w/ Gapminder)

 ## Libraries

 ```python
 import pandas as pd  
 import numpy as np  
 import plotly.express as px
 ```

 ## Read and Filter DataFrame

 ```python
 # Read Excel  
 df = pd.read_excel('Properties.xlsx')  

 # Filter for sold properties only  
 sold_df = df[df['Sold'] == True]  

 This filters rows where the "Sold" column is `True`. You can also use `.query("Sold == True")` for the same effect.
 ```

 ## Plotly Scatter Plot

 ```python
 fig = px.scatter(  
   sold_df,  
   x='Date Sold',  
   y='List Price',  
   size='Square Feet',  
   color='Subdivision ID',  
   title='Sold Properties'  
 )  

 fig.show()
 ```
 - `x`: sets the x-axis — here it's `Date Sold`  
 - `y`: sets the y-axis — here it's `List Price`  
 - `size`: adjusts dot sizes based on square footage  
 - `color`: groups points by subdivision for visual clarity  
 - `title`: chart title  
 - `fig.show()`: renders the chart in notebook/browser

 ## Gapminer Pie Chart

 ```python
 gap = px.data.gapminder()  
 gap2007 = gap.query("year == 2007")  

 fig = px.pie(  
   gap2007,  
   values='pop',  
   names='continent',  
   title='Population Share by Continent (2007)'  
 )  

 fig.show()
 ```

 ```python
 df = px.data.gapminder().query("year == 1982").query("continent == 'Europe'")  
 df.loc[df['pop'] < 5e7, 'country'] = 'Other countries'  # Group small populations together  
 fig = px.pie(df, values='pop', names='country', title='Population of European continent')  
 fig.show()
 ```

 ```python
 df = px.data.gapminder()  
 fig = px.scatter_geo(  
    df.query("year == 2007"),  
    locations="iso_alpha",  
    color="continent",  
    hover_name="country",  
    size="pop",  
    projection="natural earth"  
 )  
 fig.show()
 ```

 ```python
 import plotly.express as px

 df_gapminder = px.data.gapminder()
 df_query = df_gapminder.query("year == 1982").query("continent == 'Europe'")

 df_query.loc[df_query['pop'] < 5e7, 'country'] = 'Other countries'

 df = df_query.groupby('country', as_index=False)['pop'].sum() # as_index can be added, True or False

 fig = px.pie(
    df,
    values='pop',
    names='country',
    title='Population of European continent'
 )

 fig.show()
 ```

 ### What to do in exam
 - Use `px.data.gapminder()` OR load a spreadsheet with `pd.read_excel()`  
 - Filter using `.query()` or boolean indexing  
 - Use `px.pie()`, `px.scatter()`, or `px.scatter_geo()`  
 - Assign correct arguments: `x`, `y`, `size`, `color`, `names`, `locations`, etc.  
 - Call `fig.show()` to display the chart  

 # Tokenization

 ## Example
 ```python
 import nltk
 import matplotlib.pyplot as plt
 import seaborn as sns

 nltk.download('punkt')

 with open('file.txt', 'r', encoding='utf-8') as file:
    text = file.read()

 # tokenize words
 words = nltk.word_tokenize(text)

 # frequency distribution
 fdist = nltk.FreqDist(words)

 # print frequency distribution
 print(fdist)
 print(fdist.most_common(10)) # 10 most common

 # plot top 10 words (not cumulative)
 plt.figure(figsize=(12, 6))
 fdist.plot(10, cumulative=False)
 plt.tight_layout()
 plt.show()
 ```

 ```python
 words = text.split()

 # frequency distribution
 fdist = nltk.FreqDist(words)

 # print frequency distribution
 print(fdist)
 print(fdist.most_common(10))
 ```
	# File operations

	## Open file

	```python
	with open('mytext.txt', 'r', encoding='utf-8') as f:
	content = f.read()
	```

	## Example

	```python
	import matplotlib.pyplot as plt
	import seaborn as sns
	import string

	with open('mytext.txt', 'r', encoding='utf-8') as f:
	content = f.read()

	content = content.lower()

	for symbol in string.punctuation:
	content = content.replace(symbol, '')

	word_list = content.split()

	word_counts = {}
	for w in word_list:
	if w in word_counts:
	word_counts[w] += 1
	else:
	word_counts[w] = 1

	def get_count(item):
	return item[1]

	top_words = sorted(word_counts.items(), key=get_count, reverse=True)[:10] # get top 10

	print(top_words)

	plt.figure(figsize=(10, 5))
	labels, values = zip(*top_words)
	sns.barplot(x=list(labels), y=list(values), palette='pastel')
	plt.xticks(rotation=40, ha='right')
	plt.tight_layout()
	plt.show()
	```

	# OOP

	## Class Definition

	```python
	class ClassName:
	def __init__(self): # Constructor (like JS `constructor`)
	self.attribute = value # Instance variable

	def method_name(self): # Method (uses `self` like `this` in JS)
	# code block
	pass
	```

	- self: Explicit reference to the instance (`this` in JS)
	- \_\_init__: Constructor method, runs when you create an object

	## Create and Use Object Instance

	```python
	obj = ClassName()
	obj.method_name()
	```


	## Examples

	```python
	import pandas as pd

	class PropertySales:
	def __init__(self):
	self.sum = 0

	def calculate_total_sale_price(self):
	df = pd.read_excel('Properties.xlsx') # reads the spreadsheet
	self.sum = df['Sale Price'].sum()

	def get_sum(self):
	return self.sum

	property_sales = PropertySales()
	property_sales.calculate_total_sale_price()
	total = property_sales.get_sum()

	print(f"The total sale price is: {total}")
	```

	```python
	# prop.py
	class Prop:
	def __init__(self):
	self.__x = None # Initialize __x properly

	def read_data(self):
	import pandas as pd
	pr = pd.read_excel('Properties.xlsx')
	print(pr)
	self.__x = pr['Sale Price'].sum() # Store just the sum

	def get_answer(self):
	return self.__x
	```

	```python
	# main.py
	import prop

	my_prop = prop.Prop()

	# You need to call the `read_data` method first to load the Excel and compute the sum
	my_prop.read_data()

	# Then print the answer
	print(my_prop.get_answer())
	```

	- Every method must include `self` as the first parameter
	- Use `:` after `def`, `class`, `if`, `for`, etc.
	- Indentation matters — use consistent 4-space indents
	- Class names should be in CamelCase by convention

	### What to do in exam
	1. Upload the `.xlsx` file to session storage
	2. Upload your class definition `.py` file
	3. Instantiate the class with `obj = ClassName()`
	4. Call methods like `obj.method_name()`
	5. Output results with `print()`

	# DataFrame (Pandas / Numpy / Seaborn)

	## Libraries

	```python
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	```

	## Read Excel File

	```python
	df = pd.read_excel('filename.xlsx')
	Example:
	df = pd.read_excel('Dogs.xlsx')
	df = pd.read_excel('Properties.xlsx')
	```

	## Exploring DataFrame

	```python
	pd.set_option('display.max_columns', None) # Show all columns
	pd.set_option('display.max_rows', None) # Show all rows
	pd.reset_option('display.max_columns') # Reset columns view
	pd.reset_option('display.max_rows') # Reset rows view

	df = pd.DataFrame(data) # Re-wrap Excel into DataFrame

	df.describe() # Summary stats (mean, min, max, etc.)
	df.shape # (rows, columns)
	df.info() # Data types & non-null counts
	df.head() # First 5 rows
	df.index # Row index
	```

	## Sorting & Filtering

	```python
	df.sort_values('List Price') # Ascending
	df.sort_values('List Price', ascending=False) # Descending

	df[df['List Price'] > 450000] # Filter rows

	top5 = df.sort_values(by='Weight_KG', ascending=False).head(5)
	# Sort by Weight and select top 5 dogs
	```

	## Plotting with Seaborn

	```python
	plt.figure(figsize=(12, 6))
	sns.barplot(x='Name', y='Weight_KG', data=top5, palette='bright')
	plt.tight_layout()
	plt.show()
	```

	## Examples

	```python
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt

	df = pd.read_excel('Dogs.xlsx')
	top5 = df.sort_values(by='Weight_KG', ascending=False).head(5)

	plt.figure(figsize=(12, 6))
	sns.barplot(x='Name', y='Weight_KG', data=top5, palette='bright')
	plt.tight_layout()
	plt.show()
	```

	```python
	import pandas as pd
	import numpy as np

	# read excel file
	df = pd.read_excel('Dogs.xlsx')

	# describe
	print("Description of dataframe")
	print(df.describe())

	# shape
	print("\nShape of the dataframe")
	print(df.shape)

	# info
	print("\nInfo of the dataframe")
	print(df.info())

	# head
	print("\nHead of the dataframe")
	print(df.head())

	# index
	print("\nIndex of the dataframe")
	print(df.index)

	# sort ascending by weight
	print("\nSort ascending by weight")
	print(df.sort_values(by='Weight_KG', ascending=True))

	# sort descending by weight
	print("\nSort descending by weight")
	print(df.sort_values(by='Weight_KG', ascending=False))

	# list with weight over 25 KG
	print("\nList with weight over 25 KG")
	print(df[df['Weight_KG'] > 25])
	```

	### What to do in exam

	1. You'll be given a `.xlsx` spreadsheet in the exam folder
	2. Upload it to session storage
	3. Read it into a DataFrame using `pd.read_excel()`
	4. Perform analysis: sorting, filtering, describing, shape, info
	5. Plot a bar chart using seaborn if asked

	# Plotly Express (w/ Gapminder)

	## Libraries

	```python
	import pandas as pd
	import numpy as np
	import plotly.express as px
	```

	## Read and Filter DataFrame

	```python
	# Read Excel
	df = pd.read_excel('Properties.xlsx')

	# Filter for sold properties only
	sold_df = df[df['Sold'] == True]

	This filters rows where the "Sold" column is `True`. You can also use `.query("Sold == True")` for the same effect.
	```

	## Plotly Scatter Plot

	```python
	fig = px.scatter(
	sold_df,
	x='Date Sold',
	y='List Price',
	size='Square Feet',
	color='Subdivision ID',
	title='Sold Properties'
	)

	fig.show()
	```
	- `x`: sets the x-axis — here it's `Date Sold`
	- `y`: sets the y-axis — here it's `List Price`
	- `size`: adjusts dot sizes based on square footage
	- `color`: groups points by subdivision for visual clarity
	- `title`: chart title
	- `fig.show()`: renders the chart in notebook/browser

	## Gapminer Pie Chart

	```python
	gap = px.data.gapminder()
	gap2007 = gap.query("year == 2007")

	fig = px.pie(
	gap2007,
	values='pop',
	names='continent',
	title='Population Share by Continent (2007)'
	)

	fig.show()
	```

	```python
	df = px.data.gapminder().query("year == 1982").query("continent == 'Europe'")
	df.loc[df['pop'] < 5e7, 'country'] = 'Other countries' # Group small populations together
	fig = px.pie(df, values='pop', names='country', title='Population of European continent')
	fig.show()
	```

	```python
	df = px.data.gapminder()
	fig = px.scatter_geo(
	df.query("year == 2007"),
	locations="iso_alpha",
	color="continent",
	hover_name="country",
	size="pop",
	projection="natural earth"
	)
	fig.show()
	```

	```python
	import plotly.express as px

	df_gapminder = px.data.gapminder()
	df_query = df_gapminder.query("year == 1982").query("continent == 'Europe'")

	df_query.loc[df_query['pop'] < 5e7, 'country'] = 'Other countries'

	df = df_query.groupby('country', as_index=False)['pop'].sum() # as_index can be added, True or False

	fig = px.pie(
	df,
	values='pop',
	names='country',
	title='Population of European continent'
	)

	fig.show()
	```

	### What to do in exam
	- Use `px.data.gapminder()` OR load a spreadsheet with `pd.read_excel()`
	- Filter using `.query()` or boolean indexing
	- Use `px.pie()`, `px.scatter()`, or `px.scatter_geo()`
	- Assign correct arguments: `x`, `y`, `size`, `color`, `names`, `locations`, etc.
	- Call `fig.show()` to display the chart

	# Tokenization

	## Example
	```python
	import nltk
	import matplotlib.pyplot as plt
	import seaborn as sns

	nltk.download('punkt')

	with open('file.txt', 'r', encoding='utf-8') as file:
	text = file.read()

	# tokenize words
	words = nltk.word_tokenize(text)

	# frequency distribution
	fdist = nltk.FreqDist(words)

	# print frequency distribution
	print(fdist)
	print(fdist.most_common(10)) # 10 most common

	# plot top 10 words (not cumulative)
	plt.figure(figsize=(12, 6))
	fdist.plot(10, cumulative=False)
	plt.tight_layout()
	plt.show()
	```

	```python
	words = text.split()

	# frequency distribution
	fdist = nltk.FreqDist(words)

	# print frequency distribution
	print(fdist)
	print(fdist.most_common(10))
	```