Created
May 2, 2025 17:14
-
-
Save dragonhuntr/12592c56aa77436cf1f2cad80440bb9d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # File operations | |
| ## Open file | |
| ```python | |
| with open('mytext.txt', 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| ``` | |
| ## Example | |
| ```python | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import string | |
| with open('mytext.txt', 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| content = content.lower() | |
| for symbol in string.punctuation: | |
| content = content.replace(symbol, '') | |
| word_list = content.split() | |
| word_counts = {} | |
| for w in word_list: | |
| if w in word_counts: | |
| word_counts[w] += 1 | |
| else: | |
| word_counts[w] = 1 | |
| def get_count(item): | |
| return item[1] | |
| top_words = sorted(word_counts.items(), key=get_count, reverse=True)[:10] # get top 10 | |
| print(top_words) | |
| plt.figure(figsize=(10, 5)) | |
| labels, values = zip(*top_words) | |
| sns.barplot(x=list(labels), y=list(values), palette='pastel') | |
| plt.xticks(rotation=40, ha='right') | |
| plt.tight_layout() | |
| plt.show() | |
| ``` | |
| # OOP | |
| ## Class Definition | |
| ```python | |
| class ClassName: | |
| def __init__(self): # Constructor (like JS `constructor`) | |
| self.attribute = value # Instance variable | |
| def method_name(self): # Method (uses `self` like `this` in JS) | |
| # code block | |
| pass | |
| ``` | |
| - self: Explicit reference to the instance (`this` in JS) | |
| - \_\_init__: Constructor method, runs when you create an object | |
| ## Create and Use Object Instance | |
| ```python | |
| obj = ClassName() | |
| obj.method_name() | |
| ``` | |
| ## Examples | |
| ```python | |
| import pandas as pd | |
| class PropertySales: | |
| def __init__(self): | |
| self.sum = 0 | |
| def calculate_total_sale_price(self): | |
| df = pd.read_excel('Properties.xlsx') # reads the spreadsheet | |
| self.sum = df['Sale Price'].sum() | |
| def get_sum(self): | |
| return self.sum | |
| property_sales = PropertySales() | |
| property_sales.calculate_total_sale_price() | |
| total = property_sales.get_sum() | |
| print(f"The total sale price is: {total}") | |
| ``` | |
| ```python | |
| # prop.py | |
| class Prop: | |
| def __init__(self): | |
| self.__x = None # Initialize __x properly | |
| def read_data(self): | |
| import pandas as pd | |
| pr = pd.read_excel('Properties.xlsx') | |
| print(pr) | |
| self.__x = pr['Sale Price'].sum() # Store just the sum | |
| def get_answer(self): | |
| return self.__x | |
| ``` | |
| ```python | |
| # main.py | |
| import prop | |
| my_prop = prop.Prop() | |
| # You need to call the `read_data` method first to load the Excel and compute the sum | |
| my_prop.read_data() | |
| # Then print the answer | |
| print(my_prop.get_answer()) | |
| ``` | |
| - Every method must include `self` as the first parameter | |
| - Use `:` after `def`, `class`, `if`, `for`, etc. | |
| - Indentation matters — use consistent 4-space indents | |
| - Class names should be in CamelCase by convention | |
| ### What to do in exam | |
| 1. Upload the `.xlsx` file to session storage | |
| 2. Upload your class definition `.py` file | |
| 3. Instantiate the class with `obj = ClassName()` | |
| 4. Call methods like `obj.method_name()` | |
| 5. Output results with `print()` | |
| # DataFrame (Pandas / Numpy / Seaborn) | |
| ## Libraries | |
| ```python | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| ``` | |
| ## Read Excel File | |
| ```python | |
| df = pd.read_excel('filename.xlsx') | |
| Example: | |
| df = pd.read_excel('Dogs.xlsx') | |
| df = pd.read_excel('Properties.xlsx') | |
| ``` | |
| ## Exploring DataFrame | |
| ```python | |
| pd.set_option('display.max_columns', None) # Show all columns | |
| pd.set_option('display.max_rows', None) # Show all rows | |
| pd.reset_option('display.max_columns') # Reset columns view | |
| pd.reset_option('display.max_rows') # Reset rows view | |
| df = pd.DataFrame(data) # Re-wrap Excel into DataFrame | |
| df.describe() # Summary stats (mean, min, max, etc.) | |
| df.shape # (rows, columns) | |
| df.info() # Data types & non-null counts | |
| df.head() # First 5 rows | |
| df.index # Row index | |
| ``` | |
| ## Sorting & Filtering | |
| ```python | |
| df.sort_values('List Price') # Ascending | |
| df.sort_values('List Price', ascending=False) # Descending | |
| df[df['List Price'] > 450000] # Filter rows | |
| top5 = df.sort_values(by='Weight_KG', ascending=False).head(5) | |
| # Sort by Weight and select top 5 dogs | |
| ``` | |
| ## Plotting with Seaborn | |
| ```python | |
| plt.figure(figsize=(12, 6)) | |
| sns.barplot(x='Name', y='Weight_KG', data=top5, palette='bright') | |
| plt.tight_layout() | |
| plt.show() | |
| ``` | |
| ## Examples | |
| ```python | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| df = pd.read_excel('Dogs.xlsx') | |
| top5 = df.sort_values(by='Weight_KG', ascending=False).head(5) | |
| plt.figure(figsize=(12, 6)) | |
| sns.barplot(x='Name', y='Weight_KG', data=top5, palette='bright') | |
| plt.tight_layout() | |
| plt.show() | |
| ``` | |
| ```python | |
| import pandas as pd | |
| import numpy as np | |
| # read excel file | |
| df = pd.read_excel('Dogs.xlsx') | |
| # describe | |
| print("Description of dataframe") | |
| print(df.describe()) | |
| # shape | |
| print("\nShape of the dataframe") | |
| print(df.shape) | |
| # info | |
| print("\nInfo of the dataframe") | |
| print(df.info()) | |
| # head | |
| print("\nHead of the dataframe") | |
| print(df.head()) | |
| # index | |
| print("\nIndex of the dataframe") | |
| print(df.index) | |
| # sort ascending by weight | |
| print("\nSort ascending by weight") | |
| print(df.sort_values(by='Weight_KG', ascending=True)) | |
| # sort descending by weight | |
| print("\nSort descending by weight") | |
| print(df.sort_values(by='Weight_KG', ascending=False)) | |
| # list with weight over 25 KG | |
| print("\nList with weight over 25 KG") | |
| print(df[df['Weight_KG'] > 25]) | |
| ``` | |
| ### What to do in exam | |
| 1. You'll be given a `.xlsx` spreadsheet in the exam folder | |
| 2. Upload it to session storage | |
| 3. Read it into a DataFrame using `pd.read_excel()` | |
| 4. Perform analysis: sorting, filtering, describing, shape, info | |
| 5. Plot a bar chart using seaborn if asked | |
| # Plotly Express (w/ Gapminder) | |
| ## Libraries | |
| ```python | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| ``` | |
| ## Read and Filter DataFrame | |
| ```python | |
| # Read Excel | |
| df = pd.read_excel('Properties.xlsx') | |
| # Filter for sold properties only | |
| sold_df = df[df['Sold'] == True] | |
| This filters rows where the "Sold" column is `True`. You can also use `.query("Sold == True")` for the same effect. | |
| ``` | |
| ## Plotly Scatter Plot | |
| ```python | |
| fig = px.scatter( | |
| sold_df, | |
| x='Date Sold', | |
| y='List Price', | |
| size='Square Feet', | |
| color='Subdivision ID', | |
| title='Sold Properties' | |
| ) | |
| fig.show() | |
| ``` | |
| - `x`: sets the x-axis — here it's `Date Sold` | |
| - `y`: sets the y-axis — here it's `List Price` | |
| - `size`: adjusts dot sizes based on square footage | |
| - `color`: groups points by subdivision for visual clarity | |
| - `title`: chart title | |
| - `fig.show()`: renders the chart in notebook/browser | |
| ## Gapminer Pie Chart | |
| ```python | |
| gap = px.data.gapminder() | |
| gap2007 = gap.query("year == 2007") | |
| fig = px.pie( | |
| gap2007, | |
| values='pop', | |
| names='continent', | |
| title='Population Share by Continent (2007)' | |
| ) | |
| fig.show() | |
| ``` | |
| ```python | |
| df = px.data.gapminder().query("year == 1982").query("continent == 'Europe'") | |
| df.loc[df['pop'] < 5e7, 'country'] = 'Other countries' # Group small populations together | |
| fig = px.pie(df, values='pop', names='country', title='Population of European continent') | |
| fig.show() | |
| ``` | |
| ```python | |
| df = px.data.gapminder() | |
| fig = px.scatter_geo( | |
| df.query("year == 2007"), | |
| locations="iso_alpha", | |
| color="continent", | |
| hover_name="country", | |
| size="pop", | |
| projection="natural earth" | |
| ) | |
| fig.show() | |
| ``` | |
| ```python | |
| import plotly.express as px | |
| df_gapminder = px.data.gapminder() | |
| df_query = df_gapminder.query("year == 1982").query("continent == 'Europe'") | |
| df_query.loc[df_query['pop'] < 5e7, 'country'] = 'Other countries' | |
| df = df_query.groupby('country', as_index=False)['pop'].sum() # as_index can be added, True or False | |
| fig = px.pie( | |
| df, | |
| values='pop', | |
| names='country', | |
| title='Population of European continent' | |
| ) | |
| fig.show() | |
| ``` | |
| ### What to do in exam | |
| - Use `px.data.gapminder()` OR load a spreadsheet with `pd.read_excel()` | |
| - Filter using `.query()` or boolean indexing | |
| - Use `px.pie()`, `px.scatter()`, or `px.scatter_geo()` | |
| - Assign correct arguments: `x`, `y`, `size`, `color`, `names`, `locations`, etc. | |
| - Call `fig.show()` to display the chart | |
| # Tokenization | |
| ## Example | |
| ```python | |
| import nltk | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| nltk.download('punkt') | |
| with open('file.txt', 'r', encoding='utf-8') as file: | |
| text = file.read() | |
| # tokenize words | |
| words = nltk.word_tokenize(text) | |
| # frequency distribution | |
| fdist = nltk.FreqDist(words) | |
| # print frequency distribution | |
| print(fdist) | |
| print(fdist.most_common(10)) # 10 most common | |
| # plot top 10 words (not cumulative) | |
| plt.figure(figsize=(12, 6)) | |
| fdist.plot(10, cumulative=False) | |
| plt.tight_layout() | |
| plt.show() | |
| ``` | |
| ```python | |
| words = text.split() | |
| # frequency distribution | |
| fdist = nltk.FreqDist(words) | |
| # print frequency distribution | |
| print(fdist) | |
| print(fdist.most_common(10)) | |
| ``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment