Code for the medium post
numpy==1.19.5
tensorflow==2.4.1
scikit-learn==0.24.2
Code for the medium post
numpy==1.19.5
tensorflow==2.4.1
scikit-learn==0.24.2
| from google.cloud import bigquery | |
| client = bigquery.Client() | |
| sql = """ | |
| SELECT | |
| name, | |
| gender, | |
| COUNT(name) AS num_names | |
| FROM | |
| `bigquery-public-data.usa_names.usa_1910_current` | |
| GROUP BY | |
| name, | |
| gender | |
| """ | |
| names_df = client.query(sql).to_dataframe() | |
| print(names_df.shape) | |
| names_df.head() |
| def preprocess(names_df, train=True): | |
| # Step 1: Lowercase | |
| names_df['name'] = names_df['name'].str.lower() | |
| # Step 2: Split individual characters | |
| names_df['name'] = [list(name) for name in names_df['name']] | |
| # Step 3: Pad names with spaces to make all names same length | |
| name_length = 50 | |
| names_df['name'] = [ | |
| (name + [' ']*name_length)[:name_length] | |
| for name in names_df['name'] | |
| ] | |
| # Step 4: Encode Characters to Numbers | |
| names_df['name'] = [ | |
| [ | |
| max(0.0, ord(char)-96.0) | |
| for char in name | |
| ] | |
| for name in names_df['name'] | |
| ] | |
| if train: | |
| # Step 5: Encode Gender to Numbers | |
| names_df['gender'] = [ | |
| 0.0 if gender=='F' else 1.0 | |
| for gender in names_df['gender'] | |
| ] | |
| return names_df | |
| names_df = preprocess(names_df) | |
| names_df.head() |
| from tensorflow.keras import Sequential | |
| from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense | |
| from tensorflow.keras.optimizers import Adam | |
| def lstm_model(num_alphabets=27, name_length=50, embedding_dim=256): | |
| model = Sequential([ | |
| Embedding(num_alphabets, embedding_dim, input_length=name_length), | |
| Bidirectional(LSTM(units=128, recurrent_dropout=0.2, dropout=0.2)), | |
| Dense(1, activation="sigmoid") | |
| ]) | |
| model.compile(loss='binary_crossentropy', | |
| optimizer=Adam(learning_rate=0.001), | |
| metrics=['accuracy']) | |
| return model |
| import numpy as np | |
| from matplotlib import pyplot as plt | |
| from sklearn.model_selection import train_test_split | |
| from tensorflow.keras.callbacks import EarlyStopping | |
| # Step 1: Instantiate the model | |
| model = lstm_model(num_alphabets=27, name_length=50, embedding_dim=256) | |
| # Step 2: Split Training and Test Data | |
| X = np.asarray(names_df['name'].values.tolist()) | |
| y = np.asarray(names_df['gender'].values.tolist()) | |
| X_train, X_test, y_train, y_test = train_test_split(X, | |
| y, | |
| test_size=0.2, | |
| random_state=0) | |
| # Step 3: Train the model | |
| callbacks = [ | |
| EarlyStopping(monitor='val_accuracy', | |
| min_delta=1e-3, | |
| patience=5, | |
| mode='max', | |
| restore_best_weights=True, | |
| verbose=1), | |
| ] | |
| history = model.fit(x=X_train, | |
| y=y_train, | |
| batch_size=64, | |
| epochs=50, | |
| validation_data=(X_test, y_test), | |
| callbacks=callbacks) | |
| # Step 4: Save the model | |
| model.save('boyorgirl.h5') | |
| # Step 5: Plot accuracies | |
| plt.plot(history.history['accuracy'], label='train') | |
| plt.plot(history.history['val_accuracy'], label='val') | |
| plt.xlabel('Epochs') | |
| plt.ylabel('Accuracy') | |
| plt.legend() |
| from tensorflow.keras.models import load_model | |
| import pandas as pd | |
| import numpy as np | |
| pred_model = load_model('boyorgirl.h5') | |
| # Input names | |
| names = ['Joe', 'Biden', 'Kamala', 'Harris'] | |
| # Convert to dataframe | |
| pred_df = pd.DataFrame({'name': names}) | |
| # Preprocess | |
| pred_df = preprocess(pred_df, train=False) | |
| # Predictions | |
| result = pred_model.predict(np.asarray( | |
| pred_df['name'].values.tolist())).squeeze(axis=1) | |
| pred_df['Boy or Girl?'] = [ | |
| 'Boy' if logit > 0.5 else 'Girl' for logit in result | |
| ] | |
| pred_df['Probability'] = [ | |
| logit if logit > 0.5 else 1.0 - logit for logit in result | |
| ] | |
| # Format the output | |
| pred_df['name'] = names | |
| pred_df.rename(columns={'name': 'Name'}, inplace=True) | |
| pred_df['Probability'] = pred_df['Probability'].round(2) | |
| pred_df.drop_duplicates(inplace=True) | |
| pred_df.head() |
| import pandas as pd | |
| import plotly.express as px | |
| import requests | |
| import streamlit as st | |
| # Get user inputs | |
| names = st.text_input( | |
| "Names", help="Input the names you'd like to check separated with spaces or commas" | |
| ) | |
| # Add a submit button | |
| if st.button("Submit"): | |
| # Code to post the user inputs to the API and get the predictions | |
| # Paste the URL to your API here! | |
| api_url = "https://name-gender1.p.rapidapi.com/predict" | |
| headers = { | |
| "content-type": "application/json", | |
| "X-RapidAPI-Key": st.secrets["RAPID_API_KEY"], # Enter your RAPID API Key here | |
| "X-RapidAPI-Host": "name-gender1.p.rapidapi.com", | |
| } | |
| with st.spinner("🥁 Drumroll..."): | |
| response = requests.post(api_url, json=[names], headers=headers) | |
| predictions_df = pd.DataFrame(response.json()["response"]) | |
| predictions_df.columns = ["Name", "Boy or Girl?", "Probability"] | |
| predictions_df = predictions_df.apply( | |
| lambda x: x.str.title() if x.dtype == "object" else x | |
| ) | |
| fig = px.bar( | |
| predictions_df, | |
| x="Probability", | |
| y="Name", | |
| color="Boy or Girl?", | |
| orientation="h", | |
| color_discrete_map={"Boy": "dodgerblue", "Girl": "lightcoral"}, | |
| ) | |
| fig.update_layout( | |
| title={"text": "Confidence in Prediction", "x": 0.5}, | |
| yaxis={ | |
| "categoryorder": "array", | |
| "categoryarray": predictions_df["Name"].values.tolist(), | |
| "autorange": "reversed", | |
| }, | |
| xaxis={"range": [0, 1]}, | |
| font={"size": 14}, | |
| # width=700 | |
| ) | |
| st.write("Predictions") | |
| st.dataframe(predictions_df) | |
| st.plotly_chart(fig, use_container_width=True) |