This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Remove outliers as defined | |
| df = df[50000 <= df['avg_salary']] | |
| df = df[df['avg_salary'] < 200000] | |
| def salary_category(salary): | |
| if 50000 <= salary < 75000: | |
| return 1 | |
| if 75000 <= salary < 100000: | |
| return 2 | |
| if 100000 <= salary < 125000: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| df = pd.read_csv('data_cleaned.csv') | |
| # Select features for the pivot table | |
| df_pivots = df[['Rating', 'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Size', 'Age', 'Python', 'R', 'SQL', 'AWS', 'Excel', 'GCP', | |
| 'Azure', 'Spark', 'PyTorch', 'TensorFlow', 'Tableau', 'Keras', 'Job', 'Seniority','avg_salary']] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| words = " ".join(df['Job Description']) | |
| def punctuation_stop(text): | |
| """remove punctuation and stop words""" | |
| filtered = [] | |
| stop_words = set(stopwords.words('english')) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| import re | |
| from nltk.corpus import stopwords | |
| stopWords = set(stopwords.words('english')) | |
| df = pd.read_csv('data.csv') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException | |
| from selenium import webdriver | |
| import time | |
| import pandas as pd | |
| def get_jobs(keyword, num_jobs, verbose): | |
| '''Gathers jobs as a dataframe, scraped from Glassdoor''' | |
| # Initializing the webdriver |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def predict(cls, test_data): | |
| # Character cleaning function that replace a certain set of special characters with spaces | |
| def char_clean(string): | |
| new_string = re.sub('[^a-zA-Z0-9&+@ \n\.]', ' ', string) | |
| new_string = ' '.join(new_string.split()) | |
| return new_string | |
| # Input dataframe as test_data. | |
| df = test_data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import spacy | |
| import random | |
| # Example of training data. For me, I am training a spaCy model to do NER task. | |
| TRAIN_DATA = | |
| [ | |
| ('Amazon co ca', {'entities': [(0, 6, 'BRD')]}), | |
| ('AMZNMKTPLACE AMAZON CO', {'entities': [(13, 19, 'BRD')]}), | |
| ('APPLE COM BILL', {'entities': [(0, 5, 'BRD')]}), | |
| ('BOOKING COM New York City', {'entities': [(0, 7, 'BRD')]}), |
NewerOlder