This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import spacy | |
| import random | |
| # Example of training data. For me, I am training a spaCy model to do NER task. | |
| TRAIN_DATA = | |
| [ | |
| ('Amazon co ca', {'entities': [(0, 6, 'BRD')]}), | |
| ('AMZNMKTPLACE AMAZON CO', {'entities': [(13, 19, 'BRD')]}), | |
| ('APPLE COM BILL', {'entities': [(0, 5, 'BRD')]}), | |
| ('BOOKING COM New York City', {'entities': [(0, 7, 'BRD')]}), |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def predict(cls, test_data): | |
| # Character cleaning function that replace a certain set of special characters with spaces | |
| def char_clean(string): | |
| new_string = re.sub('[^a-zA-Z0-9&+@ \n\.]', ' ', string) | |
| new_string = ' '.join(new_string.split()) | |
| return new_string | |
| # Input dataframe as test_data. | |
| df = test_data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException | |
| from selenium import webdriver | |
| import time | |
| import pandas as pd | |
| def get_jobs(keyword, num_jobs, verbose): | |
| '''Gathers jobs as a dataframe, scraped from Glassdoor''' | |
| # Initializing the webdriver |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| import re | |
| from nltk.corpus import stopwords | |
| stopWords = set(stopwords.words('english')) | |
| df = pd.read_csv('data.csv') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| words = " ".join(df['Job Description']) | |
| def punctuation_stop(text): | |
| """remove punctuation and stop words""" | |
| filtered = [] | |
| stop_words = set(stopwords.words('english')) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| df = pd.read_csv('data_cleaned.csv') | |
| # Select features for the pivot table | |
| df_pivots = df[['Rating', 'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Size', 'Age', 'Python', 'R', 'SQL', 'AWS', 'Excel', 'GCP', | |
| 'Azure', 'Spark', 'PyTorch', 'TensorFlow', 'Tableau', 'Keras', 'Job', 'Seniority','avg_salary']] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Remove outliers as defined | |
| df = df[50000 <= df['avg_salary']] | |
| df = df[df['avg_salary'] < 200000] | |
| def salary_category(salary): | |
| if 50000 <= salary < 75000: | |
| return 1 | |
| if 75000 <= salary < 100000: | |
| return 2 | |
| if 100000 <= salary < 125000: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| import re | |
| from nltk.corpus import stopwords | |
| import matplotlib.pyplot as plt | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.preprocessing import StandardScaler |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import pandas as pd | |
| import nltk | |
| from nltk import word_tokenize | |
| nltk.download('wordnet') | |
| nltk.download('punkt') | |
| lemma = nltk.wordnet.WordNetLemmatizer() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| # Analyzing top frquent bi-gram words in the interview questions of type Methodology with CountVectorizer | |
| def counter(Q_A, category, data, n_gram_min, n_gram_max): | |
| data = data[data[category]==1] | |
| word_vectorizer = CountVectorizer(ngram_range=(n_gram_min,n_gram_max), analyzer='word') | |
| sparse_matrix = word_vectorizer.fit_transform(data[Q_A]) | |
| frequencies = sum(sparse_matrix).toarray()[0] |
OlderNewer