Cheng ✨ kyoto-cheng

🎯

Focusing

🦊 Data Scientist & Data Engineer

kyoto-cheng / train_spaCy.py

Created May 30, 2021 17:07

	import spacy
	import random

	# Example of training data. For me, I am training a spaCy model to do NER task.
	TRAIN_DATA =
	[
	('Amazon co ca', {'entities': [(0, 6, 'BRD')]}),
	('AMZNMKTPLACE AMAZON CO', {'entities': [(13, 19, 'BRD')]}),
	('APPLE COM BILL', {'entities': [(0, 5, 'BRD')]}),
	('BOOKING COM New York City', {'entities': [(0, 7, 'BRD')]}),

kyoto-cheng / predict_spaCy.py

Last active May 30, 2021 18:03

	def predict(cls, test_data):

	# Character cleaning function that replace a certain set of special characters with spaces
	def char_clean(string):
	new_string = re.sub('[^a-zA-Z0-9&+@ \n\.]', ' ', string)
	new_string = ' '.join(new_string.split())
	return new_string

	# Input dataframe as test_data.
	df = test_data

kyoto-cheng / data_scraping.py

Created June 14, 2021 17:01

	from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
	from selenium import webdriver
	import time
	import pandas as pd

	def get_jobs(keyword, num_jobs, verbose):

	'''Gathers jobs as a dataframe, scraped from Glassdoor'''

	# Initializing the webdriver

kyoto-cheng / data_cleaning.py

Created June 14, 2021 19:21

	import pandas as pd
	import numpy as np
	import re
	from nltk.corpus import stopwords

	stopWords = set(stopwords.words('english'))


	df = pd.read_csv('data.csv')

kyoto-cheng / Word_cloud.py

Created June 14, 2021 19:54

	from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize

	words = " ".join(df['Job Description'])

	def punctuation_stop(text):
	"""remove punctuation and stop words"""
	filtered = []
	stop_words = set(stopwords.words('english'))

kyoto-cheng / Pivot_table.py

Created June 14, 2021 20:54

	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns

	df = pd.read_csv('data_cleaned.csv')

	# Select features for the pivot table
	df_pivots = df[['Rating', 'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Size', 'Age', 'Python', 'R', 'SQL', 'AWS', 'Excel', 'GCP',
	'Azure', 'Spark', 'PyTorch', 'TensorFlow', 'Tableau', 'Keras', 'Job', 'Seniority','avg_salary']]

kyoto-cheng / Salary_category.py

Created June 14, 2021 21:07

kyoto-cheng / data_modeling.py

Created June 14, 2021 22:06

	import pandas as pd
	import numpy as np
	import re
	from nltk.corpus import stopwords
	import matplotlib.pyplot as plt
	from sklearn.base import BaseEstimator, TransformerMixin
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.preprocessing import StandardScaler

kyoto-cheng / nlp_process.py

Last active June 25, 2021 14:55

	import re
	import pandas as pd
	import nltk
	from nltk import word_tokenize

	nltk.download('wordnet')
	nltk.download('punkt')

	lemma = nltk.wordnet.WordNetLemmatizer()

kyoto-cheng / word_cloud.py

Last active June 25, 2021 15:38

	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	from sklearn.feature_extraction.text import CountVectorizer

	# Analyzing top frquent bi-gram words in the interview questions of type Methodology with CountVectorizer
	def counter(Q_A, category, data, n_gram_min, n_gram_max):
	data = data[data[category]==1]
	word_vectorizer = CountVectorizer(ngram_range=(n_gram_min,n_gram_max), analyzer='word')
	sparse_matrix = word_vectorizer.fit_transform(data[Q_A])
	frequencies = sum(sparse_matrix).toarray()[0]