Cheng ✨ kyoto-cheng

🎯

Focusing

🦊 Data Scientist & Data Engineer

kyoto-cheng / Salary_category.py

Created June 14, 2021 21:07

kyoto-cheng / Pivot_table.py

Created June 14, 2021 20:54

	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns

	df = pd.read_csv('data_cleaned.csv')

	# Select features for the pivot table
	df_pivots = df[['Rating', 'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Size', 'Age', 'Python', 'R', 'SQL', 'AWS', 'Excel', 'GCP',
	'Azure', 'Spark', 'PyTorch', 'TensorFlow', 'Tableau', 'Keras', 'Job', 'Seniority','avg_salary']]

kyoto-cheng / Word_cloud.py

Created June 14, 2021 19:54

	from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize

	words = " ".join(df['Job Description'])

	def punctuation_stop(text):
	"""remove punctuation and stop words"""
	filtered = []
	stop_words = set(stopwords.words('english'))

kyoto-cheng / data_cleaning.py

Created June 14, 2021 19:21

	import pandas as pd
	import numpy as np
	import re
	from nltk.corpus import stopwords

	stopWords = set(stopwords.words('english'))


	df = pd.read_csv('data.csv')

kyoto-cheng / data_scraping.py

Created June 14, 2021 17:01

	from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
	from selenium import webdriver
	import time
	import pandas as pd

	def get_jobs(keyword, num_jobs, verbose):

	'''Gathers jobs as a dataframe, scraped from Glassdoor'''

	# Initializing the webdriver

kyoto-cheng / predict_spaCy.py

Last active May 30, 2021 18:03

	def predict(cls, test_data):

	# Character cleaning function that replace a certain set of special characters with spaces
	def char_clean(string):
	new_string = re.sub('[^a-zA-Z0-9&+@ \n\.]', ' ', string)
	new_string = ' '.join(new_string.split())
	return new_string

	# Input dataframe as test_data.
	df = test_data

kyoto-cheng / train_spaCy.py

Created May 30, 2021 17:07

	import spacy
	import random

	# Example of training data. For me, I am training a spaCy model to do NER task.
	TRAIN_DATA =
	[
	('Amazon co ca', {'entities': [(0, 6, 'BRD')]}),
	('AMZNMKTPLACE AMAZON CO', {'entities': [(13, 19, 'BRD')]}),
	('APPLE COM BILL', {'entities': [(0, 5, 'BRD')]}),
	('BOOKING COM New York City', {'entities': [(0, 7, 'BRD')]}),