Severin Perez sevperez

Writer | Data Scientist

sevperez / search_relevance_6.py

Created October 14, 2020 08:41

	def search_tfidf_df(tfidf_df, text_df, query_string: str):
	"""
	- Parameters: tfidf_df (Pandas DataFrame) representing a tf-idf
	matrix, text_df (Pandas DataFrame) with a "text" column and rows
	that correspond to the tfidf_df, and query_string (string).
	- Returns: A new dataframe that only contains rows from text_df where
	the corresponding tf-idf value was greater than zero for each of
	the terms in query_string. Additional columns are added to show the
	tf-idf value for each term and the sum of the tf-idf values.
	"""

sevperez / search_relevance_5.py

Created October 14, 2020 08:40

	def document_frequency(td_df, term: str):
	"""
	- Parameters: td_df (Pandas DataFrame) representing a term-document
	matrix, and term (string).
	- Returns: The document frequency value showing the number of
	documents in td_df where term occurs at least once.
	"""
	return td_df[td_df[term] > 0].shape[0]

	def inverse_document_frequency(td_df, term: str):

sevperez / search_relevance_4.py

Created October 14, 2020 08:38

	def search_td_df(td_df, text_df, query_string: str):
	"""
	- Parameters: td_df (Pandas DataFrame) representing a term-document
	matrix, text_df (Pandas DataFrame) with a "text" column and rows
	that correspond to the td_df, and query_string (string).
	- Returns: A new dataframe that only contains rows from text_df where
	the "text" column had at least one occurence of each term in
	query_string. Additional columns are added to show the count of
	each term and the total count of all terms.
	"""

sevperez / search_relevance_3.py

Created October 14, 2020 08:35

	def build_corpus(doc_list, dictionary):
	"""
	- Parameters: doc_list (list of spaCy Document objects), dictionary
	(Gensim Dictionary object).
	- Returns: A list of documents in bag-of-words format, containing
	tuples with (token_id, token_count) for each token in the text.
	"""
	return [dictionary.doc2bow(get_token_texts(doc)) for doc in doc_list]

	def build_td_matrix(doc_list, dictionary):

sevperez / search_relevance_2.py

Created October 14, 2020 08:34

	# load spaCy model
	nlp = spacy.load("en_core_web_md")

	# tokenize documents
	def spacy_doc(model, text, lower=True):
	"""
	- Parameters: model (spaCy model), text (string), lower (bool).
	- Returns: A spaCy Document object processed using the provided
	model. Document is all lowercase if lower is True.
	"""

sevperez / search_relevance_1.py

Last active October 14, 2020 08:33

	def search_df_texts(df, query_string: str):
	"""
	- Parameters: df (Pandas DataFrame), query_string (string). df must
	contain a "text" column.
	- Returns: A subset of df containing only rows where each term in
	query_string appeared as a substring in df["text"].
	"""
	terms = query_string.lower().split(" ")
	filters = [df["text"].str.lower().str.contains(term) for term in terms]
	return df[np.all(filters, axis=0)]

sevperez / writing_5.py

Created October 8, 2020 07:02

	def binary_search(items, target):
	left = 0
	right = len(items) - 1

	while left <= right:
	mid = (left + right) // 2

	if items[mid] == target:
	return True

sevperez / writing_4.py

Created October 8, 2020 07:01

	class Receipt:
	def __init__(self, item, cost):
	self.item = item
	self.cost = cost

	def receipt_msg(self):
	return f"{self.item}, ${round(self.cost, 2)}"

	def deliver(self):
	msg = self.receipt_msg()

sevperez / writing_3.py

Created October 8, 2020 07:01

	class Car:
	def __init__(self, max_speed):
	self.max_speed = max_speed
	self.current_speed = 0
	self.acceleration_rate = 1

	def accelerate(self):
	if self.current_speed < self.max_speed:
	self.current_speed += self.acceleration_rate

sevperez / writing_2.py

Created October 8, 2020 07:00

	def double(num):
	return num * 2

	my_numbers = [1, 2, 3, 4, 5]

	doubled_numbers = list(map(double, my_numbers))
	print(doubled_numbers) # [2, 4, 6, 8, 10]