This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def search_tfidf_df(tfidf_df, text_df, query_string: str): | |
""" | |
- Parameters: tfidf_df (Pandas DataFrame) representing a tf-idf | |
matrix, text_df (Pandas DataFrame) with a "text" column and rows | |
that correspond to the tfidf_df, and query_string (string). | |
- Returns: A new dataframe that only contains rows from text_df where | |
the corresponding tf-idf value was greater than zero for each of | |
the terms in query_string. Additional columns are added to show the | |
tf-idf value for each term and the sum of the tf-idf values. | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def document_frequency(td_df, term: str): | |
""" | |
- Parameters: td_df (Pandas DataFrame) representing a term-document | |
matrix, and term (string). | |
- Returns: The document frequency value showing the number of | |
documents in td_df where term occurs at least once. | |
""" | |
return td_df[td_df[term] > 0].shape[0] | |
def inverse_document_frequency(td_df, term: str): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def search_td_df(td_df, text_df, query_string: str): | |
""" | |
- Parameters: td_df (Pandas DataFrame) representing a term-document | |
matrix, text_df (Pandas DataFrame) with a "text" column and rows | |
that correspond to the td_df, and query_string (string). | |
- Returns: A new dataframe that only contains rows from text_df where | |
the "text" column had at least one occurence of each term in | |
query_string. Additional columns are added to show the count of | |
each term and the total count of all terms. | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def build_corpus(doc_list, dictionary): | |
""" | |
- Parameters: doc_list (list of spaCy Document objects), dictionary | |
(Gensim Dictionary object). | |
- Returns: A list of documents in bag-of-words format, containing | |
tuples with (token_id, token_count) for each token in the text. | |
""" | |
return [dictionary.doc2bow(get_token_texts(doc)) for doc in doc_list] | |
def build_td_matrix(doc_list, dictionary): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load spaCy model | |
nlp = spacy.load("en_core_web_md") | |
# tokenize documents | |
def spacy_doc(model, text, lower=True): | |
""" | |
- Parameters: model (spaCy model), text (string), lower (bool). | |
- Returns: A spaCy Document object processed using the provided | |
model. Document is all lowercase if lower is True. | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def search_df_texts(df, query_string: str): | |
""" | |
- Parameters: df (Pandas DataFrame), query_string (string). df must | |
contain a "text" column. | |
- Returns: A subset of df containing only rows where each term in | |
query_string appeared as a substring in df["text"]. | |
""" | |
terms = query_string.lower().split(" ") | |
filters = [df["text"].str.lower().str.contains(term) for term in terms] | |
return df[np.all(filters, axis=0)] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def binary_search(items, target): | |
left = 0 | |
right = len(items) - 1 | |
while left <= right: | |
mid = (left + right) // 2 | |
if items[mid] == target: | |
return True | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Receipt: | |
def __init__(self, item, cost): | |
self.item = item | |
self.cost = cost | |
def receipt_msg(self): | |
return f"{self.item}, ${round(self.cost, 2)}" | |
def deliver(self): | |
msg = self.receipt_msg() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Car: | |
def __init__(self, max_speed): | |
self.max_speed = max_speed | |
self.current_speed = 0 | |
self.acceleration_rate = 1 | |
def accelerate(self): | |
if self.current_speed < self.max_speed: | |
self.current_speed += self.acceleration_rate | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def double(num): | |
return num * 2 | |
my_numbers = [1, 2, 3, 4, 5] | |
doubled_numbers = list(map(double, my_numbers)) | |
print(doubled_numbers) # [2, 4, 6, 8, 10] |
NewerOlder