Created
October 14, 2020 08:41
-
-
Save sevperez/4434037aa3637e19242fd624b89ca7f4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def search_tfidf_df(tfidf_df, text_df, query_string: str): | |
""" | |
- Parameters: tfidf_df (Pandas DataFrame) representing a tf-idf | |
matrix, text_df (Pandas DataFrame) with a "text" column and rows | |
that correspond to the tfidf_df, and query_string (string). | |
- Returns: A new dataframe that only contains rows from text_df where | |
the corresponding tf-idf value was greater than zero for each of | |
the terms in query_string. Additional columns are added to show the | |
tf-idf value for each term and the sum of the tf-idf values. | |
""" | |
terms = query_string.lower().split(" ") | |
filters = [tfidf_df[term] > 0 for term in terms] | |
filtered_tfidf_df = tfidf_df[np.all(filters, axis=0)][terms] | |
filtered_tfidf_df["tfidf_sum"] = filtered_tfidf_df.agg(sum, axis=1) | |
full_df = text_df.merge(filtered_tfidf_df, | |
left_index=True, right_index=True) | |
return full_df.sort_values("tfidf_sum", ascending=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment