sevperez · October 14, 2020 08:40
diff --git a/search_relevance_5.py b/search_relevance_5.py
 def document_frequency(td_df, term: str):
    """
    - Parameters: td_df (Pandas DataFrame) representing a term-document
      matrix, and term (string).
    - Returns: The document frequency value showing the number of
      documents in td_df where term occurs at least once.
    """
    return td_df[td_df[term] > 0].shape[0]

 def inverse_document_frequency(td_df, term: str):
    """
    - Parameters: td_df (Pandas DataFrame) representing a term-document
      matrix, and term (string).
    - Returns: The inverse document frequency value for term, calculated
      as N / log(dft) where N is the number of documents in td_df and
      dft is the document frequency value for term.
    """
    N = td_df.shape[0]
    dft = document_frequency(td_df, term)
    return (N / np.log10(dft))
    
 def build_tfidf_df(td_df):
    """
    - Parameters: td_df (Pandas DataFrame) representing a term-document
      matrix.
    - Returns: Returns a term frequency-inverse document frequency
      (TF-IDF) matrix in the form of a Pandas DataFrame, where each row
      is a document and each column is a token. Values in the dataframe
      are TF-IDF values for the given document / token.
    """
    def calculate_tfidf(col, td_df):
        idf = inverse_document_frequency(td_df, col.name)
        return col * idf
    
    return td_df.apply(calculate_tfidf, td_df=td_df)

 sotu_tfidf_df = build_tfidf_df(sotu_td_df)
	def document_frequency(td_df, term: str):
	"""
	- Parameters: td_df (Pandas DataFrame) representing a term-document
	matrix, and term (string).
	- Returns: The document frequency value showing the number of
	documents in td_df where term occurs at least once.
	"""
	return td_df[td_df[term] > 0].shape[0]

	def inverse_document_frequency(td_df, term: str):
	"""
	- Parameters: td_df (Pandas DataFrame) representing a term-document
	matrix, and term (string).
	- Returns: The inverse document frequency value for term, calculated
	as N / log(dft) where N is the number of documents in td_df and
	dft is the document frequency value for term.
	"""
	N = td_df.shape[0]
	dft = document_frequency(td_df, term)
	return (N / np.log10(dft))

	def build_tfidf_df(td_df):
	"""
	- Parameters: td_df (Pandas DataFrame) representing a term-document
	matrix.
	- Returns: Returns a term frequency-inverse document frequency
	(TF-IDF) matrix in the form of a Pandas DataFrame, where each row
	is a document and each column is a token. Values in the dataframe
	are TF-IDF values for the given document / token.
	"""
	def calculate_tfidf(col, td_df):
	idf = inverse_document_frequency(td_df, col.name)
	return col * idf

	return td_df.apply(calculate_tfidf, td_df=td_df)

	sotu_tfidf_df = build_tfidf_df(sotu_td_df)
No results found