rjurney · February 2, 2013 07:57
diff --git a/ntf_idf.pig b/ntf_idf.pig
 DEFINE tf_idf(token_records, id_field, token_field) RETURNS out_relation {
   
  /* Calculate the term count per document */
  doc_word_totals = foreach (group $token_records by ($id_field, $token_field)) generate 
    FLATTEN(group) as ($id_field, token), 
    COUNT_STAR($token_records) as doc_total;
 
  /* Calculate the document size */
  pre_term_counts = foreach (group doc_word_totals by $id_field) generate
    group AS $id_field,
    FLATTEN(doc_word_totals.(token, doc_total)) as (token, doc_total), 
    SUM(doc_word_totals.doc_total) as doc_size;
 
  /* Calculate the TF */
  term_freqs = foreach pre_term_counts generate $id_field as $id_field,
    token as token,
    ((double)doc_total / (double)doc_size) AS term_freq;
  
  /* Calculate Max Freq */
  max_freqs = foreach (group term_freqs by $id_field) {
    sorted = order term_freqs by term_freq desc;
    doc_maxes = limit sorted 1;
    generate group as $id_field, 
             FLATTEN(doc_maxes.term_freq) as doc_max;
  }
  
  /* Calculate NTF */
  n_term_freqs = join max_freqs by $id_field, term_freqs by $id_field;
  n_term_freqs = foreach n_term_freqs generate term_freqs.($id_field) as $id_field, 
    term_freqs::token as token, 
   (double)((double)0.4 + (double)(1 - 0.4) * (double)((double)term_freqs::term_freq/(double)max_freqs::doc_max)) as n_term_freq:double;
 
  /* Get count of documents using each token, for idf */
  token_usages = foreach (group n_term_freqs by token) generate
    FLATTEN(n_term_freqs) as ($id_field:chararray, token:chararray, n_term_freq:double),
    COUNT_STAR(n_term_freqs) as num_docs_with_token;
 
  /* Get document count */
  just_ids = foreach $token_records generate $id_field;
  just_ids = DISTINCT just_ids;
  ndocs = foreach (group just_ids all) generate COUNT_STAR(just_ids) as total_docs;
 
  /* Note the use of Pig Scalars to calculate idf */
  $out_relation = foreach token_usages {
    idf    = LOG((double)ndocs.total_docs/(double)num_docs_with_token);
    tf_idf = (double)n_term_freq * idf;
    generate $id_field as $id_field,
      token as token,
      (double)tf_idf as score:double;
  };
 };
	DEFINE tf_idf(token_records, id_field, token_field) RETURNS out_relation {

	/* Calculate the term count per document */
	doc_word_totals = foreach (group $token_records by ($id_field, $token_field)) generate
	FLATTEN(group) as ($id_field, token),
	COUNT_STAR($token_records) as doc_total;

	/* Calculate the document size */
	pre_term_counts = foreach (group doc_word_totals by $id_field) generate
	group AS $id_field,
	FLATTEN(doc_word_totals.(token, doc_total)) as (token, doc_total),
	SUM(doc_word_totals.doc_total) as doc_size;

	/* Calculate the TF */
	term_freqs = foreach pre_term_counts generate $id_field as $id_field,
	token as token,
	((double)doc_total / (double)doc_size) AS term_freq;

	/* Calculate Max Freq */
	max_freqs = foreach (group term_freqs by $id_field) {
	sorted = order term_freqs by term_freq desc;
	doc_maxes = limit sorted 1;
	generate group as $id_field,
	FLATTEN(doc_maxes.term_freq) as doc_max;
	}

	/* Calculate NTF */
	n_term_freqs = join max_freqs by $id_field, term_freqs by $id_field;
	n_term_freqs = foreach n_term_freqs generate term_freqs.($id_field) as $id_field,
	term_freqs::token as token,
	(double)((double)0.4 + (double)(1 - 0.4) * (double)((double)term_freqs::term_freq/(double)max_freqs::doc_max)) as n_term_freq:double;

	/* Get count of documents using each token, for idf */
	token_usages = foreach (group n_term_freqs by token) generate
	FLATTEN(n_term_freqs) as ($id_field:chararray, token:chararray, n_term_freq:double),
	COUNT_STAR(n_term_freqs) as num_docs_with_token;

	/* Get document count */
	just_ids = foreach $token_records generate $id_field;
	just_ids = DISTINCT just_ids;
	ndocs = foreach (group just_ids all) generate COUNT_STAR(just_ids) as total_docs;

	/* Note the use of Pig Scalars to calculate idf */
	$out_relation = foreach token_usages {
	idf = LOG((double)ndocs.total_docs/(double)num_docs_with_token);
	tf_idf = (double)n_term_freq * idf;
	generate $id_field as $id_field,
	token as token,
	(double)tf_idf as score:double;
	};
	};