Skip to content

Instantly share code, notes, and snippets.

@zhangce
Created May 19, 2014 19:45
Show Gist options
  • Select an option

  • Save zhangce/14bc8c999e723130b910 to your computer and use it in GitHub Desktop.

Select an option

Save zhangce/14bc8c999e723130b910 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
# File: udf/ext_has_spouse_features.py
import sys, json
# For each input tuple
# TODO: Sample Data and the input schema.
# sample json
for row in sys.stdin:
obj = json.loads(row)
# Library/DSL??? This is a span, it should be an object.
p1_start = obj["p1.start_position"]
p1_length = obj["p1.length"]
p1_end = p1_start + p1_length
p2_start = obj["p2.start_position"]
p2_length = obj["p2.length"]
p2_end = p2_start + p2_length
p1_text = obj["words"][p1_start:p1_length]
p2_text = obj["words"][p2_start:p2_length]
left_idx = min(p1_end, p2_end)
right_idx = max(p1_start, p2_start)
# Features for this pair come in here
features = set()
# Feature 1: Find out if a lemma of marry occurs.
# A better feature would ensure this is on the dependency path between the two.
lemma_between = obj["lemma"][left_idx:right_idx]
married_words = ['marry', 'widow']
for mw in married_words:
if mw in lemma_between:
features.add("important_word=%s" % mw)
# Feature 2: The number of words between the two phrases.
# Intuition: if they are close by, the link may be stronger.
words_between = obj["words"][left_idx:right_idx]
l = len(words_between)
if l < 5: features.add("num_words_between=%s" % l)
else: features.add("many_words_between")
# Feature 3: Check if the last name matches heuristically.
last_word_left = obj["words"][p1_end - 1]
last_word_right = obj["words"][p2_end - 1]
if (last_word_left == last_word_right):
features.add("potential_last_name_match")
# TODO: Add more features, look at dependency paths, etc
for feature in features:
print json.dumps({
"relation_id": obj["relation_id"],
"feature": feature
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment