tpeng · August 1, 2014 22:26
diff --git a/fuzzy_bio.diff b/fuzzy_bio.diff
 diff --git a/webstruct/model.py b/webstruct/model.py
 index f77ab36..941cd8f 100644
 --- a/webstruct/model.py
 +++ b/webstruct/model.py
 @@ -111,6 +111,13 @@ class NER(object):
         Return annotated HTML data in WebAnnotator format.
         """
         html_tokens, tags = self.extract_raw(bytes_data)
 +        return self.annotate_tokens(html_tokens, tags)
 +
 +    def annotate_tokens(self, html_tokens, tags, pretty_print=False):
 +        """
 +        Return annotated HTML data in WebAnnotator format.; input is tokens and tags.
 +        """
 +        assert len(html_tokens) == len(tags)
         tree = self.html_tokenizer.detokenize_single(html_tokens, tags)
         tree = to_webannotator(tree, self.entity_colors)
         return tostring(tree, pretty_print=pretty_print)
 diff --git a/webstruct/tests/test_utils.py b/webstruct/tests/test_utils.py
 index a27b339..e76d348 100644
 --- a/webstruct/tests/test_utils.py
 +++ b/webstruct/tests/test_utils.py
 @@ -1,8 +1,59 @@
 +#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 -from __future__ import absolute_import
 -from webstruct.utils import human_sorted
 
 +from __future__ import absolute_import
 +import re
 +from webstruct.utils import human_sorted, fuzzy_assign_bio_tags, merge_bio_tags
 +from webstruct import HtmlTokenizer
 +from webstruct.utils import html_document_fromstring
 
 def test_human_sorted():
     assert human_sorted(['5', '10', '7', '100']) == ['5', '7', '10', '100']
     assert human_sorted(['foo1', 'foo10', 'foo2']) == ['foo1', 'foo2', 'foo10']
 +
 +def test_fuzzy_assign_bio_tags():
 +    html = """<div id="intro_contact">
 +         013-Witgoedreparaties.nl<br/>Postbus 27<br/>
 +         4500 AA Oostburg<br/><a href="mailto:[email protected]">[email protected]</a><br/>  tel: 013-5444999<br/></div></div>
 +        </div>
 +    """
 +    html_tokenizer = HtmlTokenizer()
 +    html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
 +    tokens = [t.token for t in html_tokens]
 +    pattern = ur'(Postbus.*?Oostburg)'
 +    choices = ['Postbus 22 4500AA Oostburg']
 +    tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ADDR', choices)
 +    assert tags == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O']
 +
 +    html = """<div id="intro_contact">
 +         013-Witgoedreparaties.nl<br/>Postbus 27<br/>
 +         4500 AA Oostburg<br/>The Netherlands</br><a href="mailto:[email protected]">[email protected]</a><br/>  tel: 013-5444999<br/></div></div>
 +        </div>
 +    """
 +    html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
 +    tokens = [t.token for t in html_tokens]
 +    pattern = ur'(Postbus.*?Oostburg\s+(the\s+)?ne(d|th)erlands?)'
 +    choices = ['Postbus 22 4500AA Oostburg', 'Postbus 22 4500AA Oostburg the netherlands']
 +    tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ADDR', choices)
 +    # tokens == [u'013-Witgoedreparaties.nl', u'Postbus', u'27', u'4500', u'AA', u'Oostburg',
 +    # u'The', u'Netherlands', u'[email protected]', u'tel:', u'013-5444999']
 +    assert tags == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O']
 +
 +    html = """<title>013-witgoedreparaties.nl | 013-witgoedreparaties.nl | title </title>"""
 +    html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
 +    tokens = [t.token for t in html_tokens]
 +    pattern = ur'013-Witgoedreparaties.nl'
 +    choices = ['013-witgoedreparaties.nl']
 +    tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ORG', choices)
 +    # tokens = [u'013-witgoedreparaties.nl', u'|', u'013-witgoedreparaties.nl', u'|', u'title']
 +    assert tags == ['B-ORG', 'O', 'B-ORG', 'O', 'O']
 +
 +def test_merge_bio_tags():
 +    tags = merge_bio_tags(['B-ORG', 'O', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'])
 +    assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O']
 +
 +    tags = merge_bio_tags(['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'])
 +    assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O']
 +
 +    tags = merge_bio_tags(['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'B-ADDR'])
 +    assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'B-ADDR']
 diff --git a/webstruct/utils.py b/webstruct/utils.py
 index 167a005..5501645 100644
 --- a/webstruct/utils.py
 +++ b/webstruct/utils.py
 @@ -6,7 +6,7 @@ from functools import partial
 from itertools import chain
 import lxml.html
 from lxml.etree import iterwalk
 -
 +from webstruct.sequence_encoding import IobEncoder
 
 def merge_dicts(*dicts):
     """
 @@ -322,3 +322,38 @@ def train_test_split_noshuffle(*arrays, **options):
     return list(chain.from_iterable(
         (a[:-test_size], a[-test_size:]) for a in arrays
     ))
 +
 +def fuzzy_assign_bio_tags(html_tokens, pattern, entity, choices, threshold=0.9):
 +    """Assign the BIO tags with fuzzy string matching.
 +
 +    It first finds the candidates using the given regex
 +    pattern and then compare similarity to the ``choices``,
 +    if the similarity to any ``choices`` exceed the ``threshold``, assign the
 +    ``BIO`` entity to corresponding tags.
 +
 +    this function is helpful when having a partially labeled dataset.
 +    """
 +    tokens = [html_token.token for html_token in html_tokens]
 +    iob_encoder = IobEncoder()
 +
 +    from fuzzywuzzy import process
 +    def repl(m):
 +        if process.extractBests(m.group(0), choices, score_cutoff=threshold * 100):
 +            return u' __START_{0}__  {1}  __END_{0}__ '.format(entity, m.group(0))
 +        return m.group(0)
 +
 +    text = re.sub(pattern, repl, u" ".join(tokens), flags=re.I | re.U | re.DOTALL)
 +    tags = [tag for _, tag in iob_encoder.encode(text.split())]
 +    assert len(html_tokens) == len(tags)
 +    return tags
 +
 +def merge_bio_tags(*tag_list):
 +    """Merge BIO tags"""
 +    def select_tag(x, y):
 +        # later one wins
 +        if y != 'O':
 +            return y
 +        if x != 'O':
 +            return x
 +        return 'O'
 +    return [reduce(select_tag, i) for i in zip(*tag_list)]
	diff --git a/webstruct/model.py b/webstruct/model.py
	index f77ab36..941cd8f 100644
	--- a/webstruct/model.py
	+++ b/webstruct/model.py
	@@ -111,6 +111,13 @@ class NER(object):
	Return annotated HTML data in WebAnnotator format.
	"""
	html_tokens, tags = self.extract_raw(bytes_data)
	+ return self.annotate_tokens(html_tokens, tags)
	+
	+ def annotate_tokens(self, html_tokens, tags, pretty_print=False):
	+ """
	+ Return annotated HTML data in WebAnnotator format.; input is tokens and tags.
	+ """
	+ assert len(html_tokens) == len(tags)
	tree = self.html_tokenizer.detokenize_single(html_tokens, tags)
	tree = to_webannotator(tree, self.entity_colors)
	return tostring(tree, pretty_print=pretty_print)
	diff --git a/webstruct/tests/test_utils.py b/webstruct/tests/test_utils.py
	index a27b339..e76d348 100644
	--- a/webstruct/tests/test_utils.py
	+++ b/webstruct/tests/test_utils.py
	@@ -1,8 +1,59 @@
	+#!/usr/bin/env python
	# -- coding: utf-8 --
	-from __future__ import absolute_import
	-from webstruct.utils import human_sorted

	+from __future__ import absolute_import
	+import re
	+from webstruct.utils import human_sorted, fuzzy_assign_bio_tags, merge_bio_tags
	+from webstruct import HtmlTokenizer
	+from webstruct.utils import html_document_fromstring

	def test_human_sorted():
	assert human_sorted(['5', '10', '7', '100']) == ['5', '7', '10', '100']
	assert human_sorted(['foo1', 'foo10', 'foo2']) == ['foo1', 'foo2', 'foo10']
	+
	+def test_fuzzy_assign_bio_tags():
	+ html = """<div id="intro_contact">
	+ 013-Witgoedreparaties.nl<br/>Postbus 27<br/>
	+ 4500 AA Oostburg<br/><a href="mailto:[email protected]">[email protected]</a><br/> tel: 013-5444999<br/></div></div>
	+ </div>
	+ """
	+ html_tokenizer = HtmlTokenizer()
	+ html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
	+ tokens = [t.token for t in html_tokens]
	+ pattern = ur'(Postbus.*?Oostburg)'
	+ choices = ['Postbus 22 4500AA Oostburg']
	+ tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ADDR', choices)
	+ assert tags == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O']
	+
	+ html = """<div id="intro_contact">
	+ 013-Witgoedreparaties.nl<br/>Postbus 27<br/>
	+ 4500 AA Oostburg<br/>The Netherlands</br><a href="mailto:[email protected]">[email protected]</a><br/> tel: 013-5444999<br/></div></div>
	+ </div>
	+ """
	+ html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
	+ tokens = [t.token for t in html_tokens]
	+ pattern = ur'(Postbus.*?Oostburg\s+(the\s+)?ne(d\|th)erlands?)'
	+ choices = ['Postbus 22 4500AA Oostburg', 'Postbus 22 4500AA Oostburg the netherlands']
	+ tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ADDR', choices)
	+ # tokens == [u'013-Witgoedreparaties.nl', u'Postbus', u'27', u'4500', u'AA', u'Oostburg',
	+ # u'The', u'Netherlands', u'[email protected]', u'tel:', u'013-5444999']
	+ assert tags == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O']
	+
	+ html = """<title>013-witgoedreparaties.nl \| 013-witgoedreparaties.nl \| title </title>"""
	+ html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
	+ tokens = [t.token for t in html_tokens]
	+ pattern = ur'013-Witgoedreparaties.nl'
	+ choices = ['013-witgoedreparaties.nl']
	+ tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ORG', choices)
	+ # tokens = [u'013-witgoedreparaties.nl', u'\|', u'013-witgoedreparaties.nl', u'\|', u'title']
	+ assert tags == ['B-ORG', 'O', 'B-ORG', 'O', 'O']
	+
	+def test_merge_bio_tags():
	+ tags = merge_bio_tags(['B-ORG', 'O', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'])
	+ assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O']
	+
	+ tags = merge_bio_tags(['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'])
	+ assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O']
	+
	+ tags = merge_bio_tags(['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'B-ADDR'])
	+ assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'B-ADDR']
	diff --git a/webstruct/utils.py b/webstruct/utils.py
	index 167a005..5501645 100644
	--- a/webstruct/utils.py
	+++ b/webstruct/utils.py
	@@ -6,7 +6,7 @@ from functools import partial
	from itertools import chain
	import lxml.html
	from lxml.etree import iterwalk
	-
	+from webstruct.sequence_encoding import IobEncoder

	def merge_dicts(*dicts):
	"""
	@@ -322,3 +322,38 @@ def train_test_split_noshuffle(arrays, *options):
	return list(chain.from_iterable(
	(a[:-test_size], a[-test_size:]) for a in arrays
	))
	+
	+def fuzzy_assign_bio_tags(html_tokens, pattern, entity, choices, threshold=0.9):
	+ """Assign the BIO tags with fuzzy string matching.
	+
	+ It first finds the candidates using the given regex
	+ pattern and then compare similarity to the ``choices``,
	+ if the similarity to any ``choices`` exceed the ``threshold``, assign the
	+ ``BIO`` entity to corresponding tags.
	+
	+ this function is helpful when having a partially labeled dataset.
	+ """
	+ tokens = [html_token.token for html_token in html_tokens]
	+ iob_encoder = IobEncoder()
	+
	+ from fuzzywuzzy import process
	+ def repl(m):
	+ if process.extractBests(m.group(0), choices, score_cutoff=threshold * 100):
	+ return u' __START_{0}__ {1} __END_{0}__ '.format(entity, m.group(0))
	+ return m.group(0)
	+
	+ text = re.sub(pattern, repl, u" ".join(tokens), flags=re.I \| re.U \| re.DOTALL)
	+ tags = [tag for _, tag in iob_encoder.encode(text.split())]
	+ assert len(html_tokens) == len(tags)
	+ return tags
	+
	+def merge_bio_tags(*tag_list):
	+ """Merge BIO tags"""
	+ def select_tag(x, y):
	+ # later one wins
	+ if y != 'O':
	+ return y
	+ if x != 'O':
	+ return x
	+ return 'O'
	+ return [reduce(select_tag, i) for i in zip(*tag_list)]