Skip to content

Instantly share code, notes, and snippets.

@tpeng
Created August 1, 2014 22:26
Show Gist options
  • Save tpeng/3d893ccc30c70ba30b7c to your computer and use it in GitHub Desktop.
Save tpeng/3d893ccc30c70ba30b7c to your computer and use it in GitHub Desktop.
diff --git a/webstruct/model.py b/webstruct/model.py
index f77ab36..941cd8f 100644
--- a/webstruct/model.py
+++ b/webstruct/model.py
@@ -111,6 +111,13 @@ class NER(object):
Return annotated HTML data in WebAnnotator format.
"""
html_tokens, tags = self.extract_raw(bytes_data)
+ return self.annotate_tokens(html_tokens, tags)
+
+ def annotate_tokens(self, html_tokens, tags, pretty_print=False):
+ """
+ Return annotated HTML data in WebAnnotator format.; input is tokens and tags.
+ """
+ assert len(html_tokens) == len(tags)
tree = self.html_tokenizer.detokenize_single(html_tokens, tags)
tree = to_webannotator(tree, self.entity_colors)
return tostring(tree, pretty_print=pretty_print)
diff --git a/webstruct/tests/test_utils.py b/webstruct/tests/test_utils.py
index a27b339..e76d348 100644
--- a/webstruct/tests/test_utils.py
+++ b/webstruct/tests/test_utils.py
@@ -1,8 +1,59 @@
+#!/usr/bin/env python
# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from webstruct.utils import human_sorted
+from __future__ import absolute_import
+import re
+from webstruct.utils import human_sorted, fuzzy_assign_bio_tags, merge_bio_tags
+from webstruct import HtmlTokenizer
+from webstruct.utils import html_document_fromstring
def test_human_sorted():
assert human_sorted(['5', '10', '7', '100']) == ['5', '7', '10', '100']
assert human_sorted(['foo1', 'foo10', 'foo2']) == ['foo1', 'foo2', 'foo10']
+
+def test_fuzzy_assign_bio_tags():
+ html = """<div id="intro_contact">
+ 013-Witgoedreparaties.nl<br/>Postbus 27<br/>
+ 4500 AA Oostburg<br/><a href="mailto:[email protected]">[email protected]</a><br/> tel: 013-5444999<br/></div></div>
+ </div>
+ """
+ html_tokenizer = HtmlTokenizer()
+ html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
+ tokens = [t.token for t in html_tokens]
+ pattern = ur'(Postbus.*?Oostburg)'
+ choices = ['Postbus 22 4500AA Oostburg']
+ tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ADDR', choices)
+ assert tags == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O']
+
+ html = """<div id="intro_contact">
+ 013-Witgoedreparaties.nl<br/>Postbus 27<br/>
+ 4500 AA Oostburg<br/>The Netherlands</br><a href="mailto:[email protected]">[email protected]</a><br/> tel: 013-5444999<br/></div></div>
+ </div>
+ """
+ html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
+ tokens = [t.token for t in html_tokens]
+ pattern = ur'(Postbus.*?Oostburg\s+(the\s+)?ne(d|th)erlands?)'
+ choices = ['Postbus 22 4500AA Oostburg', 'Postbus 22 4500AA Oostburg the netherlands']
+ tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ADDR', choices)
+ # tokens == [u'013-Witgoedreparaties.nl', u'Postbus', u'27', u'4500', u'AA', u'Oostburg',
+ # u'The', u'Netherlands', u'[email protected]', u'tel:', u'013-5444999']
+ assert tags == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O']
+
+ html = """<title>013-witgoedreparaties.nl | 013-witgoedreparaties.nl | title </title>"""
+ html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
+ tokens = [t.token for t in html_tokens]
+ pattern = ur'013-Witgoedreparaties.nl'
+ choices = ['013-witgoedreparaties.nl']
+ tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ORG', choices)
+ # tokens = [u'013-witgoedreparaties.nl', u'|', u'013-witgoedreparaties.nl', u'|', u'title']
+ assert tags == ['B-ORG', 'O', 'B-ORG', 'O', 'O']
+
+def test_merge_bio_tags():
+ tags = merge_bio_tags(['B-ORG', 'O', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'])
+ assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O']
+
+ tags = merge_bio_tags(['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'])
+ assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O']
+
+ tags = merge_bio_tags(['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'B-ADDR'])
+ assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'B-ADDR']
diff --git a/webstruct/utils.py b/webstruct/utils.py
index 167a005..5501645 100644
--- a/webstruct/utils.py
+++ b/webstruct/utils.py
@@ -6,7 +6,7 @@ from functools import partial
from itertools import chain
import lxml.html
from lxml.etree import iterwalk
-
+from webstruct.sequence_encoding import IobEncoder
def merge_dicts(*dicts):
"""
@@ -322,3 +322,38 @@ def train_test_split_noshuffle(*arrays, **options):
return list(chain.from_iterable(
(a[:-test_size], a[-test_size:]) for a in arrays
))
+
+def fuzzy_assign_bio_tags(html_tokens, pattern, entity, choices, threshold=0.9):
+ """Assign the BIO tags with fuzzy string matching.
+
+ It first finds the candidates using the given regex
+ pattern and then compare similarity to the ``choices``,
+ if the similarity to any ``choices`` exceed the ``threshold``, assign the
+ ``BIO`` entity to corresponding tags.
+
+ this function is helpful when having a partially labeled dataset.
+ """
+ tokens = [html_token.token for html_token in html_tokens]
+ iob_encoder = IobEncoder()
+
+ from fuzzywuzzy import process
+ def repl(m):
+ if process.extractBests(m.group(0), choices, score_cutoff=threshold * 100):
+ return u' __START_{0}__ {1} __END_{0}__ '.format(entity, m.group(0))
+ return m.group(0)
+
+ text = re.sub(pattern, repl, u" ".join(tokens), flags=re.I | re.U | re.DOTALL)
+ tags = [tag for _, tag in iob_encoder.encode(text.split())]
+ assert len(html_tokens) == len(tags)
+ return tags
+
+def merge_bio_tags(*tag_list):
+ """Merge BIO tags"""
+ def select_tag(x, y):
+ # later one wins
+ if y != 'O':
+ return y
+ if x != 'O':
+ return x
+ return 'O'
+ return [reduce(select_tag, i) for i in zip(*tag_list)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment