Created
August 1, 2014 22:26
-
-
Save tpeng/3d893ccc30c70ba30b7c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/webstruct/model.py b/webstruct/model.py | |
index f77ab36..941cd8f 100644 | |
--- a/webstruct/model.py | |
+++ b/webstruct/model.py | |
@@ -111,6 +111,13 @@ class NER(object): | |
Return annotated HTML data in WebAnnotator format. | |
""" | |
html_tokens, tags = self.extract_raw(bytes_data) | |
+ return self.annotate_tokens(html_tokens, tags) | |
+ | |
+ def annotate_tokens(self, html_tokens, tags, pretty_print=False): | |
+ """ | |
+ Return annotated HTML data in WebAnnotator format.; input is tokens and tags. | |
+ """ | |
+ assert len(html_tokens) == len(tags) | |
tree = self.html_tokenizer.detokenize_single(html_tokens, tags) | |
tree = to_webannotator(tree, self.entity_colors) | |
return tostring(tree, pretty_print=pretty_print) | |
diff --git a/webstruct/tests/test_utils.py b/webstruct/tests/test_utils.py | |
index a27b339..e76d348 100644 | |
--- a/webstruct/tests/test_utils.py | |
+++ b/webstruct/tests/test_utils.py | |
@@ -1,8 +1,59 @@ | |
+#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
-from __future__ import absolute_import | |
-from webstruct.utils import human_sorted | |
+from __future__ import absolute_import | |
+import re | |
+from webstruct.utils import human_sorted, fuzzy_assign_bio_tags, merge_bio_tags | |
+from webstruct import HtmlTokenizer | |
+from webstruct.utils import html_document_fromstring | |
def test_human_sorted(): | |
assert human_sorted(['5', '10', '7', '100']) == ['5', '7', '10', '100'] | |
assert human_sorted(['foo1', 'foo10', 'foo2']) == ['foo1', 'foo2', 'foo10'] | |
+ | |
+def test_fuzzy_assign_bio_tags(): | |
+ html = """<div id="intro_contact"> | |
+ 013-Witgoedreparaties.nl<br/>Postbus 27<br/> | |
+ 4500 AA Oostburg<br/><a href="mailto:[email protected]">[email protected]</a><br/> tel: 013-5444999<br/></div></div> | |
+ </div> | |
+ """ | |
+ html_tokenizer = HtmlTokenizer() | |
+ html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html)) | |
+ tokens = [t.token for t in html_tokens] | |
+ pattern = ur'(Postbus.*?Oostburg)' | |
+ choices = ['Postbus 22 4500AA Oostburg'] | |
+ tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ADDR', choices) | |
+ assert tags == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O'] | |
+ | |
+ html = """<div id="intro_contact"> | |
+ 013-Witgoedreparaties.nl<br/>Postbus 27<br/> | |
+ 4500 AA Oostburg<br/>The Netherlands</br><a href="mailto:[email protected]">[email protected]</a><br/> tel: 013-5444999<br/></div></div> | |
+ </div> | |
+ """ | |
+ html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html)) | |
+ tokens = [t.token for t in html_tokens] | |
+ pattern = ur'(Postbus.*?Oostburg\s+(the\s+)?ne(d|th)erlands?)' | |
+ choices = ['Postbus 22 4500AA Oostburg', 'Postbus 22 4500AA Oostburg the netherlands'] | |
+ tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ADDR', choices) | |
+ # tokens == [u'013-Witgoedreparaties.nl', u'Postbus', u'27', u'4500', u'AA', u'Oostburg', | |
+ # u'The', u'Netherlands', u'[email protected]', u'tel:', u'013-5444999'] | |
+ assert tags == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O'] | |
+ | |
+ html = """<title>013-witgoedreparaties.nl | 013-witgoedreparaties.nl | title </title>""" | |
+ html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html)) | |
+ tokens = [t.token for t in html_tokens] | |
+ pattern = ur'013-Witgoedreparaties.nl' | |
+ choices = ['013-witgoedreparaties.nl'] | |
+ tags = fuzzy_assign_bio_tags(html_tokens, pattern, 'ORG', choices) | |
+ # tokens = [u'013-witgoedreparaties.nl', u'|', u'013-witgoedreparaties.nl', u'|', u'title'] | |
+ assert tags == ['B-ORG', 'O', 'B-ORG', 'O', 'O'] | |
+ | |
+def test_merge_bio_tags(): | |
+ tags = merge_bio_tags(['B-ORG', 'O', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O']) | |
+ assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'] | |
+ | |
+ tags = merge_bio_tags(['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O']) | |
+ assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'] | |
+ | |
+ tags = merge_bio_tags(['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'B-ADDR']) | |
+ assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'B-ADDR'] | |
diff --git a/webstruct/utils.py b/webstruct/utils.py | |
index 167a005..5501645 100644 | |
--- a/webstruct/utils.py | |
+++ b/webstruct/utils.py | |
@@ -6,7 +6,7 @@ from functools import partial | |
from itertools import chain | |
import lxml.html | |
from lxml.etree import iterwalk | |
- | |
+from webstruct.sequence_encoding import IobEncoder | |
def merge_dicts(*dicts): | |
""" | |
@@ -322,3 +322,38 @@ def train_test_split_noshuffle(*arrays, **options): | |
return list(chain.from_iterable( | |
(a[:-test_size], a[-test_size:]) for a in arrays | |
)) | |
+ | |
+def fuzzy_assign_bio_tags(html_tokens, pattern, entity, choices, threshold=0.9): | |
+ """Assign the BIO tags with fuzzy string matching. | |
+ | |
+ It first finds the candidates using the given regex | |
+ pattern and then compare similarity to the ``choices``, | |
+ if the similarity to any ``choices`` exceed the ``threshold``, assign the | |
+ ``BIO`` entity to corresponding tags. | |
+ | |
+ this function is helpful when having a partially labeled dataset. | |
+ """ | |
+ tokens = [html_token.token for html_token in html_tokens] | |
+ iob_encoder = IobEncoder() | |
+ | |
+ from fuzzywuzzy import process | |
+ def repl(m): | |
+ if process.extractBests(m.group(0), choices, score_cutoff=threshold * 100): | |
+ return u' __START_{0}__ {1} __END_{0}__ '.format(entity, m.group(0)) | |
+ return m.group(0) | |
+ | |
+ text = re.sub(pattern, repl, u" ".join(tokens), flags=re.I | re.U | re.DOTALL) | |
+ tags = [tag for _, tag in iob_encoder.encode(text.split())] | |
+ assert len(html_tokens) == len(tags) | |
+ return tags | |
+ | |
+def merge_bio_tags(*tag_list): | |
+ """Merge BIO tags""" | |
+ def select_tag(x, y): | |
+ # later one wins | |
+ if y != 'O': | |
+ return y | |
+ if x != 'O': | |
+ return x | |
+ return 'O' | |
+ return [reduce(select_tag, i) for i in zip(*tag_list)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment