Created
October 1, 2013 15:12
-
-
Save dpnova/6780025 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class MyTemplateMaker(TemplateMaker): | |
def annotate_fragment_attribute(self, index, field, attribute='content'): | |
for f in self.htmlpage.parsed_body[index::-1]: | |
if isinstance(f, HtmlTag) and f.tag_type in [HtmlTagType.OPEN_TAG, HtmlTagType.UNPAIRED_TAG]: | |
if 'data-scrapy-annotate' in f.attributes: | |
fstr = self.htmlpage.fragment_data(f) | |
raise FragmentAlreadyAnnotated("Fragment already annotated: %s" % fstr) | |
d = {'annotations': {attribute or "content": field}} | |
a = ' data-scrapy-annotate="%s"' % json.dumps(d).replace('"', '"') | |
p = self.htmlpage | |
p.body = p.body[:f.end-1] + a + p.body[f.end-1:] | |
break | |
def annotate_attribute(self, field, score_func, best_match=True, attribute=None): | |
"""Annotate a field. | |
``score_func`` is a callable that receives two arguments: (fragment, | |
htmlpage) and returns a relevancy score (float) indicating how relevant | |
is the fragment. 0 means the fragment is irrelevant. Higher scores | |
means the fragment is more relevant. Otherwise, the closest opening tag | |
(to the left) is annotated with the given attribute. | |
If ``best_match`` is ``True``, only the best fragment is annotated. | |
Otherwise, all fragments (with a positive relevancy) are annotated. | |
""" | |
indexes = self.select(score_func) | |
if not indexes: | |
raise FragmentNotFound("Fragment not found annotating %r using: %s" % | |
(field, score_func)) | |
if best_match: | |
del indexes[1:] | |
for i in indexes: | |
self.annotate_fragment_attribute(i, field, attribute) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment