Skip to content

Instantly share code, notes, and snippets.

@dpnova
Created October 1, 2013 15:12
Show Gist options
  • Save dpnova/6780025 to your computer and use it in GitHub Desktop.
Save dpnova/6780025 to your computer and use it in GitHub Desktop.
class MyTemplateMaker(TemplateMaker):
def annotate_fragment_attribute(self, index, field, attribute='content'):
for f in self.htmlpage.parsed_body[index::-1]:
if isinstance(f, HtmlTag) and f.tag_type in [HtmlTagType.OPEN_TAG, HtmlTagType.UNPAIRED_TAG]:
if 'data-scrapy-annotate' in f.attributes:
fstr = self.htmlpage.fragment_data(f)
raise FragmentAlreadyAnnotated("Fragment already annotated: %s" % fstr)
d = {'annotations': {attribute or "content": field}}
a = ' data-scrapy-annotate="%s"' % json.dumps(d).replace('"', '"')
p = self.htmlpage
p.body = p.body[:f.end-1] + a + p.body[f.end-1:]
break
def annotate_attribute(self, field, score_func, best_match=True, attribute=None):
"""Annotate a field.
``score_func`` is a callable that receives two arguments: (fragment,
htmlpage) and returns a relevancy score (float) indicating how relevant
is the fragment. 0 means the fragment is irrelevant. Higher scores
means the fragment is more relevant. Otherwise, the closest opening tag
(to the left) is annotated with the given attribute.
If ``best_match`` is ``True``, only the best fragment is annotated.
Otherwise, all fragments (with a positive relevancy) are annotated.
"""
indexes = self.select(score_func)
if not indexes:
raise FragmentNotFound("Fragment not found annotating %r using: %s" %
(field, score_func))
if best_match:
del indexes[1:]
for i in indexes:
self.annotate_fragment_attribute(i, field, attribute)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment