rolisz · May 24, 2022 12:17
diff --git a/manual_patterns.py b/manual_patterns.py
 import copy
 from typing import Union, Iterable, Optional, List

 import spacy
 from prodigy import recipe, log, get_stream
 from prodigy.models.matcher import PatternMatcher
 from prodigy.types import RecipeSettingsType
 from prodigy.util import get_labels


 @recipe(
    "textcat.manual_patterns",
    # fmt: off
    dataset=("Dataset to save annotations to", "positional", None, str),
    source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
    spacy_model=("Loadable spaCy pipeline or blank:lang (e.g. blank:en)", "positional", None, str),
    labels=("Comma-separated label(s) to annotate or text file with one label per line", "option", "l", get_labels),
    patterns=("Path to match patterns file", "option", "pt", str),
    # fmt: on
 )
 def manual(
    dataset: str,
    source: Union[str, Iterable[dict]],
    spacy_model: str,
    labels: Optional[List[str]] = None,
    patterns: Optional[str] = None,
 ) -> RecipeSettingsType:
    """
    Manually annotate categories that apply to a text. If more than one label
    is specified, categories are added as multiple choice options. If the
    --exclusive flag is set, categories become mutually exclusive, meaning that
    only one can be selected during annotation.
    """
    log("RECIPE: Starting recipe textcat.manual", locals())
    log(f"RECIPE: Annotating with {len(labels)} labels", labels)
    stream = get_stream(source, rehash=True, dedup=True, input_key="text")
    nlp = spacy.load(spacy_model)

    matcher = PatternMatcher(
        nlp,
        prior_correct=5.0,
        prior_incorrect=5.0,
        label_span=False,
        label_task=True,
        filter_labels=labels,
        combine_matches=True,
        task_hash_keys=("label",),
    )
    matcher = matcher.from_disk(patterns)
    stream = add_suggestions(stream, matcher, labels)

    return {
        "view_id": "choice",
        "dataset": dataset,
        "stream": stream,
        "config": {
            "labels": labels,
            "choice_style": "multiple",
            "choice_auto_accept": False,
            "exclude_by": "task",
            "auto_count_stream": True,
        },
    }


 def add_suggestions(stream, matcher, labels):
    texts = (eg for score, eg in matcher(stream))
    options = [{"id": label, "text": label} for label in labels]

    for eg in texts:
        task = copy.deepcopy(eg)

        task["options"] = options
        if 'label' in task:
            task["accept"] = [task['label']]
            del task['label']
        yield task
	import copy
	from typing import Union, Iterable, Optional, List

	import spacy
	from prodigy import recipe, log, get_stream
	from prodigy.models.matcher import PatternMatcher
	from prodigy.types import RecipeSettingsType
	from prodigy.util import get_labels


	@recipe(
	"textcat.manual_patterns",
	# fmt: off
	dataset=("Dataset to save annotations to", "positional", None, str),
	source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
	spacy_model=("Loadable spaCy pipeline or blank:lang (e.g. blank:en)", "positional", None, str),
	labels=("Comma-separated label(s) to annotate or text file with one label per line", "option", "l", get_labels),
	patterns=("Path to match patterns file", "option", "pt", str),
	# fmt: on
	)
	def manual(
	dataset: str,
	source: Union[str, Iterable[dict]],
	spacy_model: str,
	labels: Optional[List[str]] = None,
	patterns: Optional[str] = None,
	) -> RecipeSettingsType:
	"""
	Manually annotate categories that apply to a text. If more than one label
	is specified, categories are added as multiple choice options. If the
	--exclusive flag is set, categories become mutually exclusive, meaning that
	only one can be selected during annotation.
	"""
	log("RECIPE: Starting recipe textcat.manual", locals())
	log(f"RECIPE: Annotating with {len(labels)} labels", labels)
	stream = get_stream(source, rehash=True, dedup=True, input_key="text")
	nlp = spacy.load(spacy_model)

	matcher = PatternMatcher(
	nlp,
	prior_correct=5.0,
	prior_incorrect=5.0,
	label_span=False,
	label_task=True,
	filter_labels=labels,
	combine_matches=True,
	task_hash_keys=("label",),
	)
	matcher = matcher.from_disk(patterns)
	stream = add_suggestions(stream, matcher, labels)

	return {
	"view_id": "choice",
	"dataset": dataset,
	"stream": stream,
	"config": {
	"labels": labels,
	"choice_style": "multiple",
	"choice_auto_accept": False,
	"exclude_by": "task",
	"auto_count_stream": True,
	},
	}


	def add_suggestions(stream, matcher, labels):
	texts = (eg for score, eg in matcher(stream))
	options = [{"id": label, "text": label} for label in labels]

	for eg in texts:
	task = copy.deepcopy(eg)

	task["options"] = options
	if 'label' in task:
	task["accept"] = [task['label']]
	del task['label']
	yield task