Skip to content

Instantly share code, notes, and snippets.

@halfak
Created March 12, 2020 20:32
Show Gist options
  • Select an option

  • Save halfak/867c3a5e4b7cbdbca62c9316c4985d06 to your computer and use it in GitHub Desktop.

Select an option

Save halfak/867c3a5e4b7cbdbca62c9316c4985d06 to your computer and use it in GitHub Desktop.
import logging
import re
from .extractor import TemplateExtractor
logger = logging.getLogger(__name__)
def from_template(template):
project_name = normalize_project_name(template.name)
if template.name == "marca de projeto":
labels = extract_labels(template)
if len(labels) > 1:
for project, label in labels[1:]:
yield (project, label)
PROJECT_LABEL = re.compile(r"([^\|\{\{\}\}]+)\|([0-5\*])", re.I)
def extract_labels(template):
return [(label.group(1),
label.group(2)) for label in re.finditer(PROJECT_LABEL, template)]
def normalize_project_name(template_name):
return template_name.lower().replace("_", " ")
ptwiki = TemplateExtractor(
__name__,
doc="""
articlequality.extractors.ptwiki
++++++++++++++++++++++++++++++++
This extractor looks for instances of templates that contain
"class=<some class>" on article talk pages (namespace = 1) and parses the
template name to obtain a `project`.
""",
namespaces={1},
from_template=from_template
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment