Created
March 12, 2020 20:32
-
-
Save halfak/867c3a5e4b7cbdbca62c9316c4985d06 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import logging | |
| import re | |
| from .extractor import TemplateExtractor | |
| logger = logging.getLogger(__name__) | |
| def from_template(template): | |
| project_name = normalize_project_name(template.name) | |
| if template.name == "marca de projeto": | |
| labels = extract_labels(template) | |
| if len(labels) > 1: | |
| for project, label in labels[1:]: | |
| yield (project, label) | |
| PROJECT_LABEL = re.compile(r"([^\|\{\{\}\}]+)\|([0-5\*])", re.I) | |
| def extract_labels(template): | |
| return [(label.group(1), | |
| label.group(2)) for label in re.finditer(PROJECT_LABEL, template)] | |
| def normalize_project_name(template_name): | |
| return template_name.lower().replace("_", " ") | |
| ptwiki = TemplateExtractor( | |
| __name__, | |
| doc=""" | |
| articlequality.extractors.ptwiki | |
| ++++++++++++++++++++++++++++++++ | |
| This extractor looks for instances of templates that contain | |
| "class=<some class>" on article talk pages (namespace = 1) and parses the | |
| template name to obtain a `project`. | |
| """, | |
| namespaces={1}, | |
| from_template=from_template | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment