This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python | |
Python 3.5.1+ (default, Mar 30 2016, 22:46:26) | |
[GCC 5.3.1 20160330] on linux | |
Type "help", "copyright", "credits" or "license" for more information. | |
>>> from deltas.tokenizers import wikitext_split | |
>>> wikitext_split.regex.pattern | |
"(?P<comment_start><!--)|(?P<comment_end>-->)|(?P<url>((bitcoin|geo|magnet|mailto|news|sips?|tel|urn)\\:|((|ftp|ftps|git|gopher|https?|ircs?|mms|nntp|redis|sftp|ssh|svn|telnet|worldwind|xmpp)\\:)?\\/\\/)[^\\s/$.?#].[^\\s]*)|(?P<entity>&[a-z][a-z0-9]*;)|(?P<cjk>[\\u4E00-\\u62FF\\u6300-\\u77FF\\u7800-\\u8CFF\\u8D00-\\u9FCC\\u3400-\\u4DFF\\U00020000-\\U000215FF\\U00021600-\\U000230FF\\U00023100-\\U000245FF\\U00024600-\\U000260FF\\U00026100-\\U000275FF\\U00027600-\\U000290FF\\U00029100-\\U0002A6DF\\uF900-\\uFAFF\\U0002F800-\\U0002FA1F\\u3041-\\u3096\\u30A0-\\u30FF\\u3400-\\u4DB5\\u4E00-\\u9FCB\\uF900-\\uFA6A\\u2E80-\\u2FD5\\uFF5F-\\uFF9F\\u31F0-\\u31FF\\u3220-\\u3243\\u3280-\\u337F])|(?P<ref_open><ref\\b[^>/]*>)|(?P<ref_close></ref\\b[^>]*>)|(?P<ref_singleton><ref\\b[^>/]*/>)|( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
0 - Biography | |
1 - Sports | |
2 - etc. | |
[1,1,0,0,0,0,0,0] - Athlete biography | |
[0,0,1,0,0,0,0,0] | |
64 UBIGINT - Unsigned INT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import mwapi | |
import textstat | |
session = mwapi.Session("https://en.wikipedia.org", user_agent="[email protected]") | |
doc = session.get(action='query', prop='revisions', rvprop='content', titles='Alan Turing', rvslots="main", formatversion=2) | |
text = doc['query']['pages'][0]['revisions'][0]['slots']['main']['content'] | |
start = time.time() | |
print("flesch_reading_ease", textstat.flesch_reading_ease(text)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import re | |
from .extractor import TemplateExtractor | |
logger = logging.getLogger(__name__) | |
def from_template(template): | |
project_name = normalize_project_name(template.name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python | |
Python 3.5.1+ (default, Mar 30 2016, 22:46:26) | |
[GCC 5.3.1 20160330] on linux | |
Type "help", "copyright", "credits" or "license" for more information. | |
>>> import re | |
>>> labels_re = r"([^\|\{\{\}\}]+)\|([0-5*])" | |
>>> my_template = "{{Marca de projeto|3|Biografias|4|Políticos|4|Brasil|3|WP Offline|2|bot=4/20111127|rev=20170714}}" | |
>>> [(m.group(1), m.group(2)) for m in re.finditer(labels_re, my_template)] | |
[('Marca de projeto', '3'), ('Biografias', '4'), ('Políticos', '4'), ('Brasil', '3'), ('WP Offline', '2')] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mwparsefromhell | |
example = """ | |
{{foo bar baz}} | |
{{I am a random template|7|Foo bar|8}} | |
{{Marca de projeto|3|Biografias|4|Políticos|4|Brasil|3|WP Offline|2|bot=4/20111127|rev=20170714}}""" | |
templates = list(mwparserfromhell.parse(example_text).filter_templates()) | |
def from_template(template): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import mwapi | |
from revscoring.dependencies import solve | |
from revscoring.languages import english | |
from articlequality.feature_lists import enwiki | |
session = mwapi.Session("https://en.wikipedia.org") | |
doc = session.get(action='query', prop='revisions', rvprop='content', titles='Alan Turing', formatversion=2) | |
text = doc['query']['pages'][0]['revisions'][0]['content'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import mwapi | |
from revscoring.dependencies import solve | |
from revscoring.features import wikitext | |
from articlequality.feature_lists.enwiki import text_complexity | |
session = mwapi.Session("https://en.wikipedia.org") | |
doc = session.get(action='query', prop='revisions', rvprop='content', titles='Alan Turing', formatversion=2) | |
text = doc['query']['pages'][0]['revisions'][0]['content'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
min_section_flesch = aggregators.min( | |
section_flesches, | |
name="wikitext.revisions.sections.min_flesch") | |
max_section_flesch = aggregators.max( | |
section_flesches, | |
name="wikitext.revisions.sections.max_flesch") | |
mean_section_flesch = aggregators.mean( | |
section_flesches, | |
name="wikitext.revisions.sections.mean_flesch") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Python 3.5.1+ (default, Mar 30 2016, 22:46:26) | |
[GCC 5.3.1 20160330] on linux | |
Type "help", "copyright", "credits" or "license" for more information. | |
>>> import mwapi | |
>>> from revscoring.languages import english | |
>>> from revscoring.dependencies import solve | |
>>> doc = mwapi.Session("https://en.wikipedia.org").get(action="query", prop="revisions", titles="Alan Turing", rvprop="content", formatversion=2) | |
Sending requests with default User-Agent. Set 'user_agent' on mwapi.Session to quiet this message. | |
The following query raised warnings: {'format': 'json', 'prop': 'revisions', 'rvprop': 'content', 'titles': 'Alan Turing', 'formatversion': 2, 'action': 'query'} | |
- revisions -- {'warnings': 'Because "rvslots" was not specified, a legacy format has been used for the output. This format is deprecated, and in the future the new format will always be used.'} |