-
-
Save yfe404/4378f5c0e5be902d93be522ddc247e6b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from revscoring.features import wikitext | |
| from revscoring.languages import english | |
| char_based = [ | |
| wikitext.revision.chars, | |
| wikitext.revision.whitespace_chars, | |
| wikitext.revision.markup_chars, | |
| wikitext.revision.cjk_chars, | |
| wikitext.revision.entity_chars, | |
| wikitext.revision.url_chars, | |
| wikitext.revision.word_chars, | |
| wikitext.revision.uppercase_word_chars, | |
| wikitext.revision.punctuation_chars, | |
| wikitext.revision.break_chars, | |
| wikitext.revision.longest_repeated_char | |
| ] | |
| token_based = [ | |
| wikitext.revision.tokens, | |
| wikitext.revision.numbers, | |
| wikitext.revision.whitespaces, | |
| wikitext.revision.markups, | |
| wikitext.revision.cjks, | |
| wikitext.revision.entities, | |
| wikitext.revision.urls, | |
| wikitext.revision.words, | |
| wikitext.revision.uppercase_words, | |
| wikitext.revision.punctuations, | |
| wikitext.revision.breaks, | |
| wikitext.revision.longest_token, | |
| wikitext.revision.longest_word | |
| ] | |
| lang_based = [ | |
| english.badwords.revision.matches, | |
| english.stopwords.revision.stopwords, | |
| english.stopwords.revision.non_stopwords, | |
| english.informals.revision.matches | |
| ] | |
| parse_based = [ | |
| wikitext.revision.content_chars, | |
| wikitext.revision.headings, | |
| wikitext.revision.external_links, | |
| wikitext.revision.wikilinks, | |
| wikitext.revision.tags, | |
| wikitext.revision.ref_tags, | |
| wikitext.revision.templates | |
| ] | |
| char_normalized = [ | |
| wikitext.revision.whitespace_chars / wikitext.revision.chars, | |
| wikitext.revision.markup_chars / wikitext.revision.chars, | |
| wikitext.revision.cjk_chars / wikitext.revision.chars, | |
| wikitext.revision.entity_chars / wikitext.revision.chars, | |
| wikitext.revision.url_chars / wikitext.revision.chars, | |
| wikitext.revision.word_chars / wikitext.revision.chars, | |
| wikitext.revision.uppercase_word_chars / wikitext.revision.chars, | |
| wikitext.revision.punctuation_chars / wikitext.revision.chars, | |
| wikitext.revision.break_chars / wikitext.revision.chars, | |
| wikitext.revision.longest_repeated_char / wikitext.revision.chars, | |
| ] | |
| token_normalized = [ | |
| wikitext.revision.numbers / wikitext.revision.tokens, | |
| wikitext.revision.whitespaces / wikitext.revision.tokens, | |
| wikitext.revision.markups / wikitext.revision.tokens, | |
| wikitext.revision.cjks / wikitext.revision.tokens, | |
| wikitext.revision.entities / wikitext.revision.tokens, | |
| wikitext.revision.urls / wikitext.revision.tokens, | |
| wikitext.revision.words / wikitext.revision.tokens, | |
| wikitext.revision.uppercase_words / wikitext.revision.tokens, | |
| wikitext.revision.punctuations / wikitext.revision.tokens, | |
| wikitext.revision.breaks / wikitext.revision.tokens, | |
| wikitext.revision.longest_token / wikitext.revision.tokens, | |
| wikitext.revision.longest_word / wikitext.revision.tokens | |
| ] | |
| lang_normalized = [ | |
| english.badwords.revision.matches / wikitext.revision.words, | |
| english.stopwords.revision.stopwords / wikitext.revision.words, | |
| english.stopwords.revision.non_stopwords / wikitext.revision.words, | |
| english.informals.revision.matches / wikitext.revision.words | |
| ] | |
| draft_quality = (char_based + token_based + parse_based + | |
| lang_based + char_normalized + token_normalized + lang_normalized) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| >>> from pprint import pprint | |
| >>> | |
| >>> from revscoring.datasources import revision_oriented as ro | |
| >>> from revscoring.dependencies import solve | |
| >>> | |
| >>> | |
| >>> text = """ | |
| ... {{Infobox|foo}} | |
| ... I am an article | |
| ... | |
| ... == Header! == | |
| ... * list | |
| ... * items | |
| ... * [[stuff|and a link]]<ref>some stuff</ref> | |
| ... """ | |
| >>> pprint(list(zip(draft_quality, solve(draft_quality, cache={ro.revision.text: text})))) | |
| [(<feature.wikitext.revision.chars>, 139), | |
| (<feature.wikitext.revision.whitespace_chars>, 27), | |
| (<feature.wikitext.revision.markup_chars>, 12), | |
| (<feature.wikitext.revision.cjk_chars>, 0), | |
| (<feature.wikitext.revision.entity_chars>, 0), | |
| (<feature.wikitext.revision.url_chars>, 0), | |
| (<feature.wikitext.revision.word_chars>, 59), | |
| (<feature.wikitext.revision.uppercase_word_chars>, 0), | |
| (<feature.wikitext.revision.punctuation_chars>, 25), | |
| (<feature.wikitext.revision.break_chars>, 0), | |
| (<feature.wikitext.revision.longest_repeated_char>, 3), | |
| (<feature.len(<datasource.tokenized(datasource.revision.text)>)>, 64), | |
| (<feature.len(<datasource.wikitext.revision.numbers>)>, 0), | |
| (<feature.len(<datasource.wikitext.revision.whitespaces>)>, 27), | |
| (<feature.len(<datasource.wikitext.revision.markups>)>, 6), | |
| (<feature.len(<datasource.wikitext.revision.cjks>)>, 0), | |
| (<feature.len(<datasource.wikitext.revision.entities>)>, 0), | |
| (<feature.len(<datasource.wikitext.revision.urls>)>, 0), | |
| (<feature.len(<datasource.wikitext.revision.words>)>, 15), | |
| (<feature.len(<datasource.wikitext.revision.uppercase_words>)>, 0), | |
| (<feature.len(<datasource.wikitext.revision.punctuations>)>, 9), | |
| (<feature.len(<datasource.wikitext.revision.breaks>)>, 0), | |
| (<feature.max(<datasource.map(<built-in function len>, <datasource.tokenized(datasource.revision.text)>)>)>, | |
| 7), | |
| (<feature.max(<datasource.map(<built-in function len>, <datasource.wikitext.revision.words>)>)>, | |
| 7), | |
| (<feature.wikitext.revision.content_chars>, 102), | |
| (<feature.wikitext.revision.headings>, 0), | |
| (<feature.wikitext.revision.external_links>, 0), | |
| (<feature.wikitext.revision.wikilinks>, 1), | |
| (<feature.wikitext.revision.tags>, 1), | |
| (<feature.wikitext.revision.ref_tags>, 1), | |
| (<feature.wikitext.revision.templates>, 1), | |
| (<feature.len(<datasource.english.badwords.revision.matches>)>, 0), | |
| (<feature.len(<datasource.english.stopwords.revision.stopwords>)>, 6), | |
| (<feature.len(<datasource.english.stopwords.revision.non_stopwords>)>, 9), | |
| (<feature.len(<datasource.english.informals.revision.matches>)>, 1), | |
| (<feature.(wikitext.revision.whitespace_chars / wikitext.revision.chars)>, | |
| 0.19424460431654678), | |
| (<feature.(wikitext.revision.markup_chars / wikitext.revision.chars)>, | |
| 0.08633093525179857), | |
| (<feature.(wikitext.revision.cjk_chars / wikitext.revision.chars)>, 0.0), | |
| (<feature.(wikitext.revision.entity_chars / wikitext.revision.chars)>, 0.0), | |
| (<feature.(wikitext.revision.url_chars / wikitext.revision.chars)>, 0.0), | |
| (<feature.(wikitext.revision.word_chars / wikitext.revision.chars)>, | |
| 0.4244604316546763), | |
| (<feature.(wikitext.revision.uppercase_word_chars / wikitext.revision.chars)>, | |
| 0.0), | |
| (<feature.(wikitext.revision.punctuation_chars / wikitext.revision.chars)>, | |
| 0.17985611510791366), | |
| (<feature.(wikitext.revision.break_chars / wikitext.revision.chars)>, 0.0), | |
| (<feature.(wikitext.revision.longest_repeated_char / wikitext.revision.chars)>, | |
| 0.02158273381294964), | |
| (<feature.(len(<datasource.wikitext.revision.numbers>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.0), | |
| (<feature.(len(<datasource.wikitext.revision.whitespaces>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.421875), | |
| (<feature.(len(<datasource.wikitext.revision.markups>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.09375), | |
| (<feature.(len(<datasource.wikitext.revision.cjks>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.0), | |
| (<feature.(len(<datasource.wikitext.revision.entities>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.0), | |
| (<feature.(len(<datasource.wikitext.revision.urls>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.0), | |
| (<feature.(len(<datasource.wikitext.revision.words>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.234375), | |
| (<feature.(len(<datasource.wikitext.revision.uppercase_words>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.0), | |
| (<feature.(len(<datasource.wikitext.revision.punctuations>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.140625), | |
| (<feature.(len(<datasource.wikitext.revision.breaks>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.0), | |
| (<feature.(max(<datasource.map(<built-in function len>, <datasource.tokenized(datasource.revision.text)>)>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.109375), | |
| (<feature.(max(<datasource.map(<built-in function len>, <datasource.wikitext.revision.words>)>) / len(<datasource.tokenized(datasource.revision.text)>))>, | |
| 0.109375), | |
| (<feature.(len(<datasource.english.badwords.revision.matches>) / len(<datasource.wikitext.revision.words>))>, | |
| 0.0), | |
| (<feature.(len(<datasource.english.stopwords.revision.stopwords>) / len(<datasource.wikitext.revision.words>))>, | |
| 0.4), | |
| (<feature.(len(<datasource.english.stopwords.revision.non_stopwords>) / len(<datasource.wikitext.revision.words>))>, | |
| 0.6), | |
| (<feature.(len(<datasource.english.informals.revision.matches>) / len(<datasource.wikitext.revision.words>))>, | |
| 0.06666666666666667)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment