Created
July 29, 2020 15:03
-
-
Save Ladsgroup/a5ee3c7b1f8209496a35e54a0d77f744 to your computer and use it in GitHub Desktop.
features based on kmeans clustering of properties
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from revscoring import Feature | |
from revscoring.datasources import \ | |
revision_oriented as revision_oriented_datasources | |
from revscoring.datasources.datasource import Datasource | |
from revscoring.features import wikibase as wikibase_ | |
from revscoring.features import modifiers | |
from revscoring.features.feature_vector import FeatureVector | |
from revscoring.features.meta import aggregators, bools | |
from revscoring.features.modifiers import not_ | |
from . import wikibase, wikimedia | |
name = "wikidatawiki" | |
import pickle | |
filehandler = open('/home/amsa/articlequality/articlequality/feature_lists/kmeans.pickle', 'rb') | |
kmeans = pickle.load(filehandler) | |
IMPORTANT_LANG_CODES = {'en', 'de', 'ar', 'zh', 'es', 'pt', 'ru', 'fr'} | |
IMPORTANT_LANG_CODES_LIST = sorted(list(IMPORTANT_LANG_CODES)) | |
""" | |
Language codes for important languages which are described in | |
https://www.wikidata.org/wiki/Wikidata:Item_quality#Translations | |
""" | |
class properties: | |
""" | |
Mapping of english descriptions to property identifiers | |
""" | |
INSTANCE_OF = 'P31' | |
DATE_OF_BIRTH = 'P569' | |
DATE_OF_DEATH = 'P570' | |
class items: | |
""" | |
Mapping of english descriptions to item idenifiers | |
""" | |
HUMAN = 'Q5' | |
def _process_references(entity): | |
return [reference | |
for pid, statements in entity.properties.items() | |
for statement in statements | |
for pid, references in statement.references.items() | |
for reference in references] | |
def _process_pids(entity): | |
item_keys = np.zeros(10000) | |
for pid in entity.properties: | |
if int(pid[1:]) > len(item_keys): | |
continue | |
item_keys[int(pid[1:])] = 1 | |
return kmeans.transform([item_keys]) | |
pids_transformed = FeatureVector( | |
name + ".revision.pids_transformed", | |
_process_pids, | |
returns=float, | |
depends_on=[wikibase_.revision.datasources.entity]) | |
references = Datasource( | |
name + ".revision.references", | |
_process_references, | |
depends_on=[wikibase_.revision.datasources.entity]) | |
def _process_wikimedia_references(references): | |
return [reference | |
for reference in references | |
if (reference.datatype == 'wikibase-entityid' and | |
reference.datavalue.id in wikimedia.PROJECT_QIDS)] | |
wikimedia_references = Datasource( | |
name + ".revision.wikimedia_references", | |
_process_wikimedia_references, depends_on=[references]) | |
def _process_unique_references(references): | |
return {(reference.property, str(reference.datavalue)) | |
for reference in references} | |
unique_references = Datasource( | |
name + ".revision.unique_references", | |
_process_unique_references, depends_on=[references]) | |
def _process_complete_translations(item_labels, item_descriptions): | |
return (item_labels.keys() & item_descriptions.keys()) | |
complete_translations = Datasource( | |
name + ".revision.complete_translations", | |
_process_complete_translations, | |
depends_on=[wikibase_.revision.datasources.labels, | |
wikibase_.revision.datasources.descriptions],) | |
def _process_important_complete_translations(complete_translations): | |
return (complete_translations & IMPORTANT_LANG_CODES) | |
important_complete_translations = Datasource( | |
name + ".revision.important_complete_translations", | |
_process_important_complete_translations, | |
depends_on=[complete_translations]) | |
def _process_important_label_translations(item_labels): | |
return (item_labels.keys() & IMPORTANT_LANG_CODES) | |
important_label_translations = Datasource( | |
name + ".revision.important_label_translations", | |
_process_important_label_translations, | |
depends_on=[wikibase_.revision.datasources.labels]) | |
def _process_important_description_translations(item_descriptions): | |
return (item_descriptions.keys() & IMPORTANT_LANG_CODES) | |
important_description_translations = Datasource( | |
name + ".revision.important_description_translations", | |
_process_important_description_translations, | |
depends_on=[wikibase_.revision.datasources.descriptions]) | |
references_count = aggregators.len(references) | |
"`int` : A count of all sources in the revision" | |
wikimedia_references_count = aggregators.len(wikimedia_references) | |
"`int` : A count of all sources which come from Wikimedia projects" | |
external_references_count = references_count - wikimedia_references_count | |
"`int` : A count of all sources which do not come from Wikimedia projects" | |
unique_references_count = aggregators.len(unique_references) | |
"`int` : A count of unique sources in the revision" | |
def _process_item_completeness(current_properties, properties_suggested): | |
current_properties = set(current_properties.keys()) | |
all_prob = 0.0 | |
present_prob = 0.0 | |
for statement in properties_suggested: | |
all_prob += float(statement['rating']) | |
if statement['id'] in current_properties: | |
present_prob += float(statement['rating']) | |
return present_prob / all_prob if all_prob else 0.0 | |
item_completeness = Feature( | |
name + '.revision.page.item_completeness', | |
_process_item_completeness, | |
returns=float, | |
depends_on=[ | |
wikibase_.revision.datasources.properties, | |
revision_oriented_datasources.revision.page.suggested.properties]) | |
# Status | |
is_human = wikibase_.revision.has_property_value( | |
properties.INSTANCE_OF, items.HUMAN, name=name + '.revision.is_human') | |
has_birthday = wikibase_.revision.has_property( | |
properties.DATE_OF_BIRTH, name=name + '.revision.has_birthday') | |
dead = wikibase_.revision.has_property( | |
properties.DATE_OF_DEATH, name=name + '.revision.dead') | |
is_blp = has_birthday.and_(not_(dead)) | |
important_label_translation_features = [ | |
bools.item_in_set(i, important_label_translations) | |
for i in IMPORTANT_LANG_CODES_LIST | |
] | |
important_description_translation_features = [ | |
bools.item_in_set(i, important_description_translations) | |
for i in IMPORTANT_LANG_CODES_LIST | |
] | |
important_complete_translation_features = [ | |
bools.item_in_set(i, important_complete_translations) | |
for i in IMPORTANT_LANG_CODES_LIST | |
] | |
local_wiki = \ | |
important_label_translation_features + \ | |
important_description_translation_features + \ | |
important_complete_translation_features + \ | |
[ | |
is_human, | |
is_blp, | |
aggregators.len(complete_translations), | |
references_count, | |
wikimedia_references_count, | |
wikimedia_references_count / modifiers.max(references_count, 1), | |
external_references_count, | |
external_references_count / modifiers.max(references_count, 1), | |
unique_references_count, | |
unique_references_count / modifiers.max(references_count, 1), | |
item_completeness | |
] | |
item_quality = wikibase.item + local_wiki |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment