Created
February 26, 2018 20:14
-
-
Save Ladsgroup/3efbe704382b3299c00bb7be03dbfb2f to your computer and use it in GitHub Desktop.
wikidata and enwiktionary batch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
- ################################### Wikidata ################################## | |
? ------ -- | |
+ ############################# Wikidata ################################ | |
- | |
- # wikidatawiki.balanced_revisions.20k_2015.json is check into the repo | |
- | |
- datasets/wikidatawiki.autolabeled_revisions.20k_2015.json: \ | |
- datasets/wikidatawiki.balanced_revisions.20k_2015.json | |
- cat $< | \ | |
- ./utility autolabel --host=https://wikidata.org \ | |
- --trusted-groups=abusefilter,arbcom,bureaucrat,checkuser,rollbacker,sysop,bot \ | |
- --trusted-edits=1000 \ | |
- --verbose > $@ | |
datasets/wikidatawiki.human_labeled_revisions.5k_2016.json: | |
./utility fetch_labels \ | |
https://labels.wmflabs.org/campaigns/wikidatawiki/19/ > $@ | |
datasets/wikidatawiki.labeled_revisions.20k_2015.json: \ | |
datasets/wikidatawiki.human_labeled_revisions.5k_2016.json \ | |
datasets/wikidatawiki.autolabeled_revisions.20k_2015.json | |
./utility merge_labels $^ > $@ | |
datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json: \ | |
datasets/wikidatawiki.labeled_revisions.20k_2015.json | |
cat $< | \ | |
revscoring extract \ | |
- editquality.feature_lists.wikidatawiki.reverted \ | |
editquality.feature_lists.wikidatawiki.damaging \ | |
editquality.feature_lists.wikidatawiki.goodfaith \ | |
- --host https://wikidata.org \ | |
+ --host https://www.wikidata.org \ | |
? ++++ | |
+ --extractor $(max_extractors) \ | |
--verbose > $@ | |
tuning_reports/wikidatawiki.damaging.md: \ | |
datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json | |
cat $< | \ | |
revscoring tune \ | |
config/classifiers.params.yaml \ | |
editquality.feature_lists.wikidatawiki.damaging \ | |
damaging \ | |
roc_auc.labels.true \ | |
--label-weight "true=$(damaging_weight)" \ | |
--pop-rate "true=0.0008668694143782405" \ | |
--pop-rate "false=0.9991331305856218" \ | |
- --labels "true,false" \ | |
--center --scale \ | |
- --cv-timeout=60 \ | |
? ^ | |
+ --cv-timeout 60 \ | |
? ^ | |
--debug > $@ | |
models/wikidatawiki.damaging.gradient_boosting.model: \ | |
datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json | |
cat $< | \ | |
revscoring cv_train \ | |
revscoring.scoring.models.GradientBoosting \ | |
editquality.feature_lists.wikidatawiki.damaging \ | |
damaging \ | |
--version=$(damaging_major_minor).0 \ | |
+ -p 'learning_rate=0.01' \ | |
-p 'max_depth=7' \ | |
- -p 'learning_rate=0.01' \ | |
- -p 'max_features="log2"' \ | |
? - - | |
+ -p 'max_features=log2' \ | |
-p 'n_estimators=700' \ | |
--label-weight "true=$(damaging_weight)" \ | |
- --labels "true,false" \ | |
+ --pop-rate "true=0.0008668694143782405" \ | |
+ --pop-rate "false=0.9991331305856218" \ | |
--center --scale > $@ | |
tuning_reports/wikidatawiki.goodfaith.md: \ | |
datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json | |
cat $< | \ | |
revscoring tune \ | |
config/classifiers.params.yaml \ | |
editquality.feature_lists.wikidatawiki.goodfaith \ | |
goodfaith \ | |
roc_auc.labels.true \ | |
--label-weight "false=$(goodfaith_weight)" \ | |
--pop-rate "true=0.9998525516181488" \ | |
--pop-rate "false=0.00014744838185121178" \ | |
- --labels "true,false" \ | |
+ --center --scale \ | |
- --cv-timeout=60 \ | |
? ^ | |
+ --cv-timeout 60 \ | |
? ^ | |
--debug > $@ | |
models/wikidatawiki.goodfaith.gradient_boosting.model: \ | |
datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json | |
cat $< | \ | |
revscoring cv_train \ | |
revscoring.scoring.models.GradientBoosting \ | |
editquality.feature_lists.wikidatawiki.goodfaith \ | |
goodfaith \ | |
--version=$(goodfaith_major_minor).0 \ | |
+ -p 'learning_rate=0.1' \ | |
-p 'max_depth=5' \ | |
- -p 'learning_rate=0.1' \ | |
- -p 'max_features="log2"' \ | |
? - - | |
+ -p 'max_features=log2' \ | |
-p 'n_estimators=300' \ | |
--label-weight "false=$(goodfaith_weight)" \ | |
- --labels "true,false" \ | |
+ --pop-rate "true=0.9998525516181488" \ | |
+ --pop-rate "false=0.00014744838185121178" \ | |
--center --scale > $@ | |
wikidatawiki_models: \ | |
- models/wikidatawiki.damaging.gradient_boosting.model \ | |
? - | |
+ models/wikidatawiki.damaging.gradient_boosting.model \ | |
- models/wikidatawiki.goodfaith.gradient_boosting.model | |
? - | |
+ models/wikidatawiki.goodfaith.gradient_boosting.model | |
wikidatawiki_tuning_reports: \ | |
- tuning_reports/wikidatawiki.damaging.md \ | |
? - | |
+ tuning_reports/wikidatawiki.damaging.md \ | |
- tuning_reports/wikidatawiki.goodfaith.md? - | |
+ tuning_reports/wikidatawiki.goodfaith.md | |
? | |
############################# English Wiktionary ################################ | |
datasets/enwiktionary.sampled_revisions.200k_2016.json: | |
wget -qO- https://quarry.wmflabs.org/run/97131/output/0/json-lines?download=true > $@ | |
datasets/enwiktionary.autolabeled_revisions.200k_2016.json: \ | |
datasets/enwiktionary.sampled_revisions.200k_2016.json | |
cat $< | \ | |
./utility autolabel --host=https://en.wiktionary.org \ | |
--trusted-groups=sysop,oversight,bot,rollbacker,checkuser,abusefilter,bureaucrat \ | |
--trusted-edits=1000 \ | |
+ --revert-radius=3 \ | |
+ --revert-window=48 \ | |
--verbose > $@ | |
- datasets/enwiktionary.autolabeled_revisions.evens.100k_2016.json: \ | |
? ------------ ^ ^^ ^^^ | |
+ datasets/enwiktionary.revisions_for_review.5k_2016.json: \ | |
? ^^^^^^ + ^ ^ | |
datasets/enwiktionary.autolabeled_revisions.200k_2016.json | |
+ ( \ | |
- cat $< | \ | |
+ cat $< | \ | |
? + | |
- grep -P '"rev_id": [0-9]+[02468],' > $@ | |
+ grep '"needs_review": true' | \ | |
+ shuf -n 2500; \ | |
+ cat $< | \ | |
+ grep '"needs_review": false' | \ | |
+ shuf -n 2500 \ | |
+ ) | shuf > $@ | |
datasets/enwiktionary.autolabeled_revisions.w_cache.20k_2016.json: \ | |
- datasets/enwiktionary.autolabeled_revisions.weighted.20k_2016.json | |
? --------- | |
+ datasets/enwiktionary.autolabeled_revisions.200k_2016.json | |
? + | |
- cat $< | \ | |
+ shuf -n 20000 $< | \ | |
revscoring extract \ | |
editquality.feature_lists.enwiktionary.reverted \ | |
--host https://en.wiktionary.org \ | |
--extractor $(max_extractors) \ | |
--verbose > $@ | |
- | |
- datasets/enwiktionary.autolabeled_revisions.weighted.20k_2016.json: \ | |
- datasets/enwiktionary.autolabeled_revisions.200k_2016.json | |
- ( \ | |
- cat $< | \ | |
- grep '"reverted_for_damage": false' | shuf -n 20000; \ | |
- cat $< | \ | |
- grep '"reverted_for_damage": true' \ | |
- ) | shuf > $@ | |
tuning_reports/enwiktionary.reverted.md: \ | |
datasets/enwiktionary.autolabeled_revisions.w_cache.20k_2016.json | |
cat $< | \ | |
revscoring tune \ | |
config/classifiers.params.yaml \ | |
editquality.feature_lists.enwiktionary.reverted \ | |
reverted_for_damage \ | |
roc_auc.labels.true \ | |
--label-weight "true=$(reverted_weight)" \ | |
--pop-rate "true=0.004778273117085203" \ | |
--pop-rate "false=0.9952217268829148" \ | |
--center --scale \ | |
- --cv-timeout=60 \ | |
? ^ | |
+ --cv-timeout 60 \ | |
? ^ | |
--debug > $@ | |
models/enwiktionary.reverted.rf.model: \ | |
datasets/enwiktionary.autolabeled_revisions.w_cache.20k_2016.json | |
cat $< | \ | |
revscoring cv_train \ | |
revscoring.scoring.models.RandomForest \ | |
editquality.feature_lists.enwiktionary.reverted \ | |
reverted_for_damage \ | |
--version=$(reverted_major_minor).0 \ | |
- -p 'criterion="entropy"' \ | |
? - - | |
+ -p 'criterion=entropy' \ | |
- -p 'max_features="log2"' \ | |
? - - | |
+ -p 'max_features=log2' \ | |
+ -p 'min_samples_leaf=3' \ | |
-p 'n_estimators=320' \ | |
- -p 'min_samples_leaf=3' \ | |
--label-weight "true=$(reverted_weight)" \ | |
--pop-rate "true=0.004778273117085203" \ | |
--pop-rate "false=0.9952217268829148" \ | |
--center --scale > $@ | |
- | |
- | |
- datasets/enwiktionary.human_labeled_revisions.5k_2016.json: | |
- ./utility fetch_labels \ | |
- https://labels.wmflabs.org/campaigns/enwiktionary/59/ > $@ | |
- | |
- datasets/enwiktionary.labeled_revisions.100k_2016.json: \ | |
- datasets/enwiktionary.human_labeled_revisions.5k_2016.json \ | |
- datasets/enwiktionary.autolabeled_revisions.evens.100k_2016.json | |
- ./utility merge_labels $^ > $@ | |
- | |
- datasets/enwiktionary.labeled_revisions.w_cache.100k_2016.json: \ | |
- datasets/enwiktionary.labeled_revisions.100k_2016.json | |
- cat $< | \ | |
- revscoring extract \ | |
- editquality.feature_lists.enwiktionary.goodfaith \ | |
- editquality.feature_lists.enwiktionary.damaging \ | |
- --host https://en.wiktionary.org \ | |
- --extractor $(max_extractors) \ | |
- --verbose > $@ | |
- | |
- tuning_reports/enwiktionary.damaging.md: \ | |
- datasets/enwiktionary.labeled_revisions.w_cache.100k_2016.json | |
- cat $< | \ | |
- revscoring tune \ | |
- config/classifiers.params.yaml \ | |
- editquality.feature_lists.enwiktionary.damaging \ | |
- damaging \ | |
- roc_auc.labels.true \ | |
- --label-weight "true=$(damaging_weight)" \ | |
- --fixme \ | |
- --cv-timeout=60 \ | |
- --debug > $@ | |
- | |
- tuning_reports/enwiktionary.goodfaith.md: \ | |
- datasets/enwiktionary.labeled_revisions.w_cache.100k_2016.json | |
- cat $< | \ | |
- revscoring tune \ | |
- config/classifiers.params.yaml \ | |
- editquality.feature_lists.enwiktionary.goodfaith \ | |
- goodfaith \ | |
- roc_auc.labels.true \ | |
- --label-weight "false=$(goodfaith_weight)" \ | |
- --fixme \ | |
- --cv-timeout=60 \ | |
- --debug > $@ | |
enwiktionary_models: \ | |
models/enwiktionary.reverted.rf.model | |
enwiktionary_tuning_reports: \ | |
tuning_reports/enwiktionary.reverted.md | |
- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment