Created
February 26, 2018 20:14
-
-
Save Ladsgroup/3efbe704382b3299c00bb7be03dbfb2f to your computer and use it in GitHub Desktop.
wikidata and enwiktionary batch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| - ################################### Wikidata ################################## | |
| ? ------ -- | |
| + ############################# Wikidata ################################ | |
| - | |
| - # wikidatawiki.balanced_revisions.20k_2015.json is check into the repo | |
| - | |
| - datasets/wikidatawiki.autolabeled_revisions.20k_2015.json: \ | |
| - datasets/wikidatawiki.balanced_revisions.20k_2015.json | |
| - cat $< | \ | |
| - ./utility autolabel --host=https://wikidata.org \ | |
| - --trusted-groups=abusefilter,arbcom,bureaucrat,checkuser,rollbacker,sysop,bot \ | |
| - --trusted-edits=1000 \ | |
| - --verbose > $@ | |
| datasets/wikidatawiki.human_labeled_revisions.5k_2016.json: | |
| ./utility fetch_labels \ | |
| https://labels.wmflabs.org/campaigns/wikidatawiki/19/ > $@ | |
| datasets/wikidatawiki.labeled_revisions.20k_2015.json: \ | |
| datasets/wikidatawiki.human_labeled_revisions.5k_2016.json \ | |
| datasets/wikidatawiki.autolabeled_revisions.20k_2015.json | |
| ./utility merge_labels $^ > $@ | |
| datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json: \ | |
| datasets/wikidatawiki.labeled_revisions.20k_2015.json | |
| cat $< | \ | |
| revscoring extract \ | |
| - editquality.feature_lists.wikidatawiki.reverted \ | |
| editquality.feature_lists.wikidatawiki.damaging \ | |
| editquality.feature_lists.wikidatawiki.goodfaith \ | |
| - --host https://wikidata.org \ | |
| + --host https://www.wikidata.org \ | |
| ? ++++ | |
| + --extractor $(max_extractors) \ | |
| --verbose > $@ | |
| tuning_reports/wikidatawiki.damaging.md: \ | |
| datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json | |
| cat $< | \ | |
| revscoring tune \ | |
| config/classifiers.params.yaml \ | |
| editquality.feature_lists.wikidatawiki.damaging \ | |
| damaging \ | |
| roc_auc.labels.true \ | |
| --label-weight "true=$(damaging_weight)" \ | |
| --pop-rate "true=0.0008668694143782405" \ | |
| --pop-rate "false=0.9991331305856218" \ | |
| - --labels "true,false" \ | |
| --center --scale \ | |
| - --cv-timeout=60 \ | |
| ? ^ | |
| + --cv-timeout 60 \ | |
| ? ^ | |
| --debug > $@ | |
| models/wikidatawiki.damaging.gradient_boosting.model: \ | |
| datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json | |
| cat $< | \ | |
| revscoring cv_train \ | |
| revscoring.scoring.models.GradientBoosting \ | |
| editquality.feature_lists.wikidatawiki.damaging \ | |
| damaging \ | |
| --version=$(damaging_major_minor).0 \ | |
| + -p 'learning_rate=0.01' \ | |
| -p 'max_depth=7' \ | |
| - -p 'learning_rate=0.01' \ | |
| - -p 'max_features="log2"' \ | |
| ? - - | |
| + -p 'max_features=log2' \ | |
| -p 'n_estimators=700' \ | |
| --label-weight "true=$(damaging_weight)" \ | |
| - --labels "true,false" \ | |
| + --pop-rate "true=0.0008668694143782405" \ | |
| + --pop-rate "false=0.9991331305856218" \ | |
| --center --scale > $@ | |
| tuning_reports/wikidatawiki.goodfaith.md: \ | |
| datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json | |
| cat $< | \ | |
| revscoring tune \ | |
| config/classifiers.params.yaml \ | |
| editquality.feature_lists.wikidatawiki.goodfaith \ | |
| goodfaith \ | |
| roc_auc.labels.true \ | |
| --label-weight "false=$(goodfaith_weight)" \ | |
| --pop-rate "true=0.9998525516181488" \ | |
| --pop-rate "false=0.00014744838185121178" \ | |
| - --labels "true,false" \ | |
| + --center --scale \ | |
| - --cv-timeout=60 \ | |
| ? ^ | |
| + --cv-timeout 60 \ | |
| ? ^ | |
| --debug > $@ | |
| models/wikidatawiki.goodfaith.gradient_boosting.model: \ | |
| datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json | |
| cat $< | \ | |
| revscoring cv_train \ | |
| revscoring.scoring.models.GradientBoosting \ | |
| editquality.feature_lists.wikidatawiki.goodfaith \ | |
| goodfaith \ | |
| --version=$(goodfaith_major_minor).0 \ | |
| + -p 'learning_rate=0.1' \ | |
| -p 'max_depth=5' \ | |
| - -p 'learning_rate=0.1' \ | |
| - -p 'max_features="log2"' \ | |
| ? - - | |
| + -p 'max_features=log2' \ | |
| -p 'n_estimators=300' \ | |
| --label-weight "false=$(goodfaith_weight)" \ | |
| - --labels "true,false" \ | |
| + --pop-rate "true=0.9998525516181488" \ | |
| + --pop-rate "false=0.00014744838185121178" \ | |
| --center --scale > $@ | |
| wikidatawiki_models: \ | |
| - models/wikidatawiki.damaging.gradient_boosting.model \ | |
| ? - | |
| + models/wikidatawiki.damaging.gradient_boosting.model \ | |
| - models/wikidatawiki.goodfaith.gradient_boosting.model | |
| ? - | |
| + models/wikidatawiki.goodfaith.gradient_boosting.model | |
| wikidatawiki_tuning_reports: \ | |
| - tuning_reports/wikidatawiki.damaging.md \ | |
| ? - | |
| + tuning_reports/wikidatawiki.damaging.md \ | |
| - tuning_reports/wikidatawiki.goodfaith.md? - | |
| + tuning_reports/wikidatawiki.goodfaith.md | |
| ? | |
| ############################# English Wiktionary ################################ | |
| datasets/enwiktionary.sampled_revisions.200k_2016.json: | |
| wget -qO- https://quarry.wmflabs.org/run/97131/output/0/json-lines?download=true > $@ | |
| datasets/enwiktionary.autolabeled_revisions.200k_2016.json: \ | |
| datasets/enwiktionary.sampled_revisions.200k_2016.json | |
| cat $< | \ | |
| ./utility autolabel --host=https://en.wiktionary.org \ | |
| --trusted-groups=sysop,oversight,bot,rollbacker,checkuser,abusefilter,bureaucrat \ | |
| --trusted-edits=1000 \ | |
| + --revert-radius=3 \ | |
| + --revert-window=48 \ | |
| --verbose > $@ | |
| - datasets/enwiktionary.autolabeled_revisions.evens.100k_2016.json: \ | |
| ? ------------ ^ ^^ ^^^ | |
| + datasets/enwiktionary.revisions_for_review.5k_2016.json: \ | |
| ? ^^^^^^ + ^ ^ | |
| datasets/enwiktionary.autolabeled_revisions.200k_2016.json | |
| + ( \ | |
| - cat $< | \ | |
| + cat $< | \ | |
| ? + | |
| - grep -P '"rev_id": [0-9]+[02468],' > $@ | |
| + grep '"needs_review": true' | \ | |
| + shuf -n 2500; \ | |
| + cat $< | \ | |
| + grep '"needs_review": false' | \ | |
| + shuf -n 2500 \ | |
| + ) | shuf > $@ | |
| datasets/enwiktionary.autolabeled_revisions.w_cache.20k_2016.json: \ | |
| - datasets/enwiktionary.autolabeled_revisions.weighted.20k_2016.json | |
| ? --------- | |
| + datasets/enwiktionary.autolabeled_revisions.200k_2016.json | |
| ? + | |
| - cat $< | \ | |
| + shuf -n 20000 $< | \ | |
| revscoring extract \ | |
| editquality.feature_lists.enwiktionary.reverted \ | |
| --host https://en.wiktionary.org \ | |
| --extractor $(max_extractors) \ | |
| --verbose > $@ | |
| - | |
| - datasets/enwiktionary.autolabeled_revisions.weighted.20k_2016.json: \ | |
| - datasets/enwiktionary.autolabeled_revisions.200k_2016.json | |
| - ( \ | |
| - cat $< | \ | |
| - grep '"reverted_for_damage": false' | shuf -n 20000; \ | |
| - cat $< | \ | |
| - grep '"reverted_for_damage": true' \ | |
| - ) | shuf > $@ | |
| tuning_reports/enwiktionary.reverted.md: \ | |
| datasets/enwiktionary.autolabeled_revisions.w_cache.20k_2016.json | |
| cat $< | \ | |
| revscoring tune \ | |
| config/classifiers.params.yaml \ | |
| editquality.feature_lists.enwiktionary.reverted \ | |
| reverted_for_damage \ | |
| roc_auc.labels.true \ | |
| --label-weight "true=$(reverted_weight)" \ | |
| --pop-rate "true=0.004778273117085203" \ | |
| --pop-rate "false=0.9952217268829148" \ | |
| --center --scale \ | |
| - --cv-timeout=60 \ | |
| ? ^ | |
| + --cv-timeout 60 \ | |
| ? ^ | |
| --debug > $@ | |
| models/enwiktionary.reverted.rf.model: \ | |
| datasets/enwiktionary.autolabeled_revisions.w_cache.20k_2016.json | |
| cat $< | \ | |
| revscoring cv_train \ | |
| revscoring.scoring.models.RandomForest \ | |
| editquality.feature_lists.enwiktionary.reverted \ | |
| reverted_for_damage \ | |
| --version=$(reverted_major_minor).0 \ | |
| - -p 'criterion="entropy"' \ | |
| ? - - | |
| + -p 'criterion=entropy' \ | |
| - -p 'max_features="log2"' \ | |
| ? - - | |
| + -p 'max_features=log2' \ | |
| + -p 'min_samples_leaf=3' \ | |
| -p 'n_estimators=320' \ | |
| - -p 'min_samples_leaf=3' \ | |
| --label-weight "true=$(reverted_weight)" \ | |
| --pop-rate "true=0.004778273117085203" \ | |
| --pop-rate "false=0.9952217268829148" \ | |
| --center --scale > $@ | |
| - | |
| - | |
| - datasets/enwiktionary.human_labeled_revisions.5k_2016.json: | |
| - ./utility fetch_labels \ | |
| - https://labels.wmflabs.org/campaigns/enwiktionary/59/ > $@ | |
| - | |
| - datasets/enwiktionary.labeled_revisions.100k_2016.json: \ | |
| - datasets/enwiktionary.human_labeled_revisions.5k_2016.json \ | |
| - datasets/enwiktionary.autolabeled_revisions.evens.100k_2016.json | |
| - ./utility merge_labels $^ > $@ | |
| - | |
| - datasets/enwiktionary.labeled_revisions.w_cache.100k_2016.json: \ | |
| - datasets/enwiktionary.labeled_revisions.100k_2016.json | |
| - cat $< | \ | |
| - revscoring extract \ | |
| - editquality.feature_lists.enwiktionary.goodfaith \ | |
| - editquality.feature_lists.enwiktionary.damaging \ | |
| - --host https://en.wiktionary.org \ | |
| - --extractor $(max_extractors) \ | |
| - --verbose > $@ | |
| - | |
| - tuning_reports/enwiktionary.damaging.md: \ | |
| - datasets/enwiktionary.labeled_revisions.w_cache.100k_2016.json | |
| - cat $< | \ | |
| - revscoring tune \ | |
| - config/classifiers.params.yaml \ | |
| - editquality.feature_lists.enwiktionary.damaging \ | |
| - damaging \ | |
| - roc_auc.labels.true \ | |
| - --label-weight "true=$(damaging_weight)" \ | |
| - --fixme \ | |
| - --cv-timeout=60 \ | |
| - --debug > $@ | |
| - | |
| - tuning_reports/enwiktionary.goodfaith.md: \ | |
| - datasets/enwiktionary.labeled_revisions.w_cache.100k_2016.json | |
| - cat $< | \ | |
| - revscoring tune \ | |
| - config/classifiers.params.yaml \ | |
| - editquality.feature_lists.enwiktionary.goodfaith \ | |
| - goodfaith \ | |
| - roc_auc.labels.true \ | |
| - --label-weight "false=$(goodfaith_weight)" \ | |
| - --fixme \ | |
| - --cv-timeout=60 \ | |
| - --debug > $@ | |
| enwiktionary_models: \ | |
| models/enwiktionary.reverted.rf.model | |
| enwiktionary_tuning_reports: \ | |
| tuning_reports/enwiktionary.reverted.md | |
| - |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment