Skip to content

Instantly share code, notes, and snippets.

@Ladsgroup
Created February 26, 2018 20:14
Show Gist options
  • Save Ladsgroup/3efbe704382b3299c00bb7be03dbfb2f to your computer and use it in GitHub Desktop.
Save Ladsgroup/3efbe704382b3299c00bb7be03dbfb2f to your computer and use it in GitHub Desktop.
wikidata and enwiktionary batch
- ################################### Wikidata ##################################
? ------ --
+ ############################# Wikidata ################################
-
- # wikidatawiki.balanced_revisions.20k_2015.json is check into the repo
-
- datasets/wikidatawiki.autolabeled_revisions.20k_2015.json: \
- datasets/wikidatawiki.balanced_revisions.20k_2015.json
- cat $< | \
- ./utility autolabel --host=https://wikidata.org \
- --trusted-groups=abusefilter,arbcom,bureaucrat,checkuser,rollbacker,sysop,bot \
- --trusted-edits=1000 \
- --verbose > $@
datasets/wikidatawiki.human_labeled_revisions.5k_2016.json:
./utility fetch_labels \
https://labels.wmflabs.org/campaigns/wikidatawiki/19/ > $@
datasets/wikidatawiki.labeled_revisions.20k_2015.json: \
datasets/wikidatawiki.human_labeled_revisions.5k_2016.json \
datasets/wikidatawiki.autolabeled_revisions.20k_2015.json
./utility merge_labels $^ > $@
datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json: \
datasets/wikidatawiki.labeled_revisions.20k_2015.json
cat $< | \
revscoring extract \
- editquality.feature_lists.wikidatawiki.reverted \
editquality.feature_lists.wikidatawiki.damaging \
editquality.feature_lists.wikidatawiki.goodfaith \
- --host https://wikidata.org \
+ --host https://www.wikidata.org \
? ++++
+ --extractor $(max_extractors) \
--verbose > $@
tuning_reports/wikidatawiki.damaging.md: \
datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json
cat $< | \
revscoring tune \
config/classifiers.params.yaml \
editquality.feature_lists.wikidatawiki.damaging \
damaging \
roc_auc.labels.true \
--label-weight "true=$(damaging_weight)" \
--pop-rate "true=0.0008668694143782405" \
--pop-rate "false=0.9991331305856218" \
- --labels "true,false" \
--center --scale \
- --cv-timeout=60 \
? ^
+ --cv-timeout 60 \
? ^
--debug > $@
models/wikidatawiki.damaging.gradient_boosting.model: \
datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json
cat $< | \
revscoring cv_train \
revscoring.scoring.models.GradientBoosting \
editquality.feature_lists.wikidatawiki.damaging \
damaging \
--version=$(damaging_major_minor).0 \
+ -p 'learning_rate=0.01' \
-p 'max_depth=7' \
- -p 'learning_rate=0.01' \
- -p 'max_features="log2"' \
? - -
+ -p 'max_features=log2' \
-p 'n_estimators=700' \
--label-weight "true=$(damaging_weight)" \
- --labels "true,false" \
+ --pop-rate "true=0.0008668694143782405" \
+ --pop-rate "false=0.9991331305856218" \
--center --scale > $@
tuning_reports/wikidatawiki.goodfaith.md: \
datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json
cat $< | \
revscoring tune \
config/classifiers.params.yaml \
editquality.feature_lists.wikidatawiki.goodfaith \
goodfaith \
roc_auc.labels.true \
--label-weight "false=$(goodfaith_weight)" \
--pop-rate "true=0.9998525516181488" \
--pop-rate "false=0.00014744838185121178" \
- --labels "true,false" \
+ --center --scale \
- --cv-timeout=60 \
? ^
+ --cv-timeout 60 \
? ^
--debug > $@
models/wikidatawiki.goodfaith.gradient_boosting.model: \
datasets/wikidatawiki.labeled_revisions.w_cache.20k_2015.json
cat $< | \
revscoring cv_train \
revscoring.scoring.models.GradientBoosting \
editquality.feature_lists.wikidatawiki.goodfaith \
goodfaith \
--version=$(goodfaith_major_minor).0 \
+ -p 'learning_rate=0.1' \
-p 'max_depth=5' \
- -p 'learning_rate=0.1' \
- -p 'max_features="log2"' \
? - -
+ -p 'max_features=log2' \
-p 'n_estimators=300' \
--label-weight "false=$(goodfaith_weight)" \
- --labels "true,false" \
+ --pop-rate "true=0.9998525516181488" \
+ --pop-rate "false=0.00014744838185121178" \
--center --scale > $@
wikidatawiki_models: \
- models/wikidatawiki.damaging.gradient_boosting.model \
? -
+ models/wikidatawiki.damaging.gradient_boosting.model \
- models/wikidatawiki.goodfaith.gradient_boosting.model
? -
+ models/wikidatawiki.goodfaith.gradient_boosting.model
wikidatawiki_tuning_reports: \
- tuning_reports/wikidatawiki.damaging.md \
? -
+ tuning_reports/wikidatawiki.damaging.md \
- tuning_reports/wikidatawiki.goodfaith.md? -
+ tuning_reports/wikidatawiki.goodfaith.md
?
############################# English Wiktionary ################################
datasets/enwiktionary.sampled_revisions.200k_2016.json:
wget -qO- https://quarry.wmflabs.org/run/97131/output/0/json-lines?download=true > $@
datasets/enwiktionary.autolabeled_revisions.200k_2016.json: \
datasets/enwiktionary.sampled_revisions.200k_2016.json
cat $< | \
./utility autolabel --host=https://en.wiktionary.org \
--trusted-groups=sysop,oversight,bot,rollbacker,checkuser,abusefilter,bureaucrat \
--trusted-edits=1000 \
+ --revert-radius=3 \
+ --revert-window=48 \
--verbose > $@
- datasets/enwiktionary.autolabeled_revisions.evens.100k_2016.json: \
? ------------ ^ ^^ ^^^
+ datasets/enwiktionary.revisions_for_review.5k_2016.json: \
? ^^^^^^ + ^ ^
datasets/enwiktionary.autolabeled_revisions.200k_2016.json
+ ( \
- cat $< | \
+ cat $< | \
? +
- grep -P '"rev_id": [0-9]+[02468],' > $@
+ grep '"needs_review": true' | \
+ shuf -n 2500; \
+ cat $< | \
+ grep '"needs_review": false' | \
+ shuf -n 2500 \
+ ) | shuf > $@
datasets/enwiktionary.autolabeled_revisions.w_cache.20k_2016.json: \
- datasets/enwiktionary.autolabeled_revisions.weighted.20k_2016.json
? ---------
+ datasets/enwiktionary.autolabeled_revisions.200k_2016.json
? +
- cat $< | \
+ shuf -n 20000 $< | \
revscoring extract \
editquality.feature_lists.enwiktionary.reverted \
--host https://en.wiktionary.org \
--extractor $(max_extractors) \
--verbose > $@
-
- datasets/enwiktionary.autolabeled_revisions.weighted.20k_2016.json: \
- datasets/enwiktionary.autolabeled_revisions.200k_2016.json
- ( \
- cat $< | \
- grep '"reverted_for_damage": false' | shuf -n 20000; \
- cat $< | \
- grep '"reverted_for_damage": true' \
- ) | shuf > $@
tuning_reports/enwiktionary.reverted.md: \
datasets/enwiktionary.autolabeled_revisions.w_cache.20k_2016.json
cat $< | \
revscoring tune \
config/classifiers.params.yaml \
editquality.feature_lists.enwiktionary.reverted \
reverted_for_damage \
roc_auc.labels.true \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.004778273117085203" \
--pop-rate "false=0.9952217268829148" \
--center --scale \
- --cv-timeout=60 \
? ^
+ --cv-timeout 60 \
? ^
--debug > $@
models/enwiktionary.reverted.rf.model: \
datasets/enwiktionary.autolabeled_revisions.w_cache.20k_2016.json
cat $< | \
revscoring cv_train \
revscoring.scoring.models.RandomForest \
editquality.feature_lists.enwiktionary.reverted \
reverted_for_damage \
--version=$(reverted_major_minor).0 \
- -p 'criterion="entropy"' \
? - -
+ -p 'criterion=entropy' \
- -p 'max_features="log2"' \
? - -
+ -p 'max_features=log2' \
+ -p 'min_samples_leaf=3' \
-p 'n_estimators=320' \
- -p 'min_samples_leaf=3' \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.004778273117085203" \
--pop-rate "false=0.9952217268829148" \
--center --scale > $@
-
-
- datasets/enwiktionary.human_labeled_revisions.5k_2016.json:
- ./utility fetch_labels \
- https://labels.wmflabs.org/campaigns/enwiktionary/59/ > $@
-
- datasets/enwiktionary.labeled_revisions.100k_2016.json: \
- datasets/enwiktionary.human_labeled_revisions.5k_2016.json \
- datasets/enwiktionary.autolabeled_revisions.evens.100k_2016.json
- ./utility merge_labels $^ > $@
-
- datasets/enwiktionary.labeled_revisions.w_cache.100k_2016.json: \
- datasets/enwiktionary.labeled_revisions.100k_2016.json
- cat $< | \
- revscoring extract \
- editquality.feature_lists.enwiktionary.goodfaith \
- editquality.feature_lists.enwiktionary.damaging \
- --host https://en.wiktionary.org \
- --extractor $(max_extractors) \
- --verbose > $@
-
- tuning_reports/enwiktionary.damaging.md: \
- datasets/enwiktionary.labeled_revisions.w_cache.100k_2016.json
- cat $< | \
- revscoring tune \
- config/classifiers.params.yaml \
- editquality.feature_lists.enwiktionary.damaging \
- damaging \
- roc_auc.labels.true \
- --label-weight "true=$(damaging_weight)" \
- --fixme \
- --cv-timeout=60 \
- --debug > $@
-
- tuning_reports/enwiktionary.goodfaith.md: \
- datasets/enwiktionary.labeled_revisions.w_cache.100k_2016.json
- cat $< | \
- revscoring tune \
- config/classifiers.params.yaml \
- editquality.feature_lists.enwiktionary.goodfaith \
- goodfaith \
- roc_auc.labels.true \
- --label-weight "false=$(goodfaith_weight)" \
- --fixme \
- --cv-timeout=60 \
- --debug > $@
enwiktionary_models: \
models/enwiktionary.reverted.rf.model
enwiktionary_tuning_reports: \
tuning_reports/enwiktionary.reverted.md
-
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment