Skip to content

Instantly share code, notes, and snippets.

@Ladsgroup
Created February 26, 2018 19:26
Show Gist options
  • Select an option

  • Save Ladsgroup/7c28db9b95db6727715e163963fabde9 to your computer and use it in GitHub Desktop.

Select an option

Save Ladsgroup/7c28db9b95db6727715e163963fabde9 to your computer and use it in GitHub Desktop.
huwiki templating
- ############################### Hungarian Wikipedia ###########################
? --
+ ############################# Hungarian Wikipedia ################################
? +++++
datasets/huwiki.sampled_revisions.40k_2016.json:
wget -qO- http://quarry.wmflabs.org/run/79645/output/0/json-lines?download=true > $@
datasets/huwiki.autolabeled_revisions.40k_2016.json: \
datasets/huwiki.sampled_revisions.40k_2016.json
cat $< | \
./utility autolabel --host=https://hu.wikipedia.org \
--trusted-groups=sysop,oversight,trusted,bot,rollbacker,checkuser,abusefilter,bureaucrat \
--trusted-edits=1000 \
+ --revert-radius=3 \
+ --revert-window=48 \
--verbose > $@
+
+ datasets/huwiki.revisions_for_review.5k_2016.json: \
+ datasets/huwiki.autolabeled_revisions.40k_2016.json
+ ( \
+ cat $< | \
+ grep '"needs_review": true' | \
+ shuf -n 2500; \
+ cat $< | \
+ grep '"needs_review": false' | \
+ shuf -n 2500 \
+ ) | shuf > $@
datasets/huwiki.autolabeled_revisions.w_cache.40k_2016.json: \
datasets/huwiki.autolabeled_revisions.40k_2016.json
cat $< | \
revscoring extract \
editquality.feature_lists.huwiki.reverted \
--host https://hu.wikipedia.org \
--extractor $(max_extractors) \
--verbose > $@
tuning_reports/huwiki.reverted.md: \
datasets/huwiki.autolabeled_revisions.w_cache.40k_2016.json
cat $< | \
revscoring tune \
config/classifiers.params.yaml \
editquality.feature_lists.huwiki.reverted \
reverted_for_damage \
roc_auc.labels.true \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.014812583163867339" \
--pop-rate "false=0.9851874168361326" \
--center --scale \
- --cv-timeout=60 \
? ^
+ --cv-timeout 60 \
? ^
--debug > $@
models/huwiki.reverted.rf.model: \
datasets/huwiki.autolabeled_revisions.w_cache.40k_2016.json
cat $< | \
revscoring cv_train \
revscoring.scoring.models.RandomForest \
editquality.feature_lists.huwiki.reverted \
reverted_for_damage \
--version=$(reverted_major_minor).1 \
- -p 'criterion="entropy"' \
? - -
+ -p 'criterion=entropy' \
- -p 'max_features="log2"' \
+ -p 'min_samples_leaf=13' \
-p 'n_estimators=320' \
- -p 'min_samples_leaf=13' \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.014812583163867339" \
--pop-rate "false=0.9851874168361326" \
--center --scale > $@
-
- datasets/huwiki.revisions_for_review.5k_2016.json: \
- datasets/huwiki.autolabeled_revisions.40k_2016.json
- ( \
- cat $< | \
- grep '"needs_review": true' | \
- shuf -n 2500; \
- cat $< | \
- grep '"needs_review": false' | \
- shuf -n 2500 \
- ) | shuf > $@
huwiki_models: \
models/huwiki.reverted.rf.model
huwiki_tuning_reports: \
tuning_reports/huwiki.reverted.md
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment