Skip to content

Instantly share code, notes, and snippets.

@Ladsgroup
Created February 21, 2018 22:58
Show Gist options
  • Save Ladsgroup/aba5473316fffc2cf32ef0fe392eb297 to your computer and use it in GitHub Desktop.
Save Ladsgroup/aba5473316fffc2cf32ef0fe392eb297 to your computer and use it in GitHub Desktop.
Edge_cases_part_I
- ############################# Norwegian Wikipedia #############################
+ ############################# Norwegian Wikipedia ################################
? +++
datasets/nowiki.sampled_revisions.100k_2015.json:
wget -qO- https://quarry.wmflabs.org/run/67250/output/0/json-lines?download=true > $@
datasets/nowiki.autolabeled_revisions.100k_2015.json: \
datasets/nowiki.sampled_revisions.100k_2015.json
cat $< | \
./utility autolabel --host=https://no.wikipedia.org \
--trusted-groups=sysop,oversight,bot,rollbacker,checkuser,abusefilter,bureaucrat \
--trusted-edits=1000 \
+ --revert-radius=3 \
+ --revert-window=48 \
--verbose > $@
- datasets/nowiki.revisions_to_review.5k_2015.json: \
? ^
+ datasets/nowiki.revisions_for_review.5k_2015.json: \
? ^ +
datasets/nowiki.autolabeled_revisions.100k_2015.json
( \
- cat $< | \
? ^
+ cat $< | \
? ^
- grep '"needs_review": true' | \
? ^
+ grep '"needs_review": true' | \
? ^
- shuf -n 2500; \
? ^
+ shuf -n 2500; \
? ^
- cat $< | \
? ^
+ cat $< | \
? ^
- grep '"needs_review": false' | \
? ^
+ grep '"needs_review": false' | \
? ^
- shuf -n 2500 \
? ^
+ shuf -n 2500 \
? ^
) | shuf > $@
-
datasets/nowiki.autolabeled_revisions.w_cache.40k_2015.json: \
datasets/nowiki.autolabeled_revisions.100k_2015.json
shuf -n 40000 $< | \
revscoring extract \
editquality.feature_lists.nowiki.reverted \
--host https://no.wikipedia.org \
--extractor $(max_extractors) \
--verbose > $@
tuning_reports/nowiki.reverted.md: \
datasets/nowiki.autolabeled_revisions.w_cache.40k_2015.json
cat $< | \
revscoring tune \
config/classifiers.params.yaml \
editquality.feature_lists.nowiki.reverted \
reverted_for_damage \
roc_auc.labels.true \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.019061539539679838" \
--pop-rate "false=0.9809384604603202" \
--center --scale \
- --cv-timeout=60 \
? ^
+ --cv-timeout 60 \
? ^
--debug > $@
models/nowiki.reverted.gradient_boosting.model: \
datasets/nowiki.autolabeled_revisions.w_cache.40k_2015.json
cat $< | \
revscoring cv_train \
revscoring.scoring.models.GradientBoosting \
editquality.feature_lists.nowiki.reverted \
reverted_for_damage \
--version=$(reverted_major_minor).1 \
+ -p 'learning_rate=0.01' \
-p 'max_depth=7' \
- -p 'learning_rate=0.01' \
- -p 'max_features="log2"' \
? - -
+ -p 'max_features=log2' \
-p 'n_estimators=500' \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.019061539539679838" \
--pop-rate "false=0.9809384604603202" \
--center --scale > $@
nowiki_models: \
- models/nowiki.reverted.gradient_boosting.model
? -
+ models/nowiki.reverted.gradient_boosting.model
nowiki_tuning_reports: \
- tuning_reports/nowiki.reverted.md
? -
+ tuning_reports/nowiki.reverted.md
- ############################### Vietnamese Wikipedia ###########################
? --
+ ############################# Vietnamese Wikipedia ################################
? +++++
datasets/viwiki.sampled_revisions.500k_2015.json:
wget -qO- http://quarry.wmflabs.org/run/65793/output/0/json-lines?download=true > $@
datasets/viwiki.autolabeled_revisions.500k_2015.json: \
datasets/viwiki.sampled_revisions.500k_2015.json
cat $< | \
./utility autolabel --host=https://vi.wikipedia.org \
--trusted-groups=checkuser,bureaucrat,sysop,eliminator,bot \
--trusted-edits=1000 \
+ --revert-radius=3 \
+ --revert-window=48 \
--verbose > $@
- datasets/viwiki.revisions_to_review.5k_2015.json: \
? ^
+ datasets/viwiki.revisions_for_review.5k_2015.json: \
? ^ +
datasets/viwiki.autolabeled_revisions.500k_2015.json
+ ( \
+ cat $< | \
- (cat $< | grep '"needs_review": true' | \
? ---------
+ grep '"needs_review": true' | \
shuf -n 2500; \
+ cat $< | \
- cat $< | grep '"needs_review": false' | \
? ---------
+ grep '"needs_review": false' | \
shuf -n 2500 \
) | shuf > $@
datasets/viwiki.autolabeled_revisions.w_cache.100k_2015.json: \
datasets/viwiki.autolabeled_revisions.500k_2015.json
- cat $< | shuf -n 100000 | \
? ---------
+ shuf -n 100000 $< | \
? +++
revscoring extract \
editquality.feature_lists.viwiki.reverted \
--host https://vi.wikipedia.org \
--extractor $(max_extractors) \
--verbose > $@
tuning_reports/viwiki.reverted.md: \
datasets/viwiki.autolabeled_revisions.w_cache.100k_2015.json
cat $< | \
revscoring tune \
config/classifiers.params.yaml \
editquality.feature_lists.viwiki.reverted \
reverted_for_damage \
roc_auc.labels.true \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.019211042993949594" \
--pop-rate "false=0.9807889570060504" \
--center --scale \
- --cv-timeout=60 \
? ^
+ --cv-timeout 60 \
? ^
--debug > $@
models/viwiki.reverted.gradient_boosting.model: \
datasets/viwiki.autolabeled_revisions.w_cache.100k_2015.json
cat $< | \
revscoring cv_train \
revscoring.scoring.models.GradientBoosting \
editquality.feature_lists.viwiki.reverted \
reverted_for_damage \
--version=$(reverted_major_minor).0 \
+ -p 'learning_rate=0.01' \
-p 'max_depth=7' \
- -p 'learning_rate=0.01' \
- -p 'max_features="log2"' \
? - -
+ -p 'max_features=log2' \
-p 'n_estimators=700' \
--label-weight "true=$(reverted_weight)" \
--pop-rate "true=0.019211042993949594" \
--pop-rate "false=0.9807889570060504" \
--center --scale > $@
viwiki_models: \
- models/viwiki.reverted.gradient_boosting.model
? -
+ models/viwiki.reverted.gradient_boosting.model
viwiki_tuning_reports: \
- tuning_reports/viwiki.reverted.md
? -
+ tuning_reports/viwiki.reverted.md
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment