Created
January 4, 2012 04:10
-
-
Save whym/1558442 to your computer and use it in GitHub Desktop.
download Wikimedia rev diff dumps, giving a different limit rate depending on day/night, and output md5sum to stdout
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env ruby | |
| # download Wikimedia rev diff dumps, giving a different limit rate depending on day/night, and output md5sum to stdout | |
| require 'open-uri' | |
| require 'optparse' | |
| require 'time' | |
| USAGE= <<'END' | |
| usage: download.rb http://dumps.wikimedia.org/enwiki/20111201/ --day-limit-rate=500k > checksums.txt | |
| END | |
| ######################################## | |
| require 'optparse' | |
| OPT = Struct. | |
| new(:daylimit, :overwrite, :verbose). | |
| new('500k', nil, nil) | |
| OptionParser.new do |opts| | |
| opts.on('--day-limit-rate STR', String) do |v| | |
| OPT.daylimit = v | |
| end | |
| opts.on('--overwrite') do | |
| OPT.overwrite = true | |
| end | |
| opts.on('--verbose') do | |
| OPT.verbose = true | |
| end | |
| end.parse! | |
| stem = ARGV.shift | |
| sumsfile = nil | |
| if stem then | |
| _,_,_,wiki,date = stem.split(/\//) | |
| sumsfile = open("#{stem}/#{wiki}-#{date}-md5sums.txt") | |
| else | |
| stem = 'http://dumps.wikimedia.org/simplewiki/20111227/' | |
| sumsfile = DATA | |
| end | |
| sumsfile.each do |line| | |
| md5, path = line.split | |
| next if path !~ /pages-meta-history.*bz2/ | |
| option = '' | |
| hour = Time.now.hour | |
| if hour >= 9 then | |
| option = "--limit-rate=#{OPT.daylimit}" | |
| end | |
| if File.exists?(path) and not OPT.overwrite then | |
| STDERR.puts "skipping #{path}" | |
| system "md5sum #{path}" | |
| print line | |
| next | |
| end | |
| system "wget #{option} #{stem}/#{path}" | |
| system "md5sum #{path}" | |
| print line | |
| end | |
| __END__ | |
| e2e907563741758ee5b7b3c98031aac2 simplewiki-20111227-site_stats.sql.gz | |
| 1693b4a15f534f348654185deb8977e9 simplewiki-20111227-image.sql.gz | |
| c3c8b798f35a444704419f76c034f31d simplewiki-20111227-oldimage.sql.gz | |
| d3088c64cbad3083a3f3b084685ec954 simplewiki-20111227-pagelinks.sql.gz | |
| aa1a5522a1c31513ee801c5bda29dfb4 simplewiki-20111227-categorylinks.sql.gz | |
| 06259073cc5d1c9f9ad1a5cd60ae8935 simplewiki-20111227-imagelinks.sql.gz | |
| b954f127095ccad40ee9f1f411f2b2c2 simplewiki-20111227-templatelinks.sql.gz | |
| e658b07e57af11917ace09b43acb9cc4 simplewiki-20111227-externallinks.sql.gz | |
| 1ce30868ac5a2d13b623ae211c214307 simplewiki-20111227-langlinks.sql.gz | |
| 5916f9ba8e555144ee979d5f1d747579 simplewiki-20111227-interwiki.sql.gz | |
| 1263f238aea042868febb851f6479074 simplewiki-20111227-user_groups.sql.gz | |
| c9e2b6a34c7e86b33404a3a8cf6cbbb0 simplewiki-20111227-category.sql.gz | |
| 6bbbcf6a95287e10902b5cca247af219 simplewiki-20111227-page.sql.gz | |
| c27aea4c465dda6425ad131217953075 simplewiki-20111227-page_restrictions.sql.gz | |
| 26911fb22db9403c48810c7df9c80b6d simplewiki-20111227-page_props.sql.gz | |
| d69611a63bd0bc00b17b2db7f0ebbf53 simplewiki-20111227-protected_titles.sql.gz | |
| 1270336018b68bd4efbe373b58157582 simplewiki-20111227-redirect.sql.gz | |
| 0b0f9f242095ad73d87c818c84bb5820 simplewiki-20111227-iwlinks.sql.gz | |
| 8f78537fc6c969f5f128e8af05678814 simplewiki-20111227-all-titles-in-ns0.gz | |
| b5cc88514d980de69244ca9d2e74a8d3 simplewiki-20111227-abstract.xml | |
| 80069e36b1c9b79fe5b2a8eb93172b8e simplewiki-20111227-stub-meta-history.xml.gz | |
| 74998e54c10bf2134475bcec79926b00 simplewiki-20111227-stub-meta-current.xml.gz | |
| ada569807b78e11735e91796e054d7e9 simplewiki-20111227-stub-articles.xml.gz | |
| 67189ca7bb5e46a27938afb9e57120bb simplewiki-20111227-pages-articles.xml.bz2 | |
| d7a428d466e9d281fcf9deb0ccf3bdc3 simplewiki-20111227-pages-meta-current.xml.bz2 | |
| d34e23baf3c5b5d2b53fb8731c3d668e simplewiki-20111227-pages-logging.xml.gz | |
| 46fe85b5e3cbc8e434a433a32ea688c6 simplewiki-20111227-pages-meta-history.xml.bz2 | |
| 10a6dee9ba4d34fe239b5171ac85a460 simplewiki-20111227-pages-meta-history.xml.7z |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment