-
-
Save nemobis/7718061 to your computer and use it in GitHub Desktop.
Google search scraper to list all results likely to be MediaWiki installations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
################################################################################## | |
# Google search scraper to list all results likely to be MediaWiki installations # | |
# # | |
# CC-0, ArchiveTeam/WikiTeam, 2013 # | |
# # | |
################################################################################## | |
require 'rubygems' | |
require 'mechanize' | |
require 'uri' | |
require 'cgi' | |
domains = Array.new | |
a = Mechanize.new { |agent| | |
agent.user_agent_alias = 'Linux Firefox' # e.g. with Konqueror the HTML is not ok for //h3/a search | |
} | |
prng = Random.new | |
search_result = a.get('http://www.google.it/') # Alternative? webhp?num=30&complete=0&hl=it | |
search_form = search_result.form('f') | |
search_form.q = 'link:mediawiki.org site:' + ARGV[0] | |
# Other queries to use: | |
# "Powered by MediaWiki" -site:wikia.com -wikimedia | |
# "Magnus Manske, Brion Vibber, Lee Daniel Crocker" | |
# link:mediawiki.org -site:wikia.com | |
# "w/api.php" -site:wikipedia.org -site:wikia.com -site:wiktionary.org | |
# allinurl:"wiki/api.php" | |
# "meta name generator" MediaWiki -site:wikia.com -wikimedia | |
# "Main Page" site:http:sourceforge.net/apps/mediawiki/ | |
# inurl:index.php site:http:sourceforge.net/apps/mediawiki/ | |
# "content=Main Page" | |
# "index.php?title=Main_Page" | |
# mw.config.set "wgCanonicalNamespace" -site:wikia.com -wikimedia | |
# "ResourceLoaderDynamicStyles" -site:wikia.com -wikimedia | |
# "MediaWiki has been successfully installed" | |
# "MediaWiki wurde erfolgreich installiert" | |
# "MediaWiki ha sido instalado con éxito" | |
# "已成功安装MediaWiki" | |
# "MediaWiki a été installé avec succès" | |
# "Вики-движок MediaWiki успешно установлен" | |
# "MediaWiki のインストールに成功しました" | |
# "Powered by MediaWiki" -intitle:MediaWiki http://p.defau.lt/?2pccpYrwic_YmNH2UKsY5A | |
# All the above with allintext:/intext: ? | |
# Some/all of the above with site:<TLD> http://ftp.isc.org/www/survey/reports/2013/07/bynum.txt | |
# ... | |
search_result = a.submit(search_form, search_form.buttons.first) | |
# Continue clicking "Next" endlessly; exits at some point. TODO: Test even better | |
loop do | |
search_result.search("//h3/a").each do |link| | |
# The result URLs are in h3 headers and passed through google.com/url?q= | |
target = CGI.parse(link['href'])['/url?q'][0] | |
unless target.nil? | |
# Take each result URI provided | |
begin | |
uri = URI.parse(target) | |
rescue URI::InvalidURIError | |
puts "Skipped invalid URI: " + target | |
break | |
end | |
# Try to extract the entry URL to MediaWiki: index.php if we're lucky, otherwise the article path | |
# We could try and be smart, open the URL and follow the link rel=EditURI; but it's too recent a feature | |
unless uri.query.nil? | |
# If there are parameters, perhaps we're lucky, just take till the path | |
# TODO: This looks silly | |
entry = uri.scheme + '://' + uri.host + uri.path | |
else | |
# But if there is none it's probably using short URLs or some other rewriting: | |
# the last part must be the page title, remove it | |
entry = target.split("/")[0..-2].join("/") | |
end | |
unless domains.include?(entry) | |
domains << entry | |
print '.' | |
end | |
# A human would probably click every now and then | |
if prng.rand(0..3.0) < 1 then | |
begin | |
trash = a.get('http://google.com' + link['href']) | |
rescue Exception | |
# Nothing to do; we don't care at all | |
end | |
end | |
end | |
end | |
sleep(prng.rand(150..300.0)) | |
begin | |
# We click the "Next" link; replace with your language | |
search_result = search_result.link_with(:text => 'Avanti').click | |
rescue NoMethodError | |
begin | |
# Use the name of the link to repeat research with previously removed results. Doesn't work without leading space, in Italian! | |
search_result = search_result.link_with(:text => ' ripetere la ricerca includendo i risultati omessi').click | |
rescue NoMethodError | |
break | |
end | |
rescue Net::HTTPServiceUnavailable | |
puts "We got a 503, party is over" | |
break | |
end | |
break if search_result.nil? | |
end | |
# Print all domains found, one per line, to a file existing or new | |
output = File.open( "MediaWikiPowered.txt", "a") | |
domains.each do |domain| | |
output.puts domain | |
end | |
output.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Upload this to listofwikis directory?