Created
August 14, 2014 05:06
-
-
Save lanceliao/e127d10dd6096bb5c064 to your computer and use it in GitHub Desktop.
Generate a list of dnsmasq(with ipset) rules for blocked alexa top 1000 domains
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#coding=utf-8 | |
# | |
# Generate a list of dnsmasq(with ipset) rules for | |
# Censorship of Alexa Top 1000 Domains in China | |
# | |
# Copyright (C) 2014 http://www.shuyz.com | |
# | |
import urllib2 | |
import re | |
import os | |
import datetime | |
# the url of search result | |
baseurl = 'https://en.greatfire.org/search/alexa-top-1000-domains?page=%s' | |
pattern = r'<a href=".*">(.*?)<\/a><\/td><td>.*<\/td><td class=".*\d+%;">(\d+%)<\/td>' | |
mydnsip = '127.0.0.1' | |
mydnsport = '1053' | |
outfile = 'gfw_alexa1000.conf' | |
fs = file(outfile, 'w') | |
fs.write('# GFW blocked Alexa Top 1000 Domains for dnsmasq\n') | |
fs.write('# updated on ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '\n') | |
fs.write('#\n') | |
# remember all blocked domains, in case of duplicate records | |
domainlist = [] | |
for i in range(0, 11): | |
print 'fetching page ' + baseurl%(i) | |
content = urllib2.urlopen(baseurl, timeout=15).read() | |
print 'page content fetched, analysing...' | |
items = re.findall(pattern, content) | |
for item in items: | |
try: | |
found = domainlist.index(item[0]) | |
print item[0] + ' exists.' | |
except ValueError: | |
domainlist.append(item[0]) | |
print item[0] + ' is ' + item[1] + ' blocked.' | |
if item[1] != '0%': | |
fs.write('server=/.%s/%s#%s\n'%(item[0],mydnsip,mydnsport)) | |
fs.write('ipset=/.%s/gfw_alexa1000\n'%item[0]) | |
# endfor | |
fs.close(); | |
print 'done!' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment