This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def down_page_callable(**kw): | |
try: | |
_task_queue = kw["task_queue"] | |
_url_list = kw["url_list"] | |
_log_queue = kw["log_queue"] | |
_deeph = kw["deeph"] | |
_filter_exts = kw["filter_exts"], | |
_root = kw["root"] | |
except KeyError, ke: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: UTF-8 -*- | |
import httplib | |
import urllib2 | |
import socket | |
import traceback | |
import re | |
import BeautifulSoup | |
import threading | |
from threadpool import ThreadPool | |
from Queue import Queue, Empty |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class FileWrapper: | |
"""Wrapper to convert file-like objects to iterables""" | |
def __init__(self, filelike, blksize=8192): | |
self.filelike = filelike | |
self.blksize = blksize | |
if hasattr(filelike,'close'): | |
self.close = filelike.close |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
socket = require("socket") | |
httpcode = require("httpcode") | |
require("mime") | |
getMime = mime.getMime | |
assert(getMime) | |
function main( p ) | |
local port |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mime = {} | |
local mime_table = | |
{ | |
[".html"] = { | |
["mime"] = "text/html", | |
["bin"] = false | |
}, | |
[".xml"] = { | |
["mime"] = "application/xml", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function main() | |
regex = [[false"><a title="(.-)"%starget=".-"%shref="(.-)%.html"]] | |
f = io.open("test.html", "r") | |
html = f:read("*a") | |
f:close() | |
s_idx, e_idx, title, url = string.find(html, regex, 0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- trim implementations | |
function trim1(s) | |
return (s:gsub("^%s*(.-)%s*$", "%1")) | |
end | |
-- from PiL2 20.4 | |
function trim2(s) | |
return s:match "^%s*(.-)%s*$" | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
w3m/0.4.1 | |
Emacs-w3m/1.4.4 w3m/0.5.1+cvs-1.946 | |
Mozilla/4.0 (compatible; MSIE 4.01; Windows CE; MSN Companion 2.0; 800x600; Compaq) | |
DonutP; Windows98SE | |
MoonBrowser (version 0.41 Beta4) | |
Moonbrowser - IE based browser (Japan) | |
Mozilla/4.0 (compatible; MSIE 4.01; Windows CE; MSN Companion 2.0; 800x600; Compaq) | |
Mozilla/1.1 (compatible; MSPIE 2.0; Windows CE) | |
Mozilla/1.22 (compatible; MSIE 2.0d; Windows NT) | |
Mozilla/2.0 (compatible; MSIE 2.1; Mac_PowerPC) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
UA_LIST = [a.split("\n")[0].strip() for a in open("user_agent", "r").readlines()] | |
random_value = lambda lst: random.randint(0, len(lst)) | |
def down_page(url): | |
global UA_LIST | |
headers = { "User-Agent" : random_value(UA_LIST)} | |
req = urllib2.Request(url, headers=headers) | |
response = urllib2.urlopen(req) | |
the_page = response.read() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#coding: utf-8 | |
#author: shiweifu | |
#mail : [email protected] | |
import logging |
OlderNewer