Skip to content

Instantly share code, notes, and snippets.

@shiweifu
shiweifu / gist:2850877
Created June 1, 2012 09:55
down_page_callable
def down_page_callable(**kw):
try:
_task_queue = kw["task_queue"]
_url_list = kw["url_list"]
_log_queue = kw["log_queue"]
_deeph = kw["deeph"]
_filter_exts = kw["filter_exts"],
_root = kw["root"]
except KeyError, ke:
@shiweifu
shiweifu / fetchpage2.py
Created June 2, 2012 14:18
spider use threadpool by shiweifu
# -*- coding: UTF-8 -*-
import httplib
import urllib2
import socket
import traceback
import re
import BeautifulSoup
import threading
from threadpool import ThreadPool
from Queue import Queue, Empty
@shiweifu
shiweifu / util.py
Created June 18, 2012 08:43
Wrapper to convert file-like objects to iterables
class FileWrapper:
"""Wrapper to convert file-like objects to iterables"""
def __init__(self, filelike, blksize=8192):
self.filelike = filelike
self.blksize = blksize
if hasattr(filelike,'close'):
self.close = filelike.close
@shiweifu
shiweifu / lweb.lua
Created June 22, 2012 06:21
lua static webserver
socket = require("socket")
httpcode = require("httpcode")
require("mime")
getMime = mime.getMime
assert(getMime)
function main( p )
local port
@shiweifu
shiweifu / mime.lua
Created June 22, 2012 06:22
lua static webserver
mime = {}
local mime_table =
{
[".html"] = {
["mime"] = "text/html",
["bin"] = false
},
[".xml"] = {
["mime"] = "application/xml",
@shiweifu
shiweifu / test.lua
Created July 3, 2012 11:46
lua parser youku video
function main()
regex = [[false"><a title="(.-)"%starget=".-"%shref="(.-)%.html"]]
f = io.open("test.html", "r")
html = f:read("*a")
f:close()
s_idx, e_idx, title, url = string.find(html, regex, 0)
-- trim implementations
function trim1(s)
return (s:gsub("^%s*(.-)%s*$", "%1"))
end
-- from PiL2 20.4
function trim2(s)
return s:match "^%s*(.-)%s*$"
end
@shiweifu
shiweifu / user_agent
Created July 12, 2012 02:24
一堆ua
w3m/0.4.1
Emacs-w3m/1.4.4 w3m/0.5.1+cvs-1.946
Mozilla/4.0 (compatible; MSIE 4.01; Windows CE; MSN Companion 2.0; 800x600; Compaq)
DonutP; Windows98SE
MoonBrowser (version 0.41 Beta4)
Moonbrowser - IE based browser (Japan)
Mozilla/4.0 (compatible; MSIE 4.01; Windows CE; MSN Companion 2.0; 800x600; Compaq)
Mozilla/1.1 (compatible; MSPIE 2.0; Windows CE)
Mozilla/1.22 (compatible; MSIE 2.0d; Windows NT)
Mozilla/2.0 (compatible; MSIE 2.1; Mac_PowerPC)
@shiweifu
shiweifu / define_ua.py
Created July 12, 2012 07:57
自定义user agent
UA_LIST = [a.split("\n")[0].strip() for a in open("user_agent", "r").readlines()]
random_value = lambda lst: random.randint(0, len(lst))
def down_page(url):
global UA_LIST
headers = { "User-Agent" : random_value(UA_LIST)}
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req)
the_page = response.read()
@shiweifu
shiweifu / mylog.py
Created July 13, 2012 05:41
日志封装
#!/usr/bin/env python
#coding: utf-8
#author: shiweifu
#mail : [email protected]
import logging