Skip to content

Instantly share code, notes, and snippets.

@dervn
Created February 23, 2012 10:05
Show Gist options
  • Save dervn/1892066 to your computer and use it in GitHub Desktop.
Save dervn/1892066 to your computer and use it in GitHub Desktop.
script to get content from xxx.com
#!/usr/bin/env python
# encoding: utf-8
"""
tb1.py
Created by dn on 2011-07-24.
Copyright (c) 2011 shubz. All rights reserved.
"""
import os
import sys
import re
import datetime, time
import urllib2
from HTMLParser import HTMLParser
from pymongo.connection import Connection
conn = Connection('localhost', 27017, network_timeout = 5)
tb1 = conn['shubz']['tb1']
DOMAIN = 'xxx.com'
def urlopen(url):
headers = {
"User-Agent" : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/15.0.912.41 Safari/535.7'
}
req = urllib2.Request(
url = url,
headers = headers
)
return urllib2.urlopen(req)
def get_list(url, cate_id):
print 'list of url:' + url
html = urlopen(url).read()
_re = re.compile(r'<a target="_blank" href="(?P<url>.*?)" class="title">(?P<title>.*?)</a>', re.DOTALL)
_res = re.findall(_re, html)
for var in _res:
url = DOMAIN + var[0]
title = var[1]
_re_url = re.compile(r'.*/(?P<date>\d+)/(?P<id>\d+).html', re.DOTALL)
_res_url = _re_url.search(url)
id = int(_res_url.groupdict().get('id'))
date = int(_res_url.groupdict().get('date'))
if tb1.find_one({"id":id}) == None:
chapter = {
"id":id,
"date":date,
"cate_id":cate_id,
u"title":unicode(title, "gbk", "ignore"),
u"url":url,
u"creat_at": time.time()
}
tb1.insert(chapter)
#time.sleep(1)
print 'insert ' + str(id)
else :
print 'insert ' + str(id) + ' exists!'
def get_content(url):
c = urlopen(url).read()
print "url read end"
_re = re.compile(r'<TD height=30>(?P<content>.*?)</TD>', re.DOTALL)
_res = _re.search(c)
content = _res.groupdict().get('content')
content = unicode(content, "gbk", "ignore")
content = content.replace(u' color=#0000ff size=4 face=新宋体', '')
content = re.sub(r"<FONT.*?>","",content)
content = re.sub(r"http.*\.com","",content)
content = re.sub(r"www.*\.com","",content)
content = content.replace(u'X<mz5Q34!   本帖子来', '');
content = content.replace('<pstyle="text-align:center">', '');
content = content.replace( u'<metacontent="10;url=http://2.11/Html/chang/42762489250.html"http-equiv="refresh"/>', '');
content = content.replace('<DI', '');
content = content.replace('</DI', '');
content = content.replace('V>', '');
content = content.replace('<br><br>', '\n');
content = content.replace('.<br>', '\n');
content = content.replace('<br>', '\n');
content = content.replace('<BR><BR>', '\n');
content = content.replace('<BR>', '\n');
#return content.strip()
#print content
return strip_tags(content).strip()
def update_content (id):
print id
row = tb1.find_one({"id":id})
if row and row.get('content') is None:
try:
url = row.get("url")
print url
content = get_content(url)
#print content
tb1.update({"id":id}, {"$set":{"content":content}})
print 'update ' + str(id)
#time.sleep(30)
except:
print 'error of url:' + url
def strip_tags(html):
html = html.strip()
html = html.strip("\n")
result = []
parser = HTMLParser()
parser.handle_data = result.append
parser.feed(html)
parser.close()
return ''.join(result)
def main():
catelog = [['qiyia', 15, '',104]
,['xiaoyuan', 16, '',85]
,['changpian', 19, '',67]
,['wuxia', 20, '',63]
,['lingleix', 21, '',88]
]
page = 2
for cls in catelog:
_range = cls[3] + 1
if page > 0:
_range = page
for num in range(1, _range):
url = "%s/%s/list_%s_%s.html" % (DOMAIN, cls[0], str(cls[1]), str(num))
get_list(url, cls[1])
time.sleep(5)
def main_for_update():
catelog_ids = [15, 16, 14, 13, 19, 20, 21]
for cate_id in catelog_ids:
print cate_id
results = tb1.find({"cate_id":cate_id})
for item in results:
if item.get("content") is None:
update_content(item.get("id"))
if __name__ == "__main__":
main()
if tb1.find({"content":None}) > 0:
main_for_update()
#print update_content(58635)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment