Last active
December 23, 2015 14:39
-
-
Save fffonion/6650539 to your computer and use it in GitHub Desktop.
a vely vely ugly scrlipt for converting dm123 xinfan jieshao to BBCode
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
__version__=1.4 | |
#1.2 修正补全图片url时出现的bug | |
#1.3 htmlescape | |
#1.4 flash的BBCOD优化,调整大小;字体调整为雅黑,标题放大 | |
import urllib2,re,win32clipboard as clipboard,time,win32con,os,sys | |
import random | |
import httplib2 | |
import datetime | |
baseurl='http://www.dm123.cn' | |
def html2bbcode(str): | |
#curdir=re.findall('(.+)/.+',cururl)[0] | |
ignorelist=['<p.*?>','</p>','</embed>'] | |
#str=str.replace(' ','') | |
for i in re.findall('<img.+src="(.+)".*?/>',str): | |
#print i | |
fullurl=lambda i: i.startswith('http') and i or baseurl+'/'+i | |
str=re.sub('<img.*?'+i+'.*?>','[img]'+fullurl(i)+'[/img]',str) | |
#embed object | |
for i in re.findall('<embed.*src="(.*?)".*?>',str): | |
str=re.sub('<embed.*?'+i+'.*?>','[flash w=720 h=405]'+i+'[/flash]',str) | |
for j in ignorelist: | |
str=re.sub(j,'',str) | |
str=htmlescape(str) | |
str=str.replace('<','[') | |
str=str.replace('>',']') | |
while str.find('\n\n')!=-1: | |
str.replace('\n\n','\n') | |
str=str.replace('[br/]','') | |
str=str.replace('[br /]','') | |
for i in ['STAFF','CAST','PV']: | |
str=str.replace('【%s】'%i,'[h3]%s[/h3]'%i) | |
return str | |
def add_format(str): | |
colors=['#66ccff','Red','Orange','Indigo','Green','Yellow Green','Teal','Pink','Dark Olive','Dark Slate'] | |
replc_dict={'#title':'[size=4][color='+random.choice(colors)+'][b]','#/title':'[/b][/color][/size]','#desc':'','#/desc':''} | |
str_new=('[font="Microsoft YaHei,微软雅黑;文泉驿微米黑"]'+str+'[/font]') | |
for i in replc_dict: | |
str_new=str_new.replace(i,replc_dict[i]) | |
return str_new | |
def htmlescape(str): | |
def replc(match): | |
#print match.group(0),match.group(1),match.group(2) | |
dict={'amp':'&','nbsp':' ','quot':'"','lt':'<','gt':'>','copy':'?','reg':'?','ldquo':'“','rdquo':'”','mdash':'—','bull':'…','hellip':'‰'} | |
if match.groups>2: | |
if match.group(1)=='#': | |
#print(match.group(2)) | |
if int(match.group(2)) in [12539,65381]: | |
return '.' | |
return unichr(int(match.group(2))) | |
else: | |
return dict.get(match.group(2),'?') | |
htmlre=re.compile("&(#?)(\d{1,5}|\w{1,8}|[a-z]+);") | |
return htmlre.sub(replc,str) | |
def setClipboard(str): | |
clipboard.OpenClipboard() | |
clipboard.EmptyClipboard() | |
clipboard.SetClipboardData(win32con.CF_TEXT, str) | |
clipboard.CloseClipboard() | |
def makeNum(num): | |
#只支持两位数 | |
chn=['','一','二','三','四','五','六','七','八','九'] | |
str='' | |
a=num/10 | |
b=num-a*10 | |
if a>0: | |
if a>1:str=chn[a] | |
str+='十' | |
str+=chn[b] | |
return str | |
if __name__=='__main__': | |
reload(sys) | |
sys.setdefaultencoding('gbk') | |
ht=httplib2.Http() | |
hd={'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0','Connection': 'Keep-Alive','Accept-Encoding':'gzip'} | |
yearmon=int(time.strftime('%Y%m',time.localtime(time.time()))) | |
#print -(-(yearmon%100)/4) | |
season=(yearmon%100-1)/3 | |
seasonch=['春','夏','秋','冬'][season] | |
mon=['4','7','10','1'][season] | |
year=str(yearmon/100+(mon=='1' and 1 or 0)) | |
print('使用前先看readme嗯~【度娘网盘上有\n'+'-'*60) | |
print('现在是'+str(yearmon/100)+'年'+str(yearmon%100)+'月 -> '+'下载'+year+'年'+seasonch+'季新番数据') | |
mon=raw_input('要指定其他季度请在此输入(1,4,7,10),按回车继续:') or mon | |
mon="%.2d" % (int(mon)) | |
contenturl='http://www.dm123.cn/data/'+year+'/'+year+mon | |
print('下载首页...') | |
#try: | |
resp,content=ht.request(contenturl,headers=hd) | |
if int(resp['status'])>=400: | |
raw_input('您太超前了,还木有'+mon+'月新番介绍,按回车退出……') | |
os._exit(0) | |
res=re.findall('#C3C3C3(.+)</table',content,re.DOTALL)[0] | |
list=re.findall('tr >(.*?)</tr',res,re.DOTALL) | |
dict=[] | |
if os.path.exists('elem.txt'): | |
ss=open('elem.txt','r').read().decode('utf-8').split('SEPSEPSEPSEP\n\n') | |
for s in ss: | |
elemdict={'name':'','time':'','url':'','thumb':'','desc':'','postid':''} | |
if len(s.split(','))==5: | |
elemdict['time'],elemdict['url'],elemdict['name'],elemdict['thumb'],elemdict['desc']=s.split(',') | |
dict+=[elemdict]+[] | |
else: | |
for i in range(len(list)): | |
#try: | |
elemdict={'name':'','time':'','url':'','thumb':'','desc':'','postid':''} | |
elemdict['time']='-'.join(re.findall('\d+', re.findall('2" >(.+)</td',list[i])[0])) | |
elemdict['url'],elemdict['name'],thumb=re.findall('f="(.+)" target="_blank">(.+)</a>.+<img height="74" src="(.+)" width="20',list[i])[0] | |
elemdict['thumb']=baseurl+thumb | |
elemdict['desc']=html2bbcode(\ | |
re.findall('div id="nrzw">(.*?)</div',ht.request(elemdict['url'],headers=hd)[1],re.DOTALL)[0]) | |
elemdict['desc']=elemdict['desc'] | |
'''except Exception,e: | |
#print('Skip '+elemdict['name']+' : '+e.reason) | |
else:''' | |
print('Get '+elemdict['name']) | |
open('elem.txt','a').write(('%s,%s,%s,%s,%sSEPSEPSEPSEP\n\n'%(elemdict['time'],elemdict['url'],elemdict['name'],elemdict['thumb'],elemdict['desc'])).encode('utf-8')) | |
dict+=[elemdict]+[] | |
raw_input('将要生成各番组简介,全部完成后将生成索引页,按回车继续……') | |
for i in range(len(dict)): | |
#setClipboard(makeNum(i+1)+'、'+dict[i]['name']+'\n'+dict[i]['desc']) | |
s=add_format('#title%s#/title\n#desc%s#/desc'%(dict[i]['name'],dict[i]['desc'])) | |
try: | |
s=s.decode('gb2312','ignore').encode('gb2312') | |
setClipboard(htmlescape(s)) | |
except: | |
print('encoding error, see temp.txt') | |
#print htmlescape(s) | |
open('temp.txt','w').write(s.encode('utf-8').replace('\r\n','\n')) | |
print(dict[i]['name']+' 简介已复制到剪贴板') | |
dict[i]['postid']=raw_input('输入其楼层pid,回车自动+1:') or str(int(dict[i-1]['postid'])+1) | |
print('\n'+'-'*60+'\n') | |
baseid=raw_input('输入帖子id:') | |
indexstr='' | |
alldesc='' | |
if not os.path.exists('_indexes'): | |
os.mkdir('_indexes') | |
for p in os.listdir('_indexes'): | |
open(os.path.join('_indexes', p), 'w').close() | |
for i in range(len(dict)): | |
#http://www.kmgtp.org/forums.php?action=viewtopic&topicid=21427&page=p268050#pid268050 | |
indexstr='[url=http://www.kmgtp.org/forums.php?action=viewtopic&topicid='+baseid+\ | |
'&page=p'+dict[i]['postid']+'#pid'+dict[i]['postid']+'][img]'\ | |
+dict[i]['thumb']+'[/img]\n[b]'+dict[i]['name']+'[/b][/url]\n' | |
try: | |
open('_indexes/_%s.txt' % | |
datetime.datetime.strptime(dict[i]['time'], '%Y-%m-%d').strftime('%w') | |
,'a').write(indexstr) | |
except: | |
open('_indexes/_undefined.txt' ,'a').write(indexstr) | |
#alldesc+=dict[i]['desc'] | |
# | |
#setClipboard(add_format(htmlescape(indexstr))) | |
#open('index.txt','w').write(indexstr) | |
#setClipboard(alldesc) | |
print('索引已复制到剪贴板.') | |
raw_input('木有了,按回车退出……') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding:gbk | |
import win32clipboard as clipboard | |
import re | |
def setClipboard(str): | |
clipboard.OpenClipboard() | |
clipboard.EmptyClipboard() | |
clipboard.SetClipboardData(win32con.CF_TEXT, str) | |
clipboard.CloseClipboard() | |
open('span.txt','w').close() | |
fname_list = ['0', '1', '2', '3', '4', '5', '6', 'undefined'] | |
weekday_jp = ['日', '月', '火', '水', '木', '金', '土', '未定'] | |
weekday_cn = ['日', '一', '二', '三', '四', '五' ,'六', '?'] | |
weekday_color = ['Red', 'Orange', 'Yellow Green', 'Medium', 'Purple', 'Navy', 'Gray', 'Black'] | |
for idx in range(8): | |
nv,nan,yiban=[],[],[] | |
inp='' | |
open('span.txt','a').write('[font="Microsoft YaHei,微软雅黑;文泉驿微米黑"]' | |
'[size=4][color=%s][b]%s[/b][/color][/size] 星期%s[/font]\n' % ( | |
weekday_color[idx], | |
weekday_jp[idx], | |
weekday_cn[idx])) | |
total=open('_indexes/_%s.txt' % fname_list[idx]).read().split('[/url]') | |
i=0 | |
for t in total: | |
t+='[/url]' | |
#try: | |
# inp=raw_input(re.findall('\[b\](.*?)\[\/b\]',t)[0]+' > ') | |
#except IndexError: | |
# break | |
if i==0 : | |
nan.append(t.strip('\n')) | |
if i==1: | |
nv.append(t.strip('\n')) | |
if i==2: | |
yiban.append(t.strip('\n')) | |
i+=1 | |
i=i%3 | |
p1,p2,p3=0,0,0 | |
while p1<len(nan) or p2<len(nv) or p3<len(yiban): | |
s1,s2,s3='','','' | |
if p1<len(nan): | |
s1=nan[p1] | |
p1+=1 | |
if p2<len(nv): | |
s2=nv[p2] | |
p2+=1 | |
if p3<len(yiban): | |
s3=yiban[p3] | |
p3+=1 | |
if s1 and s1 != '[/url]': | |
s2 = (s2 and s2 != '[/url]') and s2 or '[img]http://ww2.sinaimg.cn/mw600/436919cbjw1e8tbx9n07nj205k022aa6.jpg[/img]' | |
s3 = (s3 and s3 != '[/url]') and s3 or '[img]http://ww2.sinaimg.cn/mw600/436919cbjw1e8tbx9n07nj205k022aa6.jpg[/img]' | |
single='''[span style="width:90%; margin:0 auto; overflow:auto; _display:inline-block;"][span style="width: 250px; float: left;"]\n'''+s1+'''[/span][span style="width: 260px; float: left;"] | |
'''+s2+'''[/span][span]\n'''+s3+'''[/span][/span]\n''' | |
#print single | |
open('span.txt','a').write(single) | |
open('span.txt','a').write('\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment