Yhzhtk · December 18, 2015 21:39
diff --git a/AnalyKeys.py b/AnalyKeys.py
 # -*- coding: cp936 -*-
 '''
 Created on 2013-6-19
 @author: gdh
 '''

 import sys
 import urllib

 def GetCateKeys(fileName):
 '''获取分类关键字'''
    f = open(fileName, "r")
    try:
        lines = f.readlines()
    finally:
        f.close()
    
    maps = {}
    for l in lines:
        fs = l.split()
        try:
            if(fs[11] and maps.has_key(fs[11])):
                if fs[5] not in maps[fs[11]]:
                    maps[fs[11]].append(fs[5])
            else:
                maps[fs[11]] = [fs[5]]
        except Exception as e:
            print e, fs
            
            
    metas = ("bing","jike","sogou","soso","panguso","youdao")
    path = "d:\\meta\\"
    for k in maps.keys():
        if k in metas:
            write = open(path + k + ".txt", 'w')
            newmap = maps[k][-100:]
            for l in newmap:
                write.write(l + "\n")
                
 def DecodeUrl(fileName):
 '''解析url中的汉字'''
    f = open(fileName, "r")
    try:
    lines = f.readlines()
    finally:
        f.close()
    
    write = open("d:/urls.txt", 'w')
    for l in lines:
        str = urllib.quote(l) 
        write.write(str + "\n")
    print 'count：%d' % len(lines)


 if __name__ == '__main__':
    if sys.argv[1:]:
        GetCateKeys(sys.argv[1])
    else:
        print "need fileName"
    
    
    
diff --git a/GetKeys.py b/GetKeys.py
 # -*- coding: utf-8 -*-
 '''
 解析url中的关键字，参数一是解析的文件，参数二是输出的中文关键字，最后打印行数
 @author: gdh
 '''

 import os
 import sys
 import urllib

 shstr = "cat " + sys.argv[1]  + ''' | awk -F "key=" '{print $2}' | awk -F "&pageNo" '{print $1}' | awk  '{print $1}' | uniq '''
 res = os.popen(shstr).read()

 lines = res.split("\n")

 write = open(sys.argv[2],"w")

 for line in lines:
    write.write(urllib.unquote(line) + "\n")

 print len(lines)

diff --git a/PostJson.py b/PostJson.py
 #coding=utf-8
 '''
 将jekyll中_post目录下的文章到处json格式，用于阿里云搜索
 @author: gdh
 '''

 import os
 import re
 import json
 import time

 def object2dict(obj):
    #convert object to a dict
    d = {}
    d.update(obj.__dict__)
    return d

 class PostInfo():
    def __init__(self, title, tags, content, rooturl, f):
        self.title = title
        self.tag = {}
        for t in tags.split(","):
            self.tag[t.strip()] = 1
        self.body = content
        self.id = f.replace("-", "").replace(".html", "")
        self.display_text = "open search"
        self.hit_num = "88888"
        t = re.match('\\d{4}-\\d{2}-\\d{2}', f).group()
        self.url = rooturl + f.replace(t + "-", (t + "-").replace("-", "/"))
        self.update_timestamp = "%d" % time.mktime(time.strptime(t,'%Y-%m-%d'))
        self.create_timestamp = "%d" % time.mktime(time.strptime(t,'%Y-%m-%d'))
        self.type_id = "1"
        self.cat_id= [1,2]
        self.author="yhzhtk"
        self.grade="10"
        self.source="yhzhtk"
        self.boost = "1"

    
 def getPostInfo(path, rooturl="http://yhzhtk.info/"):
    os.chdir(path)
    posts = [[open(f, "r").read(), f]
             for f in os.listdir(path) if f.endswith(".html")]
    pattern = re.compile(r"---\nlayout: post\ntitle: ([^\n]*)\ntags: \[([^\]]*)\]\n---\n(.*)", re.DOTALL)
    infos = []
    for post, f in posts:
        match = pattern.match(post)
        if match:
            title = match.group(1)
            tags =  match.group(2)
            content = match.group(3)
            content = re.sub("<[^>]*>","",content)
            content = re.sub("{%[^%]*%}","",content)
            content = content.replace("\n", "")
            infos.append(PostInfo(title, tags, content, rooturl, f))
    return infos

 def genPostJson(infos):
    jsonstr = ""
    for info in infos:
        temp = json.dumps(info, ensure_ascii=False, default=object2dict)
        temp = '''{"fields":''' + temp + ''', "cmd": "ADD"}'''
        jsonstr += "," + temp
    if json:
        jsonstr = jsonstr[1:]
    jsonstr = "[" + jsonstr + "]"
    print jsonstr

    
 if __name__ == '__main__':
    path = r"C:\Documents and Settings\yhzhtk\blog\_posts"
    infos = getPostInfo(path)
    genPostJson(infos);
	# -- coding: cp936 --
	'''
	Created on 2013-6-19
	@author: gdh
	'''

	import sys
	import urllib

	def GetCateKeys(fileName):
	'''获取分类关键字'''
	f = open(fileName, "r")
	try:
	lines = f.readlines()
	finally:
	f.close()

	maps = {}
	for l in lines:
	fs = l.split()
	try:
	if(fs[11] and maps.has_key(fs[11])):
	if fs[5] not in maps[fs[11]]:
	maps[fs[11]].append(fs[5])
	else:
	maps[fs[11]] = [fs[5]]
	except Exception as e:
	print e, fs


	metas = ("bing","jike","sogou","soso","panguso","youdao")
	path = "d:\\meta\\"
	for k in maps.keys():
	if k in metas:
	write = open(path + k + ".txt", 'w')
	newmap = maps[k][-100:]
	for l in newmap:
	write.write(l + "\n")

	def DecodeUrl(fileName):
	'''解析url中的汉字'''
	f = open(fileName, "r")
	try:
	lines = f.readlines()
	finally:
	f.close()

	write = open("d:/urls.txt", 'w')
	for l in lines:
	str = urllib.quote(l)
	write.write(str + "\n")
	print 'count：%d' % len(lines)


	if __name__ == '__main__':
	if sys.argv[1:]:
	GetCateKeys(sys.argv[1])
	else:
	print "need fileName"
	# -- coding: utf-8 --
	'''
	解析url中的关键字，参数一是解析的文件，参数二是输出的中文关键字，最后打印行数
	@author: gdh
	'''

	import os
	import sys
	import urllib

	shstr = "cat " + sys.argv[1] + ''' \| awk -F "key=" '{print $2}' \| awk -F "&pageNo" '{print $1}' \| awk '{print $1}' \| uniq '''
	res = os.popen(shstr).read()

	lines = res.split("\n")

	write = open(sys.argv[2],"w")

	for line in lines:
	write.write(urllib.unquote(line) + "\n")

	print len(lines)
	#coding=utf-8
	'''
	将jekyll中_post目录下的文章到处json格式，用于阿里云搜索
	@author: gdh
	'''

	import os
	import re
	import json
	import time

	def object2dict(obj):
	#convert object to a dict
	d = {}
	d.update(obj.__dict__)
	return d

	class PostInfo():
	def __init__(self, title, tags, content, rooturl, f):
	self.title = title
	self.tag = {}
	for t in tags.split(","):
	self.tag[t.strip()] = 1
	self.body = content
	self.id = f.replace("-", "").replace(".html", "")
	self.display_text = "open search"
	self.hit_num = "88888"
	t = re.match('\\d{4}-\\d{2}-\\d{2}', f).group()
	self.url = rooturl + f.replace(t + "-", (t + "-").replace("-", "/"))
	self.update_timestamp = "%d" % time.mktime(time.strptime(t,'%Y-%m-%d'))
	self.create_timestamp = "%d" % time.mktime(time.strptime(t,'%Y-%m-%d'))
	self.type_id = "1"
	self.cat_id= [1,2]
	self.author="yhzhtk"
	self.grade="10"
	self.source="yhzhtk"
	self.boost = "1"


	def getPostInfo(path, rooturl="http://yhzhtk.info/"):
	os.chdir(path)
	posts = [[open(f, "r").read(), f]
	for f in os.listdir(path) if f.endswith(".html")]
	pattern = re.compile(r"---\nlayout: post\ntitle: ([^\n])\ntags: \[([^\]])\]\n---\n(.*)", re.DOTALL)
	infos = []
	for post, f in posts:
	match = pattern.match(post)
	if match:
	title = match.group(1)
	tags = match.group(2)
	content = match.group(3)
	content = re.sub("<[^>]*>","",content)
	content = re.sub("{%[^%]*%}","",content)
	content = content.replace("\n", "")
	infos.append(PostInfo(title, tags, content, rooturl, f))
	return infos

	def genPostJson(infos):
	jsonstr = ""
	for info in infos:
	temp = json.dumps(info, ensure_ascii=False, default=object2dict)
	temp = '''{"fields":''' + temp + ''', "cmd": "ADD"}'''
	jsonstr += "," + temp
	if json:
	jsonstr = jsonstr[1:]
	jsonstr = "[" + jsonstr + "]"
	print jsonstr


	if __name__ == '__main__':
	path = r"C:\Documents and Settings\yhzhtk\blog\_posts"
	infos = getPostInfo(path)
	genPostJson(infos);