Skip to content

Instantly share code, notes, and snippets.

@Yhzhtk
Last active December 18, 2015 21:39
Show Gist options
  • Save Yhzhtk/5848852 to your computer and use it in GitHub Desktop.
Save Yhzhtk/5848852 to your computer and use it in GitHub Desktop.
AnalyKeys.py 一个简单的脚本程序。GetCateKeys 分析关键字的分类,并按分类保存。DecodeUrl 将URL中带有中文的转码成%形式。 PostJson.py 将jekyll中_post目录下的文章到处json格式,满足阿里云搜索导入的数据格式。 GetKeys.py 解析url中的关键字,参数一是解析的文件,参数二是输出的中文关键字,最后打印行数
# -*- coding: cp936 -*-
'''
Created on 2013-6-19
@author: gdh
'''
import sys
import urllib
def GetCateKeys(fileName):
'''获取分类关键字'''
f = open(fileName, "r")
try:
lines = f.readlines()
finally:
f.close()
maps = {}
for l in lines:
fs = l.split()
try:
if(fs[11] and maps.has_key(fs[11])):
if fs[5] not in maps[fs[11]]:
maps[fs[11]].append(fs[5])
else:
maps[fs[11]] = [fs[5]]
except Exception as e:
print e, fs
metas = ("bing","jike","sogou","soso","panguso","youdao")
path = "d:\\meta\\"
for k in maps.keys():
if k in metas:
write = open(path + k + ".txt", 'w')
newmap = maps[k][-100:]
for l in newmap:
write.write(l + "\n")
def DecodeUrl(fileName):
'''解析url中的汉字'''
f = open(fileName, "r")
try:
lines = f.readlines()
finally:
f.close()
write = open("d:/urls.txt", 'w')
for l in lines:
str = urllib.quote(l)
write.write(str + "\n")
print 'count:%d' % len(lines)
if __name__ == '__main__':
if sys.argv[1:]:
GetCateKeys(sys.argv[1])
else:
print "need fileName"
# -*- coding: utf-8 -*-
'''
解析url中的关键字,参数一是解析的文件,参数二是输出的中文关键字,最后打印行数
@author: gdh
'''
import os
import sys
import urllib
shstr = "cat " + sys.argv[1] + ''' | awk -F "key=" '{print $2}' | awk -F "&pageNo" '{print $1}' | awk '{print $1}' | uniq '''
res = os.popen(shstr).read()
lines = res.split("\n")
write = open(sys.argv[2],"w")
for line in lines:
write.write(urllib.unquote(line) + "\n")
print len(lines)
#coding=utf-8
'''
将jekyll中_post目录下的文章到处json格式,用于阿里云搜索
@author: gdh
'''
import os
import re
import json
import time
def object2dict(obj):
#convert object to a dict
d = {}
d.update(obj.__dict__)
return d
class PostInfo():
def __init__(self, title, tags, content, rooturl, f):
self.title = title
self.tag = {}
for t in tags.split(","):
self.tag[t.strip()] = 1
self.body = content
self.id = f.replace("-", "").replace(".html", "")
self.display_text = "open search"
self.hit_num = "88888"
t = re.match('\\d{4}-\\d{2}-\\d{2}', f).group()
self.url = rooturl + f.replace(t + "-", (t + "-").replace("-", "/"))
self.update_timestamp = "%d" % time.mktime(time.strptime(t,'%Y-%m-%d'))
self.create_timestamp = "%d" % time.mktime(time.strptime(t,'%Y-%m-%d'))
self.type_id = "1"
self.cat_id= [1,2]
self.author="yhzhtk"
self.grade="10"
self.source="yhzhtk"
self.boost = "1"
def getPostInfo(path, rooturl="http://yhzhtk.info/"):
os.chdir(path)
posts = [[open(f, "r").read(), f]
for f in os.listdir(path) if f.endswith(".html")]
pattern = re.compile(r"---\nlayout: post\ntitle: ([^\n]*)\ntags: \[([^\]]*)\]\n---\n(.*)", re.DOTALL)
infos = []
for post, f in posts:
match = pattern.match(post)
if match:
title = match.group(1)
tags = match.group(2)
content = match.group(3)
content = re.sub("<[^>]*>","",content)
content = re.sub("{%[^%]*%}","",content)
content = content.replace("\n", "")
infos.append(PostInfo(title, tags, content, rooturl, f))
return infos
def genPostJson(infos):
jsonstr = ""
for info in infos:
temp = json.dumps(info, ensure_ascii=False, default=object2dict)
temp = '''{"fields":''' + temp + ''', "cmd": "ADD"}'''
jsonstr += "," + temp
if json:
jsonstr = jsonstr[1:]
jsonstr = "[" + jsonstr + "]"
print jsonstr
if __name__ == '__main__':
path = r"C:\Documents and Settings\yhzhtk\blog\_posts"
infos = getPostInfo(path)
genPostJson(infos);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment