Last active
December 18, 2015 21:39
-
-
Save Yhzhtk/5848852 to your computer and use it in GitHub Desktop.
AnalyKeys.py 一个简单的脚本程序。GetCateKeys 分析关键字的分类,并按分类保存。DecodeUrl 将URL中带有中文的转码成%形式。 PostJson.py 将jekyll中_post目录下的文章到处json格式,满足阿里云搜索导入的数据格式。 GetKeys.py 解析url中的关键字,参数一是解析的文件,参数二是输出的中文关键字,最后打印行数
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: cp936 -*- | |
''' | |
Created on 2013-6-19 | |
@author: gdh | |
''' | |
import sys | |
import urllib | |
def GetCateKeys(fileName): | |
'''获取分类关键字''' | |
f = open(fileName, "r") | |
try: | |
lines = f.readlines() | |
finally: | |
f.close() | |
maps = {} | |
for l in lines: | |
fs = l.split() | |
try: | |
if(fs[11] and maps.has_key(fs[11])): | |
if fs[5] not in maps[fs[11]]: | |
maps[fs[11]].append(fs[5]) | |
else: | |
maps[fs[11]] = [fs[5]] | |
except Exception as e: | |
print e, fs | |
metas = ("bing","jike","sogou","soso","panguso","youdao") | |
path = "d:\\meta\\" | |
for k in maps.keys(): | |
if k in metas: | |
write = open(path + k + ".txt", 'w') | |
newmap = maps[k][-100:] | |
for l in newmap: | |
write.write(l + "\n") | |
def DecodeUrl(fileName): | |
'''解析url中的汉字''' | |
f = open(fileName, "r") | |
try: | |
lines = f.readlines() | |
finally: | |
f.close() | |
write = open("d:/urls.txt", 'w') | |
for l in lines: | |
str = urllib.quote(l) | |
write.write(str + "\n") | |
print 'count:%d' % len(lines) | |
if __name__ == '__main__': | |
if sys.argv[1:]: | |
GetCateKeys(sys.argv[1]) | |
else: | |
print "need fileName" | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
''' | |
解析url中的关键字,参数一是解析的文件,参数二是输出的中文关键字,最后打印行数 | |
@author: gdh | |
''' | |
import os | |
import sys | |
import urllib | |
shstr = "cat " + sys.argv[1] + ''' | awk -F "key=" '{print $2}' | awk -F "&pageNo" '{print $1}' | awk '{print $1}' | uniq ''' | |
res = os.popen(shstr).read() | |
lines = res.split("\n") | |
write = open(sys.argv[2],"w") | |
for line in lines: | |
write.write(urllib.unquote(line) + "\n") | |
print len(lines) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
''' | |
将jekyll中_post目录下的文章到处json格式,用于阿里云搜索 | |
@author: gdh | |
''' | |
import os | |
import re | |
import json | |
import time | |
def object2dict(obj): | |
#convert object to a dict | |
d = {} | |
d.update(obj.__dict__) | |
return d | |
class PostInfo(): | |
def __init__(self, title, tags, content, rooturl, f): | |
self.title = title | |
self.tag = {} | |
for t in tags.split(","): | |
self.tag[t.strip()] = 1 | |
self.body = content | |
self.id = f.replace("-", "").replace(".html", "") | |
self.display_text = "open search" | |
self.hit_num = "88888" | |
t = re.match('\\d{4}-\\d{2}-\\d{2}', f).group() | |
self.url = rooturl + f.replace(t + "-", (t + "-").replace("-", "/")) | |
self.update_timestamp = "%d" % time.mktime(time.strptime(t,'%Y-%m-%d')) | |
self.create_timestamp = "%d" % time.mktime(time.strptime(t,'%Y-%m-%d')) | |
self.type_id = "1" | |
self.cat_id= [1,2] | |
self.author="yhzhtk" | |
self.grade="10" | |
self.source="yhzhtk" | |
self.boost = "1" | |
def getPostInfo(path, rooturl="http://yhzhtk.info/"): | |
os.chdir(path) | |
posts = [[open(f, "r").read(), f] | |
for f in os.listdir(path) if f.endswith(".html")] | |
pattern = re.compile(r"---\nlayout: post\ntitle: ([^\n]*)\ntags: \[([^\]]*)\]\n---\n(.*)", re.DOTALL) | |
infos = [] | |
for post, f in posts: | |
match = pattern.match(post) | |
if match: | |
title = match.group(1) | |
tags = match.group(2) | |
content = match.group(3) | |
content = re.sub("<[^>]*>","",content) | |
content = re.sub("{%[^%]*%}","",content) | |
content = content.replace("\n", "") | |
infos.append(PostInfo(title, tags, content, rooturl, f)) | |
return infos | |
def genPostJson(infos): | |
jsonstr = "" | |
for info in infos: | |
temp = json.dumps(info, ensure_ascii=False, default=object2dict) | |
temp = '''{"fields":''' + temp + ''', "cmd": "ADD"}''' | |
jsonstr += "," + temp | |
if json: | |
jsonstr = jsonstr[1:] | |
jsonstr = "[" + jsonstr + "]" | |
print jsonstr | |
if __name__ == '__main__': | |
path = r"C:\Documents and Settings\yhzhtk\blog\_posts" | |
infos = getPostInfo(path) | |
genPostJson(infos); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment