Skip to content

Instantly share code, notes, and snippets.

@tianweidut
Created February 6, 2013 12:56
Show Gist options
  • Save tianweidut/4722360 to your computer and use it in GitHub Desktop.
Save tianweidut/4722360 to your computer and use it in GitHub Desktop.
CNKI 专利索引分析
# -*- coding:utf-8 -*-
'''
Created on 2011-12-17
@author: tianwei
'''
import os,sys
objPath = 'F:\\txt\\'
def getFileList(path):
p = str(path)
if p == "":
return []
p = p.replace('\\','\\\\')
if p[-1] != '\\':
p = p + '\\'
a = os.listdir(p)
return [x for x in a if os.path.isfile(p+x)]
files = getFileList(objPath)
parseRecord = {"":{
"cnt":0,
"location":{}, #非第一作者可能的地址
"truelocation":{"":0}, #除了改变单位地址
"finallocation":"", #经过比较分析过的正确的地址
"title":{"":["",0]},
}}
title =''
author =[]
location = []
postface =''
year = 0
for fileStr in files:
file = open(str(objPath+fileStr),"r+")
list_files = file.readlines()
for line in list_files:
#print line,
if line != "\n":
#同一条记录
item = line.strip('\n').split(':')
if item[0] == '题名':
title = item[1].strip()
elif item[0] == '作者':
author = item[1].strip().strip(';').split(';')
elif item[0] == '单位':
location = item[1].strip().strip(';').split(';')
elif item[0] == '刊名':
postface = str(item[1].strip())
elif item[0] == '年':
year = item[1].strip()
else:
#此记录完成,处理后进入下一条记录
if title != '':
#区分两个或多个空行
for tmp in author:
if tmp in parseRecord: #已经存在该记录
parseRecord[tmp]['cnt'] = parseRecord[tmp]['cnt'] + 1
else:
#首次出现
parseRecord[tmp] = {"cnt":1,"location":{0:[]},"truelocation":{},"finallocation":"","title":{"":["",0]}}
parseRecord[tmp]['title'][title]=[postface,year]
if location.__len__() == 1: #当只有一个地址时
#直接进入truelocation候选,比较年份进行替换
num = parseRecord[tmp]['truelocation'].keys().__len__()
if num == 0: #没有元素
parseRecord[tmp]['truelocation'][location[0]]=year
else:
if year>parseRecord[tmp]['truelocation'][(parseRecord[tmp]['truelocation'].keys())[0]]:
parseRecord[tmp]['truelocation'][location[0]]=year
elif location.__len__() == 0: #没有地址
pass
else: #多个地址
if tmp == author[0]: #当为第一作者时
#以前是否出现过,若出现过,比较日期信息,记录最近一次有意义的地址信息
num = parseRecord[tmp]['truelocation'].keys().__len__()
if num == 0: #没有元素
parseRecord[tmp]['truelocation'][location[0]]=year
else:
if year>parseRecord[tmp]['truelocation'][(parseRecord[tmp]['truelocation'].keys())[0]]:
parseRecord[tmp]['truelocation'][location[0]]=year
else:
parseRecord[tmp]['location'][year]=location #将每个人可能的单位都写下来,除第一作者外,不考虑没有年份情况
#清空处理
title =''
author =[]
location = []
postface =''
year = 0
file.close()
#处理部门信息,在location中筛选权重高的部门信息
for tmp in parseRecord:
num = parseRecord[tmp]['truelocation'].keys().__len__()
if num != 0:
parseRecord[tmp]["finallocation"]=(parseRecord[tmp]['truelocation'].keys())[0]
else:
#location中选择,选择单位出现最多的,否则就选择第一个单位
pass
#所有的文件处理完成,进行最终的处理,并写入文件
f = open(str('C:\\'+'result.txt'),"w")
for tmp in parseRecord:
f.write(tmp+'\t'+str(parseRecord[tmp]['cnt'])+'\t')
for l in parseRecord[tmp]['truelocation']:
f.write(str(l.strip('\n')))
f.write('\n')
f.close
print '!*_*!'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment