Created
February 6, 2013 12:56
-
-
Save tianweidut/4722360 to your computer and use it in GitHub Desktop.
CNKI 专利索引分析
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
''' | |
Created on 2011-12-17 | |
@author: tianwei | |
''' | |
import os,sys | |
objPath = 'F:\\txt\\' | |
def getFileList(path): | |
p = str(path) | |
if p == "": | |
return [] | |
p = p.replace('\\','\\\\') | |
if p[-1] != '\\': | |
p = p + '\\' | |
a = os.listdir(p) | |
return [x for x in a if os.path.isfile(p+x)] | |
files = getFileList(objPath) | |
parseRecord = {"":{ | |
"cnt":0, | |
"location":{}, #非第一作者可能的地址 | |
"truelocation":{"":0}, #除了改变单位地址 | |
"finallocation":"", #经过比较分析过的正确的地址 | |
"title":{"":["",0]}, | |
}} | |
title ='' | |
author =[] | |
location = [] | |
postface ='' | |
year = 0 | |
for fileStr in files: | |
file = open(str(objPath+fileStr),"r+") | |
list_files = file.readlines() | |
for line in list_files: | |
#print line, | |
if line != "\n": | |
#同一条记录 | |
item = line.strip('\n').split(':') | |
if item[0] == '题名': | |
title = item[1].strip() | |
elif item[0] == '作者': | |
author = item[1].strip().strip(';').split(';') | |
elif item[0] == '单位': | |
location = item[1].strip().strip(';').split(';') | |
elif item[0] == '刊名': | |
postface = str(item[1].strip()) | |
elif item[0] == '年': | |
year = item[1].strip() | |
else: | |
#此记录完成,处理后进入下一条记录 | |
if title != '': | |
#区分两个或多个空行 | |
for tmp in author: | |
if tmp in parseRecord: #已经存在该记录 | |
parseRecord[tmp]['cnt'] = parseRecord[tmp]['cnt'] + 1 | |
else: | |
#首次出现 | |
parseRecord[tmp] = {"cnt":1,"location":{0:[]},"truelocation":{},"finallocation":"","title":{"":["",0]}} | |
parseRecord[tmp]['title'][title]=[postface,year] | |
if location.__len__() == 1: #当只有一个地址时 | |
#直接进入truelocation候选,比较年份进行替换 | |
num = parseRecord[tmp]['truelocation'].keys().__len__() | |
if num == 0: #没有元素 | |
parseRecord[tmp]['truelocation'][location[0]]=year | |
else: | |
if year>parseRecord[tmp]['truelocation'][(parseRecord[tmp]['truelocation'].keys())[0]]: | |
parseRecord[tmp]['truelocation'][location[0]]=year | |
elif location.__len__() == 0: #没有地址 | |
pass | |
else: #多个地址 | |
if tmp == author[0]: #当为第一作者时 | |
#以前是否出现过,若出现过,比较日期信息,记录最近一次有意义的地址信息 | |
num = parseRecord[tmp]['truelocation'].keys().__len__() | |
if num == 0: #没有元素 | |
parseRecord[tmp]['truelocation'][location[0]]=year | |
else: | |
if year>parseRecord[tmp]['truelocation'][(parseRecord[tmp]['truelocation'].keys())[0]]: | |
parseRecord[tmp]['truelocation'][location[0]]=year | |
else: | |
parseRecord[tmp]['location'][year]=location #将每个人可能的单位都写下来,除第一作者外,不考虑没有年份情况 | |
#清空处理 | |
title ='' | |
author =[] | |
location = [] | |
postface ='' | |
year = 0 | |
file.close() | |
#处理部门信息,在location中筛选权重高的部门信息 | |
for tmp in parseRecord: | |
num = parseRecord[tmp]['truelocation'].keys().__len__() | |
if num != 0: | |
parseRecord[tmp]["finallocation"]=(parseRecord[tmp]['truelocation'].keys())[0] | |
else: | |
#location中选择,选择单位出现最多的,否则就选择第一个单位 | |
pass | |
#所有的文件处理完成,进行最终的处理,并写入文件 | |
f = open(str('C:\\'+'result.txt'),"w") | |
for tmp in parseRecord: | |
f.write(tmp+'\t'+str(parseRecord[tmp]['cnt'])+'\t') | |
for l in parseRecord[tmp]['truelocation']: | |
f.write(str(l.strip('\n'))) | |
f.write('\n') | |
f.close | |
print '!*_*!' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment