tianweidut · February 6, 2013 12:56
diff --git a/gistfile1.py b/gistfile1.py
 # -*- coding:utf-8 -*-
 '''
 Created on 2011-12-17

 @author: tianwei

 '''

 import os,sys

 objPath = 'F:\\txt\\'

 def getFileList(path):
    p = str(path)
    if p == "":
        return []
    p = p.replace('\\','\\\\')
    if p[-1] != '\\':
        p = p + '\\'
    a = os.listdir(p)
    return [x for x in a if os.path.isfile(p+x)]


 files = getFileList(objPath)

 parseRecord = {"":{
                   "cnt":0,
                   "location":{},           #非第一作者可能的地址
                   "truelocation":{"":0},   #除了改变单位地址
                   "finallocation":"",      #经过比较分析过的正确的地址
                   "title":{"":["",0]},
                   }}

 title =''
 author =[]
 location = []
 postface =''
 year = 0 

 for fileStr in files:
    file = open(str(objPath+fileStr),"r+")
    list_files = file.readlines()
    for line in list_files:
        #print line,
        if line != "\n":
            #同一条记录
            item = line.strip('\n').split(':')
            if item[0] == '题名':
                title = item[1].strip()
            elif item[0] == '作者':
                author = item[1].strip().strip(';').split(';')
            elif item[0] == '单位':
                location = item[1].strip().strip(';').split(';')
            elif item[0] == '刊名':
                postface = str(item[1].strip())
            elif item[0] == '年':
                year =  item[1].strip()
        else:
            #此记录完成，处理后进入下一条记录
            if title != '':                   
                #区分两个或多个空行
                for tmp in author:
                    if tmp in parseRecord:      #已经存在该记录
                        parseRecord[tmp]['cnt'] = parseRecord[tmp]['cnt'] + 1
                    else:                       
                        #首次出现
                        parseRecord[tmp] = {"cnt":1,"location":{0:[]},"truelocation":{},"finallocation":"","title":{"":["",0]}}
                    
                    parseRecord[tmp]['title'][title]=[postface,year]
                    
                    if location.__len__() == 1:         #当只有一个地址时
                        #直接进入truelocation候选，比较年份进行替换
                        num = parseRecord[tmp]['truelocation'].keys().__len__()
                        if num == 0:    #没有元素
                            parseRecord[tmp]['truelocation'][location[0]]=year
                        else:
                            if year>parseRecord[tmp]['truelocation'][(parseRecord[tmp]['truelocation'].keys())[0]]:
                                parseRecord[tmp]['truelocation'][location[0]]=year
                    elif location.__len__() == 0:       #没有地址
                        pass    
                    else:                               #多个地址        
                        if tmp == author[0]:    #当为第一作者时
                            #以前是否出现过，若出现过，比较日期信息，记录最近一次有意义的地址信息
                            num = parseRecord[tmp]['truelocation'].keys().__len__() 
                            if num == 0:    #没有元素
                                parseRecord[tmp]['truelocation'][location[0]]=year
                            else:
                                if year>parseRecord[tmp]['truelocation'][(parseRecord[tmp]['truelocation'].keys())[0]]:
                                    parseRecord[tmp]['truelocation'][location[0]]=year
                        else:
                            parseRecord[tmp]['location'][year]=location     #将每个人可能的单位都写下来,除第一作者外，不考虑没有年份情况
                             
                    
                #清空处理
                title =''
                author =[]
                location = []
                postface =''
                year = 0 
    file.close()
    
 #处理部门信息,在location中筛选权重高的部门信息
 for tmp in parseRecord:
    num = parseRecord[tmp]['truelocation'].keys().__len__()
    if num != 0:
        parseRecord[tmp]["finallocation"]=(parseRecord[tmp]['truelocation'].keys())[0] 
    else:
        #location中选择，选择单位出现最多的，否则就选择第一个单位
        pass

 #所有的文件处理完成，进行最终的处理，并写入文件
 f = open(str('C:\\'+'result.txt'),"w")        
 for tmp in  parseRecord:
    f.write(tmp+'\t'+str(parseRecord[tmp]['cnt'])+'\t')
    for l in parseRecord[tmp]['truelocation']:
        f.write(str(l.strip('\n')))
    f.write('\n')
 f.close
 print '!*_*!'
	# -- coding:utf-8 --
	'''
	Created on 2011-12-17

	@author: tianwei

	'''

	import os,sys

	objPath = 'F:\\txt\\'

	def getFileList(path):
	p = str(path)
	if p == "":
	return []
	p = p.replace('\\','\\\\')
	if p[-1] != '\\':
	p = p + '\\'
	a = os.listdir(p)
	return [x for x in a if os.path.isfile(p+x)]


	files = getFileList(objPath)

	parseRecord = {"":{
	"cnt":0,
	"location":{}, #非第一作者可能的地址
	"truelocation":{"":0}, #除了改变单位地址
	"finallocation":"", #经过比较分析过的正确的地址
	"title":{"":["",0]},
	}}

	title =''
	author =[]
	location = []
	postface =''
	year = 0

	for fileStr in files:
	file = open(str(objPath+fileStr),"r+")
	list_files = file.readlines()
	for line in list_files:
	#print line,
	if line != "\n":
	#同一条记录
	item = line.strip('\n').split(':')
	if item[0] == '题名':
	title = item[1].strip()
	elif item[0] == '作者':
	author = item[1].strip().strip(';').split(';')
	elif item[0] == '单位':
	location = item[1].strip().strip(';').split(';')
	elif item[0] == '刊名':
	postface = str(item[1].strip())
	elif item[0] == '年':
	year = item[1].strip()
	else:
	#此记录完成，处理后进入下一条记录
	if title != '':
	#区分两个或多个空行
	for tmp in author:
	if tmp in parseRecord: #已经存在该记录
	parseRecord[tmp]['cnt'] = parseRecord[tmp]['cnt'] + 1
	else:
	#首次出现
	parseRecord[tmp] = {"cnt":1,"location":{0:[]},"truelocation":{},"finallocation":"","title":{"":["",0]}}

	parseRecord[tmp]['title'][title]=[postface,year]

	if location.__len__() == 1: #当只有一个地址时
	#直接进入truelocation候选，比较年份进行替换
	num = parseRecord[tmp]['truelocation'].keys().__len__()
	if num == 0: #没有元素
	parseRecord[tmp]['truelocation'][location[0]]=year
	else:
	if year>parseRecord[tmp]['truelocation'][(parseRecord[tmp]['truelocation'].keys())[0]]:
	parseRecord[tmp]['truelocation'][location[0]]=year
	elif location.__len__() == 0: #没有地址
	pass
	else: #多个地址
	if tmp == author[0]: #当为第一作者时
	#以前是否出现过，若出现过，比较日期信息，记录最近一次有意义的地址信息
	num = parseRecord[tmp]['truelocation'].keys().__len__()
	if num == 0: #没有元素
	parseRecord[tmp]['truelocation'][location[0]]=year
	else:
	if year>parseRecord[tmp]['truelocation'][(parseRecord[tmp]['truelocation'].keys())[0]]:
	parseRecord[tmp]['truelocation'][location[0]]=year
	else:
	parseRecord[tmp]['location'][year]=location #将每个人可能的单位都写下来,除第一作者外，不考虑没有年份情况


	#清空处理
	title =''
	author =[]
	location = []
	postface =''
	year = 0
	file.close()

	#处理部门信息,在location中筛选权重高的部门信息
	for tmp in parseRecord:
	num = parseRecord[tmp]['truelocation'].keys().__len__()
	if num != 0:
	parseRecord[tmp]["finallocation"]=(parseRecord[tmp]['truelocation'].keys())[0]
	else:
	#location中选择，选择单位出现最多的，否则就选择第一个单位
	pass

	#所有的文件处理完成，进行最终的处理，并写入文件
	f = open(str('C:\\'+'result.txt'),"w")
	for tmp in parseRecord:
	f.write(tmp+'\t'+str(parseRecord[tmp]['cnt'])+'\t')
	for l in parseRecord[tmp]['truelocation']:
	f.write(str(l.strip('\n')))
	f.write('\n')
	f.close
	print '!_!'