hxy9243 · November 14, 2021 05:50
diff --git a/docreader.py b/docreader.py
 # depend on third-party libraries, run the following command to install:
 # sudo pip3 install jieba pdfplumber docx2txt
 #
 # run the script with:
 # python3 reader.py <yourfile.pdf> <yourfile.doc>
 #
 # see more at: https://github.com/fxsjy/jieba


 import sys
 import re

 import pdfplumber
 import docx2txt
 import jieba
 import jieba.analyse

 # guessing the year by the format 19XX or 20XX
 FindYearRE = re.compile(r'(19|20)\d{2}')


 def read_pdf(filepath):
    '''read pdf file, return content as a string'''

    with pdfplumber.open(filepath) as f:
        txts = []

        for page in f.pages:
            txt = page.extract_text()

            txts += txt.split('\n')

        return ''.join(txts)


 def read_word(filepath):
    '''read doc file, return string'''
    return docx2txt.process(filepath)


 def jieba_analyze(text, topK=50):
    '''analyze text and return the list of words with their weights'''
    words = jieba.analyse.extract_tags(text, topK=topK, withWeight=True,
                                       allowPOS=('ns', 'n', 'vn', 'nr', 'x', 'nz'))
    return words


 def jieba_histogram(text, taglist):
    '''get the histogram of word frequency, given the list of words'''
    tokens = jieba.cut(text, cut_all=False)
    hist = dict()

    for tag in taglist:
        hist[tag] = 0

    for t in tokens:
        if t in hist:
            hist[t] += 1

    return hist


 def dump_citespace(title='', year='', tags=[]):
    tagstr = '； '.join(tags)

    return (
        f'T1 {title}\n'
        f'YR {year}\n'
        f'K1 {tagstr}\n'
    )


 def main():
    jieba.enable_parallel(4)

    outfile = open('output.citespace', 'w')

    for filepath in sys.argv[1:]:
        outtext = ''
        if filepath.endswith('.txt'):
            with open(filepath) as f:
                outtext = f.read()

            title = filepath[:-len('.txt')]
        elif filepath.endswith('.pdf'):
            outtext = read_pdf(filepath)
            title = filepath[:-len('.pdf')]
        elif filepath.endswith('.doc'):
            outtext = read_word(filepath)
            title = filepath[:-len('.doc')]
        elif filepath.endswith('.docx'):
            outtext = read_word(filepath)
            title = filepath[:-len('.docx')]

        if not filepath.endswith('txt'):
            with open(title+'.txt', 'w') as f:
                f.write(outtext)

        # a very crude way of guessing the publishing year
        results = FindYearRE.match(outtext)
        year = 0
        if results:
            year = results[0]

        tags = jieba_analyze(outtext, 20)

        print(f'Dumping result text analysis of {filepath}\n')
        result = dump_citespace(title=title, year=year,
                                tags=[tag for tag, _ in tags])
        print(result)

        print(f'Dumping histogram analysis of {filepath}\n')
        hist = jieba_histogram(outtext, [tag for tag, _ in tags])
        for key, freq in hist.items():
            print(f'{key}: {freq}')
        print()

        outfile.write(result)
        outfile.write('\n\n')

    outfile.close()


 if __name__ == '__main__':
    main()
	# depend on third-party libraries, run the following command to install:
	# sudo pip3 install jieba pdfplumber docx2txt
	#
	# run the script with:
	# python3 reader.py <yourfile.pdf> <yourfile.doc>
	#
	# see more at: https://github.com/fxsjy/jieba


	import sys
	import re

	import pdfplumber
	import docx2txt
	import jieba
	import jieba.analyse

	# guessing the year by the format 19XX or 20XX
	FindYearRE = re.compile(r'(19\|20)\d{2}')


	def read_pdf(filepath):
	'''read pdf file, return content as a string'''

	with pdfplumber.open(filepath) as f:
	txts = []

	for page in f.pages:
	txt = page.extract_text()

	txts += txt.split('\n')

	return ''.join(txts)


	def read_word(filepath):
	'''read doc file, return string'''
	return docx2txt.process(filepath)


	def jieba_analyze(text, topK=50):
	'''analyze text and return the list of words with their weights'''
	words = jieba.analyse.extract_tags(text, topK=topK, withWeight=True,
	allowPOS=('ns', 'n', 'vn', 'nr', 'x', 'nz'))
	return words


	def jieba_histogram(text, taglist):
	'''get the histogram of word frequency, given the list of words'''
	tokens = jieba.cut(text, cut_all=False)
	hist = dict()

	for tag in taglist:
	hist[tag] = 0

	for t in tokens:
	if t in hist:
	hist[t] += 1

	return hist


	def dump_citespace(title='', year='', tags=[]):
	tagstr = '； '.join(tags)

	return (
	f'T1 {title}\n'
	f'YR {year}\n'
	f'K1 {tagstr}\n'
	)


	def main():
	jieba.enable_parallel(4)

	outfile = open('output.citespace', 'w')

	for filepath in sys.argv[1:]:
	outtext = ''
	if filepath.endswith('.txt'):
	with open(filepath) as f:
	outtext = f.read()

	title = filepath[:-len('.txt')]
	elif filepath.endswith('.pdf'):
	outtext = read_pdf(filepath)
	title = filepath[:-len('.pdf')]
	elif filepath.endswith('.doc'):
	outtext = read_word(filepath)
	title = filepath[:-len('.doc')]
	elif filepath.endswith('.docx'):
	outtext = read_word(filepath)
	title = filepath[:-len('.docx')]

	if not filepath.endswith('txt'):
	with open(title+'.txt', 'w') as f:
	f.write(outtext)

	# a very crude way of guessing the publishing year
	results = FindYearRE.match(outtext)
	year = 0
	if results:
	year = results[0]

	tags = jieba_analyze(outtext, 20)

	print(f'Dumping result text analysis of {filepath}\n')
	result = dump_citespace(title=title, year=year,
	tags=[tag for tag, _ in tags])
	print(result)

	print(f'Dumping histogram analysis of {filepath}\n')
	hist = jieba_histogram(outtext, [tag for tag, _ in tags])
	for key, freq in hist.items():
	print(f'{key}: {freq}')
	print()

	outfile.write(result)
	outfile.write('\n\n')

	outfile.close()


	if __name__ == '__main__':
	main()