Skip to content

Instantly share code, notes, and snippets.

@hxy9243
Last active November 14, 2021 05:50
Show Gist options
  • Save hxy9243/8a5e0b3bb932764c850d286add86bdc2 to your computer and use it in GitHub Desktop.
Save hxy9243/8a5e0b3bb932764c850d286add86bdc2 to your computer and use it in GitHub Desktop.
Get jieba to analyze pdf and word
# depend on third-party libraries, run the following command to install:
# sudo pip3 install jieba pdfplumber docx2txt
#
# run the script with:
# python3 reader.py <yourfile.pdf> <yourfile.doc>
#
# see more at: https://github.com/fxsjy/jieba
import sys
import re
import pdfplumber
import docx2txt
import jieba
import jieba.analyse
# guessing the year by the format 19XX or 20XX
FindYearRE = re.compile(r'(19|20)\d{2}')
def read_pdf(filepath):
'''read pdf file, return content as a string'''
with pdfplumber.open(filepath) as f:
txts = []
for page in f.pages:
txt = page.extract_text()
txts += txt.split('\n')
return ''.join(txts)
def read_word(filepath):
'''read doc file, return string'''
return docx2txt.process(filepath)
def jieba_analyze(text, topK=50):
'''analyze text and return the list of words with their weights'''
words = jieba.analyse.extract_tags(text, topK=topK, withWeight=True,
allowPOS=('ns', 'n', 'vn', 'nr', 'x', 'nz'))
return words
def jieba_histogram(text, taglist):
'''get the histogram of word frequency, given the list of words'''
tokens = jieba.cut(text, cut_all=False)
hist = dict()
for tag in taglist:
hist[tag] = 0
for t in tokens:
if t in hist:
hist[t] += 1
return hist
def dump_citespace(title='', year='', tags=[]):
tagstr = '; '.join(tags)
return (
f'T1 {title}\n'
f'YR {year}\n'
f'K1 {tagstr}\n'
)
def main():
jieba.enable_parallel(4)
outfile = open('output.citespace', 'w')
for filepath in sys.argv[1:]:
outtext = ''
if filepath.endswith('.txt'):
with open(filepath) as f:
outtext = f.read()
title = filepath[:-len('.txt')]
elif filepath.endswith('.pdf'):
outtext = read_pdf(filepath)
title = filepath[:-len('.pdf')]
elif filepath.endswith('.doc'):
outtext = read_word(filepath)
title = filepath[:-len('.doc')]
elif filepath.endswith('.docx'):
outtext = read_word(filepath)
title = filepath[:-len('.docx')]
if not filepath.endswith('txt'):
with open(title+'.txt', 'w') as f:
f.write(outtext)
# a very crude way of guessing the publishing year
results = FindYearRE.match(outtext)
year = 0
if results:
year = results[0]
tags = jieba_analyze(outtext, 20)
print(f'Dumping result text analysis of {filepath}\n')
result = dump_citespace(title=title, year=year,
tags=[tag for tag, _ in tags])
print(result)
print(f'Dumping histogram analysis of {filepath}\n')
hist = jieba_histogram(outtext, [tag for tag, _ in tags])
for key, freq in hist.items():
print(f'{key}: {freq}')
print()
outfile.write(result)
outfile.write('\n\n')
outfile.close()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment