Last active
November 14, 2021 05:50
-
-
Save hxy9243/8a5e0b3bb932764c850d286add86bdc2 to your computer and use it in GitHub Desktop.
Get jieba to analyze pdf and word
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# depend on third-party libraries, run the following command to install: | |
# sudo pip3 install jieba pdfplumber docx2txt | |
# | |
# run the script with: | |
# python3 reader.py <yourfile.pdf> <yourfile.doc> | |
# | |
# see more at: https://github.com/fxsjy/jieba | |
import sys | |
import re | |
import pdfplumber | |
import docx2txt | |
import jieba | |
import jieba.analyse | |
# guessing the year by the format 19XX or 20XX | |
FindYearRE = re.compile(r'(19|20)\d{2}') | |
def read_pdf(filepath): | |
'''read pdf file, return content as a string''' | |
with pdfplumber.open(filepath) as f: | |
txts = [] | |
for page in f.pages: | |
txt = page.extract_text() | |
txts += txt.split('\n') | |
return ''.join(txts) | |
def read_word(filepath): | |
'''read doc file, return string''' | |
return docx2txt.process(filepath) | |
def jieba_analyze(text, topK=50): | |
'''analyze text and return the list of words with their weights''' | |
words = jieba.analyse.extract_tags(text, topK=topK, withWeight=True, | |
allowPOS=('ns', 'n', 'vn', 'nr', 'x', 'nz')) | |
return words | |
def jieba_histogram(text, taglist): | |
'''get the histogram of word frequency, given the list of words''' | |
tokens = jieba.cut(text, cut_all=False) | |
hist = dict() | |
for tag in taglist: | |
hist[tag] = 0 | |
for t in tokens: | |
if t in hist: | |
hist[t] += 1 | |
return hist | |
def dump_citespace(title='', year='', tags=[]): | |
tagstr = '; '.join(tags) | |
return ( | |
f'T1 {title}\n' | |
f'YR {year}\n' | |
f'K1 {tagstr}\n' | |
) | |
def main(): | |
jieba.enable_parallel(4) | |
outfile = open('output.citespace', 'w') | |
for filepath in sys.argv[1:]: | |
outtext = '' | |
if filepath.endswith('.txt'): | |
with open(filepath) as f: | |
outtext = f.read() | |
title = filepath[:-len('.txt')] | |
elif filepath.endswith('.pdf'): | |
outtext = read_pdf(filepath) | |
title = filepath[:-len('.pdf')] | |
elif filepath.endswith('.doc'): | |
outtext = read_word(filepath) | |
title = filepath[:-len('.doc')] | |
elif filepath.endswith('.docx'): | |
outtext = read_word(filepath) | |
title = filepath[:-len('.docx')] | |
if not filepath.endswith('txt'): | |
with open(title+'.txt', 'w') as f: | |
f.write(outtext) | |
# a very crude way of guessing the publishing year | |
results = FindYearRE.match(outtext) | |
year = 0 | |
if results: | |
year = results[0] | |
tags = jieba_analyze(outtext, 20) | |
print(f'Dumping result text analysis of {filepath}\n') | |
result = dump_citespace(title=title, year=year, | |
tags=[tag for tag, _ in tags]) | |
print(result) | |
print(f'Dumping histogram analysis of {filepath}\n') | |
hist = jieba_histogram(outtext, [tag for tag, _ in tags]) | |
for key, freq in hist.items(): | |
print(f'{key}: {freq}') | |
print() | |
outfile.write(result) | |
outfile.write('\n\n') | |
outfile.close() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment