Created
June 5, 2024 12:37
-
-
Save valuex/4b933c673fa7f3eeec879a88199a0fd8 to your computer and use it in GitHub Desktop.
Get PDF Title by Max Font Size
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pdfplumber | |
import os | |
def get_max_font_text(file_path): | |
max_font_text="" | |
with pdfplumber.open(file_path) as pdf: | |
page1=pdf.pages[0] | |
page1_content=page1.chars | |
font_size_list=[item['size'] for item in page1_content] | |
max_font_size=max(font_size_list) | |
char_with_max_size=[item['text'] for item in page1_content if item['size'>=max_font_size]] | |
max_font_text=''.join(char_with_max_size) | |
return max_font_text | |
def main(): | |
pdf_dir="D:/Downloads/" | |
for pdf_file in os.listdir(pdf_dir): | |
pdf_full_path=os.path.join(pdf_dir,pdf_file) | |
if(os.path.isfile(pdf_full_path)): | |
max_text=get_max_font_text(pdf_full_path) | |
print(pdf_file+"\t"+max_text) | |
if __name__=="__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Known Issue:
For certain pdf files, if one copy the text content from it and paste it into notepad, and only messy code is got.
For those pdf files, the above script won't work, it will only output some un-readable word in the format of
(cid:xx)
.Another solutions: