Created
January 13, 2025 05:35
-
-
Save goldengrape/af25ea74f6df9561d41f00aa9bdd5c87 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import streamlit as st | |
import pdfplumber | |
import fitz # PyMuPDF | |
import os | |
import base64 | |
import re | |
# ------------------------------------------------------------------------ | |
# 一些工具函数 | |
# ------------------------------------------------------------------------ | |
def unify_line_breaks(text: str) -> str: | |
""" | |
用于去除中文文本在 PDF 中因为换行而拆分的问题。 | |
比如"这是一个很长的句子,包\n含换行" -> "这是一个很长的句子,包含换行" | |
根据需求,可以自行调整规则,比如结合标点、字符等。 | |
""" | |
# 这里演示一个简单的换行合并逻辑:只要下一行不是以英文大写字母或数字等开头,就合并 | |
# 如果你的 PDF 主要是中文,这个可以更加简单,如直接 .replace("\n", "") | |
lines = text.split('\n') | |
merged_lines = [] | |
for i, line in enumerate(lines): | |
if i < len(lines) - 1: | |
next_line = lines[i+1] | |
# 简单判断:如果下一行首字符不是可能的句首标志,就与当前行合并 | |
if re.match(r'[A-Z0-9]', next_line.strip()): | |
merged_lines.append(line) | |
else: | |
merged_lines.append(line + next_line) | |
lines[i+1] = "" # 下行已经合并进本行 | |
else: | |
merged_lines.append(line) | |
# 再把空行过滤掉 | |
merged_lines = [l for l in merged_lines if l.strip() != ""] | |
final_text = "\n".join(merged_lines) | |
return final_text | |
def read_pdf_text(pdf_file_path: str) -> str: | |
""" | |
使用 pdfplumber 提取 PDF 全文,并做行合并。 | |
""" | |
extracted_text = [] | |
with pdfplumber.open(pdf_file_path) as pdf: | |
for page in pdf.pages: | |
text = page.extract_text() or "" | |
text = unify_line_breaks(text) | |
extracted_text.append(text.strip()) | |
return "\n".join(extracted_text) | |
def add_annotation_to_pdf(pdf_file_path: str, annotation_text: str, output_path: str = None): | |
""" | |
示例:在第 1 页上添加一个简单文本批注,并保存到 PDF。 | |
output_path 若为 None,则覆盖原文件。 | |
""" | |
if output_path is None: | |
output_path = pdf_file_path # 原位保存 | |
doc = fitz.open(pdf_file_path) | |
if doc.page_count > 0: | |
page = doc[0] # 这里只做演示,在第一页加批注 | |
# 设置批注位置(演示:左下角) | |
annot_rect = fitz.Rect(50, 50, 300, 100) | |
annot = page.add_text_annot(annot_rect.tl, annotation_text) | |
# 也可使用 add_highlight_annot 等其他标注 | |
doc.save(output_path) | |
# 前端截图(html2canvas)的 JS 注入(示例) | |
def screenshot_js(): | |
""" | |
使用 html2canvas 实现前端截图示例。 | |
需要在网络环境能访问到 https://html2canvas.hertzen.com/dist/html2canvas.min.js | |
或者将 html2canvas 源码下载到本地并替换链接。 | |
""" | |
js_code = """ | |
<script> | |
function takeScreenshot() { | |
var container = document.getElementById("pdf-container"); | |
if(!container){ | |
alert("未找到 PDF 容器,请确认元素ID。"); | |
return; | |
} | |
html2canvas(container).then(function(canvas) { | |
var dataURL = canvas.toDataURL("image/png"); | |
// 将截图的 base64 通过 Streamlit 的 setComponentValue 回传 | |
window.parent.postMessage({type: 'STREAMLIT:BASE64_SCREENSHOT', data: dataURL}, "*"); | |
}); | |
} | |
</script> | |
""" | |
return js_code | |
# ------------------------------------------------------------------------ | |
# Streamlit 主逻辑 | |
# ------------------------------------------------------------------------ | |
def main(): | |
st.title("PDF阅读器示例(pdf.js + 批注 + 搜索 + 截图)") | |
# 选择 PDF 文件 | |
uploaded_file = st.file_uploader("上传PDF文件", type=["pdf"]) | |
if not uploaded_file: | |
st.info("请上传 PDF 文件。") | |
return | |
# 将上传的文件暂存到本地以便 pdf.js / pdfplumber / PyMuPDF 使用 | |
temp_pdf_path = os.path.join("temp_uploaded.pdf") | |
with open(temp_pdf_path, "wb") as f: | |
f.write(uploaded_file.read()) | |
# -------------------------------------------------------------------- | |
# 前端显示 PDF (iframe + pdf.js) | |
# -------------------------------------------------------------------- | |
# 这里为了演示,假设已经把 pdf.js 解压放在本地 "pdfjs" 目录下 | |
# 注意:如果是在 Streamlit Cloud 环境,需要把 pdf.js 静态文件放到可被访问的位置(例如 GitHub Pages 或其他 CDN)。 | |
# 读取PDF并转为base64 | |
with open(temp_pdf_path, "rb") as f: | |
pdf_data = f.read() | |
pdf_base64 = base64.b64encode(pdf_data).decode() | |
pdfjs_path = "pdfjs/web/viewer.html" # 本地 pdf.js viewer.html 的相对路径 | |
# 拼接 url,传入 base64 的数据 | |
# 注意 viewer.html?file= 后面的地址需要 url编码 | |
# 简单演示:data:application/pdf;base64, + base64 | |
file_url = f"data:application/pdf;base64,{pdf_base64}" | |
viewer_url = f"{pdfjs_path}?file={file_url}" | |
# 用 iframe 加载 pdf.js 的 viewer.html | |
st.markdown( | |
f""" | |
<iframe | |
id="pdf-container" | |
src="{viewer_url}" | |
width="100%" | |
height="800px" | |
style="border:none;"> | |
</iframe> | |
""", | |
unsafe_allow_html=True | |
) | |
# -------------------------------------------------------------------- | |
# 文本搜索示例 | |
# -------------------------------------------------------------------- | |
st.subheader("全文搜索示例(后端拆行合并后再搜索)") | |
search_query = st.text_input("输入搜索关键词(支持中文)", "") | |
if st.button("搜索"): | |
if search_query.strip(): | |
# 提取全文文本 | |
text_all = read_pdf_text(temp_pdf_path) | |
# 简单查找匹配次数 | |
count = text_all.count(search_query) | |
st.write(f"在 PDF 文本中发现 **{search_query}** 共计 **{count}** 处匹配。") | |
# 如果需要更详细的定位或高亮,需要在 pdfplumber 提取时记录 (page_no, text) 并做高亮处理 | |
else: | |
st.write("请输入搜索词。") | |
# -------------------------------------------------------------------- | |
# 添加批注示例 | |
# -------------------------------------------------------------------- | |
st.subheader("PDF 批注示例(保存到 PDF)") | |
annotation_text = st.text_input("输入批注内容", "") | |
if st.button("添加批注并保存PDF"): | |
if annotation_text.strip(): | |
add_annotation_to_pdf(temp_pdf_path, annotation_text.strip(), output_path=None) | |
st.success("批注已添加并原位保存!重新载入可查看效果。") | |
else: | |
st.write("请先输入批注内容。") | |
# -------------------------------------------------------------------- | |
# 前端截图示例 | |
# -------------------------------------------------------------------- | |
st.subheader("截图功能演示") | |
st.markdown( | |
""" | |
> 下方按钮点击后,将调用 JavaScript (html2canvas) 来对 PDF iframe 做截图,并回传到 Streamlit。 | |
""", | |
unsafe_allow_html=True | |
) | |
# 注入 html2canvas | |
st.markdown(""" | |
<script src="https://html2canvas.hertzen.com/dist/html2canvas.min.js"></script> | |
""", unsafe_allow_html=True) | |
# 注入截图 js | |
st.markdown(screenshot_js(), unsafe_allow_html=True) | |
# 截图按钮 | |
if st.button("截图"): | |
# 调用 takeScreenshot() | |
st.markdown("<script>takeScreenshot();</script>", unsafe_allow_html=True) | |
# 用于接收前端截图的 Base64 | |
screenshot_data = st.text_area("截图Base64", "", height=100, key="screenshot_data_area") | |
if screenshot_data: | |
try: | |
head, data = screenshot_data.split(',', 1) | |
screenshot_bytes = base64.b64decode(data) | |
st.image(screenshot_bytes, caption="前端PDF区域截图") | |
except Exception as e: | |
st.write("解析截图失败:", e) | |
# JS事件监听,把截图的base64发到 screenshot_data_area 这个组件 | |
st.markdown(""" | |
<script> | |
window.addEventListener("message", (event) => { | |
if(event.data.type === 'STREAMLIT:BASE64_SCREENSHOT') { | |
let data = event.data.data; | |
// 将截图写入到页面中的 TextArea | |
let textArea = window.parent.document.querySelector("textarea[key='screenshot_data_area']"); | |
if(textArea) { | |
textArea.value = data; | |
textArea.dispatchEvent(new Event('change', { bubbles: true })); | |
} | |
} | |
}, false); | |
</script> | |
""", unsafe_allow_html=True) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment