satish860 · December 11, 2023 11:29
diff --git a/spilter.py b/spilter.py
 from llmsherpa.readers import LayoutPDFReader

 llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
 pdf_url = "1.pdf"  # Replace with your actual PDF file path

 def read_pdf(pdf_url):
    try:
        pdf_reader = LayoutPDFReader(llmsherpa_api_url)
        return pdf_reader.read_pdf(pdf_url)
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

 def ends_with_sentence_terminator(line):
    return line.strip().endswith(('.', '?', '!', ":"))

 def process_pdf_chunks(doc):
    previous_heading = None
    incomplete_chunk = ""
    processed_text = []

    for chunk in doc.chunks():
        fullchunk = incomplete_chunk
        incomplete_chunk = ""

        text = chunk.to_context_text()
        lines = text.split("\n")

        if lines:
            current_heading = lines[0].split('>')[-1].strip()

            if current_heading != previous_heading:
                fullchunk += current_heading + '\n'
                previous_heading = current_heading

            for i, line in enumerate(lines[1:], 1):
                if i < len(lines) - 1 and not ends_with_sentence_terminator(line):
                    fullchunk += line + ' '
                else:
                    fullchunk += line + '\n'

        if not fullchunk.strip().endswith('.'):
            incomplete_chunk = fullchunk
            continue

        processed_text.append(fullchunk)

    if incomplete_chunk:
        processed_text.append(incomplete_chunk)

    return processed_text
	from llmsherpa.readers import LayoutPDFReader

	llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
	pdf_url = "1.pdf" # Replace with your actual PDF file path

	def read_pdf(pdf_url):
	try:
	pdf_reader = LayoutPDFReader(llmsherpa_api_url)
	return pdf_reader.read_pdf(pdf_url)
	except Exception as e:
	print(f"Error reading PDF: {e}")
	return None

	def ends_with_sentence_terminator(line):
	return line.strip().endswith(('.', '?', '!', ":"))

	def process_pdf_chunks(doc):
	previous_heading = None
	incomplete_chunk = ""
	processed_text = []

	for chunk in doc.chunks():
	fullchunk = incomplete_chunk
	incomplete_chunk = ""

	text = chunk.to_context_text()
	lines = text.split("\n")

	if lines:
	current_heading = lines[0].split('>')[-1].strip()

	if current_heading != previous_heading:
	fullchunk += current_heading + '\n'
	previous_heading = current_heading

	for i, line in enumerate(lines[1:], 1):
	if i < len(lines) - 1 and not ends_with_sentence_terminator(line):
	fullchunk += line + ' '
	else:
	fullchunk += line + '\n'

	if not fullchunk.strip().endswith('.'):
	incomplete_chunk = fullchunk
	continue

	processed_text.append(fullchunk)

	if incomplete_chunk:
	processed_text.append(incomplete_chunk)

	return processed_text