cdrini · July 14, 2023 16:28
diff --git a/djvu_to_single_line.py b/djvu_to_single_line.py
 """
 Examples:
 # Get first 20 pages
 python djvu_to_single_line.py ~/Downloads/isbn_9781531610494_djvu.xml -- :20
 # Get first 20 pages and last 5 pages
 python djvu_to_single_line.py ~/Downloads/isbn_9781531610494_djvu.xml -- :20 -5:
 """
 import xml.etree.ElementTree as ET
 from typing import List

 def page_to_line(page_xml_tree):
    """
    Convert e.g.
    
    <OBJECT data="file://localhost/var/tmp/autoclean/derive/isbn_9781531610494/isbn_9781531610494.djvu" type="image/x.djvu" usemap="isbn_9781531610494_0003.djvu" width="2103" height="3347">
        <PARAM name="PAGE" value="isbn_9781531610494_0003.djvu" />
        <PARAM name="DPI" value="360" />
        <HIDDENTEXT>
            <PAGECOLUMN>
                <REGION>
                    <PARAGRAPH>
                        <LINE>
                            <WORD coords="467,1604,779,1525" x-confidence="96">Digitized</WORD>
                            <WORD coords="811,1604,893,1525" x-confidence="96">by</WORD>
                            <WORD coords="919,1588,1031,1525" x-confidence="96">the</WORD>
                            <WORD coords="1066,1588,1340,1525" x-confidence="94">Internet</WORD>
                            <WORD coords="1366,1588,1639,1525" x-confidence="96">Archive</WORD>
                        </LINE>
                        <LINE>
                            <WORD coords="586,1695,641,1634" x-confidence="84">in</WORD>
                            <WORD coords="673,1697,854,1635" x-confidence="94">2022</WORD>
                            <WORD coords="882,1697,1025,1634" x-confidence="96">with</WORD>
                            <WORD coords="1055,1713,1325,1634" x-confidence="96">funding</WORD>
                            <WORD coords="1355,1697,1516,1634" x-confidence="96">from</WORD>
                        </LINE>
                        <LINE>
                            <WORD coords="601,1806,1059,1743" x-confidence="91">Kahle/Austin</WORD>
                            <WORD coords="1095,1806,1502,1743" x-confidence="96">Foundation</WORD>
                        </LINE>
                    </PARAGRAPH>
                </REGION>
            </PAGECOLUMN>
            <PAGECOLUMN>
                <REGION>
                    <PARAGRAPH>
                        <LINE>
                            <WORD coords="190,3188,1502,3109" x-confidence="67">https://archive.org/details/ison_</WORD>
                            <WORD coords="1517,3172,1916,3110" x-confidence="96">9781531610494</WORD>
                        </LINE>
                    </PARAGRAPH>
                </REGION>
            </PAGECOLUMN>
        </HIDDENTEXT>
    </OBJECT>
    to:
    0003: Digitized by the Internet Archive \n in 2022 with funding from \n Kahle/Austin Foundation \n\n https://archive.org/details/ison_9781531610494
    """
    page_number = page_xml_tree.find('PARAM[@name="PAGE"]').attrib['value'].split('_')[-1].split('.')[0]
    text = ''
    for column in page_xml_tree.find('HIDDENTEXT').findall('PAGECOLUMN'):
        for region in column.findall('REGION'):
            for paragraph in region.findall('PARAGRAPH'):
                for line in paragraph.findall('LINE'):
                    text += ' '.join([word.text for word in line.findall('WORD')]) + ' '
                text += '\\n'
            text += '\\n'
    return f'{page_number}: {text}'

 def extract_pages(xml_file: str, ranges: List[str], truncate=500) -> str:
    """
    Return single-line representations of the pages in the given ranges as a string.

    Ranges can be of the form:
        - :10 (first 10 pages)
        - 10: (pages 10 and up)
        - 10:20 (pages 10 through 20)
        - 10 (just page 10)
        - -10: (last 10 pages)

    :param truncate: Truncate lines to this length
    """

    tree = ET.parse(xml_file)
    root = tree.getroot()
    pages = root.findall('.//OBJECT')
    output_str = ""

    for page_range in ranges:
        if ':' in page_range:
            start, end = page_range.split(':')
            start = int(start) if start else 0
            end = int(end) if end else len(pages)
        else:
            start = int(page_range)
            end = start + 1

        for page in pages[start:end]:
            line = page_to_line(page)
            if len(line) > truncate:
                line = line[:truncate//2] + '[...]' + line[-truncate//2:]
            output_str += line + "\n"

    return output_str

 if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('xml_file')
    parser.add_argument('ranges', nargs='+')
    parser.add_argument('--truncate', type=int, default=500)
    args = parser.parse_args()
    print(extract_pages(args.xml_file, args.ranges, args.truncate))
	"""
	Examples:
	# Get first 20 pages
	python djvu_to_single_line.py ~/Downloads/isbn_9781531610494_djvu.xml -- :20
	# Get first 20 pages and last 5 pages
	python djvu_to_single_line.py ~/Downloads/isbn_9781531610494_djvu.xml -- :20 -5:
	"""
	import xml.etree.ElementTree as ET
	from typing import List

	def page_to_line(page_xml_tree):
	"""
	Convert e.g.

	<OBJECT data="file://localhost/var/tmp/autoclean/derive/isbn_9781531610494/isbn_9781531610494.djvu" type="image/x.djvu" usemap="isbn_9781531610494_0003.djvu" width="2103" height="3347">
	<PARAM name="PAGE" value="isbn_9781531610494_0003.djvu" />
	<PARAM name="DPI" value="360" />
	<HIDDENTEXT>
	<PAGECOLUMN>
	<REGION>
	<PARAGRAPH>
	<LINE>
	<WORD coords="467,1604,779,1525" x-confidence="96">Digitized</WORD>
	<WORD coords="811,1604,893,1525" x-confidence="96">by</WORD>
	<WORD coords="919,1588,1031,1525" x-confidence="96">the</WORD>
	<WORD coords="1066,1588,1340,1525" x-confidence="94">Internet</WORD>
	<WORD coords="1366,1588,1639,1525" x-confidence="96">Archive</WORD>
	</LINE>
	<LINE>
	<WORD coords="586,1695,641,1634" x-confidence="84">in</WORD>
	<WORD coords="673,1697,854,1635" x-confidence="94">2022</WORD>
	<WORD coords="882,1697,1025,1634" x-confidence="96">with</WORD>
	<WORD coords="1055,1713,1325,1634" x-confidence="96">funding</WORD>
	<WORD coords="1355,1697,1516,1634" x-confidence="96">from</WORD>
	</LINE>
	<LINE>
	<WORD coords="601,1806,1059,1743" x-confidence="91">Kahle/Austin</WORD>
	<WORD coords="1095,1806,1502,1743" x-confidence="96">Foundation</WORD>
	</LINE>
	</PARAGRAPH>
	</REGION>
	</PAGECOLUMN>
	<PAGECOLUMN>
	<REGION>
	<PARAGRAPH>
	<LINE>
	<WORD coords="190,3188,1502,3109" x-confidence="67">https://archive.org/details/ison_</WORD>
	<WORD coords="1517,3172,1916,3110" x-confidence="96">9781531610494</WORD>
	</LINE>
	</PARAGRAPH>
	</REGION>
	</PAGECOLUMN>
	</HIDDENTEXT>
	</OBJECT>
	to:
	0003: Digitized by the Internet Archive \n in 2022 with funding from \n Kahle/Austin Foundation \n\n https://archive.org/details/ison_9781531610494
	"""
	page_number = page_xml_tree.find('PARAM[@name="PAGE"]').attrib['value'].split('_')[-1].split('.')[0]
	text = ''
	for column in page_xml_tree.find('HIDDENTEXT').findall('PAGECOLUMN'):
	for region in column.findall('REGION'):
	for paragraph in region.findall('PARAGRAPH'):
	for line in paragraph.findall('LINE'):
	text += ' '.join([word.text for word in line.findall('WORD')]) + ' '
	text += '\\n'
	text += '\\n'
	return f'{page_number}: {text}'

	def extract_pages(xml_file: str, ranges: List[str], truncate=500) -> str:
	"""
	Return single-line representations of the pages in the given ranges as a string.

	Ranges can be of the form:
	- :10 (first 10 pages)
	- 10: (pages 10 and up)
	- 10:20 (pages 10 through 20)
	- 10 (just page 10)
	- -10: (last 10 pages)

	:param truncate: Truncate lines to this length
	"""

	tree = ET.parse(xml_file)
	root = tree.getroot()
	pages = root.findall('.//OBJECT')
	output_str = ""

	for page_range in ranges:
	if ':' in page_range:
	start, end = page_range.split(':')
	start = int(start) if start else 0
	end = int(end) if end else len(pages)
	else:
	start = int(page_range)
	end = start + 1

	for page in pages[start:end]:
	line = page_to_line(page)
	if len(line) > truncate:
	line = line[:truncate//2] + '[...]' + line[-truncate//2:]
	output_str += line + "\n"

	return output_str

	if __name__ == '__main__':
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument('xml_file')
	parser.add_argument('ranges', nargs='+')
	parser.add_argument('--truncate', type=int, default=500)
	args = parser.parse_args()
	print(extract_pages(args.xml_file, args.ranges, args.truncate))