thibaudcolas · July 2, 2024 16:45
diff --git a/README.md b/README.md
diff --git a/content_import_parsers.py b/content_import_parsers.py
 import bs4
 import mammoth
 from django.core import exceptions, validators
 from wagtail.embeds import embeds
 from wagtail_content_import.parsers import base as base_parser


 class DocxHTMLParser(base_parser.DocumentParser):
    def __init__(self, document):
        self.document = document

    def close_paragraph(self, block, stream_data):
        if block:
            stream_data.append({"type": "html", "value": "".join(block)})
        block.clear()
        return

    def parse(self):
        html = mammoth.convert_to_html(self.document).value

        soup = bs4.BeautifulSoup(html, "html5lib")

        stream_data = []

        # Run through contents and populate stream
        current_paragraph_block = []

        for tag in soup.body.recursiveChildGenerator():
            # Remove all inline styles and classes
            if hasattr(tag, "attrs"):
                for attr in ["class", "style"]:
                    tag.attrs.pop(attr, None)

        title = ""

        for tag in soup.body.contents:
            if isinstance(tag, bs4.NavigableString):
                stream_data.append({"type": "html", "value": str(tag)})
            else:
                if tag.name == "h1":
                    if not title:
                        title = tag.text
                    else:
                        self.close_paragraph(current_paragraph_block, stream_data)
                        stream_data.append({"type": "heading", "value": tag.text})
                elif tag.name == "h2":
                    self.close_paragraph(current_paragraph_block, stream_data)
                    stream_data.append({"type": "heading", "value": tag.text})
                elif tag.name in ["h3", "h4", "h5", "h6"]:
                    self.close_paragraph(current_paragraph_block, stream_data)
                    stream_data.append({"type": "subheading", "value": tag.text})
                elif tag.name == "img":
                    # Break the paragraph and add an image
                    self.close_paragraph(current_paragraph_block, stream_data)
                    stream_data.append(
                        {
                            "type": "image",
                            "value": tag.get("src"),
                            "title": tag.get("alt", ""),
                        }
                    )
                elif tag.text:
                    if tag.text.startswith("http:") or tag.text.startswith("https:"):
                        validate = validators.URLValidator()
                        url = tag.text.strip()
                        try:
                            validate(url)

                            if embed := embeds.get_embed(url):
                                self.close_paragraph(
                                    current_paragraph_block, stream_data
                                )
                                stream_data.append({"type": "embed", "value": embed})
                        except exceptions.ValidationError:
                            current_paragraph_block.append(str(tag))
                    else:
                        current_paragraph_block.append(str(tag))

                if tag.find_all("img"):
                    # Break the paragraph and add images
                    self.close_paragraph(current_paragraph_block, stream_data)
                    for img in tag.find_all("img"):
                        stream_data.append(
                            {
                                "type": "image",
                                "value": img.get("src"),
                                "title": img.get("alt", ""),
                            }
                        )

            self.close_paragraph(current_paragraph_block, stream_data)

        return {"title": title, "elements": stream_data}
	import bs4
	import mammoth
	from django.core import exceptions, validators
	from wagtail.embeds import embeds
	from wagtail_content_import.parsers import base as base_parser


	class DocxHTMLParser(base_parser.DocumentParser):
	def __init__(self, document):
	self.document = document

	def close_paragraph(self, block, stream_data):
	if block:
	stream_data.append({"type": "html", "value": "".join(block)})
	block.clear()
	return

	def parse(self):
	html = mammoth.convert_to_html(self.document).value

	soup = bs4.BeautifulSoup(html, "html5lib")

	stream_data = []

	# Run through contents and populate stream
	current_paragraph_block = []

	for tag in soup.body.recursiveChildGenerator():
	# Remove all inline styles and classes
	if hasattr(tag, "attrs"):
	for attr in ["class", "style"]:
	tag.attrs.pop(attr, None)

	title = ""

	for tag in soup.body.contents:
	if isinstance(tag, bs4.NavigableString):
	stream_data.append({"type": "html", "value": str(tag)})
	else:
	if tag.name == "h1":
	if not title:
	title = tag.text
	else:
	self.close_paragraph(current_paragraph_block, stream_data)
	stream_data.append({"type": "heading", "value": tag.text})
	elif tag.name == "h2":
	self.close_paragraph(current_paragraph_block, stream_data)
	stream_data.append({"type": "heading", "value": tag.text})
	elif tag.name in ["h3", "h4", "h5", "h6"]:
	self.close_paragraph(current_paragraph_block, stream_data)
	stream_data.append({"type": "subheading", "value": tag.text})
	elif tag.name == "img":
	# Break the paragraph and add an image
	self.close_paragraph(current_paragraph_block, stream_data)
	stream_data.append(
	{
	"type": "image",
	"value": tag.get("src"),
	"title": tag.get("alt", ""),
	}
	)
	elif tag.text:
	if tag.text.startswith("http:") or tag.text.startswith("https:"):
	validate = validators.URLValidator()
	url = tag.text.strip()
	try:
	validate(url)

	if embed := embeds.get_embed(url):
	self.close_paragraph(
	current_paragraph_block, stream_data
	)
	stream_data.append({"type": "embed", "value": embed})
	except exceptions.ValidationError:
	current_paragraph_block.append(str(tag))
	else:
	current_paragraph_block.append(str(tag))

	if tag.find_all("img"):
	# Break the paragraph and add images
	self.close_paragraph(current_paragraph_block, stream_data)
	for img in tag.find_all("img"):
	stream_data.append(
	{
	"type": "image",
	"value": img.get("src"),
	"title": img.get("alt", ""),
	}
	)

	self.close_paragraph(current_paragraph_block, stream_data)

	return {"title": title, "elements": stream_data}