Skip to content

Instantly share code, notes, and snippets.

@smartexpert
Forked from harshildarji/docx_to_conll.py
Last active June 1, 2024 19:09
Show Gist options
  • Save smartexpert/cf435ca0803167dda4c5daf29a8b1638 to your computer and use it in GitHub Desktop.
Save smartexpert/cf435ca0803167dda4c5daf29a8b1638 to your computer and use it in GitHub Desktop.
Python script to extract comments from .docx file, and convert into CoNLL format.
import json
import os
import string
import zipfile
from lxml import etree
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
import docx
from docx import Document
# PART 1 - Extract comments and commented text from DOCX, and put them into JSON format.
# Annotation labels mapping
annotation_labels = {
"DC": "Data Controller",
"DP": "Data Processor",
"DPO": "Data Protection Officer",
"R": "Recipient",
"TP": "Third Party",
"A": "Authority",
"DS": "Data Subject",
"DSO": "Data Source",
"RP": "Required Purpose",
"NRP": "Not-Required Purpose",
"P": "Processing",
"NPD": "Non-Personal Data",
"PD": "Personal Data",
"OM": "Organisational Measure",
"TM": "Technical Measure",
"LB": "Legal Basis",
"CONS": "Consent",
"CONT": "Contract",
"LI": "Legitimate Interest",
"ADM": "Automated Decision Making",
"RET": "Retention",
"SEU": "Scale EU",
"SNEU": "Scale Non-EU",
"RI": "Right",
"DSR15": "Art. 15 Right of access by the data subject",
"DSR16": "Art. 16 Right to rectification",
"DSR17": "Art. 17 Right to erasure (‘right to be forgotten’)",
"DSR18": "Art. 18 Right to restriction of processing",
"DSR19": "Art. 19 Notification obligation regarding rectification or erasure of personal data or restriction of processing",
"DSR20": "Art. 20 Right to data portability",
"DSR21": "Art. 21 Right to object",
"DSR22": "Art. 22 Automated individual decision-making, including profiling",
"LC": "Lodge Complaint",
}
annotation_keys = list(annotation_labels.keys())
xmlns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
comments_dict = {}
# Get a list of all docx files in the directory
directory = "docx"
files = os.listdir(directory)
files_list = [file for file in files if file.endswith(".docx")]
for docx_file in tqdm(files_list, desc="Total"):
cmts = {}
name = docx_file.split(".")[0].strip()
comments_dict[f"{name}"] = []
# Open the docx file as a zip
docx_zip = zipfile.ZipFile(f"docx/{docx_file}")
comments_xml = docx_zip.read("word/comments.xml")
document_xml = docx_zip.read("word/document.xml")
docx_zip.close()
# Parse comments.xml to extract the comments
et = etree.XML(comments_xml)
comments = et.xpath("//w:comment", namespaces=xmlns)
for c in comments:
comment = c.xpath("string(.)", namespaces=xmlns)
comment_id = c.xpath("@w:id", namespaces=xmlns)[0]
cmts[comment_id] = comment
# Parse document.xml to find text with associated comments
root = etree.fromstring(document_xml)
for k, v in tqdm(cmts.items(), desc=f"{name}"):
annotation = v
if v in annotation_keys:
annotations_full = annotation_labels[annotation]
# Construct XPath expression to find text associated with the comment
xpath_expr = f'//w:commentRangeStart[@w:id="{k}"]/following::w:t[following::w:commentRangeEnd[@w:id="{k}"] and not(preceding::w:commentRangeStart[@w:id="{int(k)+1}"])]'
comment_range_start_elements = root.xpath(xpath_expr, namespaces=root.nsmap)
text = ""
# Concatenate text elements within the comment range
for element in comment_range_start_elements:
if text and element.text:
# Check conditions for appending a space before the text
if (
text[-1] not in string.whitespace
and element.text[0]
not in string.whitespace + string.punctuation
):
text += " "
text += element.text.replace("\u00A0", "") if element.text else ""
elif element.text:
text += element.text.replace("\u00A0", "") if element.text else ""
text = text.replace("( ", "(").replace("[ ", "[")
if text:
comments_dict[name].append((int(k), text, annotation, annotations_full))
sorted_data = sorted(comments_dict[name], key=lambda x: x[0])
comments_dict[name] = sorted_data
json_file = "comments.json"
with open(json_file, "w") as file:
json.dump(comments_dict, file, indent=4)
# The XPath expression selects <w:t> elements that appear after a specific <w:commentRangeStart> element with a matching w:id attribute value, and before the corresponding <w:commentRangeEnd> element, while ensuring that there are no intervening <w:commentRangeStart> elements with a specific w:id attribute value. The {k} and {int(k)+1} are likely to be placeholders for dynamic values that will be substituted with actual values during runtime.
# //w:commentRangeStart: Selects all <w:commentRangeStart> elements in the XML document, regardless of their location.
# [@w:id="{k}"]: Filters the selected <w:commentRangeStart> elements by the w:id attribute value equal to {k}. This attribute value is likely to be a variable or placeholder that is being dynamically replaced with an actual value when the XPath expression is used.
# /following::w:t: Selects all <w:t> elements that appear after the filtered <w:commentRangeStart> element(s).
# [following::w:commentRangeEnd[@w:id="{k}"]: Filters the selected <w:t> elements further by checking if there is a <w:commentRangeEnd> element with a matching w:id attribute value of {k} that appears after the current <w:t> element.
# and not(preceding::w:commentRangeStart[@w:id="{int(k)+1}"]): Additionally filters the selected <w:t> elements by checking if there is no <w:commentRangeStart> element with a w:id attribute value equal to {int(k)+1} that appears before the current <w:t> element.
# PART 2 - Convert DOCX into CoNLL format with IOB tagging, with text as tokens and comments as labels.
# Function to convert a docx file to text
def convert_docx_to_text(docx_file):
doc = docx.Document(docx_file)
xml_content = doc._element.xml
namespaces = {
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
}
# Create an XML parser with namespace mappings
parser = etree.XMLParser()
for prefix, uri in namespaces.items():
etree.register_namespace(prefix, uri)
tree = etree.fromstring(xml_content.encode("utf-8"), parser=parser)
# Extract text from <w:t> elements and join them, adding a space after each <w:p>
text = ""
for paragraph in tree.findall(".//w:p", namespaces=namespaces):
text += "".join(
[
t.text.replace("\xa0", " ")
for t in paragraph.findall(".//w:t", namespaces=namespaces)
if t.text
]
)
text += " "
return text.strip()
# Function to split a string from the first match of a delimiter
def split_from_first_match(string, delimiter):
parts = string.split(delimiter, 1)
if len(parts) > 1:
return parts[0], delimiter, parts[1]
else:
return False
# Function to split a string with punctuation into tokens
def split_string_with_punctuation(string):
tokenizer = RegexpTokenizer(r"\w+|\S+")
tokens = tokenizer.tokenize(string)
return tokens
with open("comments.json", "r") as file:
data = json.load(file)
conll = open(f"comments.conll", "w")
for docx_file in tqdm(files_list, desc="Total"):
name = docx_file.split(".")[0].strip()
text = convert_docx_to_text(f"docx/{docx_file}")
conll = open(f"comments.conll", "a+")
for _, delimiter, label, _ in data[name]:
split_text = split_from_first_match(text, delimiter)
if split_text:
out_text, tag_text, text = split_text[0], split_text[1], split_text[2]
split_out_text = split_string_with_punctuation(out_text)
split_tag_text = split_string_with_punctuation(tag_text)
for txt in split_out_text:
conll.write(f"{txt}\tO\n")
conll.write(f"{split_tag_text[0]}\tB-{label}\n")
for txt in split_tag_text[1:]:
conll.write(f"{txt}\tI-{label}\n")
else:
continue
conll.write("\n")
conll.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment