-
-
Save smartexpert/cf435ca0803167dda4c5daf29a8b1638 to your computer and use it in GitHub Desktop.
Python script to extract comments from .docx file, and convert into CoNLL format.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import string | |
import zipfile | |
from lxml import etree | |
from nltk.tokenize import RegexpTokenizer | |
from tqdm import tqdm | |
import docx | |
from docx import Document | |
# PART 1 - Extract comments and commented text from DOCX, and put them into JSON format. | |
# Annotation labels mapping | |
annotation_labels = { | |
"DC": "Data Controller", | |
"DP": "Data Processor", | |
"DPO": "Data Protection Officer", | |
"R": "Recipient", | |
"TP": "Third Party", | |
"A": "Authority", | |
"DS": "Data Subject", | |
"DSO": "Data Source", | |
"RP": "Required Purpose", | |
"NRP": "Not-Required Purpose", | |
"P": "Processing", | |
"NPD": "Non-Personal Data", | |
"PD": "Personal Data", | |
"OM": "Organisational Measure", | |
"TM": "Technical Measure", | |
"LB": "Legal Basis", | |
"CONS": "Consent", | |
"CONT": "Contract", | |
"LI": "Legitimate Interest", | |
"ADM": "Automated Decision Making", | |
"RET": "Retention", | |
"SEU": "Scale EU", | |
"SNEU": "Scale Non-EU", | |
"RI": "Right", | |
"DSR15": "Art. 15 Right of access by the data subject", | |
"DSR16": "Art. 16 Right to rectification", | |
"DSR17": "Art. 17 Right to erasure (‘right to be forgotten’)", | |
"DSR18": "Art. 18 Right to restriction of processing", | |
"DSR19": "Art. 19 Notification obligation regarding rectification or erasure of personal data or restriction of processing", | |
"DSR20": "Art. 20 Right to data portability", | |
"DSR21": "Art. 21 Right to object", | |
"DSR22": "Art. 22 Automated individual decision-making, including profiling", | |
"LC": "Lodge Complaint", | |
} | |
annotation_keys = list(annotation_labels.keys()) | |
xmlns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} | |
comments_dict = {} | |
# Get a list of all docx files in the directory | |
directory = "docx" | |
files = os.listdir(directory) | |
files_list = [file for file in files if file.endswith(".docx")] | |
for docx_file in tqdm(files_list, desc="Total"): | |
cmts = {} | |
name = docx_file.split(".")[0].strip() | |
comments_dict[f"{name}"] = [] | |
# Open the docx file as a zip | |
docx_zip = zipfile.ZipFile(f"docx/{docx_file}") | |
comments_xml = docx_zip.read("word/comments.xml") | |
document_xml = docx_zip.read("word/document.xml") | |
docx_zip.close() | |
# Parse comments.xml to extract the comments | |
et = etree.XML(comments_xml) | |
comments = et.xpath("//w:comment", namespaces=xmlns) | |
for c in comments: | |
comment = c.xpath("string(.)", namespaces=xmlns) | |
comment_id = c.xpath("@w:id", namespaces=xmlns)[0] | |
cmts[comment_id] = comment | |
# Parse document.xml to find text with associated comments | |
root = etree.fromstring(document_xml) | |
for k, v in tqdm(cmts.items(), desc=f"{name}"): | |
annotation = v | |
if v in annotation_keys: | |
annotations_full = annotation_labels[annotation] | |
# Construct XPath expression to find text associated with the comment | |
xpath_expr = f'//w:commentRangeStart[@w:id="{k}"]/following::w:t[following::w:commentRangeEnd[@w:id="{k}"] and not(preceding::w:commentRangeStart[@w:id="{int(k)+1}"])]' | |
comment_range_start_elements = root.xpath(xpath_expr, namespaces=root.nsmap) | |
text = "" | |
# Concatenate text elements within the comment range | |
for element in comment_range_start_elements: | |
if text and element.text: | |
# Check conditions for appending a space before the text | |
if ( | |
text[-1] not in string.whitespace | |
and element.text[0] | |
not in string.whitespace + string.punctuation | |
): | |
text += " " | |
text += element.text.replace("\u00A0", "") if element.text else "" | |
elif element.text: | |
text += element.text.replace("\u00A0", "") if element.text else "" | |
text = text.replace("( ", "(").replace("[ ", "[") | |
if text: | |
comments_dict[name].append((int(k), text, annotation, annotations_full)) | |
sorted_data = sorted(comments_dict[name], key=lambda x: x[0]) | |
comments_dict[name] = sorted_data | |
json_file = "comments.json" | |
with open(json_file, "w") as file: | |
json.dump(comments_dict, file, indent=4) | |
# The XPath expression selects <w:t> elements that appear after a specific <w:commentRangeStart> element with a matching w:id attribute value, and before the corresponding <w:commentRangeEnd> element, while ensuring that there are no intervening <w:commentRangeStart> elements with a specific w:id attribute value. The {k} and {int(k)+1} are likely to be placeholders for dynamic values that will be substituted with actual values during runtime. | |
# //w:commentRangeStart: Selects all <w:commentRangeStart> elements in the XML document, regardless of their location. | |
# [@w:id="{k}"]: Filters the selected <w:commentRangeStart> elements by the w:id attribute value equal to {k}. This attribute value is likely to be a variable or placeholder that is being dynamically replaced with an actual value when the XPath expression is used. | |
# /following::w:t: Selects all <w:t> elements that appear after the filtered <w:commentRangeStart> element(s). | |
# [following::w:commentRangeEnd[@w:id="{k}"]: Filters the selected <w:t> elements further by checking if there is a <w:commentRangeEnd> element with a matching w:id attribute value of {k} that appears after the current <w:t> element. | |
# and not(preceding::w:commentRangeStart[@w:id="{int(k)+1}"]): Additionally filters the selected <w:t> elements by checking if there is no <w:commentRangeStart> element with a w:id attribute value equal to {int(k)+1} that appears before the current <w:t> element. | |
# PART 2 - Convert DOCX into CoNLL format with IOB tagging, with text as tokens and comments as labels. | |
# Function to convert a docx file to text | |
def convert_docx_to_text(docx_file): | |
doc = docx.Document(docx_file) | |
xml_content = doc._element.xml | |
namespaces = { | |
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", | |
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", | |
} | |
# Create an XML parser with namespace mappings | |
parser = etree.XMLParser() | |
for prefix, uri in namespaces.items(): | |
etree.register_namespace(prefix, uri) | |
tree = etree.fromstring(xml_content.encode("utf-8"), parser=parser) | |
# Extract text from <w:t> elements and join them, adding a space after each <w:p> | |
text = "" | |
for paragraph in tree.findall(".//w:p", namespaces=namespaces): | |
text += "".join( | |
[ | |
t.text.replace("\xa0", " ") | |
for t in paragraph.findall(".//w:t", namespaces=namespaces) | |
if t.text | |
] | |
) | |
text += " " | |
return text.strip() | |
# Function to split a string from the first match of a delimiter | |
def split_from_first_match(string, delimiter): | |
parts = string.split(delimiter, 1) | |
if len(parts) > 1: | |
return parts[0], delimiter, parts[1] | |
else: | |
return False | |
# Function to split a string with punctuation into tokens | |
def split_string_with_punctuation(string): | |
tokenizer = RegexpTokenizer(r"\w+|\S+") | |
tokens = tokenizer.tokenize(string) | |
return tokens | |
with open("comments.json", "r") as file: | |
data = json.load(file) | |
conll = open(f"comments.conll", "w") | |
for docx_file in tqdm(files_list, desc="Total"): | |
name = docx_file.split(".")[0].strip() | |
text = convert_docx_to_text(f"docx/{docx_file}") | |
conll = open(f"comments.conll", "a+") | |
for _, delimiter, label, _ in data[name]: | |
split_text = split_from_first_match(text, delimiter) | |
if split_text: | |
out_text, tag_text, text = split_text[0], split_text[1], split_text[2] | |
split_out_text = split_string_with_punctuation(out_text) | |
split_tag_text = split_string_with_punctuation(tag_text) | |
for txt in split_out_text: | |
conll.write(f"{txt}\tO\n") | |
conll.write(f"{split_tag_text[0]}\tB-{label}\n") | |
for txt in split_tag_text[1:]: | |
conll.write(f"{txt}\tI-{label}\n") | |
else: | |
continue | |
conll.write("\n") | |
conll.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment