Last active
January 23, 2021 21:18
-
-
Save arccoder/c74a2a010974b87b6031cfd73a748169 to your computer and use it in GitHub Desktop.
Redact word documents using Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
> python redact.py -h | |
usage: redact.py [-h] -i INPUT -o OUTPUT -p PATTERNS [-r REPLACE] [-c COLOR] | |
optional arguments: | |
-h, --help show this help message and exit | |
-i INPUT, --input INPUT | |
Path to the document to be redacted | |
-o OUTPUT, --output OUTPUT | |
Path to save the redacted document | |
-p PATTERNS, --patterns PATTERNS | |
List of pattern to redact. Comma separated. | |
-r REPLACE, --replace REPLACE | |
String to replace all patterns | |
-c COLOR, --color COLOR | |
Color to redact. Black is default. Options: white, yellow | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def process_matches(match_pairs: list, run_text: str): | |
""" | |
Process matchPairs from regex finditer | |
Args: | |
match_pairs: List of match (start, end) from a regex finditer | |
run_text: String | |
Returns: Tuple of (boolean list of highlights, number list of positions) | |
""" | |
# List to hold the indices to split at | |
matches = [] | |
# List to hold whether to highlight or not | |
# In other words, whether the split contains a pattern | |
highlights = [] | |
if len(match_pairs) > 0: | |
# If the first match does not start with zero, append zero | |
# highlight as False | |
if match_pairs[0][0] != 0: | |
matches.append(0) | |
highlights.append(False) | |
# First match start index and highlight as true | |
matches.append(match_pairs[0][0]) | |
highlights.append(True) | |
# First match end index | |
matches.append(match_pairs[0][1]) | |
prev = match_pairs[0][1] | |
# Loop through the remaining pairs, except the last | |
for idx in range(1, len(match_pairs)): | |
# IF the start of the next match does NOT match the end of the previous match | |
# THEN some text in the run does not match the pattern and should not be highlighted | |
# AND go to the start of the current match | |
# IF the start of the next match does match the end of the previous match | |
# THEN the text starting with the start of the match should be highlighted | |
if prev != match_pairs[idx][0]: | |
highlights.append(False) | |
matches.append(match_pairs[idx][0]) | |
highlights.append(True) | |
# Append the end of the current match | |
# and go over the same logic with the next match | |
matches.append(match_pairs[idx][1]) | |
prev = match_pairs[idx][1] | |
# IF the end of the last match does not match the length of the run-text | |
# THEN don't highlight and append the matches with the length of the run-text | |
if matches[-1] != len(run_text): | |
highlights.append(False) | |
matches.append(len(run_text)) | |
return highlights, matches | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def redact_colors(color: str = None): | |
""" | |
Returns text and text-background colors for redaction using argument {color}. | |
Args: | |
color: str | |
Returns: Tuple with redaction colors for (text, background) | |
""" | |
switcher = { | |
'white': (RGBColor(255, 255, 255), WD_COLOR_INDEX.WHITE), | |
'yellow': (RGBColor(255, 255, 0), WD_COLOR_INDEX.YELLOW) | |
} | |
return switcher.get(color, (RGBColor(0, 0, 0), WD_COLOR_INDEX.BLACK)) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def redact_document(input_path: str, output_path: str, pattern: list, color: str = None): | |
""" | |
Redacts {pattern} is the {input} document and saves it as {output} document | |
Args: | |
input_path: Path to the document to be redacted | |
output_path: Path to save the redacted document | |
pattern: List of pattern to redact | |
color: Color to redact. Will be used for both the text and background. | |
""" | |
# Get the text color and text-background color for reaction | |
txt_color, background_color = redact_colors(color) | |
# Open the input document | |
doc = Document(input_path) | |
# Loop through paragraphs | |
for para in doc.paragraphs: | |
# Loop through the runs in the paragraph in the reverse order | |
run_index = len(para.runs) - 1 | |
while run_index > -1: | |
run = para.runs[run_index] | |
# Find the start and end indices of the patterns in the run-text | |
match_pairs = [(match.start(), match.end()) for match in re.finditer('|'.join(pattern), run.text)] | |
# Get the locations in the format required for `split_run_by` function | |
highlights, matches = process_matches(match_pairs, run.text) | |
# Go to redact only if patterns are found in the text | |
if len(highlights) > 0 and len(matches) > 0: | |
if len(highlights) != len(matches) - 1: | |
ValueError('Calculation error within matches and highlights') | |
else: | |
if len(matches) == 2: # When a pattern is the only text in the run | |
# Highlight the background color | |
run.font.highlight_color = background_color | |
# Match the text color to the background color | |
run.font.color.rgb = txt_color | |
else: | |
# Split the runs using the matches | |
new_runs = split_run_by(para, run, matches[1:-1]) | |
# Highlight the run if it matches a pattern | |
for highlight, run in zip(highlights, new_runs): | |
if highlight: | |
# Highlight the background color | |
run.font.highlight_color = background_color | |
# Match the text color to the background color | |
run.font.color.rgb = txt_color | |
# Decrement the index to process the previous run | |
run_index -= 1 | |
# Save the redacted document to the output path | |
doc.save(output_path) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def redact_document_with_replace(input_path: str, output_path: str, pattern: list, | |
replace_with: str = '##########', color: str = None): | |
""" | |
Redacts {pattern} is the {input} document and saves it as {output} document | |
Args: | |
input_path (str): Path to the document to be redacted | |
output_path (str): Path to save the redacted document | |
pattern (list): List of pattern to redact | |
replace_with (str): String to replace all patterns | |
color (str): Color to redact. Will be used for both the text and background. | |
""" | |
# Get the text color and text-background color for reaction | |
txt_color, background_color = redact_colors(color) | |
# Open the input document | |
doc = Document(input_path) | |
# Loop through paragraphs | |
for para in doc.paragraphs: | |
# Loop through the runs in the paragraph in the reverse order | |
run_index = len(para.runs) - 1 | |
while run_index > -1: | |
run = para.runs[run_index] | |
# Find the start and end indices of the patterns in the run-text | |
match_pairs = [(match.start(), match.end()) for match in re.finditer('|'.join(pattern), run.text)] | |
# Get the locations in the format required for `split_run_by` function | |
highlights, matches = process_matches(match_pairs, run.text) | |
# Go to redact only if patterns are found in the text | |
if len(highlights) > 0 and len(matches) > 0: | |
if len(highlights) != len(matches) - 1: | |
ValueError('Calculation error within matches and highlights') | |
else: | |
if len(matches) == 2: # When a pattern is the only text in the run | |
# Replace the matching pattern | |
run.text = replace_with | |
# Highlight the background color | |
run.font.highlight_color = background_color | |
# Match the text color to the background color | |
run.font.color.rgb = txt_color | |
else: | |
# Split the runs using the matches | |
new_runs = split_run_by(para, run, matches[1:-1]) | |
# Highlight the run if it matches a pattern | |
for highlight, run in zip(highlights, new_runs): | |
if highlight: | |
# Replace the matching pattern | |
run.text = replace_with | |
# Highlight the background color | |
run.font.highlight_color = background_color | |
# Match the text color to the background color | |
run.font.color.rgb = txt_color | |
# Decrement the index to process the previous run | |
run_index -= 1 | |
# Save the redacted document to the output path | |
doc.save(output_path) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment