Skip to content

Instantly share code, notes, and snippets.

@notjulian
Created August 23, 2025 15:29
Show Gist options
  • Save notjulian/0f930e037053e957b421a58d88c4ca86 to your computer and use it in GitHub Desktop.
Save notjulian/0f930e037053e957b421a58d88c4ca86 to your computer and use it in GitHub Desktop.
Remove text from all pdf in a folder
using System;
using System.IO;
using System.Linq;
using System.Collections.Generic;
using System.Text;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas;
using iText.IO.Source;
namespace PDFTextRemover
{
class Program
{
static void Main(string[] args)
{
if (args.Length < 2)
{
Console.WriteLine("Usage: PDFTextRemover.exe <directory_path> <text_to_remove>");
Console.WriteLine("Example: PDFTextRemover.exe \"C:\\MyPDFs\" \"foo\"");
return;
}
string directoryPath = args[0];
string textToRemove = args[1];
if (!Directory.Exists(directoryPath))
{
Console.WriteLine($"Directory not found: {directoryPath}");
return;
}
var pdfRemover = new PDFTextRemover();
pdfRemover.ProcessDirectory(directoryPath, textToRemove);
}
}
public class PDFTextRemover
{
public void ProcessDirectory(string directoryPath, string textToRemove)
{
try
{
// Find all PDF files in the directory
string[] pdfFiles = Directory.GetFiles(directoryPath, "*.pdf", SearchOption.AllDirectories);
Console.WriteLine($"Found {pdfFiles.Length} PDF files in directory: {directoryPath}");
foreach (string pdfPath in pdfFiles)
{
Console.WriteLine($"Processing: {Path.GetFileName(pdfPath)}");
ProcessPDFFile(pdfPath, textToRemove);
}
Console.WriteLine("Processing completed!");
}
catch (Exception ex)
{
Console.WriteLine($"Error processing directory: {ex.Message}");
}
}
private void ProcessPDFFile(string pdfPath, string textToRemove)
{
try
{
// Create backup of original file
string backupPath = pdfPath.Replace(".pdf", "_backup.pdf");
File.Copy(pdfPath, backupPath, true);
Console.WriteLine($"Backup created: {Path.GetFileName(backupPath)}");
// Process the PDF
string tempPath = pdfPath.Replace(".pdf", "_temp.pdf");
using (PdfReader reader = new PdfReader(pdfPath))
using (PdfWriter writer = new PdfWriter(tempPath))
using (PdfDocument pdfDoc = new PdfDocument(reader, writer))
{
int totalPages = pdfDoc.GetNumberOfPages();
int totalOccurrences = 0;
for (int pageNum = 1; pageNum <= totalPages; pageNum++)
{
PdfPage page = pdfDoc.GetPage(pageNum);
int occurrencesOnPage = RemoveTextFromPageContent(page, textToRemove);
totalOccurrences += occurrencesOnPage;
if (occurrencesOnPage > 0)
{
Console.WriteLine($" Page {pageNum}: Removed {occurrencesOnPage} occurrence(s)");
}
}
Console.WriteLine($" Total occurrences removed: {totalOccurrences}");
}
// Replace original file with processed version
if (File.Exists(tempPath))
{
File.Delete(pdfPath);
File.Move(tempPath, pdfPath);
Console.WriteLine($" File updated: {Path.GetFileName(pdfPath)}");
}
}
catch (Exception ex)
{
Console.WriteLine($" Error processing {Path.GetFileName(pdfPath)}: {ex.Message}");
}
}
private int RemoveTextFromPageContent(PdfPage page, string textToRemove)
{
try
{
// First check if the text exists on this page
string pageText = PdfTextExtractor.GetTextFromPage(page);
if (!pageText.Contains(textToRemove))
{
return 0; // No occurrences found
}
// Use a more reliable approach with content stream editing
var editor = new TextRemovalContentEditor(textToRemove);
return editor.EditPage(page);
}
catch (Exception ex)
{
Console.WriteLine($" Error processing page content: {ex.Message}");
return 0;
}
}
}
// Simplified and more reliable text removal editor
public class TextRemovalContentEditor
{
private readonly string targetText;
private int removedCount;
public TextRemovalContentEditor(string targetText)
{
this.targetText = targetText;
this.removedCount = 0;
}
public int EditPage(PdfPage page)
{
removedCount = 0;
try
{
// Get the current content
var contentBytes = page.GetContentBytes();
string content = System.Text.Encoding.UTF8.GetString(contentBytes);
// Process the content to remove text operations containing our target
string cleanedContent = RemoveTextOperations(content);
// Update the page content if changes were made
if (cleanedContent != content)
{
var newContentBytes = System.Text.Encoding.UTF8.GetBytes(cleanedContent);
page.Put(PdfName.Contents, new PdfStream(newContentBytes));
}
return removedCount;
}
catch (Exception ex)
{
Console.WriteLine($"Error editing page: {ex.Message}");
return 0;
}
}
private string RemoveTextOperations(string content)
{
var lines = content.Split('\n').ToList();
var result = new List<string>();
var currentTextBlock = new List<string>();
bool inTextObject = false;
for (int i = 0; i < lines.Count; i++)
{
string line = lines[i].Trim();
if (line == "BT") // Begin text object
{
inTextObject = true;
currentTextBlock.Clear();
currentTextBlock.Add(lines[i]); // Keep original formatting
}
else if (line == "ET") // End text object
{
currentTextBlock.Add(lines[i]);
// Check if this text block contains our target text
string textBlockContent = string.Join("\n", currentTextBlock);
if (ContainsTargetText(textBlockContent))
{
removedCount++;
// Skip adding this text block (effectively removing it)
Console.WriteLine($" Removing text block containing: {targetText}");
}
else
{
// Keep this text block
result.AddRange(currentTextBlock);
}
inTextObject = false;
currentTextBlock.Clear();
}
else if (inTextObject)
{
currentTextBlock.Add(lines[i]);
}
else
{
// Non-text content, always keep
result.Add(lines[i]);
}
}
return string.Join("\n", result);
}
private bool ContainsTargetText(string textBlock)
{
// Look for common text showing operators and check their content
var lines = textBlock.Split('\n');
foreach (string line in lines)
{
string trimmedLine = line.Trim();
// Check for text showing operators
if (trimmedLine.EndsWith("Tj") || trimmedLine.EndsWith("TJ") ||
trimmedLine.EndsWith("'") || trimmedLine.EndsWith("\""))
{
string extractedText = ExtractTextFromLine(trimmedLine);
if (extractedText.Contains(targetText))
{
return true;
}
}
}
return false;
}
private string ExtractTextFromLine(string line)
{
try
{
// Extract text from parentheses
int startParen = line.IndexOf('(');
if (startParen >= 0)
{
int endParen = line.IndexOf(')', startParen);
if (endParen > startParen)
{
return line.Substring(startParen + 1, endParen - startParen - 1);
}
}
// Extract text from angle brackets (hex strings)
int startAngle = line.IndexOf('<');
if (startAngle >= 0)
{
int endAngle = line.IndexOf('>', startAngle);
if (endAngle > startAngle)
{
string hexString = line.Substring(startAngle + 1, endAngle - startAngle - 1);
return ConvertHexToText(hexString);
}
}
// Extract from array format [(...) (...)] TJ
int startBracket = line.IndexOf('[');
if (startBracket >= 0)
{
int endBracket = line.IndexOf(']', startBracket);
if (endBracket > startBracket)
{
string arrayContent = line.Substring(startBracket + 1, endBracket - startBracket - 1);
return ExtractTextFromArray(arrayContent);
}
}
return "";
}
catch
{
return "";
}
}
private string ExtractTextFromArray(string arrayContent)
{
var result = new StringBuilder();
bool inParens = false;
var currentText = new StringBuilder();
for (int i = 0; i < arrayContent.Length; i++)
{
char c = arrayContent[i];
if (c == '(' && !inParens)
{
inParens = true;
currentText.Clear();
}
else if (c == ')' && inParens)
{
inParens = false;
result.Append(currentText.ToString());
currentText.Clear();
}
else if (inParens)
{
currentText.Append(c);
}
}
return result.ToString();
}
private string ConvertHexToText(string hexString)
{
try
{
var result = new StringBuilder();
for (int i = 0; i < hexString.Length; i += 2)
{
if (i + 1 < hexString.Length)
{
string hex = hexString.Substring(i, 2);
int value = Convert.ToInt32(hex, 16);
result.Append((char)value);
}
}
return result.ToString();
}
catch
{
return "";
}
}
}
}
/*
.Net 8 Console App
REQUIRED NUGET PACKAGES:
To use this program, you need to install the following NuGet packages:
1. itext (main package)
Install-Package itext
2. itext.bouncy-castle-adapter (for encryption support)
Install-Package itext.bouncy-castle-adapter
USAGE INSTRUCTIONS:
1. Create a new Console Application in Visual Studio
2. Install the required NuGet packages
3. Replace the default Program.cs with this code
4. Build the project
5. Run from command line:
PDFTextRemover.exe "C:\Path\To\PDFs" "hide01.ir"
FEATURES:
- Completely removes text from PDF content streams (not just covering)
- Searches all PDF files in the specified directory (including subdirectories)
- Creates backup copies of original files
- Removes entire text blocks that contain the target string
- Provides detailed console output showing progress
- Handles errors gracefully
- Maintains PDF structure and formatting
TECHNICAL APPROACH:
- Parses PDF content streams at the operator level
- Identifies text showing operators (Tj, TJ, ', ")
- Removes entire text objects that contain the target text
- Rebuilds content streams without the unwanted text
- Uses low-level PDF manipulation for complete text removal
NOTES:
- Text is completely removed from the PDF, not just hidden
- Original files are backed up with "_backup" suffix
- The program removes entire text blocks containing the target string
- This approach ensures no traces of the text remain in the PDF
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment