Created
August 23, 2025 15:29
-
-
Save notjulian/0f930e037053e957b421a58d88c4ca86 to your computer and use it in GitHub Desktop.
Remove text from all pdf in a folder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using System; | |
| using System.IO; | |
| using System.Linq; | |
| using System.Collections.Generic; | |
| using System.Text; | |
| using iText.Kernel.Pdf; | |
| using iText.Kernel.Pdf.Canvas.Parser; | |
| using iText.Kernel.Pdf.Canvas.Parser.Listener; | |
| using iText.Kernel.Pdf.Canvas.Parser.Data; | |
| using iText.Kernel.Pdf.Canvas; | |
| using iText.IO.Source; | |
| namespace PDFTextRemover | |
| { | |
| class Program | |
| { | |
| static void Main(string[] args) | |
| { | |
| if (args.Length < 2) | |
| { | |
| Console.WriteLine("Usage: PDFTextRemover.exe <directory_path> <text_to_remove>"); | |
| Console.WriteLine("Example: PDFTextRemover.exe \"C:\\MyPDFs\" \"foo\""); | |
| return; | |
| } | |
| string directoryPath = args[0]; | |
| string textToRemove = args[1]; | |
| if (!Directory.Exists(directoryPath)) | |
| { | |
| Console.WriteLine($"Directory not found: {directoryPath}"); | |
| return; | |
| } | |
| var pdfRemover = new PDFTextRemover(); | |
| pdfRemover.ProcessDirectory(directoryPath, textToRemove); | |
| } | |
| } | |
| public class PDFTextRemover | |
| { | |
| public void ProcessDirectory(string directoryPath, string textToRemove) | |
| { | |
| try | |
| { | |
| // Find all PDF files in the directory | |
| string[] pdfFiles = Directory.GetFiles(directoryPath, "*.pdf", SearchOption.AllDirectories); | |
| Console.WriteLine($"Found {pdfFiles.Length} PDF files in directory: {directoryPath}"); | |
| foreach (string pdfPath in pdfFiles) | |
| { | |
| Console.WriteLine($"Processing: {Path.GetFileName(pdfPath)}"); | |
| ProcessPDFFile(pdfPath, textToRemove); | |
| } | |
| Console.WriteLine("Processing completed!"); | |
| } | |
| catch (Exception ex) | |
| { | |
| Console.WriteLine($"Error processing directory: {ex.Message}"); | |
| } | |
| } | |
| private void ProcessPDFFile(string pdfPath, string textToRemove) | |
| { | |
| try | |
| { | |
| // Create backup of original file | |
| string backupPath = pdfPath.Replace(".pdf", "_backup.pdf"); | |
| File.Copy(pdfPath, backupPath, true); | |
| Console.WriteLine($"Backup created: {Path.GetFileName(backupPath)}"); | |
| // Process the PDF | |
| string tempPath = pdfPath.Replace(".pdf", "_temp.pdf"); | |
| using (PdfReader reader = new PdfReader(pdfPath)) | |
| using (PdfWriter writer = new PdfWriter(tempPath)) | |
| using (PdfDocument pdfDoc = new PdfDocument(reader, writer)) | |
| { | |
| int totalPages = pdfDoc.GetNumberOfPages(); | |
| int totalOccurrences = 0; | |
| for (int pageNum = 1; pageNum <= totalPages; pageNum++) | |
| { | |
| PdfPage page = pdfDoc.GetPage(pageNum); | |
| int occurrencesOnPage = RemoveTextFromPageContent(page, textToRemove); | |
| totalOccurrences += occurrencesOnPage; | |
| if (occurrencesOnPage > 0) | |
| { | |
| Console.WriteLine($" Page {pageNum}: Removed {occurrencesOnPage} occurrence(s)"); | |
| } | |
| } | |
| Console.WriteLine($" Total occurrences removed: {totalOccurrences}"); | |
| } | |
| // Replace original file with processed version | |
| if (File.Exists(tempPath)) | |
| { | |
| File.Delete(pdfPath); | |
| File.Move(tempPath, pdfPath); | |
| Console.WriteLine($" File updated: {Path.GetFileName(pdfPath)}"); | |
| } | |
| } | |
| catch (Exception ex) | |
| { | |
| Console.WriteLine($" Error processing {Path.GetFileName(pdfPath)}: {ex.Message}"); | |
| } | |
| } | |
| private int RemoveTextFromPageContent(PdfPage page, string textToRemove) | |
| { | |
| try | |
| { | |
| // First check if the text exists on this page | |
| string pageText = PdfTextExtractor.GetTextFromPage(page); | |
| if (!pageText.Contains(textToRemove)) | |
| { | |
| return 0; // No occurrences found | |
| } | |
| // Use a more reliable approach with content stream editing | |
| var editor = new TextRemovalContentEditor(textToRemove); | |
| return editor.EditPage(page); | |
| } | |
| catch (Exception ex) | |
| { | |
| Console.WriteLine($" Error processing page content: {ex.Message}"); | |
| return 0; | |
| } | |
| } | |
| } | |
| // Simplified and more reliable text removal editor | |
| public class TextRemovalContentEditor | |
| { | |
| private readonly string targetText; | |
| private int removedCount; | |
| public TextRemovalContentEditor(string targetText) | |
| { | |
| this.targetText = targetText; | |
| this.removedCount = 0; | |
| } | |
| public int EditPage(PdfPage page) | |
| { | |
| removedCount = 0; | |
| try | |
| { | |
| // Get the current content | |
| var contentBytes = page.GetContentBytes(); | |
| string content = System.Text.Encoding.UTF8.GetString(contentBytes); | |
| // Process the content to remove text operations containing our target | |
| string cleanedContent = RemoveTextOperations(content); | |
| // Update the page content if changes were made | |
| if (cleanedContent != content) | |
| { | |
| var newContentBytes = System.Text.Encoding.UTF8.GetBytes(cleanedContent); | |
| page.Put(PdfName.Contents, new PdfStream(newContentBytes)); | |
| } | |
| return removedCount; | |
| } | |
| catch (Exception ex) | |
| { | |
| Console.WriteLine($"Error editing page: {ex.Message}"); | |
| return 0; | |
| } | |
| } | |
| private string RemoveTextOperations(string content) | |
| { | |
| var lines = content.Split('\n').ToList(); | |
| var result = new List<string>(); | |
| var currentTextBlock = new List<string>(); | |
| bool inTextObject = false; | |
| for (int i = 0; i < lines.Count; i++) | |
| { | |
| string line = lines[i].Trim(); | |
| if (line == "BT") // Begin text object | |
| { | |
| inTextObject = true; | |
| currentTextBlock.Clear(); | |
| currentTextBlock.Add(lines[i]); // Keep original formatting | |
| } | |
| else if (line == "ET") // End text object | |
| { | |
| currentTextBlock.Add(lines[i]); | |
| // Check if this text block contains our target text | |
| string textBlockContent = string.Join("\n", currentTextBlock); | |
| if (ContainsTargetText(textBlockContent)) | |
| { | |
| removedCount++; | |
| // Skip adding this text block (effectively removing it) | |
| Console.WriteLine($" Removing text block containing: {targetText}"); | |
| } | |
| else | |
| { | |
| // Keep this text block | |
| result.AddRange(currentTextBlock); | |
| } | |
| inTextObject = false; | |
| currentTextBlock.Clear(); | |
| } | |
| else if (inTextObject) | |
| { | |
| currentTextBlock.Add(lines[i]); | |
| } | |
| else | |
| { | |
| // Non-text content, always keep | |
| result.Add(lines[i]); | |
| } | |
| } | |
| return string.Join("\n", result); | |
| } | |
| private bool ContainsTargetText(string textBlock) | |
| { | |
| // Look for common text showing operators and check their content | |
| var lines = textBlock.Split('\n'); | |
| foreach (string line in lines) | |
| { | |
| string trimmedLine = line.Trim(); | |
| // Check for text showing operators | |
| if (trimmedLine.EndsWith("Tj") || trimmedLine.EndsWith("TJ") || | |
| trimmedLine.EndsWith("'") || trimmedLine.EndsWith("\"")) | |
| { | |
| string extractedText = ExtractTextFromLine(trimmedLine); | |
| if (extractedText.Contains(targetText)) | |
| { | |
| return true; | |
| } | |
| } | |
| } | |
| return false; | |
| } | |
| private string ExtractTextFromLine(string line) | |
| { | |
| try | |
| { | |
| // Extract text from parentheses | |
| int startParen = line.IndexOf('('); | |
| if (startParen >= 0) | |
| { | |
| int endParen = line.IndexOf(')', startParen); | |
| if (endParen > startParen) | |
| { | |
| return line.Substring(startParen + 1, endParen - startParen - 1); | |
| } | |
| } | |
| // Extract text from angle brackets (hex strings) | |
| int startAngle = line.IndexOf('<'); | |
| if (startAngle >= 0) | |
| { | |
| int endAngle = line.IndexOf('>', startAngle); | |
| if (endAngle > startAngle) | |
| { | |
| string hexString = line.Substring(startAngle + 1, endAngle - startAngle - 1); | |
| return ConvertHexToText(hexString); | |
| } | |
| } | |
| // Extract from array format [(...) (...)] TJ | |
| int startBracket = line.IndexOf('['); | |
| if (startBracket >= 0) | |
| { | |
| int endBracket = line.IndexOf(']', startBracket); | |
| if (endBracket > startBracket) | |
| { | |
| string arrayContent = line.Substring(startBracket + 1, endBracket - startBracket - 1); | |
| return ExtractTextFromArray(arrayContent); | |
| } | |
| } | |
| return ""; | |
| } | |
| catch | |
| { | |
| return ""; | |
| } | |
| } | |
| private string ExtractTextFromArray(string arrayContent) | |
| { | |
| var result = new StringBuilder(); | |
| bool inParens = false; | |
| var currentText = new StringBuilder(); | |
| for (int i = 0; i < arrayContent.Length; i++) | |
| { | |
| char c = arrayContent[i]; | |
| if (c == '(' && !inParens) | |
| { | |
| inParens = true; | |
| currentText.Clear(); | |
| } | |
| else if (c == ')' && inParens) | |
| { | |
| inParens = false; | |
| result.Append(currentText.ToString()); | |
| currentText.Clear(); | |
| } | |
| else if (inParens) | |
| { | |
| currentText.Append(c); | |
| } | |
| } | |
| return result.ToString(); | |
| } | |
| private string ConvertHexToText(string hexString) | |
| { | |
| try | |
| { | |
| var result = new StringBuilder(); | |
| for (int i = 0; i < hexString.Length; i += 2) | |
| { | |
| if (i + 1 < hexString.Length) | |
| { | |
| string hex = hexString.Substring(i, 2); | |
| int value = Convert.ToInt32(hex, 16); | |
| result.Append((char)value); | |
| } | |
| } | |
| return result.ToString(); | |
| } | |
| catch | |
| { | |
| return ""; | |
| } | |
| } | |
| } | |
| } | |
| /* | |
| .Net 8 Console App | |
| REQUIRED NUGET PACKAGES: | |
| To use this program, you need to install the following NuGet packages: | |
| 1. itext (main package) | |
| Install-Package itext | |
| 2. itext.bouncy-castle-adapter (for encryption support) | |
| Install-Package itext.bouncy-castle-adapter | |
| USAGE INSTRUCTIONS: | |
| 1. Create a new Console Application in Visual Studio | |
| 2. Install the required NuGet packages | |
| 3. Replace the default Program.cs with this code | |
| 4. Build the project | |
| 5. Run from command line: | |
| PDFTextRemover.exe "C:\Path\To\PDFs" "hide01.ir" | |
| FEATURES: | |
| - Completely removes text from PDF content streams (not just covering) | |
| - Searches all PDF files in the specified directory (including subdirectories) | |
| - Creates backup copies of original files | |
| - Removes entire text blocks that contain the target string | |
| - Provides detailed console output showing progress | |
| - Handles errors gracefully | |
| - Maintains PDF structure and formatting | |
| TECHNICAL APPROACH: | |
| - Parses PDF content streams at the operator level | |
| - Identifies text showing operators (Tj, TJ, ', ") | |
| - Removes entire text objects that contain the target text | |
| - Rebuilds content streams without the unwanted text | |
| - Uses low-level PDF manipulation for complete text removal | |
| NOTES: | |
| - Text is completely removed from the PDF, not just hidden | |
| - Original files are backed up with "_backup" suffix | |
| - The program removes entire text blocks containing the target string | |
| - This approach ensures no traces of the text remain in the PDF | |
| */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment