notjulian · August 23, 2025 15:29
diff --git a/gistfile1.txt b/gistfile1.txt
 using System;
 using System.IO;
 using System.Linq;
 using System.Collections.Generic;
 using System.Text;
 using iText.Kernel.Pdf;
 using iText.Kernel.Pdf.Canvas.Parser;
 using iText.Kernel.Pdf.Canvas.Parser.Listener;
 using iText.Kernel.Pdf.Canvas.Parser.Data;
 using iText.Kernel.Pdf.Canvas;
 using iText.IO.Source;

 namespace PDFTextRemover
 {
    class Program
    {
        static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                Console.WriteLine("Usage: PDFTextRemover.exe <directory_path> <text_to_remove>");
                Console.WriteLine("Example: PDFTextRemover.exe \"C:\\MyPDFs\" \"foo\"");
                return;
            }

            string directoryPath = args[0];
            string textToRemove = args[1];

            if (!Directory.Exists(directoryPath))
            {
                Console.WriteLine($"Directory not found: {directoryPath}");
                return;
            }

            var pdfRemover = new PDFTextRemover();
            pdfRemover.ProcessDirectory(directoryPath, textToRemove);
        }
    }

    public class PDFTextRemover
    {
        public void ProcessDirectory(string directoryPath, string textToRemove)
        {
            try
            {
                // Find all PDF files in the directory
                string[] pdfFiles = Directory.GetFiles(directoryPath, "*.pdf", SearchOption.AllDirectories);
                
                Console.WriteLine($"Found {pdfFiles.Length} PDF files in directory: {directoryPath}");

                foreach (string pdfPath in pdfFiles)
                {
                    Console.WriteLine($"Processing: {Path.GetFileName(pdfPath)}");
                    ProcessPDFFile(pdfPath, textToRemove);
                }

                Console.WriteLine("Processing completed!");
            }
            catch (Exception ex)
            {
                Console.WriteLine($"Error processing directory: {ex.Message}");
            }
        }

        private void ProcessPDFFile(string pdfPath, string textToRemove)
        {
            try
            {
                // Create backup of original file
                string backupPath = pdfPath.Replace(".pdf", "_backup.pdf");
                File.Copy(pdfPath, backupPath, true);
                Console.WriteLine($"Backup created: {Path.GetFileName(backupPath)}");

                // Process the PDF
                string tempPath = pdfPath.Replace(".pdf", "_temp.pdf");
                
                using (PdfReader reader = new PdfReader(pdfPath))
                using (PdfWriter writer = new PdfWriter(tempPath))
                using (PdfDocument pdfDoc = new PdfDocument(reader, writer))
                {
                    int totalPages = pdfDoc.GetNumberOfPages();
                    int totalOccurrences = 0;

                    for (int pageNum = 1; pageNum <= totalPages; pageNum++)
                    {
                        PdfPage page = pdfDoc.GetPage(pageNum);
                        int occurrencesOnPage = RemoveTextFromPageContent(page, textToRemove);
                        totalOccurrences += occurrencesOnPage;

                        if (occurrencesOnPage > 0)
                        {
                            Console.WriteLine($"  Page {pageNum}: Removed {occurrencesOnPage} occurrence(s)");
                        }
                    }

                    Console.WriteLine($"  Total occurrences removed: {totalOccurrences}");
                }

                // Replace original file with processed version
                if (File.Exists(tempPath))
                {
                    File.Delete(pdfPath);
                    File.Move(tempPath, pdfPath);
                    Console.WriteLine($"  File updated: {Path.GetFileName(pdfPath)}");
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine($"  Error processing {Path.GetFileName(pdfPath)}: {ex.Message}");
            }
        }

        private int RemoveTextFromPageContent(PdfPage page, string textToRemove)
        {
            try
            {
                // First check if the text exists on this page
                string pageText = PdfTextExtractor.GetTextFromPage(page);
                if (!pageText.Contains(textToRemove))
                {
                    return 0; // No occurrences found
                }

                // Use a more reliable approach with content stream editing
                var editor = new TextRemovalContentEditor(textToRemove);
                return editor.EditPage(page);
            }
            catch (Exception ex)
            {
                Console.WriteLine($"    Error processing page content: {ex.Message}");
                return 0;
            }
        }
    }

    // Simplified and more reliable text removal editor
    public class TextRemovalContentEditor
    {
        private readonly string targetText;
        private int removedCount;

        public TextRemovalContentEditor(string targetText)
        {
            this.targetText = targetText;
            this.removedCount = 0;
        }

        public int EditPage(PdfPage page)
        {
            removedCount = 0;
            
            try
            {
                // Get the current content
                var contentBytes = page.GetContentBytes();
                string content = System.Text.Encoding.UTF8.GetString(contentBytes);
                
                // Process the content to remove text operations containing our target
                string cleanedContent = RemoveTextOperations(content);
                
                // Update the page content if changes were made
                if (cleanedContent != content)
                {
                    var newContentBytes = System.Text.Encoding.UTF8.GetBytes(cleanedContent);
                    page.Put(PdfName.Contents, new PdfStream(newContentBytes));
                }
                
                return removedCount;
            }
            catch (Exception ex)
            {
                Console.WriteLine($"Error editing page: {ex.Message}");
                return 0;
            }
        }

        private string RemoveTextOperations(string content)
        {
            var lines = content.Split('\n').ToList();
            var result = new List<string>();
            var currentTextBlock = new List<string>();
            bool inTextObject = false;

            for (int i = 0; i < lines.Count; i++)
            {
                string line = lines[i].Trim();

                if (line == "BT") // Begin text object
                {
                    inTextObject = true;
                    currentTextBlock.Clear();
                    currentTextBlock.Add(lines[i]); // Keep original formatting
                }
                else if (line == "ET") // End text object
                {
                    currentTextBlock.Add(lines[i]);
                    
                    // Check if this text block contains our target text
                    string textBlockContent = string.Join("\n", currentTextBlock);
                    
                    if (ContainsTargetText(textBlockContent))
                    {
                        removedCount++;
                        // Skip adding this text block (effectively removing it)
                        Console.WriteLine($"    Removing text block containing: {targetText}");
                    }
                    else
                    {
                        // Keep this text block
                        result.AddRange(currentTextBlock);
                    }
                    
                    inTextObject = false;
                    currentTextBlock.Clear();
                }
                else if (inTextObject)
                {
                    currentTextBlock.Add(lines[i]);
                }
                else
                {
                    // Non-text content, always keep
                    result.Add(lines[i]);
                }
            }

            return string.Join("\n", result);
        }

        private bool ContainsTargetText(string textBlock)
        {
            // Look for common text showing operators and check their content
            var lines = textBlock.Split('\n');
            
            foreach (string line in lines)
            {
                string trimmedLine = line.Trim();
                
                // Check for text showing operators
                if (trimmedLine.EndsWith("Tj") || trimmedLine.EndsWith("TJ") || 
                    trimmedLine.EndsWith("'") || trimmedLine.EndsWith("\""))
                {
                    string extractedText = ExtractTextFromLine(trimmedLine);
                    if (extractedText.Contains(targetText))
                    {
                        return true;
                    }
                }
            }
            
            return false;
        }

        private string ExtractTextFromLine(string line)
        {
            try
            {
                // Extract text from parentheses
                int startParen = line.IndexOf('(');
                if (startParen >= 0)
                {
                    int endParen = line.IndexOf(')', startParen);
                    if (endParen > startParen)
                    {
                        return line.Substring(startParen + 1, endParen - startParen - 1);
                    }
                }

                // Extract text from angle brackets (hex strings)
                int startAngle = line.IndexOf('<');
                if (startAngle >= 0)
                {
                    int endAngle = line.IndexOf('>', startAngle);
                    if (endAngle > startAngle)
                    {
                        string hexString = line.Substring(startAngle + 1, endAngle - startAngle - 1);
                        return ConvertHexToText(hexString);
                    }
                }

                // Extract from array format [(...) (...)] TJ
                int startBracket = line.IndexOf('[');
                if (startBracket >= 0)
                {
                    int endBracket = line.IndexOf(']', startBracket);
                    if (endBracket > startBracket)
                    {
                        string arrayContent = line.Substring(startBracket + 1, endBracket - startBracket - 1);
                        return ExtractTextFromArray(arrayContent);
                    }
                }

                return "";
            }
            catch
            {
                return "";
            }
        }

        private string ExtractTextFromArray(string arrayContent)
        {
            var result = new StringBuilder();
            bool inParens = false;
            var currentText = new StringBuilder();

            for (int i = 0; i < arrayContent.Length; i++)
            {
                char c = arrayContent[i];
                
                if (c == '(' && !inParens)
                {
                    inParens = true;
                    currentText.Clear();
                }
                else if (c == ')' && inParens)
                {
                    inParens = false;
                    result.Append(currentText.ToString());
                    currentText.Clear();
                }
                else if (inParens)
                {
                    currentText.Append(c);
                }
            }

            return result.ToString();
        }

        private string ConvertHexToText(string hexString)
        {
            try
            {
                var result = new StringBuilder();
                for (int i = 0; i < hexString.Length; i += 2)
                {
                    if (i + 1 < hexString.Length)
                    {
                        string hex = hexString.Substring(i, 2);
                        int value = Convert.ToInt32(hex, 16);
                        result.Append((char)value);
                    }
                }
                return result.ToString();
            }
            catch
            {
                return "";
            }
        }
    }
 }

 /* 
 .Net 8 Console App

 REQUIRED NUGET PACKAGES:
 To use this program, you need to install the following NuGet packages:

 1. itext (main package)
   Install-Package itext

 2. itext.bouncy-castle-adapter (for encryption support)
   Install-Package itext.bouncy-castle-adapter

 USAGE INSTRUCTIONS:

 1. Create a new Console Application in Visual Studio
 2. Install the required NuGet packages
 3. Replace the default Program.cs with this code
 4. Build the project
 5. Run from command line:
   PDFTextRemover.exe "C:\Path\To\PDFs" "hide01.ir"

 FEATURES:

 - Completely removes text from PDF content streams (not just covering)
 - Searches all PDF files in the specified directory (including subdirectories)
 - Creates backup copies of original files
 - Removes entire text blocks that contain the target string
 - Provides detailed console output showing progress
 - Handles errors gracefully
 - Maintains PDF structure and formatting

 TECHNICAL APPROACH:

 - Parses PDF content streams at the operator level
 - Identifies text showing operators (Tj, TJ, ', ")
 - Removes entire text objects that contain the target text
 - Rebuilds content streams without the unwanted text
 - Uses low-level PDF manipulation for complete text removal

 NOTES:

 - Text is completely removed from the PDF, not just hidden
 - Original files are backed up with "_backup" suffix
 - The program removes entire text blocks containing the target string
 - This approach ensures no traces of the text remain in the PDF
 */
	using System;
	using System.IO;
	using System.Linq;
	using System.Collections.Generic;
	using System.Text;
	using iText.Kernel.Pdf;
	using iText.Kernel.Pdf.Canvas.Parser;
	using iText.Kernel.Pdf.Canvas.Parser.Listener;
	using iText.Kernel.Pdf.Canvas.Parser.Data;
	using iText.Kernel.Pdf.Canvas;
	using iText.IO.Source;

	namespace PDFTextRemover
	{
	class Program
	{
	static void Main(string[] args)
	{
	if (args.Length < 2)
	{
	Console.WriteLine("Usage: PDFTextRemover.exe <directory_path> <text_to_remove>");
	Console.WriteLine("Example: PDFTextRemover.exe \"C:\\MyPDFs\" \"foo\"");
	return;
	}

	string directoryPath = args[0];
	string textToRemove = args[1];

	if (!Directory.Exists(directoryPath))
	{
	Console.WriteLine($"Directory not found: {directoryPath}");
	return;
	}

	var pdfRemover = new PDFTextRemover();
	pdfRemover.ProcessDirectory(directoryPath, textToRemove);
	}
	}

	public class PDFTextRemover
	{
	public void ProcessDirectory(string directoryPath, string textToRemove)
	{
	try
	{
	// Find all PDF files in the directory
	string[] pdfFiles = Directory.GetFiles(directoryPath, "*.pdf", SearchOption.AllDirectories);

	Console.WriteLine($"Found {pdfFiles.Length} PDF files in directory: {directoryPath}");

	foreach (string pdfPath in pdfFiles)
	{
	Console.WriteLine($"Processing: {Path.GetFileName(pdfPath)}");
	ProcessPDFFile(pdfPath, textToRemove);
	}

	Console.WriteLine("Processing completed!");
	}
	catch (Exception ex)
	{
	Console.WriteLine($"Error processing directory: {ex.Message}");
	}
	}

	private void ProcessPDFFile(string pdfPath, string textToRemove)
	{
	try
	{
	// Create backup of original file
	string backupPath = pdfPath.Replace(".pdf", "_backup.pdf");
	File.Copy(pdfPath, backupPath, true);
	Console.WriteLine($"Backup created: {Path.GetFileName(backupPath)}");

	// Process the PDF
	string tempPath = pdfPath.Replace(".pdf", "_temp.pdf");

	using (PdfReader reader = new PdfReader(pdfPath))
	using (PdfWriter writer = new PdfWriter(tempPath))
	using (PdfDocument pdfDoc = new PdfDocument(reader, writer))
	{
	int totalPages = pdfDoc.GetNumberOfPages();
	int totalOccurrences = 0;

	for (int pageNum = 1; pageNum <= totalPages; pageNum++)
	{
	PdfPage page = pdfDoc.GetPage(pageNum);
	int occurrencesOnPage = RemoveTextFromPageContent(page, textToRemove);
	totalOccurrences += occurrencesOnPage;

	if (occurrencesOnPage > 0)
	{
	Console.WriteLine($" Page {pageNum}: Removed {occurrencesOnPage} occurrence(s)");
	}
	}

	Console.WriteLine($" Total occurrences removed: {totalOccurrences}");
	}

	// Replace original file with processed version
	if (File.Exists(tempPath))
	{
	File.Delete(pdfPath);
	File.Move(tempPath, pdfPath);
	Console.WriteLine($" File updated: {Path.GetFileName(pdfPath)}");
	}
	}
	catch (Exception ex)
	{
	Console.WriteLine($" Error processing {Path.GetFileName(pdfPath)}: {ex.Message}");
	}
	}

	private int RemoveTextFromPageContent(PdfPage page, string textToRemove)
	{
	try
	{
	// First check if the text exists on this page
	string pageText = PdfTextExtractor.GetTextFromPage(page);
	if (!pageText.Contains(textToRemove))
	{
	return 0; // No occurrences found
	}

	// Use a more reliable approach with content stream editing
	var editor = new TextRemovalContentEditor(textToRemove);
	return editor.EditPage(page);
	}
	catch (Exception ex)
	{
	Console.WriteLine($" Error processing page content: {ex.Message}");
	return 0;
	}
	}
	}

	// Simplified and more reliable text removal editor
	public class TextRemovalContentEditor
	{
	private readonly string targetText;
	private int removedCount;

	public TextRemovalContentEditor(string targetText)
	{
	this.targetText = targetText;
	this.removedCount = 0;
	}

	public int EditPage(PdfPage page)
	{
	removedCount = 0;

	try
	{
	// Get the current content
	var contentBytes = page.GetContentBytes();
	string content = System.Text.Encoding.UTF8.GetString(contentBytes);

	// Process the content to remove text operations containing our target
	string cleanedContent = RemoveTextOperations(content);

	// Update the page content if changes were made
	if (cleanedContent != content)
	{
	var newContentBytes = System.Text.Encoding.UTF8.GetBytes(cleanedContent);
	page.Put(PdfName.Contents, new PdfStream(newContentBytes));
	}

	return removedCount;
	}
	catch (Exception ex)
	{
	Console.WriteLine($"Error editing page: {ex.Message}");
	return 0;
	}
	}

	private string RemoveTextOperations(string content)
	{
	var lines = content.Split('\n').ToList();
	var result = new List<string>();
	var currentTextBlock = new List<string>();
	bool inTextObject = false;

	for (int i = 0; i < lines.Count; i++)
	{
	string line = lines[i].Trim();

	if (line == "BT") // Begin text object
	{
	inTextObject = true;
	currentTextBlock.Clear();
	currentTextBlock.Add(lines[i]); // Keep original formatting
	}
	else if (line == "ET") // End text object
	{
	currentTextBlock.Add(lines[i]);

	// Check if this text block contains our target text
	string textBlockContent = string.Join("\n", currentTextBlock);

	if (ContainsTargetText(textBlockContent))
	{
	removedCount++;
	// Skip adding this text block (effectively removing it)
	Console.WriteLine($" Removing text block containing: {targetText}");
	}
	else
	{
	// Keep this text block
	result.AddRange(currentTextBlock);
	}

	inTextObject = false;
	currentTextBlock.Clear();
	}
	else if (inTextObject)
	{
	currentTextBlock.Add(lines[i]);
	}
	else
	{
	// Non-text content, always keep
	result.Add(lines[i]);
	}
	}

	return string.Join("\n", result);
	}

	private bool ContainsTargetText(string textBlock)
	{
	// Look for common text showing operators and check their content
	var lines = textBlock.Split('\n');

	foreach (string line in lines)
	{
	string trimmedLine = line.Trim();

	// Check for text showing operators
	if (trimmedLine.EndsWith("Tj") \|\| trimmedLine.EndsWith("TJ") \|\|
	trimmedLine.EndsWith("'") \|\| trimmedLine.EndsWith("\""))
	{
	string extractedText = ExtractTextFromLine(trimmedLine);
	if (extractedText.Contains(targetText))
	{
	return true;
	}
	}
	}

	return false;
	}

	private string ExtractTextFromLine(string line)
	{
	try
	{
	// Extract text from parentheses
	int startParen = line.IndexOf('(');
	if (startParen >= 0)
	{
	int endParen = line.IndexOf(')', startParen);
	if (endParen > startParen)
	{
	return line.Substring(startParen + 1, endParen - startParen - 1);
	}
	}

	// Extract text from angle brackets (hex strings)
	int startAngle = line.IndexOf('<');
	if (startAngle >= 0)
	{
	int endAngle = line.IndexOf('>', startAngle);
	if (endAngle > startAngle)
	{
	string hexString = line.Substring(startAngle + 1, endAngle - startAngle - 1);
	return ConvertHexToText(hexString);
	}
	}

	// Extract from array format [(...) (...)] TJ
	int startBracket = line.IndexOf('[');
	if (startBracket >= 0)
	{
	int endBracket = line.IndexOf(']', startBracket);
	if (endBracket > startBracket)
	{
	string arrayContent = line.Substring(startBracket + 1, endBracket - startBracket - 1);
	return ExtractTextFromArray(arrayContent);
	}
	}

	return "";
	}
	catch
	{
	return "";
	}
	}

	private string ExtractTextFromArray(string arrayContent)
	{
	var result = new StringBuilder();
	bool inParens = false;
	var currentText = new StringBuilder();

	for (int i = 0; i < arrayContent.Length; i++)
	{
	char c = arrayContent[i];

	if (c == '(' && !inParens)
	{
	inParens = true;
	currentText.Clear();
	}
	else if (c == ')' && inParens)
	{
	inParens = false;
	result.Append(currentText.ToString());
	currentText.Clear();
	}
	else if (inParens)
	{
	currentText.Append(c);
	}
	}

	return result.ToString();
	}

	private string ConvertHexToText(string hexString)
	{
	try
	{
	var result = new StringBuilder();
	for (int i = 0; i < hexString.Length; i += 2)
	{
	if (i + 1 < hexString.Length)
	{
	string hex = hexString.Substring(i, 2);
	int value = Convert.ToInt32(hex, 16);
	result.Append((char)value);
	}
	}
	return result.ToString();
	}
	catch
	{
	return "";
	}
	}
	}
	}

	/*
	.Net 8 Console App

	REQUIRED NUGET PACKAGES:
	To use this program, you need to install the following NuGet packages:

	1. itext (main package)
	Install-Package itext

	2. itext.bouncy-castle-adapter (for encryption support)
	Install-Package itext.bouncy-castle-adapter

	USAGE INSTRUCTIONS:

	1. Create a new Console Application in Visual Studio
	2. Install the required NuGet packages
	3. Replace the default Program.cs with this code
	4. Build the project
	5. Run from command line:
	PDFTextRemover.exe "C:\Path\To\PDFs" "hide01.ir"

	FEATURES:

	- Completely removes text from PDF content streams (not just covering)
	- Searches all PDF files in the specified directory (including subdirectories)
	- Creates backup copies of original files
	- Removes entire text blocks that contain the target string
	- Provides detailed console output showing progress
	- Handles errors gracefully
	- Maintains PDF structure and formatting

	TECHNICAL APPROACH:

	- Parses PDF content streams at the operator level
	- Identifies text showing operators (Tj, TJ, ', ")
	- Removes entire text objects that contain the target text
	- Rebuilds content streams without the unwanted text
	- Uses low-level PDF manipulation for complete text removal

	NOTES:

	- Text is completely removed from the PDF, not just hidden
	- Original files are backed up with "_backup" suffix
	- The program removes entire text blocks containing the target string
	- This approach ensures no traces of the text remain in the PDF
	*/