|
import React, { useState, useEffect } from 'react'; |
|
import { Clipboard, Info, BarChart } from 'lucide-react'; |
|
import _ from 'lodash'; |
|
|
|
// Main component for the Code Sanitizer Tool |
|
const CodeSanitizer = () => { |
|
const [inputCode, setInputCode] = useState(''); |
|
const [cleanedCode, setCleanedCode] = useState(''); |
|
const [processing, setProcessing] = useState(false); |
|
const [copySuccess, setCopySuccess] = useState(false); |
|
const [showDetails, setShowDetails] = useState(false); |
|
const [detectedIssues, setDetectedIssues] = useState([]); |
|
const [codeStats, setCodeStats] = useState(null); |
|
const [showCodeAnalysis, setShowCodeAnalysis] = useState(false); |
|
|
|
// Process input code when it changes |
|
useEffect(() => { |
|
if (inputCode) { |
|
processCode(inputCode); |
|
} else { |
|
setCleanedCode(''); |
|
setDetectedIssues([]); |
|
setCodeStats(null); |
|
} |
|
}, [inputCode]); |
|
|
|
// Process the code to remove invisible characters and normalize it |
|
const processCode = (code) => { |
|
setProcessing(true); |
|
|
|
// Create a list to store detected issues |
|
const issues = []; |
|
const issueDetails = {}; |
|
|
|
// Step 1: Detect and replace zero-width characters |
|
const zeroWidthPattern = /[\u200B-\u200D\uFEFF\u2060]/g; |
|
let zeroWidthMatches = code.match(zeroWidthPattern) || []; |
|
const hasZeroWidth = zeroWidthMatches.length > 0; |
|
if (hasZeroWidth) { |
|
issues.push('Zero-width characters detected and removed'); |
|
issueDetails['zeroWidth'] = { |
|
count: zeroWidthMatches.length, |
|
description: 'Invisible characters that can hide malicious code' |
|
}; |
|
} |
|
let processed = code.replace(zeroWidthPattern, ''); |
|
|
|
// Step 2: Handle non-standard whitespace characters |
|
const nonStandardWhitespace = /[\u00A0\u2000-\u200A\u202F\u205F\u3000]/g; |
|
let whitespaceMatches = processed.match(nonStandardWhitespace) || []; |
|
const hasNonStandardWhitespace = whitespaceMatches.length > 0; |
|
if (hasNonStandardWhitespace) { |
|
issues.push('Non-standard whitespace characters normalized'); |
|
issueDetails['whitespace'] = { |
|
count: whitespaceMatches.length, |
|
description: 'Unusual whitespace characters that look like spaces but have different code points' |
|
}; |
|
} |
|
processed = processed.replace(nonStandardWhitespace, ' '); |
|
|
|
// Step 3: Handle homoglyphs (characters that look similar but are different) |
|
const homoglyphMap = { |
|
'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', |
|
'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x', 'y': 'y', 'z': 'z', 'a': 'a', |
|
'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g', 'h': 'h', |
|
'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', '0': '0', '1': '1', |
|
'2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', |
|
'9': '9', ';': ';', '(': '(', ')': ')', '[': '[', ']': ']', '{': '{', |
|
'}': '}', ':': ':', ',': ',', '.': '.', '+': '+', '-': '-', '=': '=', |
|
'*': '*', '/': '/', '\': '\\', '|': '|', '&': '&', '^': '^', '%': '%', |
|
'$': '$', '#': '#', '@': '@', '!': '!', '?': '?', '『': '"', '』': '"', |
|
'«': '"', '»': '"', '′': "'", '″': '"', '‹': '<', '›': '>' |
|
}; |
|
|
|
let homoglyphCount = 0; |
|
let homoglyphInstances = {}; |
|
|
|
for (let char in homoglyphMap) { |
|
const regex = new RegExp(char, 'g'); |
|
const matches = processed.match(regex) || []; |
|
const count = matches.length; |
|
|
|
if (count > 0) { |
|
homoglyphCount += count; |
|
homoglyphInstances[char] = { |
|
replacement: homoglyphMap[char], |
|
count: count |
|
}; |
|
processed = processed.split(char).join(homoglyphMap[char]); |
|
} |
|
} |
|
|
|
if (homoglyphCount > 0) { |
|
issues.push('Homoglyph characters replaced with standard ASCII equivalents'); |
|
issueDetails['homoglyphs'] = { |
|
count: homoglyphCount, |
|
instances: homoglyphInstances, |
|
description: 'Characters that visually resemble standard ASCII but use different Unicode code points' |
|
}; |
|
} |
|
|
|
// Step 4: Normalize quotation marks and other punctuation |
|
const smartQuotes = /[\u2018\u2019\u201C\u201D\u2032\u2033\u2035\u2036]/g; |
|
let smartQuotesMatches = processed.match(smartQuotes) || []; |
|
const hasSmartQuotes = smartQuotesMatches.length > 0; |
|
if (hasSmartQuotes) { |
|
issues.push('Smart quotes normalized to standard quotes'); |
|
issueDetails['smartQuotes'] = { |
|
count: smartQuotesMatches.length, |
|
description: 'Curly or smart quotes that can cause parsing errors in code' |
|
}; |
|
} |
|
processed = processed.replace(/[\u2018\u2019\u2032\u2035]/g, "'") |
|
.replace(/[\u201C\u201D\u2033\u2036]/g, '"'); |
|
|
|
// Step 5: Normalize line endings |
|
const crlfMatches = processed.match(/\r\n/g) || []; |
|
const crMatches = processed.match(/\r(?!\n)/g) || []; |
|
const totalLineEndingIssues = crlfMatches.length + crMatches.length; |
|
|
|
if (totalLineEndingIssues > 0) { |
|
issues.push('Mixed line endings normalized to LF (\\n)'); |
|
issueDetails['lineEndings'] = { |
|
count: totalLineEndingIssues, |
|
crlfCount: crlfMatches.length, |
|
crCount: crMatches.length, |
|
description: 'Inconsistent line endings that can cause issues in version control and parsing' |
|
}; |
|
} |
|
processed = processed.replace(/\r\n?/g, '\n'); |
|
|
|
// Step 6: Check for potential control characters |
|
const controlChars = /[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g; |
|
let controlMatches = processed.match(controlChars) || []; |
|
const hasControlChars = controlMatches.length > 0; |
|
if (hasControlChars) { |
|
issues.push('Control characters detected and removed'); |
|
issueDetails['controlChars'] = { |
|
count: controlMatches.length, |
|
description: 'Non-printable control characters that can affect execution or be used for obfuscation' |
|
}; |
|
processed = processed.replace(controlChars, ''); |
|
} |
|
|
|
// Gather code statistics for the analysis panel |
|
const stats = analyzeCode(processed); |
|
|
|
// Update state with processed code and detected issues |
|
setCleanedCode(processed); |
|
setDetectedIssues(issues); |
|
setCodeStats({ |
|
...stats, |
|
issueDetails: issueDetails, |
|
totalIssuesFixed: Object.values(issueDetails).reduce((sum, detail) => sum + detail.count, 0), |
|
issueCategories: issues.length, |
|
originalLength: code.length, |
|
cleanedLength: processed.length, |
|
charactersRemoved: code.length - processed.length |
|
}); |
|
setProcessing(false); |
|
}; |
|
|
|
// Analyze the cleaned code to gather statistics |
|
const analyzeCode = (code) => { |
|
// Skip if no code |
|
if (!code) return null; |
|
|
|
// Line count |
|
const lines = code.split('\n'); |
|
const lineCount = lines.length; |
|
|
|
// Character counts |
|
const alphaCount = (code.match(/[a-zA-Z]/g) || []).length; |
|
const digitCount = (code.match(/\d/g) || []).length; |
|
const spaceCount = (code.match(/\s/g) || []).length; |
|
const symbolCount = code.length - alphaCount - digitCount - spaceCount; |
|
|
|
// Detect language patterns |
|
const languagePatterns = { |
|
javascript: { |
|
regex: /function\s|\bconst\b|\blet\b|\bvar\b|\=\>|\bimport\b|\brequire\b|\bexport\b/g, |
|
count: 0, |
|
name: "JavaScript" |
|
}, |
|
python: { |
|
regex: /\bdef\b|\bimport\b|\bif\s+__name__\s*==\s*('|")__main__\1:|\bclass\b\s+\w+\s*:|:\s*$/g, |
|
count: 0, |
|
name: "Python" |
|
}, |
|
html: { |
|
regex: /<\/?[a-z][\s\S]*>/i, |
|
count: 0, |
|
name: "HTML" |
|
}, |
|
css: { |
|
regex: /[\.\#][\w\-]+\s*\{|\@media|\@import|[\w\-]+\s*:\s*[\w\-]+/g, |
|
count: 0, |
|
name: "CSS" |
|
}, |
|
sql: { |
|
regex: /\bSELECT\b|\bFROM\b|\bWHERE\b|\bJOIN\b|\bGROUP BY\b|\bORDER BY\b/gi, |
|
count: 0, |
|
name: "SQL" |
|
} |
|
}; |
|
|
|
// Count pattern matches for each language |
|
for (const lang in languagePatterns) { |
|
const matches = code.match(languagePatterns[lang].regex) || []; |
|
languagePatterns[lang].count = matches.length; |
|
} |
|
|
|
// Determine likely language based on pattern counts |
|
let likelyLanguage = "Unknown"; |
|
let maxCount = 0; |
|
|
|
for (const lang in languagePatterns) { |
|
if (languagePatterns[lang].count > maxCount) { |
|
maxCount = languagePatterns[lang].count; |
|
likelyLanguage = languagePatterns[lang].name; |
|
} |
|
} |
|
|
|
// Check for indentation style (spaces vs tabs) |
|
const leadingSpaces = (code.match(/^\s+/gm) || []); |
|
const tabCount = leadingSpaces.filter(s => s.includes('\t')).length; |
|
const spaceIndentCount = leadingSpaces.length - tabCount; |
|
|
|
// Determine most common indentation level (for spaces) |
|
const indentSizes = {}; |
|
let mostCommonIndentSize = 0; |
|
let mostCommonIndentCount = 0; |
|
|
|
leadingSpaces.forEach(space => { |
|
if (!space.includes('\t')) { |
|
const size = space.length; |
|
indentSizes[size] = (indentSizes[size] || 0) + 1; |
|
|
|
if (indentSizes[size] > mostCommonIndentCount) { |
|
mostCommonIndentCount = indentSizes[size]; |
|
mostCommonIndentSize = size; |
|
} |
|
} |
|
}); |
|
|
|
return { |
|
lineCount, |
|
charactersTotal: code.length, |
|
charactersAlpha: alphaCount, |
|
charactersDigit: digitCount, |
|
charactersSpace: spaceCount, |
|
charactersSymbol: symbolCount, |
|
likelyLanguage, |
|
indentation: { |
|
tabs: tabCount, |
|
spaces: spaceIndentCount, |
|
mostCommonIndentSize |
|
} |
|
}; |
|
}; |
|
|
|
// Handle copy to clipboard |
|
const handleCopy = () => { |
|
navigator.clipboard.writeText(cleanedCode).then(() => { |
|
setCopySuccess(true); |
|
setTimeout(() => setCopySuccess(false), 2000); |
|
}); |
|
}; |
|
|
|
// Toggle details section |
|
const toggleDetails = () => { |
|
setShowDetails(!showDetails); |
|
}; |
|
|
|
// Toggle code analysis section |
|
const toggleCodeAnalysis = () => { |
|
setShowCodeAnalysis(!showCodeAnalysis); |
|
}; |
|
|
|
return ( |
|
<div className="flex flex-col w-full max-w-6xl mx-auto p-4 space-y-6"> |
|
<div className="text-center"> |
|
<h1 className="text-2xl font-bold mb-2">Code Sanitizer Tool</h1> |
|
<p className="text-gray-600">Detects and removes invisible characters, homoglyphs, and other potential issues in code</p> |
|
</div> |
|
|
|
<div className="grid grid-cols-1 md:grid-cols-2 gap-4"> |
|
<div className="flex flex-col"> |
|
<label className="font-medium mb-2">Paste Code with Potential Issues:</label> |
|
<textarea |
|
className="w-full h-64 p-2 border border-gray-300 rounded font-mono text-sm resize-none" |
|
value={inputCode} |
|
onChange={(e) => setInputCode(e.target.value)} |
|
placeholder="Paste your code here..." |
|
/> |
|
</div> |
|
|
|
<div className="flex flex-col"> |
|
<div className="flex justify-between items-center mb-2"> |
|
<label className="font-medium">Sanitized Code:</label> |
|
<button |
|
onClick={handleCopy} |
|
disabled={!cleanedCode} |
|
className="flex items-center space-x-1 px-3 py-1 bg-blue-600 text-white rounded hover:bg-blue-700 disabled:bg-gray-400" |
|
> |
|
<Clipboard size={16} /> |
|
<span>{copySuccess ? 'Copied!' : 'Copy'}</span> |
|
</button> |
|
</div> |
|
<textarea |
|
className="w-full h-64 p-2 border border-gray-300 rounded font-mono text-sm resize-none bg-gray-50" |
|
value={cleanedCode} |
|
readOnly |
|
placeholder="Sanitized code will appear here..." |
|
/> |
|
</div> |
|
</div> |
|
|
|
{detectedIssues.length > 0 && ( |
|
<div className="mt-4"> |
|
<button |
|
onClick={toggleDetails} |
|
className="flex items-center text-blue-600 hover:text-blue-800 font-medium" |
|
> |
|
{showDetails ? 'Hide' : 'Show'} Details |
|
({detectedIssues.length} issue{detectedIssues.length !== 1 ? 's' : ''} detected) |
|
</button> |
|
|
|
{showDetails && ( |
|
<div className="mt-2 p-3 bg-yellow-50 border border-yellow-200 rounded"> |
|
<h3 className="font-medium mb-2">Detected Issues:</h3> |
|
<ul className="list-disc pl-5 space-y-1"> |
|
{detectedIssues.map((issue, index) => ( |
|
<li key={index} className="text-sm">{issue}</li> |
|
))} |
|
</ul> |
|
<div className="mt-3 text-sm text-gray-600"> |
|
<p>This tool has removed or normalized potentially problematic characters to create clean, consistent code.</p> |
|
</div> |
|
</div> |
|
)} |
|
</div> |
|
)} |
|
|
|
{codeStats && inputCode && ( |
|
<div className="mt-4"> |
|
<button |
|
onClick={toggleCodeAnalysis} |
|
className="flex items-center text-blue-600 hover:text-blue-800 font-medium" |
|
> |
|
<BarChart size={16} className="mr-1" /> |
|
{showCodeAnalysis ? 'Hide' : 'Show'} Code Analysis Summary |
|
</button> |
|
|
|
{showCodeAnalysis && ( |
|
<div className="mt-2 p-4 bg-blue-50 border border-blue-200 rounded"> |
|
<h3 className="font-medium text-lg mb-3">Code Analysis Summary</h3> |
|
|
|
<div className="grid grid-cols-1 md:grid-cols-2 gap-4"> |
|
<div> |
|
<h4 className="font-medium mb-2">Sanitization Results</h4> |
|
<ul className="text-sm space-y-1"> |
|
<li> |
|
<span className="font-medium">Issues Fixed:</span> {codeStats.totalIssuesFixed} problematic characters in {codeStats.issueCategories} categories |
|
</li> |
|
<li> |
|
<span className="font-medium">Characters Removed:</span> {codeStats.charactersRemoved} ({((codeStats.charactersRemoved / codeStats.originalLength) * 100).toFixed(2)}% of original) |
|
</li> |
|
{Object.entries(codeStats.issueDetails).map(([key, detail]) => ( |
|
<li key={key} className="pl-2 text-gray-600"> |
|
• {detail.count} {detail.description} |
|
</li> |
|
))} |
|
</ul> |
|
</div> |
|
|
|
<div> |
|
<h4 className="font-medium mb-2">Code Statistics</h4> |
|
<ul className="text-sm space-y-1"> |
|
<li> |
|
<span className="font-medium">Detected Language:</span> {codeStats.likelyLanguage} |
|
</li> |
|
<li> |
|
<span className="font-medium">Line Count:</span> {codeStats.lineCount} lines |
|
</li> |
|
<li> |
|
<span className="font-medium">Character Composition:</span> |
|
<ul className="pl-4 pt-1"> |
|
<li> |
|
<div className="flex items-center"> |
|
<div className="w-24">Alphabetic:</div> |
|
<div className="w-16">{codeStats.charactersAlpha}</div> |
|
<div className="w-20 bg-gray-200 h-3 rounded overflow-hidden"> |
|
<div |
|
className="bg-blue-600 h-full" |
|
style={{width: `${(codeStats.charactersAlpha / codeStats.charactersTotal) * 100}%`}} |
|
></div> |
|
</div> |
|
<div className="ml-2 text-xs"> |
|
{((codeStats.charactersAlpha / codeStats.charactersTotal) * 100).toFixed(1)}% |
|
</div> |
|
</div> |
|
</li> |
|
<li> |
|
<div className="flex items-center"> |
|
<div className="w-24">Numeric:</div> |
|
<div className="w-16">{codeStats.charactersDigit}</div> |
|
<div className="w-20 bg-gray-200 h-3 rounded overflow-hidden"> |
|
<div |
|
className="bg-green-600 h-full" |
|
style={{width: `${(codeStats.charactersDigit / codeStats.charactersTotal) * 100}%`}} |
|
></div> |
|
</div> |
|
<div className="ml-2 text-xs"> |
|
{((codeStats.charactersDigit / codeStats.charactersTotal) * 100).toFixed(1)}% |
|
</div> |
|
</div> |
|
</li> |
|
<li> |
|
<div className="flex items-center"> |
|
<div className="w-24">Whitespace:</div> |
|
<div className="w-16">{codeStats.charactersSpace}</div> |
|
<div className="w-20 bg-gray-200 h-3 rounded overflow-hidden"> |
|
<div |
|
className="bg-yellow-500 h-full" |
|
style={{width: `${(codeStats.charactersSpace / codeStats.charactersTotal) * 100}%`}} |
|
></div> |
|
</div> |
|
<div className="ml-2 text-xs"> |
|
{((codeStats.charactersSpace / codeStats.charactersTotal) * 100).toFixed(1)}% |
|
</div> |
|
</div> |
|
</li> |
|
<li> |
|
<div className="flex items-center"> |
|
<div className="w-24">Symbols:</div> |
|
<div className="w-16">{codeStats.charactersSymbol}</div> |
|
<div className="w-20 bg-gray-200 h-3 rounded overflow-hidden"> |
|
<div |
|
className="bg-purple-600 h-full" |
|
style={{width: `${(codeStats.charactersSymbol / codeStats.charactersTotal) * 100}%`}} |
|
></div> |
|
</div> |
|
<div className="ml-2 text-xs"> |
|
{((codeStats.charactersSymbol / codeStats.charactersTotal) * 100).toFixed(1)}% |
|
</div> |
|
</div> |
|
</li> |
|
</ul> |
|
</li> |
|
<li> |
|
<span className="font-medium">Indentation Style:</span> {codeStats.indentation.tabs > codeStats.indentation.spaces ? 'Tabs' : 'Spaces'} |
|
{codeStats.indentation.spaces > codeStats.indentation.tabs && codeStats.indentation.mostCommonIndentSize > 0 && ( |
|
<span> ({codeStats.indentation.mostCommonIndentSize} spaces)</span> |
|
)} |
|
</li> |
|
</ul> |
|
</div> |
|
</div> |
|
</div> |
|
)} |
|
</div> |
|
)} |
|
|
|
<div className="mt-4 p-4 bg-blue-50 border border-blue-200 rounded"> |
|
<h2 className="font-medium mb-2">How to Use:</h2> |
|
<ol className="list-decimal pl-5 space-y-1 text-sm"> |
|
<li>Copy code that might contain hidden or problematic characters</li> |
|
<li>Paste it into the left text area</li> |
|
<li>The tool will automatically detect and clean the code</li> |
|
<li>Review any detected issues in the details section</li> |
|
<li>View the code analysis summary for additional insights</li> |
|
<li>Copy the sanitized code from the right text area</li> |
|
</ol> |
|
</div> |
|
</div> |
|
); |
|
}; |
|
|
|
export default CodeSanitizer; |