Created
May 16, 2025 14:28
-
-
Save axayjha/e8308df8fd1d3544e8d525991e6c9c51 to your computer and use it in GitHub Desktop.
q25
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @param {string[]} robotLines The lines of the robots.txt file (only ASCII characters, no newline | |
* characters at the end was already removed). | |
* @return {string[]} All the disallowed url patterns that apply to your search engine. | |
*/ | |
function parseRobotFile(robotLines) { | |
let doeBotRules = []; | |
let wildcardRules = []; | |
// This variable tracks the agent type for the current section being processed. | |
// It can be 'DoeBot', '*', or 'other' (if the section is for a different bot). | |
let currentSectionAgentType = null; | |
const USER_AGENT_PREFIX = "User-agent:"; | |
const DISALLOW_PREFIX = "Disallow:"; | |
for (const rawLine of robotLines) { | |
// "There can be spaces at the beginning and end of a line... They must be ignored." | |
const line = rawLine.trim(); | |
if (line.startsWith(USER_AGENT_PREFIX)) { | |
// "All keywords, search engine names ... are case-sensitive." | |
const agentName = line.substring(USER_AGENT_PREFIX.length).trim(); | |
if (agentName === "DoeBot") { | |
currentSectionAgentType = "DoeBot"; | |
} else if (agentName === "*") { | |
currentSectionAgentType = "*"; | |
} else { | |
currentSectionAgentType = "other"; // Section for another bot | |
} | |
} else if (line.startsWith(DISALLOW_PREFIX)) { | |
// "There is always at least one non-space character after the text Disallow:" | |
// "A uri pattern can have any type of characters (letters, digits, etc.), except spaces." | |
// Trimming here handles spaces between "Disallow:" and the actual pattern. | |
const path = line.substring(DISALLOW_PREFIX.length).trim(); | |
// Add rule only if it's for an applicable agent section | |
// and path is not empty (though problem states it won't be) | |
if (path) { // Ensure path is not empty after trim, though rule says it won't be. | |
if (currentSectionAgentType === "DoeBot") { | |
doeBotRules.push(path); | |
} else if (currentSectionAgentType === "*") { | |
wildcardRules.push(path); | |
} | |
} | |
} | |
// "All lines not starting with Disallow: or User-agent: must be ignored." | |
// This is implicitly handled as other lines don't match the if/else if conditions. | |
} | |
let finalRules; | |
// DoeBot specific rules take precedence. | |
// If there are any rules specifically for "DoeBot", use them. | |
// Otherwise, use rules for "*". | |
if (doeBotRules.length > 0) { | |
finalRules = doeBotRules; | |
} else { | |
finalRules = wildcardRules; | |
} | |
// "Before returning the disallowed url patterns, remove the duplicates and sort them." | |
// Using a Set to handle duplicates. | |
const uniqueRules = [...new Set(finalRules)]; | |
// Sorting alphabetically (default string sort is case-sensitive, as required). | |
uniqueRules.sort(); | |
return uniqueRules; | |
} | |
// Example Test from the image: | |
// const testLines = ["User-agent: *", "Crawl-delay: 10", "Disallow: /administrator/", "User-agent: DoeBot", "Disallow: /includes/", "Disallow: /"]; | |
// console.log(parseRobotFile(testLines)); // Expected: ["/", "/includes/"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment