Skip to content

Instantly share code, notes, and snippets.

@axayjha
Created May 16, 2025 14:28
Show Gist options
  • Save axayjha/e8308df8fd1d3544e8d525991e6c9c51 to your computer and use it in GitHub Desktop.
Save axayjha/e8308df8fd1d3544e8d525991e6c9c51 to your computer and use it in GitHub Desktop.
q25
/**
* @param {string[]} robotLines The lines of the robots.txt file (only ASCII characters, no newline
* characters at the end was already removed).
* @return {string[]} All the disallowed url patterns that apply to your search engine.
*/
function parseRobotFile(robotLines) {
let doeBotRules = [];
let wildcardRules = [];
// This variable tracks the agent type for the current section being processed.
// It can be 'DoeBot', '*', or 'other' (if the section is for a different bot).
let currentSectionAgentType = null;
const USER_AGENT_PREFIX = "User-agent:";
const DISALLOW_PREFIX = "Disallow:";
for (const rawLine of robotLines) {
// "There can be spaces at the beginning and end of a line... They must be ignored."
const line = rawLine.trim();
if (line.startsWith(USER_AGENT_PREFIX)) {
// "All keywords, search engine names ... are case-sensitive."
const agentName = line.substring(USER_AGENT_PREFIX.length).trim();
if (agentName === "DoeBot") {
currentSectionAgentType = "DoeBot";
} else if (agentName === "*") {
currentSectionAgentType = "*";
} else {
currentSectionAgentType = "other"; // Section for another bot
}
} else if (line.startsWith(DISALLOW_PREFIX)) {
// "There is always at least one non-space character after the text Disallow:"
// "A uri pattern can have any type of characters (letters, digits, etc.), except spaces."
// Trimming here handles spaces between "Disallow:" and the actual pattern.
const path = line.substring(DISALLOW_PREFIX.length).trim();
// Add rule only if it's for an applicable agent section
// and path is not empty (though problem states it won't be)
if (path) { // Ensure path is not empty after trim, though rule says it won't be.
if (currentSectionAgentType === "DoeBot") {
doeBotRules.push(path);
} else if (currentSectionAgentType === "*") {
wildcardRules.push(path);
}
}
}
// "All lines not starting with Disallow: or User-agent: must be ignored."
// This is implicitly handled as other lines don't match the if/else if conditions.
}
let finalRules;
// DoeBot specific rules take precedence.
// If there are any rules specifically for "DoeBot", use them.
// Otherwise, use rules for "*".
if (doeBotRules.length > 0) {
finalRules = doeBotRules;
} else {
finalRules = wildcardRules;
}
// "Before returning the disallowed url patterns, remove the duplicates and sort them."
// Using a Set to handle duplicates.
const uniqueRules = [...new Set(finalRules)];
// Sorting alphabetically (default string sort is case-sensitive, as required).
uniqueRules.sort();
return uniqueRules;
}
// Example Test from the image:
// const testLines = ["User-agent: *", "Crawl-delay: 10", "Disallow: /administrator/", "User-agent: DoeBot", "Disallow: /includes/", "Disallow: /"];
// console.log(parseRobotFile(testLines)); // Expected: ["/", "/includes/"]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment