Created
August 21, 2019 06:33
-
-
Save bmwcmw/23679aa93ad2ff1aa44605caec235ca2 to your computer and use it in GitHub Desktop.
Find relative tags
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is the principal idea for the question and some thoughts that I didn't have time to explain. | |
// The simple version - O(CNM) where C number of queries, N number of log lines, M avg length of log line ==> O(NM) ? | |
static Set<String> findRelativeTags(Iterator<String> logStream, String[] queries) { | |
if (queries == null || queries.length == 0) { | |
return Collections.emptySet(); | |
} | |
Set<String> relativeTags = new HashSet<>(); | |
while (logStream.hasNext()) { | |
Set<String> logLineTagSet = splitTagSet(logStream.next()); | |
int matchingCount = 0; | |
for (String query: queries) { | |
if (logLineTagSet.contains(query)) { | |
matchingCount++; | |
} | |
} | |
// If current's log line's tags matched all tags in query | |
if (matchingCount == queries.length) { | |
relativeTags.addAll(logLineTagSet); | |
} | |
} | |
// Remove query tags one time at the end to avoid repetition | |
for (String query: queries) { | |
relativeTags.remove(query); | |
} | |
return relativeTags; | |
} | |
/** | |
* Parse log line's tag section into a set | |
* O(M length of logLine) | |
*/ | |
static Set<String> splitTagSet(String logLine) { | |
if (logLine == null || logLine.isEmpty()) { | |
return Collections.emptySet(); | |
} | |
String[] split = logLine.split("\\|"); // can be optimized using char array index | |
String tagStr = split[split.length - 1]; | |
String[] tags = tagStr.split(","); | |
return new HashSet<>(Arrays.asList(tags)); | |
} | |
// I felt strange to see the API findRelativeTags(log, queries), why the logs are read every time when we query ? | |
// Then I thought it would be more logical to have preprocessing & indexing phrase for tags. | |
// I began talking about some kind of Map-based preprocessing & indexing and we ran out of time. But it seems to be out of scope ? | |
// In the real life, one way could be that we can construct indexation while the system receives every inserted log line, | |
// for example build multiple indexes on all potential searching terms, | |
// then process queries using that pre-built indexation (and maybe update the indexation by some given logic later). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment