Created
April 3, 2020 04:29
-
-
Save vishnuvyas/98ddb55dfa5e6fd3e8ac8209ec1ff5b3 to your computer and use it in GitHub Desktop.
Way to extract the context around a particular hit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.*; | |
import java.util.stream.*; | |
class ContextExtractor { | |
public static class TagResult { | |
public String token; | |
public int start; | |
public int end; | |
public TagResult(String t, int s, int e) { | |
this.token = t; | |
this.start = s; | |
this.end = e; | |
} | |
@Override | |
public String toString() { | |
StringBuilder b = new StringBuilder(); | |
b.append(token); | |
b.append(" : "); | |
b.append("{"); | |
b.append(start); | |
b.append(","); | |
b.append(end); | |
b.append("}"); | |
return b.toString(); | |
} | |
} | |
public static String text = | |
"Patient has diabetes mellitus and shows no signs of stopping candy\n"+ "\n" | |
+ "Family History\n" | |
+ "CVA and CHF" | |
+ "\n" | |
+ "Signed by Dr Strange"; | |
public List<TagResult> fakeTagger(String line) { | |
List<String> terms = List.of("diabetes mellitus", | |
"cva"); | |
List<TagResult> taggedItems = new ArrayList<>(); | |
int lastStart = 0; | |
while(lastStart < line.length()) { | |
boolean foundAny = false; | |
for(String term: terms) { | |
String cleanLine = line.toLowerCase().substring(lastStart); | |
if(cleanLine.contains(term)) { | |
int start = lastStart + cleanLine.indexOf(term); | |
int end = start + term.length(); | |
taggedItems.add(new TagResult(term, start, end)); | |
foundAny = true; | |
lastStart = end; | |
} | |
} | |
if(!foundAny) { | |
// this means that no terms were found on this line | |
// so we are going to exit out of this loop. | |
break; | |
} | |
} | |
return taggedItems; | |
} | |
public static class Context { | |
public List<String> pre; | |
public List<String> post; | |
public String token; | |
public Context(List<String> pr, List<String> po, String t) { | |
this.pre = pr; | |
this.post = po; | |
this.token = t; | |
} | |
@Override | |
public String toString() { | |
StringBuilder builder = new StringBuilder(); | |
builder.append("[ "); | |
for(String preTok : pre) { | |
builder.append(preTok); | |
builder.append(", "); | |
} | |
builder.append("] -- "); | |
builder.append("[Tok: " + token + " ] -- "); | |
builder.append("[ "); | |
for(String tok : post) { | |
builder.append(tok); | |
builder.append(", "); | |
} | |
builder.append("]"); | |
return builder.toString(); | |
} | |
} | |
public List<String> tokenize(String line) { | |
ArrayList<String> a = new ArrayList<String>(); | |
a.addAll(Arrays.asList(line.split("\\s+"))); | |
return a; | |
} | |
public List<String> buildContext(List<String> lines, | |
String currentContext, | |
int lineNum, | |
int size, | |
int step) { | |
int contextSizeRemaining = size; | |
int currentLineNum = lineNum; | |
List<String> contextTokens = tokenize(currentContext); | |
// handle the base case where the current context contextTokens | |
// are sufficient or this is the first line or the last line. | |
if(contextTokens.size() == size) { | |
return contextTokens; | |
} else if(contextTokens.size() < size && lineNum <= 0 && step < 0) { | |
return contextTokens; | |
} else if(contextTokens.size() < size && lineNum >= lines.size() && step > 0) { | |
return contextTokens; | |
} else { | |
// this is the case where we can't return the current context | |
// directly. so we have to go to the lines nearby to get the | |
// current context. | |
lineNum += step; | |
contextSizeRemaining -= contextTokens.size(); | |
while(contextSizeRemaining > 0 && lineNum >= 0 && lineNum <= lines.size()) { | |
List<String> extraTokens = tokenize(lines.get(lineNum)); | |
int nToks = Math.min(extraTokens.size(),contextSizeRemaining); | |
int offset = (step<0) ? (extraTokens.size()-1) : 0 ; | |
for(int n = 0; n < nToks; ++n) { | |
int insPos = (step<0) ? 0 : (contextTokens.size()-1); | |
contextTokens.add(insPos,extraTokens.get(offset+(step*n))); | |
contextSizeRemaining--; | |
} | |
} | |
return contextTokens; | |
} | |
} | |
public List<Context> getContexts(String text,int left, int right) { | |
List<Context> contexts = new ArrayList<Context>(); | |
// collect all non-empty lines into a an array called lines. | |
ArrayList<String> lines = new ArrayList<String>(); | |
for(String line : text.split("\n")) { | |
if(line.trim().length() > 0) { | |
lines.add(line); | |
} | |
} | |
for(int lineNum = 0; lineNum < lines.size(); ++lineNum) { | |
String currentLine = lines.get(lineNum); | |
for(TagResult tagResult : fakeTagger(currentLine)) { | |
List<String> preContext = buildContext(lines, | |
currentLine.substring(0, tagResult.start),lineNum,left,-1); | |
List<String> postContext = buildContext(lines, | |
currentLine.substring(tagResult.end),lineNum,right,+1); | |
contexts.add(new Context(preContext,postContext,tagResult.token)); | |
} | |
} | |
return contexts; | |
} | |
public static void main(String[] args) { | |
ContextExtractor m = new ContextExtractor(); | |
m.getContexts(text,5,5).forEach(System.out::println); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment