Created
October 27, 2016 09:40
-
-
Save lukaszkalnik/d60471385877e7684c402ec5400dd94c to your computer and use it in GitHub Desktop.
Markdown - linearizing nested asterisks (e.g. **Bold *bold italic*** => **Bold** ***bold italic***)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static String preprocessNestedAsterisks(String markdownText) { | |
// Here we treat double/single asterisks (emphasis) | |
// because nested emphasis (**for example *like* this**) | |
// is not interpreted correctly by the Bypass lib | |
String[] doubleAsteriskSplitMarkdown = markdownText.split("\\*\\*", -1); | |
int numberOfDoubleAsteriskedTokens = doubleAsteriskSplitMarkdown.length; | |
// strings surrounded by double asterisks are always on the odd array positions | |
for (int i = 1; i < numberOfDoubleAsteriskedTokens; i += 2) { | |
// Because of our simplified approach using markdownText.split("\\*\\*") above, | |
// in case of trailing triple asterisks (...bold italic text***), | |
// the first two asterisks get matched by the split() method, and the last asterisk | |
// is moved inappropriately to the next token in the process (whereas it still belongs | |
// to the current token, as a closing asterisk of the inner ...italic text*). | |
// | |
// So we have to restore any trailing asterisks which got moved to the next tokens | |
// inadvertently. | |
// This is actually simple, because only odd tokens can be the *italic text* tokens. | |
// So no even token can start with an asterisk. | |
if (i < numberOfDoubleAsteriskedTokens - 1 && doubleAsteriskSplitMarkdown[i + 1].startsWith("*")) { | |
doubleAsteriskSplitMarkdown[i] += "*"; | |
doubleAsteriskSplitMarkdown[i + 1] = doubleAsteriskSplitMarkdown[i + 1].substring(1); | |
} | |
// now looking inside the **external string** (surrounded by double asterisks) | |
// for possible *nested string* (surrounded by single asterisks) | |
String[] singleAsteriskSplitContents = doubleAsteriskSplitMarkdown[i].split("\\*", -1); | |
int numberOfSingleAsteriskedTokens = singleAsteriskSplitContents.length; | |
if (numberOfSingleAsteriskedTokens > 2) { | |
// at least one *nested string* found => | |
// | |
// exchange all the occurrences to ** ***nested string*** ** | |
// (adding two asterisks to close each part of the **external string**, a space in between | |
// and also adding two asterisks on each side of the nested string to make a triple emphasized ***nested string***). | |
// | |
// So it will look like this: | |
// | |
// Input string before conversion: | |
// **Begin the external string *here the nested string* and here the external string continues** | |
// | |
// After conversion, correctly interpreted by Bypass: | |
// **Begin the external string** ***here the nested string*** **and here the external string continues** | |
StringBuilder builder = new StringBuilder(); | |
String currentToken; | |
for (int j = 0; j < numberOfSingleAsteriskedTokens - 1; j++) { | |
currentToken = singleAsteriskSplitContents[j]; | |
if (j % 2 == 0) { | |
// we are in the double-asterisked-only original Markdown part, e.g.: | |
// **This is the first string part *... | |
// There are 3 special cases though: | |
// First two, if the string had triple asterisks in the beginning or in the end | |
// meaning either the start or the end was bold italic, like this: | |
// ***bold italic text at the start* only bold text here** | |
// **bold text only here *bold italic in the end*** | |
// | |
// Or the whole text could be bold italic (triple asterisks on both sides): | |
// ***bold italic all the way*** | |
// | |
// In any of these cases, we get a token consisting of an empty string at the beginning, | |
// at the end, or both (depending where the triple asterisks were). | |
// Then we leave the triple asterisks as they were - no need to modify anything. | |
if (j == 0 && currentToken.isEmpty()) { | |
// This is the special case where the original Markdown began with | |
// triple asterisks: | |
// ***bold italic at the beginning*... | |
// So we just restore the parsed out delimiter | |
builder.append(currentToken + "*"); | |
} else { | |
// the double-asterisked-only original Markdown part usually ends with a space: | |
// **begin external string *nested string*... | |
// We have to remove the trailing space because it will be re-added in between | |
// the additional asterisks: ** *** | |
currentToken = removeTrailingSpace(currentToken); | |
// if we are in any part other than the very first one (**begin external string ), | |
// it also begins with a space: | |
// *nested string* here the second part... | |
// (subsequent parts are also possible). | |
// | |
// We need to remove this leading space character as well because we have already added it behind | |
// the previous *nested string* (between the five asterisks, like this: *** **). | |
// See also the "else" clause below. | |
if (j != 0) { | |
currentToken = removeLeadingSpace(currentToken); | |
} | |
// now add trailing double asterisks for this string, re-add the trailing space | |
// behind them and add triple leading asterisks for the coming *nested string* | |
builder.append(currentToken + "** ***"); | |
} | |
} else { | |
// This is the text inside the single-asterisked *nested string*. | |
if (j == numberOfSingleAsteriskedTokens - 2 && singleAsteriskSplitContents[j + 1].isEmpty()) { | |
// We are at the token surrounded by single asterisks before the last token | |
// and the following (last) token is empty => | |
// this is another special case, where the original Markdown string | |
// ended with triple asterisks, like this: | |
// ...*bold italic at the end*** | |
// | |
// So we just restore the parsed out delimiter | |
builder.append(currentToken + "*"); | |
} else { | |
// Add trailing triple asterisks for the nested string, a space | |
// and then leading double asterisks for the next part of the external | |
// double-asterisked string | |
builder.append(currentToken + "*** **"); | |
} | |
} | |
} | |
// we add just the last token and no asterisks after it, | |
// first removing the leading space here as well | |
currentToken = singleAsteriskSplitContents[numberOfSingleAsteriskedTokens - 1]; | |
currentToken = removeLeadingSpace(currentToken); | |
builder.append(currentToken); | |
// and now replace the original **external double-asterisked string with *nested* strings** | |
// with our modified one | |
doubleAsteriskSplitMarkdown[i] = builder.toString(); | |
} | |
} | |
// put the split on double asterisks Markdown contents together again | |
StringBuilder builder = new StringBuilder(); | |
for (int i = 0; i < numberOfDoubleAsteriskedTokens - 1; i++) { | |
builder.append(doubleAsteriskSplitMarkdown[i] + "**"); | |
} | |
// last token simply ends with the end of string, so we don't add double asterisks | |
builder.append(doubleAsteriskSplitMarkdown[numberOfDoubleAsteriskedTokens - 1]); | |
return builder.toString(); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment