Skip to content

Instantly share code, notes, and snippets.

@deton
Last active March 26, 2022 03:57
Show Gist options
  • Save deton/c373c6897f3622989cde9b2c54a23374 to your computer and use it in GitHub Desktop.
Save deton/c373c6897f3622989cde9b2c54a23374 to your computer and use it in GitHub Desktop.
lynx patch not to add spaces on joining lines for Japanese
diff --git a/src/GridText.c b/src/GridText.c
index 04e9a4a..d9a1665 100644
--- a/src/GridText.c
+++ b/src/GridText.c
@@ -453,7 +453,11 @@ struct _HText {
HTList *hidden_links; /* Content-less links ... */
int hiddenlinkflag; /* ... and how to treat them */
BOOL no_cache; /* Always refresh? */
+#ifdef EXP_JAPANESE_SPACES
+ char LastChars[7]; /* utf-8 buffer */
+#else
char LastChar; /* For absorbing white space */
+#endif
/* For Internal use: */
HTStyle *style; /* Current style */
@@ -1134,7 +1138,11 @@ HText *HText_new(HTParentAnchor *anchor)
anchor->post_data)
? YES
: NO);
+#ifdef EXP_JAPANESE_SPACES
+ memset(self->LastChars, 0, sizeof(self->LastChars));
+#else
self->LastChar = '\0';
+#endif
#ifndef USE_PRETTYSRC
if (HTOutputFormat == WWW_SOURCE)
@@ -2867,7 +2875,7 @@ static void split_line(HText *text, unsigned split)
#ifdef EXP_WCWIDTH_SUPPORT
utfxtracells_on_this_line = 0;
#endif
- text->LastChar = ' ';
+ HText_setLastChar(text, ' ');
#ifdef DEBUG_APPCH
CTRACE((tfp, "GridText: split_line(%p,%d) called\n", text, split));
@@ -4648,7 +4656,20 @@ void HText_setLastChar(HText *text, int ch)
if (!text)
return;
+#ifdef EXP_JAPANESE_SPACES
+ if (IS_UTF_EXTRA(ch) && IS_UTF_FIRST(text->LastChars[0])) {
+ int i;
+ for (i = 1; text->LastChars[i] != '\0' && i < sizeof(text->LastChars) - 1; i++)
+ ;
+ text->LastChars[i] = (char) ch;
+ text->LastChars[i + 1] = '\0';
+ return;
+ }
+ memset(text->LastChars, 0, sizeof(text->LastChars));
+ text->LastChars[0] = (char) ch;
+#else
text->LastChar = (char) ch;
+#endif
}
/* Get LastChar element in the text object.
@@ -4659,8 +4680,37 @@ char HText_getLastChar(HText *text)
if (!text)
return ('\0');
+#ifdef EXP_JAPANESE_SPACES
+ if (IS_UTF_FIRST(text->LastChars[0])) {
+ int i;
+ for (i = 1; text->LastChars[i] != '\0' && i < sizeof(text->LastChars); i++)
+ ;
+ return ((char) text->LastChars[i - 1]);
+ }
+ return ((char) text->LastChars[0]);
+#else
return ((char) text->LastChar);
+#endif
+}
+
+#ifdef EXP_JAPANESE_SPACES
+BOOL HText_checkLastChar_needSpaceOnJoinLines(HText *text)
+{
+ if (!text)
+ return YES;
+
+ if (IS_UTF_FIRST(text->LastChars[0]) && isUTF8CJChar(text->LastChars))
+ return NO;
+ if ((HTCJK == CHINESE || HTCJK == JAPANESE) && is8bits(text->LastChars[0])) {
+ /* TODO: support 2nd byte of some SJIS kanji (!is8bits && IS_SJIS_LO) */
+ return NO;
+ }
+ if (text->LastChars[0] != ' ')
+ return YES;
+ return NO;
}
+#endif
+
/* Simple table handling - private
* -------------------------------
@@ -5204,7 +5254,7 @@ static void add_link_number(HText *text, TextAnchor *a, int save_position)
&& (text->source ? !psrcview_no_anchor_numbering : 1)
#endif
&& links_are_numbered()) {
- char saved_lastchar = text->LastChar;
+ char saved_lastchar = HText_getLastChar(text);
int saved_linenum = text->Lines;
HTAnchor *link_dest;
char *link_text;
@@ -5222,7 +5272,7 @@ static void add_link_number(HText *text, TextAnchor *a, int save_position)
HText_appendText(text, marker);
}
if (saved_linenum && text->Lines && saved_lastchar != ' ')
- text->LastChar = ']'; /* if marker not after space caused split */
+ HText_setLastChar(text, ']'); /* if marker not after space caused split */
if (save_position) {
a->line_num = text->Lines;
a->line_pos = (short) text->last_line->size;
@@ -14973,6 +15023,14 @@ static void permit_split_after_CJchar(HText *text, const char *s, unsigned short
{
/* Can split after almost any CJ char (Korean uses space) */
/* TODO: UAX#14 Unicode Line Breaking Algorithm (use ICU4C?) */
+ if (isUTF8CJChar(s))
+ text->permissible_split = pos;
+}
+#endif /* EXP_WCWIDTH_SUPPORT */
+
+#if defined(EXP_WCWIDTH_SUPPORT) || defined(EXP_JAPANESE_SPACES)
+BOOL isUTF8CJChar(const char *s)
+{
UCode_t u = UCGetUniFromUtf8String(&s);
if (u >= 0x4e00 && u <= 0x9fff || /* CJK Unified Ideographs */
u >= 0x3000 && u <= 0x30ff || /* CJK Symbols and Punctuation, Hiragana, Katakana */
@@ -14981,6 +15039,7 @@ static void permit_split_after_CJchar(HText *text, const char *s, unsigned short
u >= 0x3400 && u <= 0x4dbf || /* CJK Unified Ideographs Extension A */
u >= 0xf900 && u <= 0xfaff || /* CJK Compatibility Ideographs */
u >= 0x20000 && u <= 0x3ffff) /* {Supplementary,Tertiary} Ideographic Plane */
- text->permissible_split = pos;
+ return YES;
+ return NO;
}
-#endif
+#endif /* EXP_WCWIDTH_SUPPORT || EXP_JAPANESE_SPACES */
diff --git a/src/GridText.h b/src/GridText.h
index 911de26..40b17b1 100644
--- a/src/GridText.h
+++ b/src/GridText.h
@@ -93,6 +93,9 @@ US-ASCII control characters <32 which are not defined in Unicode standard
extern void HText_setLastChar(HText *text, int ch);
extern char HText_getLastChar(HText *text);
+#ifdef EXP_JAPANESE_SPACES
+ extern BOOL HText_checkLastChar_needSpaceOnJoinLines(HText *text);
+#endif
extern int HText_sourceAnchors(HText *text);
extern void HText_setStale(HText *text);
@@ -289,6 +292,10 @@ US-ASCII control characters <32 which are not defined in Unicode standard
extern HTkcode HText_getSpecifiedKcode(HText *text);
extern void HText_updateSpecifiedKcode(HText *text, HTkcode kcode);
+#if defined(EXP_WCWIDTH_SUPPORT) || defined(EXP_JAPANESE_SPACES)
+ extern BOOL isUTF8CJChar(const char *s);
+#endif
+
#ifdef __cplusplus
}
#endif
diff --git a/src/HTML.c b/src/HTML.c
index a012466..cf2e18b 100644
--- a/src/HTML.c
+++ b/src/HTML.c
@@ -275,18 +275,6 @@ void LYShowBadHTML(const char *message)
* A C T I O N R O U T I N E S
*/
-/* FIXME: this should be amended to do the substitution only when not in a
- * multibyte stream.
- */
-#ifdef EXP_JAPANESE_SPACES
-#define FIX_JAPANESE_SPACES \
- (HTCJK == CHINESE || HTCJK == JAPANESE || HTCJK == TAIPEI)
- /* don't replace '\n' with ' ' if Chinese or Japanese - HN
- */
-#else
-#define FIX_JAPANESE_SPACES 0
-#endif
-
/* Character handling
* ------------------
*/
@@ -333,12 +321,25 @@ void HTML_put_character(HTStructured * me, int c)
return;
if (c != '\n' && c != '\t' && c != '\r') {
HTChunkPutc(&me->title, uc);
- } else if (FIX_JAPANESE_SPACES) {
- if (c == '\t') {
- HTChunkPutc(&me->title, ' ');
- } else {
+#ifdef EXP_JAPANESE_SPACES
+ } else if (c == '\t') {
+ HTChunkPutc(&me->title, ' ');
+ /* don't replace '\n' with ' ' if Chinese or Japanese - HN
+ */
+ } else if (me->title.size > 0 && is8bits(me->title.data[me->title.size - 1])) {
+ if (HTCJK == CHINESE || HTCJK == JAPANESE) {
+ /* TODO: support 2nd byte of SJIS (!is8bits && IS_SJIS_LO) */
return;
+ } else if (IS_UTF8_TTY) {
+ /* find start position of UTF-8 sequence */
+ int i = me->title.size - 1;
+ while (i > 0 && (me->title.data[i] & 0xc0) == 0x80) /* UTF_EXTRA */
+ i--;
+ if (isUTF8CJChar(&(me->title.data[i])))
+ return;
}
+ HTChunkPutc(&me->title, ' ');
+#endif
} else {
HTChunkPutc(&me->title, ' ');
}
@@ -453,15 +454,17 @@ void HTML_put_character(HTStructured * me, int c)
UPDATE_STYLE;
}
if (c == '\n') {
- if (!FIX_JAPANESE_SPACES) {
- if (me->in_word) {
- if (HText_getLastChar(me->text) != ' ') {
- me->inP = TRUE;
- me->inLABEL = FALSE;
- HText_appendCharacter(me->text, ' ');
- }
- me->in_word = NO;
+ if (me->in_word) {
+#ifdef EXP_JAPANESE_SPACES
+ if (HText_checkLastChar_needSpaceOnJoinLines(me->text)) {
+#else
+ if (HText_getLastChar(me->text) != ' ') {
+#endif
+ me->inP = TRUE;
+ me->inLABEL = FALSE;
+ HText_appendCharacter(me->text, ' ');
}
+ me->in_word = NO;
}
} else if (c == ' ' || c == '\t') {
@@ -607,12 +610,14 @@ void HTML_put_string(HTStructured * me, const char *s)
UPDATE_STYLE;
}
if (c == '\n') {
- if (!FIX_JAPANESE_SPACES) {
- if (me->in_word) {
- if (HText_getLastChar(me->text) != ' ')
- HText_appendCharacter(me->text, ' ');
- me->in_word = NO;
- }
+ if (me->in_word) {
+#ifdef EXP_JAPANESE_SPACES
+ if (HText_checkLastChar_needSpaceOnJoinLines(me->text))
+#else
+ if (HText_getLastChar(me->text) != ' ')
+#endif
+ HText_appendCharacter(me->text, ' ');
+ me->in_word = NO;
}
} else if (c == ' ' || c == '\t') {
<html>
<head>
<meta charset="EUC-JP">
<title>
lorem
ipsum
dolor
sit
</title>
</head>
<body>
<p>
lorem
ipsum
dolor
sit
</p>
<p>
Expected result:<br>
lorem ipsum 漢あカ!「dolor sit 空行
</p>
<h2>span</h2>
<p>
<span>lorem</span>
<span>ipsum</span>
<span>漢</span>
<span>あ</span>
<span>カ</span>
<span>!</span>
<span>「</span>
<span>dolor</span>
<span></span>
<span>sit</span>
<span></span>
<span>空</span>
<span></span>
<span>行</span>
</p>
<p>
Expected result:<br>
lorem ipsum 漢あカ!「dolor sit 空行
</p>
</body>
</html>
<html>
<head>
<meta charset="iso-2022-jp">
<title>
lorem
ipsum
$B4A(B
$B$"(B
$B%+(B
$B!*(B
$B!V(B
dolor
sit
$B6u(B
$B9T(B
</title>
</head>
<body>
<p>
lorem
ipsum
$B4A(B
$B$"(B
$B%+(B
$B!*(B
$B!V(B
dolor
sit
$B6u(B
$B9T(B
</p>
<p>
Expected result:<br>
lorem ipsum $B4A$"%+!*!V(Bdolor sit $B6u9T(B
</p>
<h2>span</h2>
<p>
<span>lorem</span>
<span>ipsum</span>
<span>$B4A(B</span>
<span>$B$"(B</span>
<span>$B%+(B</span>
<span>$B!*(B</span>
<span>$B!V(B</span>
<span>dolor</span>
<span></span>
<span>sit</span>
<span></span>
<span>$B6u(B</span>
<span></span>
<span>$B9T(B</span>
</p>
<p>
Expected result:<br>
lorem ipsum $B4A$"%+!*!V(Bdolor sit $B6u9T(B
</p>
</body>
</html>
<html>
<head>
<meta charset="Shift_JIS">
<title>
lorem
ipsum
dolor
sit
</title>
</head>
<body>
<p>
lorem
ipsum
dolor
sit
</p>
<p>
Expected result:<br>
lorem ipsum 漢あカ!「キdolor sit 空行
</p>
<h2>span</h2>
<p>
<span>lorem</span>
<span>ipsum</span>
<span>漢</span>
<span>あ</span>
<span>カ</span>
<span>!</span>
<span>「</span>
<span>キ</span>
<span>dolor</span>
<span></span>
<span>sit</span>
<span></span>
<span>空</span>
<span></span>
<span>行</span>
</p>
<p>
Expected result:<br>
lorem ipsum 漢あカ!「キdolor sit 空行
</p>
</body>
</html>
<html>
<head>
<meta charset="utf-8">
<title>
lorem
ipsum
𠀋
dolor
weiß
sit
</title>
</head>
<body>
<p>
lorem
ipsum
𠀋
dolor
weiß
sit
</p>
<p>
Expected result:<br>
lorem ipsum 漢あカ!「㓅﨑𠀋한 空キdolor weiß sit
</p>
<h2>span</h2>
<p>
<span>lorem</span>
<span>ipsum</span>
<span>漢</span>
<span>あ</span>
<span>カ</span>
<span>!</span>
<span>「</span>
<span>㓅</span>
<span>﨑</span>
<span>𠀋</span>
<span>한</span>
<span>空</span>
<span></span>
<span>キ</span>
<span>dolor</span>
<span></span>
<span>weiß</span>
<span>sit</span>
</p>
<p>
Expected result:<br>
lorem ipsum 漢あカ!「㓅﨑𠀋한 空キdolor weiß sit
</p>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment