Last active
March 26, 2022 03:57
-
-
Save deton/c373c6897f3622989cde9b2c54a23374 to your computer and use it in GitHub Desktop.
lynx patch not to add spaces on joining lines for Japanese
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/GridText.c b/src/GridText.c | |
index 04e9a4a..d9a1665 100644 | |
--- a/src/GridText.c | |
+++ b/src/GridText.c | |
@@ -453,7 +453,11 @@ struct _HText { | |
HTList *hidden_links; /* Content-less links ... */ | |
int hiddenlinkflag; /* ... and how to treat them */ | |
BOOL no_cache; /* Always refresh? */ | |
+#ifdef EXP_JAPANESE_SPACES | |
+ char LastChars[7]; /* utf-8 buffer */ | |
+#else | |
char LastChar; /* For absorbing white space */ | |
+#endif | |
/* For Internal use: */ | |
HTStyle *style; /* Current style */ | |
@@ -1134,7 +1138,11 @@ HText *HText_new(HTParentAnchor *anchor) | |
anchor->post_data) | |
? YES | |
: NO); | |
+#ifdef EXP_JAPANESE_SPACES | |
+ memset(self->LastChars, 0, sizeof(self->LastChars)); | |
+#else | |
self->LastChar = '\0'; | |
+#endif | |
#ifndef USE_PRETTYSRC | |
if (HTOutputFormat == WWW_SOURCE) | |
@@ -2867,7 +2875,7 @@ static void split_line(HText *text, unsigned split) | |
#ifdef EXP_WCWIDTH_SUPPORT | |
utfxtracells_on_this_line = 0; | |
#endif | |
- text->LastChar = ' '; | |
+ HText_setLastChar(text, ' '); | |
#ifdef DEBUG_APPCH | |
CTRACE((tfp, "GridText: split_line(%p,%d) called\n", text, split)); | |
@@ -4648,7 +4656,20 @@ void HText_setLastChar(HText *text, int ch) | |
if (!text) | |
return; | |
+#ifdef EXP_JAPANESE_SPACES | |
+ if (IS_UTF_EXTRA(ch) && IS_UTF_FIRST(text->LastChars[0])) { | |
+ int i; | |
+ for (i = 1; text->LastChars[i] != '\0' && i < sizeof(text->LastChars) - 1; i++) | |
+ ; | |
+ text->LastChars[i] = (char) ch; | |
+ text->LastChars[i + 1] = '\0'; | |
+ return; | |
+ } | |
+ memset(text->LastChars, 0, sizeof(text->LastChars)); | |
+ text->LastChars[0] = (char) ch; | |
+#else | |
text->LastChar = (char) ch; | |
+#endif | |
} | |
/* Get LastChar element in the text object. | |
@@ -4659,8 +4680,37 @@ char HText_getLastChar(HText *text) | |
if (!text) | |
return ('\0'); | |
+#ifdef EXP_JAPANESE_SPACES | |
+ if (IS_UTF_FIRST(text->LastChars[0])) { | |
+ int i; | |
+ for (i = 1; text->LastChars[i] != '\0' && i < sizeof(text->LastChars); i++) | |
+ ; | |
+ return ((char) text->LastChars[i - 1]); | |
+ } | |
+ return ((char) text->LastChars[0]); | |
+#else | |
return ((char) text->LastChar); | |
+#endif | |
+} | |
+ | |
+#ifdef EXP_JAPANESE_SPACES | |
+BOOL HText_checkLastChar_needSpaceOnJoinLines(HText *text) | |
+{ | |
+ if (!text) | |
+ return YES; | |
+ | |
+ if (IS_UTF_FIRST(text->LastChars[0]) && isUTF8CJChar(text->LastChars)) | |
+ return NO; | |
+ if ((HTCJK == CHINESE || HTCJK == JAPANESE) && is8bits(text->LastChars[0])) { | |
+ /* TODO: support 2nd byte of some SJIS kanji (!is8bits && IS_SJIS_LO) */ | |
+ return NO; | |
+ } | |
+ if (text->LastChars[0] != ' ') | |
+ return YES; | |
+ return NO; | |
} | |
+#endif | |
+ | |
/* Simple table handling - private | |
* ------------------------------- | |
@@ -5204,7 +5254,7 @@ static void add_link_number(HText *text, TextAnchor *a, int save_position) | |
&& (text->source ? !psrcview_no_anchor_numbering : 1) | |
#endif | |
&& links_are_numbered()) { | |
- char saved_lastchar = text->LastChar; | |
+ char saved_lastchar = HText_getLastChar(text); | |
int saved_linenum = text->Lines; | |
HTAnchor *link_dest; | |
char *link_text; | |
@@ -5222,7 +5272,7 @@ static void add_link_number(HText *text, TextAnchor *a, int save_position) | |
HText_appendText(text, marker); | |
} | |
if (saved_linenum && text->Lines && saved_lastchar != ' ') | |
- text->LastChar = ']'; /* if marker not after space caused split */ | |
+ HText_setLastChar(text, ']'); /* if marker not after space caused split */ | |
if (save_position) { | |
a->line_num = text->Lines; | |
a->line_pos = (short) text->last_line->size; | |
@@ -14973,6 +15023,14 @@ static void permit_split_after_CJchar(HText *text, const char *s, unsigned short | |
{ | |
/* Can split after almost any CJ char (Korean uses space) */ | |
/* TODO: UAX#14 Unicode Line Breaking Algorithm (use ICU4C?) */ | |
+ if (isUTF8CJChar(s)) | |
+ text->permissible_split = pos; | |
+} | |
+#endif /* EXP_WCWIDTH_SUPPORT */ | |
+ | |
+#if defined(EXP_WCWIDTH_SUPPORT) || defined(EXP_JAPANESE_SPACES) | |
+BOOL isUTF8CJChar(const char *s) | |
+{ | |
UCode_t u = UCGetUniFromUtf8String(&s); | |
if (u >= 0x4e00 && u <= 0x9fff || /* CJK Unified Ideographs */ | |
u >= 0x3000 && u <= 0x30ff || /* CJK Symbols and Punctuation, Hiragana, Katakana */ | |
@@ -14981,6 +15039,7 @@ static void permit_split_after_CJchar(HText *text, const char *s, unsigned short | |
u >= 0x3400 && u <= 0x4dbf || /* CJK Unified Ideographs Extension A */ | |
u >= 0xf900 && u <= 0xfaff || /* CJK Compatibility Ideographs */ | |
u >= 0x20000 && u <= 0x3ffff) /* {Supplementary,Tertiary} Ideographic Plane */ | |
- text->permissible_split = pos; | |
+ return YES; | |
+ return NO; | |
} | |
-#endif | |
+#endif /* EXP_WCWIDTH_SUPPORT || EXP_JAPANESE_SPACES */ | |
diff --git a/src/GridText.h b/src/GridText.h | |
index 911de26..40b17b1 100644 | |
--- a/src/GridText.h | |
+++ b/src/GridText.h | |
@@ -93,6 +93,9 @@ US-ASCII control characters <32 which are not defined in Unicode standard | |
extern void HText_setLastChar(HText *text, int ch); | |
extern char HText_getLastChar(HText *text); | |
+#ifdef EXP_JAPANESE_SPACES | |
+ extern BOOL HText_checkLastChar_needSpaceOnJoinLines(HText *text); | |
+#endif | |
extern int HText_sourceAnchors(HText *text); | |
extern void HText_setStale(HText *text); | |
@@ -289,6 +292,10 @@ US-ASCII control characters <32 which are not defined in Unicode standard | |
extern HTkcode HText_getSpecifiedKcode(HText *text); | |
extern void HText_updateSpecifiedKcode(HText *text, HTkcode kcode); | |
+#if defined(EXP_WCWIDTH_SUPPORT) || defined(EXP_JAPANESE_SPACES) | |
+ extern BOOL isUTF8CJChar(const char *s); | |
+#endif | |
+ | |
#ifdef __cplusplus | |
} | |
#endif | |
diff --git a/src/HTML.c b/src/HTML.c | |
index a012466..cf2e18b 100644 | |
--- a/src/HTML.c | |
+++ b/src/HTML.c | |
@@ -275,18 +275,6 @@ void LYShowBadHTML(const char *message) | |
* A C T I O N R O U T I N E S | |
*/ | |
-/* FIXME: this should be amended to do the substitution only when not in a | |
- * multibyte stream. | |
- */ | |
-#ifdef EXP_JAPANESE_SPACES | |
-#define FIX_JAPANESE_SPACES \ | |
- (HTCJK == CHINESE || HTCJK == JAPANESE || HTCJK == TAIPEI) | |
- /* don't replace '\n' with ' ' if Chinese or Japanese - HN | |
- */ | |
-#else | |
-#define FIX_JAPANESE_SPACES 0 | |
-#endif | |
- | |
/* Character handling | |
* ------------------ | |
*/ | |
@@ -333,12 +321,25 @@ void HTML_put_character(HTStructured * me, int c) | |
return; | |
if (c != '\n' && c != '\t' && c != '\r') { | |
HTChunkPutc(&me->title, uc); | |
- } else if (FIX_JAPANESE_SPACES) { | |
- if (c == '\t') { | |
- HTChunkPutc(&me->title, ' '); | |
- } else { | |
+#ifdef EXP_JAPANESE_SPACES | |
+ } else if (c == '\t') { | |
+ HTChunkPutc(&me->title, ' '); | |
+ /* don't replace '\n' with ' ' if Chinese or Japanese - HN | |
+ */ | |
+ } else if (me->title.size > 0 && is8bits(me->title.data[me->title.size - 1])) { | |
+ if (HTCJK == CHINESE || HTCJK == JAPANESE) { | |
+ /* TODO: support 2nd byte of SJIS (!is8bits && IS_SJIS_LO) */ | |
return; | |
+ } else if (IS_UTF8_TTY) { | |
+ /* find start position of UTF-8 sequence */ | |
+ int i = me->title.size - 1; | |
+ while (i > 0 && (me->title.data[i] & 0xc0) == 0x80) /* UTF_EXTRA */ | |
+ i--; | |
+ if (isUTF8CJChar(&(me->title.data[i]))) | |
+ return; | |
} | |
+ HTChunkPutc(&me->title, ' '); | |
+#endif | |
} else { | |
HTChunkPutc(&me->title, ' '); | |
} | |
@@ -453,15 +454,17 @@ void HTML_put_character(HTStructured * me, int c) | |
UPDATE_STYLE; | |
} | |
if (c == '\n') { | |
- if (!FIX_JAPANESE_SPACES) { | |
- if (me->in_word) { | |
- if (HText_getLastChar(me->text) != ' ') { | |
- me->inP = TRUE; | |
- me->inLABEL = FALSE; | |
- HText_appendCharacter(me->text, ' '); | |
- } | |
- me->in_word = NO; | |
+ if (me->in_word) { | |
+#ifdef EXP_JAPANESE_SPACES | |
+ if (HText_checkLastChar_needSpaceOnJoinLines(me->text)) { | |
+#else | |
+ if (HText_getLastChar(me->text) != ' ') { | |
+#endif | |
+ me->inP = TRUE; | |
+ me->inLABEL = FALSE; | |
+ HText_appendCharacter(me->text, ' '); | |
} | |
+ me->in_word = NO; | |
} | |
} else if (c == ' ' || c == '\t') { | |
@@ -607,12 +610,14 @@ void HTML_put_string(HTStructured * me, const char *s) | |
UPDATE_STYLE; | |
} | |
if (c == '\n') { | |
- if (!FIX_JAPANESE_SPACES) { | |
- if (me->in_word) { | |
- if (HText_getLastChar(me->text) != ' ') | |
- HText_appendCharacter(me->text, ' '); | |
- me->in_word = NO; | |
- } | |
+ if (me->in_word) { | |
+#ifdef EXP_JAPANESE_SPACES | |
+ if (HText_checkLastChar_needSpaceOnJoinLines(me->text)) | |
+#else | |
+ if (HText_getLastChar(me->text) != ' ') | |
+#endif | |
+ HText_appendCharacter(me->text, ' '); | |
+ me->in_word = NO; | |
} | |
} else if (c == ' ' || c == '\t') { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<meta charset="EUC-JP"> | |
<title> | |
lorem | |
ipsum | |
漢 | |
あ | |
カ | |
! | |
「 | |
dolor | |
sit | |
空 | |
行 | |
</title> | |
</head> | |
<body> | |
<p> | |
lorem | |
ipsum | |
漢 | |
あ | |
カ | |
! | |
「 | |
dolor | |
sit | |
空 | |
行 | |
</p> | |
<p> | |
Expected result:<br> | |
lorem ipsum 漢あカ!「dolor sit 空行 | |
</p> | |
<h2>span</h2> | |
<p> | |
<span>lorem</span> | |
<span>ipsum</span> | |
<span>漢</span> | |
<span>あ</span> | |
<span>カ</span> | |
<span>!</span> | |
<span>「</span> | |
<span>dolor</span> | |
<span></span> | |
<span>sit</span> | |
<span></span> | |
<span>空</span> | |
<span></span> | |
<span>行</span> | |
</p> | |
<p> | |
Expected result:<br> | |
lorem ipsum 漢あカ!「dolor sit 空行 | |
</p> | |
</body> | |
</html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<meta charset="iso-2022-jp"> | |
<title> | |
lorem | |
ipsum | |
$B4A(B | |
$B$"(B | |
$B%+(B | |
$B!*(B | |
$B!V(B | |
dolor | |
sit | |
$B6u(B | |
$B9T(B | |
</title> | |
</head> | |
<body> | |
<p> | |
lorem | |
ipsum | |
$B4A(B | |
$B$"(B | |
$B%+(B | |
$B!*(B | |
$B!V(B | |
dolor | |
sit | |
$B6u(B | |
$B9T(B | |
</p> | |
<p> | |
Expected result:<br> | |
lorem ipsum $B4A$"%+!*!V(Bdolor sit $B6u9T(B | |
</p> | |
<h2>span</h2> | |
<p> | |
<span>lorem</span> | |
<span>ipsum</span> | |
<span>$B4A(B</span> | |
<span>$B$"(B</span> | |
<span>$B%+(B</span> | |
<span>$B!*(B</span> | |
<span>$B!V(B</span> | |
<span>dolor</span> | |
<span></span> | |
<span>sit</span> | |
<span></span> | |
<span>$B6u(B</span> | |
<span></span> | |
<span>$B9T(B</span> | |
</p> | |
<p> | |
Expected result:<br> | |
lorem ipsum $B4A$"%+!*!V(Bdolor sit $B6u9T(B | |
</p> | |
</body> | |
</html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<meta charset="Shift_JIS"> | |
<title> | |
lorem | |
ipsum | |
漢 | |
あ | |
カ | |
! | |
「 | |
キ | |
dolor | |
sit | |
空 | |
行 | |
</title> | |
</head> | |
<body> | |
<p> | |
lorem | |
ipsum | |
漢 | |
あ | |
カ | |
! | |
「 | |
キ | |
dolor | |
sit | |
空 | |
行 | |
</p> | |
<p> | |
Expected result:<br> | |
lorem ipsum 漢あカ!「キdolor sit 空行 | |
</p> | |
<h2>span</h2> | |
<p> | |
<span>lorem</span> | |
<span>ipsum</span> | |
<span>漢</span> | |
<span>あ</span> | |
<span>カ</span> | |
<span>!</span> | |
<span>「</span> | |
<span>キ</span> | |
<span>dolor</span> | |
<span></span> | |
<span>sit</span> | |
<span></span> | |
<span>空</span> | |
<span></span> | |
<span>行</span> | |
</p> | |
<p> | |
Expected result:<br> | |
lorem ipsum 漢あカ!「キdolor sit 空行 | |
</p> | |
</body> | |
</html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<meta charset="utf-8"> | |
<title> | |
lorem | |
ipsum | |
漢 | |
あ | |
カ | |
! | |
「 | |
㓅 | |
﨑 | |
𠀋 | |
한 | |
空 | |
キ | |
dolor | |
weiß | |
sit | |
</title> | |
</head> | |
<body> | |
<p> | |
lorem | |
ipsum | |
漢 | |
あ | |
カ | |
! | |
「 | |
㓅 | |
﨑 | |
𠀋 | |
한 | |
空 | |
キ | |
dolor | |
weiß | |
sit | |
</p> | |
<p> | |
Expected result:<br> | |
lorem ipsum 漢あカ!「㓅﨑𠀋한 空キdolor weiß sit | |
</p> | |
<h2>span</h2> | |
<p> | |
<span>lorem</span> | |
<span>ipsum</span> | |
<span>漢</span> | |
<span>あ</span> | |
<span>カ</span> | |
<span>!</span> | |
<span>「</span> | |
<span>㓅</span> | |
<span>﨑</span> | |
<span>𠀋</span> | |
<span>한</span> | |
<span>空</span> | |
<span></span> | |
<span>キ</span> | |
<span>dolor</span> | |
<span></span> | |
<span>weiß</span> | |
<span>sit</span> | |
</p> | |
<p> | |
Expected result:<br> | |
lorem ipsum 漢あカ!「㓅﨑𠀋한 空キdolor weiß sit | |
</p> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment