From 892a34e66de5de8f315788ac77e1c2734a33a0cd Mon Sep 17 00:00:00 2001 From: Marius Gerbershagen Date: Sat, 13 Jun 2020 10:35:43 +0200 Subject: [PATCH] fix encoding issues on windows console streams Always use the byte sized input/output operations ReadConsoleA/WriteConsoleA and do all the conversion to unicode by ourself. Moreover, expand the set of known encodings for Windows codepages and print a warning if we encounter an unsupported codepage. Fixes #582. --- src/c/file.d | 102 +++++++++++++++++++++++++++++++----------- src/c/main.d | 38 +++------------- src/c/symbols_list.h | 14 +++--- src/c/symbols_list2.h | 14 +++--- src/h/external.h | 3 ++ 5 files changed, 103 insertions(+), 68 deletions(-) diff --git a/src/c/file.d b/src/c/file.d index ebc3b272..13ad5748 100755 --- a/src/c/file.d +++ b/src/c/file.d @@ -4042,34 +4042,17 @@ wcon_stream_read_byte8(cl_object strm, unsigned char *c, cl_index n) unlikely_if (strm->stream.byte_stack != ECL_NIL) { return consume_byte_stack(strm, c, n); } else { - cl_index len = 0; cl_env_ptr the_env = ecl_process_env(); HANDLE h = (HANDLE)IO_FILE_DESCRIPTOR(strm); DWORD nchars; - unsigned char aux[4]; - WCHAR waux[1]; - for (len = 0; len < n; ) { - int i, ok; - ecl_disable_interrupts_env(the_env); - ok = ReadConsoleW(h, waux, 1, &nchars, NULL); - if (ok) { - nchars = WideCharToMultiByte(GetConsoleCP(), 0, waux, 1, aux, 4, NULL, NULL); - } - ecl_enable_interrupts_env(the_env); - unlikely_if (!ok) { - FEwin32_error("Cannot read from console", 0); - } - for (i = 0; i < nchars; i++) { - if (len < n) { - c[len++] = aux[i]; - } else { - strm->stream.byte_stack = - ecl_nconc(strm->stream.byte_stack, - ecl_list1(ecl_make_fixnum(aux[i]))); - } - } + int ok; + ecl_disable_interrupts_env(the_env); + ok = ReadConsoleA(h, c, n, &nchars, NULL); + ecl_enable_interrupts_env(the_env); + unlikely_if (!ok) { + FEwin32_error("Cannot read from console", 0); } - return (len > 0) ? len : EOF; + return (nchars > 0) ? nchars : EOF; } } @@ -4078,7 +4061,7 @@ wcon_stream_write_byte8(cl_object strm, unsigned char *c, cl_index n) { HANDLE h = (HANDLE)IO_FILE_DESCRIPTOR(strm); DWORD nchars; - unlikely_if(!WriteConsole(h, c, n, &nchars, NULL)) { + unlikely_if(!WriteConsoleA(h, c, n, &nchars, NULL)) { FEwin32_error("Cannot write to console.", 0); } return nchars; @@ -4203,6 +4186,67 @@ maybe_make_windows_console_fd(cl_object fname, int desc, enum ecl_smmode smm, } return output; } + +cl_object +si_windows_codepage_encoding() +{ + /* Mapping from windows codepages to encoding names used by ECL */ + DWORD cp = GetConsoleCP(); + cl_object encoding; + switch (cp) { +#ifdef ECL_UNICODE + case 437: return ecl_make_keyword("DOS-CP437"); + case 708: return ecl_make_keyword("ISO-8859-6"); + case 850: return ecl_make_keyword("DOS-CP850"); + case 852: return ecl_make_keyword("DOS-CP852"); + case 855: return ecl_make_keyword("DOS-CP855"); + case 857: return ecl_make_keyword("DOS-CP857"); + case 858: return ecl_make_keyword("DOS-CP858"); + case 860: return ecl_make_keyword("DOS-CP860"); + case 861: return ecl_make_keyword("DOS-CP861"); + case 862: return ecl_make_keyword("DOS-CP862"); + case 863: return ecl_make_keyword("DOS-CP863"); + case 864: return ecl_make_keyword("DOS-CP864"); + case 865: return ecl_make_keyword("DOS-CP865"); + case 866: return ecl_make_keyword("DOS-CP866"); + case 869: return ecl_make_keyword("DOS-CP869"); + case 932: return ecl_make_keyword("WINDOWS-CP932"); + case 936: return ecl_make_keyword("WINDOWS-CP936"); + case 949: return ecl_make_keyword("WINDOWS-CP949"); + case 950: return ecl_make_keyword("WINDOWS-CP950"); + case 1200: return ecl_make_keyword("UCS-2LE"); + case 1201: return ecl_make_keyword("UCS-2BE"); + case 1250: return ecl_make_keyword("WINDOWS-CP1250"); + case 1251: return ecl_make_keyword("WINDOWS-CP1251"); + case 1252: return ecl_make_keyword("WINDOWS-CP1252"); + case 1253: return ecl_make_keyword("WINDOWS-CP1253"); + case 1254: return ecl_make_keyword("WINDOWS-CP1254"); + case 1255: return ecl_make_keyword("WINDOWS-CP1255"); + case 1256: return ecl_make_keyword("WINDOWS-CP1256"); + case 1257: return ecl_make_keyword("WINDOWS-CP1257"); + case 1258: return ecl_make_keyword("WINDOWS-CP1258"); + case 12000: return ecl_make_keyword("UCS-4LE"); + case 12001: return ecl_make_keyword("UCS-4BE"); + case 20932: return ecl_make_keyword("JISX0212"); + case 21866: return ecl_make_keyword("KOI8-U"); + case 28591: return ecl_make_keyword("ISO-8859-1"); + case 28592: return ecl_make_keyword("ISO-8859-2"); + case 28593: return ecl_make_keyword("ISO-8859-3"); + case 28594: return ecl_make_keyword("ISO-8859-4"); + case 28595: return ecl_make_keyword("ISO-8859-5"); + case 28596: return ecl_make_keyword("ISO-8859-6"); + case 28597: return ecl_make_keyword("ISO-8859-7"); + case 28598: return ecl_make_keyword("ISO-8859-8"); + case 28599: return ecl_make_keyword("ISO-8859-9"); + case 28603: return ecl_make_keyword("ISO-8859-13"); + case 28605: return ecl_make_keyword("ISO-8859-15"); + case 50220: return ecl_make_keyword("ISO-2022-JP"); + case 65001: return ecl_make_keyword("UTF-8"); +#endif + /* Nothing we can do here, try our best with :pass-through */ + default: return @':pass-through'; + } +} #else #define maybe_make_windows_console_FILE ecl_make_stream_from_FILE #define maybe_make_windows_console_fd ecl_make_file_stream_from_fd @@ -5788,11 +5832,15 @@ init_file(void) cl_object null_stream; cl_object external_format = ECL_NIL; #if defined(ECL_MS_WINDOWS_HOST) + /* We start out with :pass-through external format for standard + * input/output for bootstrap reasons (some of the external format + * support is implemented in lisp and not available on start of + * ECL). The correct format is later on set using the encoding + * specified by the current codepage. */ + external_format = cl_list(2, @':pass-through', @':crlf'); # ifdef ECL_UNICODE - external_format = cl_list(2, @':latin-1', @':crlf'); flags = 0; # else - external_format = cl_list(2, @':crlf', @':pass-through'); flags = ECL_STREAM_DEFAULT_FORMAT; # endif #else diff --git a/src/c/main.d b/src/c/main.d index a868328c..e5343edc 100755 --- a/src/c/main.d +++ b/src/c/main.d @@ -454,40 +454,16 @@ struct cl_core_struct cl_core = { static void maybe_fix_console_stream(cl_object stream) { - DWORD cp = GetConsoleCP(); - const char *encoding; cl_object external_format; - int i; - static const struct { - int code; - const char *name; - } known_cp[] = { - {874, "WINDOWS-CP874"}, - {932, "WINDOWS-CP932"}, - {936, "WINDOWS-CP936"}, - {949, "WINDOWS-CP949"}, - {950, "WINDOWS-CP950"}, - {1200, "WINDOWS-CP1200"}, - {1201, "WINDOWS-CP1201"}, - {1250, "WINDOWS-CP1250"}, - {1251, "WINDOWS-CP1251"}, - {1252, "WINDOWS-CP1252"}, - {1253, "WINDOWS-CP1253"}, - {1254, "WINDOWS-CP1254"}, - {1255, "WINDOWS-CP1255"}, - {1256, "WINDOWS-CP1256"}, - {1257, "WINDOWS-CP1257"}, - {1258, "WINDOWS-CP1258"}, - {65001, "UTF8"}, - {0,"LATIN-1"} - }; if (stream->stream.mode != ecl_smm_io_wcon) return; - for (i = 0; known_cp[i].code && known_cp[i].code != cp; i++) - {} - external_format = cl_list(2, ecl_make_keyword(known_cp[i].name), - @':crlf'); - si_stream_external_format_set(stream, external_format); + external_format = si_windows_codepage_encoding(); + if (external_format == @':pass-through') + fprintf(stderr, + "Unsupported codepage %d, input/output encoding may be wrong.\n" + "Use the chcp command to change codepages, e.g. 'chcp 65001' to change to utf-8.\n", + GetConsoleCP()); + si_stream_external_format_set(stream, cl_list(2, external_format, @':crlf')); stream->stream.eof_char = 26; } #endif diff --git a/src/c/symbols_list.h b/src/c/symbols_list.h index 16147c01..6870da71 100755 --- a/src/c/symbols_list.h +++ b/src/c/symbols_list.h @@ -79,6 +79,11 @@ typedef struct { #else # define IF_COMPLEX_FLOAT(x) NULL #endif +#ifdef ECL_MS_WINDOWS_HOST +# define IF_WINDOWS(x) x +#else +# define IF_WINDOWS(x) NULL +#endif /* XXX When the symbol has the associated function its name must follow the naming convention, otherwise si:mangle-name will @@ -1811,6 +1816,8 @@ cl_symbols[] = { {EXT_ "*ACTION-ON-UNDEFINED-VARIABLE*", EXT_SPECIAL, NULL, -1, ECL_NIL}, +{SYS_ "WINDOWS-CODEPAGE-ENCODING", SI_ORDINARY, IF_WINDOWS(si_windows_codepage_encoding), 0, OBJNULL}, + {EXT_ "SET-BUFFERING-MODE", EXT_ORDINARY, si_set_buffering_mode, 2, OBJNULL}, {KEY_ "NONE", KEYWORD, NULL, -1, OBJNULL}, {KEY_ "LINE-BUFFERED", KEYWORD, NULL, -1, OBJNULL}, @@ -1923,6 +1930,7 @@ cl_symbols[] = { {KEY_ "CR", KEYWORD, NULL, -1, OBJNULL}, {KEY_ "LF", KEYWORD, NULL, -1, OBJNULL}, {KEY_ "CRLF", KEYWORD, NULL, -1, OBJNULL}, + {KEY_ "UCS-2BE", KEYWORD, NULL, -1, OBJNULL}, {KEY_ "UCS-4BE", KEYWORD, NULL, -1, OBJNULL}, {KEY_ "UCS-2LE", KEYWORD, NULL, -1, OBJNULL}, @@ -2049,11 +2057,7 @@ cl_symbols[] = { #endif {SYS_ "RUN-PROGRAM-INNER", SI_ORDINARY, si_run_program_inner, 4, OBJNULL}, {SYS_ "SPAWN-SUBPROCESS", SI_ORDINARY, si_spawn_subprocess, 6, OBJNULL}, -#if defined(ECL_MS_WINDOWS_HOST) -{SYS_ "CLOSE-WINDOWS-HANDLE", SI_ORDINARY, si_close_windows_handle, 1, OBJNULL}, -#else -{SYS_ "CLOSE-WINDOWS-HANDLE", SI_ORDINARY, NULL, -1, OBJNULL}, -#endif +{SYS_ "CLOSE-WINDOWS-HANDLE", SI_ORDINARY, IF_WINDOWS(si_close_windows_handle), 1, OBJNULL}, /* ~ */ {EXT_ "*INVOKE-DEBUGGER-HOOK*", EXT_SPECIAL, NULL, -1, ECL_NIL}, diff --git a/src/c/symbols_list2.h b/src/c/symbols_list2.h index f2614b93..8980f4b9 100644 --- a/src/c/symbols_list2.h +++ b/src/c/symbols_list2.h @@ -79,6 +79,11 @@ typedef struct { #else # define IF_COMPLEX_FLOAT(x) NULL #endif +#ifdef ECL_MS_WINDOWS_HOST +# define IF_WINDOWS(x) x +#else +# define IF_WINDOWS(x) NULL +#endif /* XXX When the symbol has the associated function its name must follow the naming convention, otherwise si:mangle-name will @@ -1811,6 +1816,8 @@ cl_symbols[] = { {EXT_ "*ACTION-ON-UNDEFINED-VARIABLE*",NULL,-1}, +{SYS_ "WINDOWS-CODEPAGE-ENCODING",IF_WINDOWS("si_windows_codepage_encoding"),0}, + {EXT_ "SET-BUFFERING-MODE","si_set_buffering_mode",2}, {KEY_ "NONE",NULL,-1}, {KEY_ "LINE-BUFFERED",NULL,-1}, @@ -1923,6 +1930,7 @@ cl_symbols[] = { {KEY_ "CR",NULL,-1}, {KEY_ "LF",NULL,-1}, {KEY_ "CRLF",NULL,-1}, + {KEY_ "UCS-2BE",NULL,-1}, {KEY_ "UCS-4BE",NULL,-1}, {KEY_ "UCS-2LE",NULL,-1}, @@ -2049,11 +2057,7 @@ cl_symbols[] = { #endif {SYS_ "RUN-PROGRAM-INNER","si_run_program_inner",4}, {SYS_ "SPAWN-SUBPROCESS","si_spawn_subprocess",6}, -#if defined(ECL_MS_WINDOWS_HOST) -{SYS_ "CLOSE-WINDOWS-HANDLE","si_close_windows_handle",1}, -#else -{SYS_ "CLOSE-WINDOWS-HANDLE",NULL,-1}, -#endif +{SYS_ "CLOSE-WINDOWS-HANDLE",IF_WINDOWS("si_close_windows_handle"),1}, /* ~ */ {EXT_ "*INVOKE-DEBUGGER-HOOK*",NULL,-1}, diff --git a/src/h/external.h b/src/h/external.h index 5f5fdcfa..b490bd86 100755 --- a/src/h/external.h +++ b/src/h/external.h @@ -718,6 +718,9 @@ extern ECL_API cl_object si_do_write_sequence(cl_object string, cl_object stream extern ECL_API cl_object si_do_read_sequence(cl_object string, cl_object stream, cl_object start, cl_object end); extern ECL_API cl_object si_file_column(cl_object strm); extern ECL_API cl_object cl_interactive_stream_p(cl_object strm); +#if defined(ECL_MS_WINDOWS_HOST) +extern ECL_API cl_object si_windows_codepage_encoding(); +#endif extern ECL_API cl_object si_set_buffering_mode(cl_object strm, cl_object mode); extern ECL_API cl_object si_stream_external_format_set(cl_object strm, cl_object format);