/* local: some stuff for localisation, safe terminal printout This is about determining if we got UTF-8 locale and checking output terminal properties, along with subsequent string transformations for safe printout. copyright 2008-2021 by the mpg123 project - free software under the terms of the LGPL 2.1 see COPYING and AUTHORS files in distribution or http://mpg123.org initially written by Thomas Orgis, based on a patch by Thorsten Glaser. */ // wchar stuff #define _XOPEN_SOURCE 600 #define _POSIX_C_SOURCE 200112L #include "config.h" #ifdef HAVE_LOCALE_H #include #endif #ifdef HAVE_LANGINFO_H #include #endif #include "compat/compat.h" #include "local.h" #ifdef HAVE_WCHAR_H #include #endif #ifdef HAVE_WCTYPE_H #include #endif #ifdef WIN32 #define WIN32_LEAN_AND_MEAN 1 #include #include #endif #include "common/debug.h" int utf8force = 0; // enforce UTF-8 workings int utf8env = 0; // produce UTF-8 text output int utf8loc = 0; // have actual UTF-8 locale (so that mbstowcs() works) //static int term_is_fun = -1; static const char joker_symbol = '?'; static const char *uni_repl = "\xef\xbf\xbd"; static const int uni_repl_len = 3; /* Check some language variable for UTF-8-ness. */ static int is_utf8(const char *lang); void check_locale(void) { if(utf8force) utf8env = 1; else { const char *cp; /* Check for env vars in proper oder. */ if((cp = getenv("LC_ALL")) == NULL && (cp = getenv("LC_CTYPE")) == NULL) cp = getenv("LANG"); if(is_utf8(cp)) utf8env = 1; } #if defined(HAVE_SETLOCALE) && defined(LC_CTYPE) /* To query, we need to set from environment... */ if( is_utf8(setlocale(LC_CTYPE, "")) // If enforced, try to set an UTF-8 locale that hopefully exists. || (utf8force && is_utf8(setlocale(LC_CTYPE, "C.UTF-8"))) || (utf8force && is_utf8(setlocale(LC_CTYPE, "en_US.UTF-8"))) ) { utf8env = 1; utf8loc = 1; } #endif #if defined(HAVE_NL_LANGINFO) && defined(CODESET) /* ...langinfo works after we set a locale, eh? So it makes sense after setlocale, if only. */ if(is_utf8(nl_langinfo(CODESET))) { utf8env = 1; utf8loc = 1; } #endif debug2("UTF-8 env %i: locale: %i", utf8env, utf8loc); } static int is_utf8(const char *lang) { if(lang == NULL) return 0; /* Now, if the variable mentions UTF-8 anywhere, in some variation, the locale is UTF-8. */ if( strstr(lang, "UTF-8") || strstr(lang, "utf-8") || strstr(lang, "UTF8") || strstr(lang, "utf8") ) return 1; else return 0; } // Moved encoding stuff over from metaprint.c and removed references to libmpg123, // meaning no mpg123_string for you! int unknown2utf8(char **dest, const char *source, int len) { if(!dest) return -1; if(!source) { *dest = INT123_safer_realloc(*dest, 0); return -1; } size_t count = len < 0 ? strlen(source) : (size_t)len; // Make a somewhat proper UTF-8 string out of this. Testing for valid // UTF-8 is futile. It will be some unspecified legacy 8-bit encoding. // I am keeping C0 chars, but replace everything above 7 bits with // the Unicode replacement character as most custom 8-bit encodings // placed some symbols into the C1 range, we just don't know which. size_t ulen = 1; // trailing zero for(size_t i=0; i= 0x80 ? uni_repl_len : 1; } if(NULL == (*dest = INT123_safer_realloc(*dest, ulen))) return -1; unsigned char *p = (unsigned char*)*dest; for(size_t i=0; i= 0x80) { for(int r=0; r= 0x80 && (c) <= 0x9f) \ { \ c2lead = 0; \ continue; \ } \ else \ { \ append; \ } \ } \ c2lead = ((c) == 0xc2); \ if(c2lead) \ continue; // return: strlen+1 of result, 0 on error // If ret is 0, *dest will be freed and NULL. static size_t utf8_ascii_work(char **dest_, const char *source , int keep_nonprint) { if(!dest_) return 0; if(!source) { *dest_ = INT123_safer_realloc(*dest_, 0); return 0; } char *dest = *dest_; size_t source_fill = strlen(source)+1; size_t spos = 0; size_t dlen = 1; // At least a zero. unsigned char *p; // Find length of ASCII string (count non-continuation bytes). // Do _not_ change this to mpg123_strlen()! // It needs to match the loop below. // No UTF-8 continuation byte 0x10??????, nor control char. #define ASCII_PRINT_SOMETHING(c) \ (((c) & 0xc0) != 0x80 && (keep_nonprint || ((c) != 0x7f && (c) >= 0x20))) int c2lead = 0; int wasspace = 0; for(spos=0; spos < source_fill; ++spos) { unsigned char c = ((unsigned char*)source)[spos]; if(!keep_nonprint) ascii_space(&c, &wasspace); ASCII_C1(c, ++dlen); if(ASCII_PRINT_SOMETHING(c)) ++dlen; } // Do nothing with nothing or if allocation fails. Neatly catches overflow // of ++dlen. if(!dlen || !(dest=INT123_safer_realloc(dest, dlen))) goto utf8_ascii_bad; p = (unsigned char*)dest; c2lead = 0; wasspace = 0; for(spos=0; spos < source_fill; ++spos) { unsigned char c = ((unsigned char*)source)[spos]; if(!keep_nonprint) ascii_space(&c, &wasspace); ASCII_C1(c, *p++ = joker_symbol) if(!ASCII_PRINT_SOMETHING(c)) continue; else if(c & 0x80) // UTF-8 lead byte 0x11?????? c = joker_symbol; *p++ = c; } #undef ASCII_PRINT_SOMETHING // Always close the string. if(dlen) dest[dlen-1] = 0; goto utf8_ascii_end; utf8_ascii_bad: dest = INT123_safer_realloc(dest, 0); utf8_ascii_end: *dest_ = dest; return dest ? strlen(dest)+1 : 0; } // Reduce UTF-8 data to 7-bit ASCII, dropping non-printable characters. // Non-printable ASCII == everything below 0x20 (space), including // line breaks. // Also: 0x7f (DEL) and the C1 chars. The C0 and C1 chars should just be // dropped, not rendered. Or should they? static size_t utf8_ascii_print(char **dest, const char *source) { return utf8_ascii_work(dest, source, 0); } // Same as above, but keeping non-printable and control chars in the // 7 bit realm. static size_t utf8_ascii(char **dest, const char *source) { return utf8_ascii_work(dest, source, 1); } size_t utf8outstr(char **dest_, const char *source, int to_terminal) { if(!dest_) return 0; if(!source) { *dest_ = INT123_safer_realloc(*dest_, 0); return 0; } char *dest = *dest_; size_t width = 0; size_t source_fill = strlen(source)+1; if(utf8env) { #if defined(HAVE_MBSTOWCS) && defined(HAVE_WCSWIDTH) && \ defined(HAVE_ISWPRINT) && defined(HAVE_WCSTOMBS) if(utf8loc && to_terminal) { // Best case scenario: Convert to wide string, filter, // compute printing width. size_t wcharlen = mbstowcs(NULL, source, 0); if(wcharlen == (size_t)-1) goto utf8outstr_bad; if(wcharlen+1 > SIZE_MAX/sizeof(wchar_t)) goto utf8outstr_bad; wchar_t *pre = malloc(sizeof(wchar_t)*(wcharlen+1)); wchar_t *flt = malloc(sizeof(wchar_t)*(wcharlen+1)); if(!pre || !flt) { free(flt); free(pre); goto utf8outstr_bad; } if(mbstowcs(pre, source, wcharlen+1) == wcharlen) { size_t nwl = 0; int wasspace = 0; for(size_t i=0; i= 0 && bytelen != (size_t)-1 && (dest=INT123_safer_realloc(dest, bytelen+1)) && wcstombs(dest, flt, bytelen+1) == bytelen ){ width = columns; } else dest=INT123_safer_realloc(dest, 0); } free(flt); free(pre); } else #endif if(to_terminal) { // Only filter C0 and C1 control characters. // That is, 0x01 to 0x19 (keeping 0x20, space) and 0x7f (DEL) to 0x9f. // Since the input and output is UTF-8, we'll keep that intact. // C1 is mapped to 0xc280 till 0xc29f. dest = INT123_safer_realloc(dest, source_fill); if(!dest) goto utf8outstr_bad; size_t dest_fill = 0; int c2lead = 0; int wasspace = 0; unsigned char *p = (unsigned char*)dest; for(size_t i=0; i