looper/subprojects/mpg123/src/local.c
2024-09-28 10:31:18 -07:00

485 lines
11 KiB
C

/*
local: some stuff for localisation, safe terminal printout
This is about determining if we got UTF-8 locale and
checking output terminal properties, along with subsequent string
transformations for safe printout.
copyright 2008-2021 by the mpg123 project - free software under the terms of the LGPL 2.1
see COPYING and AUTHORS files in distribution or http://mpg123.org
initially written by Thomas Orgis, based on a patch by Thorsten Glaser.
*/
// wchar stuff
#define _XOPEN_SOURCE 600
#define _POSIX_C_SOURCE 200112L
#include "config.h"
#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif
#ifdef HAVE_LANGINFO_H
#include <langinfo.h>
#endif
#include "compat/compat.h"
#include "local.h"
#ifdef HAVE_WCHAR_H
#include <wchar.h>
#endif
#ifdef HAVE_WCTYPE_H
#include <wctype.h>
#endif
#ifdef WIN32
#define WIN32_LEAN_AND_MEAN 1
#include <windows.h>
#include <wincon.h>
#endif
#include "common/debug.h"
int utf8force = 0; // enforce UTF-8 workings
int utf8env = 0; // produce UTF-8 text output
int utf8loc = 0; // have actual UTF-8 locale (so that mbstowcs() works)
//static int term_is_fun = -1;
static const char joker_symbol = '?';
static const char *uni_repl = "\xef\xbf\xbd";
static const int uni_repl_len = 3;
/* Check some language variable for UTF-8-ness. */
static int is_utf8(const char *lang);
void check_locale(void)
{
if(utf8force)
utf8env = 1;
else
{
const char *cp;
/* Check for env vars in proper oder. */
if((cp = getenv("LC_ALL")) == NULL && (cp = getenv("LC_CTYPE")) == NULL)
cp = getenv("LANG");
if(is_utf8(cp))
utf8env = 1;
}
#if defined(HAVE_SETLOCALE) && defined(LC_CTYPE)
/* To query, we need to set from environment... */
if(
is_utf8(setlocale(LC_CTYPE, ""))
// If enforced, try to set an UTF-8 locale that hopefully exists.
|| (utf8force && is_utf8(setlocale(LC_CTYPE, "C.UTF-8")))
|| (utf8force && is_utf8(setlocale(LC_CTYPE, "en_US.UTF-8")))
)
{
utf8env = 1;
utf8loc = 1;
}
#endif
#if defined(HAVE_NL_LANGINFO) && defined(CODESET)
/* ...langinfo works after we set a locale, eh? So it makes sense after setlocale, if only. */
if(is_utf8(nl_langinfo(CODESET)))
{
utf8env = 1;
utf8loc = 1;
}
#endif
debug2("UTF-8 env %i: locale: %i", utf8env, utf8loc);
}
static int is_utf8(const char *lang)
{
if(lang == NULL) return 0;
/* Now, if the variable mentions UTF-8 anywhere, in some variation, the locale is UTF-8. */
if( strstr(lang, "UTF-8") || strstr(lang, "utf-8")
|| strstr(lang, "UTF8") || strstr(lang, "utf8") )
return 1;
else
return 0;
}
// Moved encoding stuff over from metaprint.c and removed references to libmpg123,
// meaning no mpg123_string for you!
int unknown2utf8(char **dest, const char *source, int len)
{
if(!dest)
return -1;
if(!source)
{
*dest = INT123_safer_realloc(*dest, 0);
return -1;
}
size_t count = len < 0 ? strlen(source) : (size_t)len;
// Make a somewhat proper UTF-8 string out of this. Testing for valid
// UTF-8 is futile. It will be some unspecified legacy 8-bit encoding.
// I am keeping C0 chars, but replace everything above 7 bits with
// the Unicode replacement character as most custom 8-bit encodings
// placed some symbols into the C1 range, we just don't know which.
size_t ulen = 1; // trailing zero
for(size_t i=0; i<count; ++i)
{
unsigned char c = ((unsigned char*)source)[i];
if(!c)
break;
ulen += c >= 0x80 ? uni_repl_len : 1;
}
if(NULL == (*dest = INT123_safer_realloc(*dest, ulen)))
return -1;
unsigned char *p = (unsigned char*)*dest;
for(size_t i=0; i<count; ++i)
{
unsigned char c = ((unsigned char*)source)[i];
if(!c)
break;
if(c >= 0x80)
{
for(int r=0; r<uni_repl_len; ++r)
*p++ = uni_repl[r];
}
else
*p++ = c;
}
*p = 0;
return 0;
}
static void ascii_space(unsigned char *c, int *wasspace)
{
switch(*c)
{
case '\f':
case '\r':
case '\n':
case '\t':
case '\v':
if(!*wasspace)
*c = ' '; // Will be dropped by < 0x20 check otherwise.
*wasspace = 1;
break;
default:
*wasspace = 0;
}
}
// Filter C1 control chars, using c2lead state.
#define ASCII_C1(c, append) \
if(c2lead) \
{ \
if((c) >= 0x80 && (c) <= 0x9f) \
{ \
c2lead = 0; \
continue; \
} \
else \
{ \
append; \
} \
} \
c2lead = ((c) == 0xc2); \
if(c2lead) \
continue;
// return: strlen+1 of result, 0 on error
// If ret is 0, *dest will be freed and NULL.
static size_t utf8_ascii_work(char **dest_, const char *source
, int keep_nonprint)
{
if(!dest_)
return 0;
if(!source)
{
*dest_ = INT123_safer_realloc(*dest_, 0);
return 0;
}
char *dest = *dest_;
size_t source_fill = strlen(source)+1;
size_t spos = 0;
size_t dlen = 1; // At least a zero.
unsigned char *p;
// Find length of ASCII string (count non-continuation bytes).
// Do _not_ change this to mpg123_strlen()!
// It needs to match the loop below.
// No UTF-8 continuation byte 0x10??????, nor control char.
#define ASCII_PRINT_SOMETHING(c) \
(((c) & 0xc0) != 0x80 && (keep_nonprint || ((c) != 0x7f && (c) >= 0x20)))
int c2lead = 0;
int wasspace = 0;
for(spos=0; spos < source_fill; ++spos)
{
unsigned char c = ((unsigned char*)source)[spos];
if(!keep_nonprint)
ascii_space(&c, &wasspace);
ASCII_C1(c, ++dlen);
if(ASCII_PRINT_SOMETHING(c))
++dlen;
}
// Do nothing with nothing or if allocation fails. Neatly catches overflow
// of ++dlen.
if(!dlen || !(dest=INT123_safer_realloc(dest, dlen)))
goto utf8_ascii_bad;
p = (unsigned char*)dest;
c2lead = 0;
wasspace = 0;
for(spos=0; spos < source_fill; ++spos)
{
unsigned char c = ((unsigned char*)source)[spos];
if(!keep_nonprint)
ascii_space(&c, &wasspace);
ASCII_C1(c, *p++ = joker_symbol)
if(!ASCII_PRINT_SOMETHING(c))
continue;
else if(c & 0x80) // UTF-8 lead byte 0x11??????
c = joker_symbol;
*p++ = c;
}
#undef ASCII_PRINT_SOMETHING
// Always close the string.
if(dlen)
dest[dlen-1] = 0;
goto utf8_ascii_end;
utf8_ascii_bad:
dest = INT123_safer_realloc(dest, 0);
utf8_ascii_end:
*dest_ = dest;
return dest ? strlen(dest)+1 : 0;
}
// Reduce UTF-8 data to 7-bit ASCII, dropping non-printable characters.
// Non-printable ASCII == everything below 0x20 (space), including
// line breaks.
// Also: 0x7f (DEL) and the C1 chars. The C0 and C1 chars should just be
// dropped, not rendered. Or should they?
static size_t utf8_ascii_print(char **dest, const char *source)
{
return utf8_ascii_work(dest, source, 0);
}
// Same as above, but keeping non-printable and control chars in the
// 7 bit realm.
static size_t utf8_ascii(char **dest, const char *source)
{
return utf8_ascii_work(dest, source, 1);
}
size_t utf8outstr(char **dest_, const char *source, int to_terminal)
{
if(!dest_)
return 0;
if(!source)
{
*dest_ = INT123_safer_realloc(*dest_, 0);
return 0;
}
char *dest = *dest_;
size_t width = 0;
size_t source_fill = strlen(source)+1;
if(utf8env)
{
#if defined(HAVE_MBSTOWCS) && defined(HAVE_WCSWIDTH) && \
defined(HAVE_ISWPRINT) && defined(HAVE_WCSTOMBS)
if(utf8loc && to_terminal)
{
// Best case scenario: Convert to wide string, filter,
// compute printing width.
size_t wcharlen = mbstowcs(NULL, source, 0);
if(wcharlen == (size_t)-1)
goto utf8outstr_bad;
if(wcharlen+1 > SIZE_MAX/sizeof(wchar_t))
goto utf8outstr_bad;
wchar_t *pre = malloc(sizeof(wchar_t)*(wcharlen+1));
wchar_t *flt = malloc(sizeof(wchar_t)*(wcharlen+1));
if(!pre || !flt)
{
free(flt);
free(pre);
goto utf8outstr_bad;
}
if(mbstowcs(pre, source, wcharlen+1) == wcharlen)
{
size_t nwl = 0;
int wasspace = 0;
for(size_t i=0; i<wcharlen; ++i)
{
// Turn any funky space sequence (including line breaks) into
// one normal space.
if(iswspace(pre[i]) && pre[i] != ' ')
{
if(!wasspace)
flt[nwl++] = ' ';
wasspace = 1;
} else // Anything non-printing is skipped.
{
if(iswprint(pre[i]))
flt[nwl++] = pre[i];
wasspace = 0;
}
}
flt[nwl] = 0;
int columns = wcswidth(flt, nwl);
size_t bytelen = wcstombs(NULL, flt, 0);
if(
columns >= 0 && bytelen != (size_t)-1
&& (dest=INT123_safer_realloc(dest, bytelen+1))
&& wcstombs(dest, flt, bytelen+1) == bytelen
){
width = columns;
}
else
dest=INT123_safer_realloc(dest, 0);
}
free(flt);
free(pre);
}
else
#endif
if(to_terminal)
{
// Only filter C0 and C1 control characters.
// That is, 0x01 to 0x19 (keeping 0x20, space) and 0x7f (DEL) to 0x9f.
// Since the input and output is UTF-8, we'll keep that intact.
// C1 is mapped to 0xc280 till 0xc29f.
dest = INT123_safer_realloc(dest, source_fill);
if(!dest)
goto utf8outstr_bad;
size_t dest_fill = 0;
int c2lead = 0;
int wasspace = 0;
unsigned char *p = (unsigned char*)dest;
for(size_t i=0; i<source_fill; ++i)
{
unsigned char c = ((unsigned char*)source)[i];
ascii_space(&c, &wasspace);
ASCII_C1(c, *p++ = 0xc2)
if(c && c < 0x20)
continue; // no C0 control chars, except space
if(c == 0x7f)
continue; // also no DEL
*p++ = c;
if(!c)
break; // Up to zero is enough.
// Assume each 7 bit char and each sequence start make one character.
// So only continuation bytes need to be ignored.
if((c & 0xc0) != 0x80)
++width;
}
// Make damn sure that it ends.
dest_fill = (char*)p - dest;
dest[dest_fill-1] = 0;
} else
{
dest = INT123_safer_realloc(dest, source_fill);
if(!dest)
goto utf8outstr_bad;
size_t dest_fill = 0;
unsigned char *p = (unsigned char*)dest;
for(size_t i=0; i<source_fill; ++i)
{
unsigned char c = ((unsigned char*)source)[i];
*p++ = c;
if(!c)
break; // Up to zero is enough.
// Actual width should not matter that much for non-terminal,
// as we should use less formatting in that case, but anyway.
if((c & 0xc0) != 0x80)
++width;
}
dest_fill = (char*)p - dest;
dest[dest_fill-1] = 0;
}
} else if(to_terminal)
{
// Last resort: just 7-bit ASCII.
width = to_terminal
? utf8_ascii_print(&dest, source)
: utf8_ascii(&dest, source);
if(!width)
goto utf8outstr_bad;
--width;
}
goto utf8outstr_end;
utf8outstr_bad:
dest = INT123_safer_realloc(dest, 0);
width = 0;
utf8outstr_end:
*dest_ = dest;
return width;
}
#undef ASCII_C1
// I tried saving some malloc using provided work buffers, but
// realized that the path of Unicode transformations is so full
// of them regardless.
// Can this include all the necessary logic?
// - If UTF-8 input: Use utf8outstr(), which includes terminal switch.
// - If not:
// -- If terminal: construct safe UTF-8, pass on to outstr().
// -- If not: assume env encoding, unprocessed string that came
// from the environment.
int outstr(char **dest, const char *str, int is_utf8, int is_term)
{
if(!dest)
return -1;
if(!str)
{
*dest = INT123_safer_realloc(*dest, 0);
return -1;
}
int ret = 0;
if(is_utf8 || utf8env)
{
utf8outstr(dest, str, is_term);
if(*dest)
ret = -1;
} else if(is_term)
{
char *usrc = NULL;
ret = unknown2utf8(&usrc, str, -1);
if(!ret)
{
utf8outstr(dest, usrc, is_term);
if(!*dest)
ret = -1;
}
free(usrc);
} else
{
*dest = INT123_compat_strdup(str);
if(!*dest)
ret = -1;
}
return ret;
}
int print_outstr(FILE *out, const char *str, int is_utf8, int is_term)
{
int ret = 0;
if(!str)
return -1;
char *outbuf = NULL;
ret = outstr(&outbuf, str, is_utf8, is_term);
if(outbuf)
{
ret = fprintf(out, "%s", outbuf);
free(outbuf);
}
return ret;
}