looper/subprojects/mpg123/src/local.c

	/*
	local: some stuff for localisation, safe terminal printout

	This is about determining if we got UTF-8 locale and
	checking output terminal properties, along with subsequent string
	transformations for safe printout.

	copyright 2008-2021 by the mpg123 project - free software under the terms of the LGPL 2.1
	see COPYING and AUTHORS files in distribution or http://mpg123.org
	initially written by Thomas Orgis, based on a patch by Thorsten Glaser.
*/

// wchar stuff
#define _XOPEN_SOURCE 600
#define _POSIX_C_SOURCE 200112L

#include "config.h"

#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif
#ifdef HAVE_LANGINFO_H
#include <langinfo.h>
#endif
#include "compat/compat.h"

#include "local.h"

#ifdef HAVE_WCHAR_H
#include <wchar.h>
#endif
#ifdef HAVE_WCTYPE_H
#include <wctype.h>
#endif

#ifdef WIN32
#define WIN32_LEAN_AND_MEAN 1
#include <windows.h>
#include <wincon.h>
#endif

#include "common/debug.h"

int utf8force = 0; // enforce UTF-8 workings
int utf8env = 0; // produce UTF-8 text output
int utf8loc = 0; // have actual UTF-8 locale (so that mbstowcs() works)

//static int term_is_fun = -1;

static const char joker_symbol = '?';
static const char *uni_repl = "\xef\xbf\xbd";
static const int uni_repl_len = 3;

/* Check some language variable for UTF-8-ness. */
static int is_utf8(const char *lang);

void check_locale(void)
{
	if(utf8force)
		utf8env = 1;
	else
	{
		const char *cp;

		/* Check for env vars in proper oder. */
		if((cp = getenv("LC_ALL")) == NULL && (cp = getenv("LC_CTYPE")) == NULL)
		cp = getenv("LANG");

		if(is_utf8(cp))
			utf8env = 1;
	}

#if defined(HAVE_SETLOCALE) && defined(LC_CTYPE)
	/* To query, we need to set from environment... */
	if(
		   is_utf8(setlocale(LC_CTYPE, ""))
		// If enforced, try to set an UTF-8 locale that hopefully exists.
		|| (utf8force && is_utf8(setlocale(LC_CTYPE, "C.UTF-8")))
		|| (utf8force && is_utf8(setlocale(LC_CTYPE, "en_US.UTF-8")))
	)
	{
		utf8env = 1;
		utf8loc = 1;
	}
#endif
#if defined(HAVE_NL_LANGINFO) && defined(CODESET)
	/* ...langinfo works after we set a locale, eh? So it makes sense after setlocale, if only. */
	if(is_utf8(nl_langinfo(CODESET)))
	{
		utf8env = 1;
		utf8loc = 1;
	}
#endif

	debug2("UTF-8 env %i: locale: %i", utf8env, utf8loc);
}

static int is_utf8(const char *lang)
{
	if(lang == NULL) return 0;

	/* Now, if the variable mentions UTF-8 anywhere, in some variation, the locale is UTF-8. */
	if(   strstr(lang, "UTF-8") || strstr(lang, "utf-8")
	   || strstr(lang, "UTF8")  || strstr(lang, "utf8")  )
	return 1;
	else
	return 0;
}

// Moved encoding stuff over from metaprint.c and removed references to libmpg123,
// meaning no mpg123_string for you!

int unknown2utf8(char **dest, const char *source, int len)
{
	if(!dest)
		return -1;
	if(!source)
	{
		*dest = INT123_safer_realloc(*dest, 0);
		return -1;
	}
	size_t count = len < 0 ? strlen(source) : (size_t)len;
	// Make a somewhat proper UTF-8 string out of this. Testing for valid
	// UTF-8 is futile. It will be some unspecified legacy 8-bit encoding.
	// I am keeping C0 chars, but replace everything above 7 bits with
	// the Unicode replacement character as most custom 8-bit encodings
	// placed some symbols into the C1 range, we just don't know which.
	size_t ulen = 1; // trailing zero
	for(size_t i=0; i<count; ++i)
	{
		unsigned char c = ((unsigned char*)source)[i];
		if(!c)
			break;
		ulen += c >= 0x80 ? uni_repl_len : 1;
	}

	if(NULL == (*dest = INT123_safer_realloc(*dest, ulen)))
		return -1;

	unsigned char *p = (unsigned char*)*dest;
	for(size_t i=0; i<count; ++i)
	{
		unsigned char c = ((unsigned char*)source)[i];
		if(!c)
			break;
		if(c >= 0x80)
		{
			for(int r=0; r<uni_repl_len; ++r)
				*p++ = uni_repl[r];
		}
		else
			*p++ = c;
	}
	*p = 0;
	return 0;
}

static void ascii_space(unsigned char *c, int *wasspace)
{
	switch(*c)
	{
		case '\f':
		case '\r':
		case '\n':
		case '\t':
		case '\v':
			if(!*wasspace)
				*c = ' '; // Will be dropped by < 0x20 check otherwise.
			*wasspace = 1;
		break;
		default:
			*wasspace = 0;
	}
}

// Filter C1 control chars, using c2lead state.
#define ASCII_C1(c, append) \
	if(c2lead) \
	{ \
		if((c) >= 0x80 && (c) <= 0x9f) \
		{ \
			c2lead = 0; \
			continue; \
		} \
		else \
		{ \
			append; \
		} \
	} \
	c2lead = ((c) == 0xc2); \
	if(c2lead) \
		continue;

// return: strlen+1 of result, 0 on error
// If ret is 0, *dest will be freed and NULL.

static size_t utf8_ascii_work(char **dest_, const char *source
,	int keep_nonprint)
{
	if(!dest_)
		return 0;
	if(!source)
	{
		*dest_ = INT123_safer_realloc(*dest_, 0);
		return 0;
	}

	char *dest = *dest_;
	size_t source_fill = strlen(source)+1;
	size_t spos = 0;
	size_t dlen = 1; // At least a zero.
	unsigned char *p;

	// Find length of ASCII string (count non-continuation bytes).
	// Do _not_ change this to mpg123_strlen()!
	// It needs to match the loop below.
	// No UTF-8 continuation byte 0x10??????, nor control char.
#define ASCII_PRINT_SOMETHING(c) \
	(((c) & 0xc0) != 0x80 && (keep_nonprint || ((c) != 0x7f && (c) >= 0x20)))
	int c2lead = 0;
	int wasspace = 0;
	for(spos=0; spos < source_fill; ++spos)
	{
		unsigned char c = ((unsigned char*)source)[spos];
		if(!keep_nonprint)
			ascii_space(&c, &wasspace);
		ASCII_C1(c, ++dlen);
		if(ASCII_PRINT_SOMETHING(c))
			++dlen;
	}
	// Do nothing with nothing or if allocation fails. Neatly catches overflow
	// of ++dlen.
	if(!dlen || !(dest=INT123_safer_realloc(dest, dlen)))
		goto utf8_ascii_bad;
	p = (unsigned char*)dest;
	c2lead = 0;
	wasspace = 0;
	for(spos=0; spos < source_fill; ++spos)
	{
		unsigned char c = ((unsigned char*)source)[spos];
		if(!keep_nonprint)
			ascii_space(&c, &wasspace);
		ASCII_C1(c, *p++ = joker_symbol)
		if(!ASCII_PRINT_SOMETHING(c))
			continue;
		else if(c & 0x80) // UTF-8 lead byte 0x11??????
			c = joker_symbol;
		*p++ = c;
	}
#undef ASCII_PRINT_SOMETHING
	// Always close the string.
	if(dlen)
		dest[dlen-1] = 0;
	goto utf8_ascii_end;
utf8_ascii_bad:
	dest = INT123_safer_realloc(dest, 0);
utf8_ascii_end:
	*dest_ = dest;
	return dest ? strlen(dest)+1 : 0;
}

// Reduce UTF-8 data to 7-bit ASCII, dropping non-printable characters.
// Non-printable ASCII == everything below 0x20 (space), including
// line breaks.
// Also: 0x7f (DEL) and the C1 chars. The C0 and C1 chars should just be
// dropped, not rendered. Or should they?
static size_t utf8_ascii_print(char **dest, const char *source)
{
	return utf8_ascii_work(dest, source, 0);
}

// Same as above, but keeping non-printable and control chars in the
// 7 bit realm.
static size_t utf8_ascii(char **dest, const char *source)
{
	return utf8_ascii_work(dest, source, 1);
}

size_t utf8outstr(char **dest_, const char *source, int to_terminal)
{
	if(!dest_)
		return 0;
	if(!source)
	{
		*dest_ = INT123_safer_realloc(*dest_, 0);
		return 0;
	}
	char *dest = *dest_;
	size_t width = 0;
	size_t source_fill = strlen(source)+1;

	if(utf8env)
	{
#if defined(HAVE_MBSTOWCS) && defined(HAVE_WCSWIDTH) && \
    defined(HAVE_ISWPRINT) && defined(HAVE_WCSTOMBS)
		if(utf8loc && to_terminal)
		{
			// Best case scenario: Convert to wide string, filter,
			// compute printing width.
			size_t wcharlen = mbstowcs(NULL, source, 0);
			if(wcharlen == (size_t)-1)
				goto utf8outstr_bad;
			if(wcharlen+1 > SIZE_MAX/sizeof(wchar_t))
				goto utf8outstr_bad;
			wchar_t *pre = malloc(sizeof(wchar_t)*(wcharlen+1));
			wchar_t *flt = malloc(sizeof(wchar_t)*(wcharlen+1));
			if(!pre || !flt)
			{
				free(flt);
				free(pre);
				goto utf8outstr_bad;
			}
			if(mbstowcs(pre, source, wcharlen+1) == wcharlen)
			{
				size_t nwl = 0;
				int wasspace = 0;
				for(size_t i=0;  i<wcharlen; ++i)
				{
					// Turn any funky space sequence (including line breaks) into
					// one normal space.
					if(iswspace(pre[i]) && pre[i] != ' ')
					{
						if(!wasspace)
							flt[nwl++] = ' ';
						wasspace = 1;
					} else // Anything non-printing is skipped.
					{
						if(iswprint(pre[i]))
							flt[nwl++] = pre[i];
						wasspace = 0;
					}
				}
				flt[nwl] = 0;
				int columns = wcswidth(flt, nwl);
				size_t bytelen = wcstombs(NULL, flt, 0);
				if(
					columns >= 0 && bytelen != (size_t)-1
					&& (dest=INT123_safer_realloc(dest, bytelen+1))
					&& wcstombs(dest, flt, bytelen+1) == bytelen
				){
					width = columns;
				}
				else
					dest=INT123_safer_realloc(dest, 0);
			}
			free(flt);
			free(pre);
		}
		else
#endif
		if(to_terminal)
		{
			// Only filter C0 and C1 control characters.
			// That is, 0x01 to 0x19 (keeping 0x20, space) and 0x7f (DEL) to 0x9f.
			// Since the input and output is UTF-8, we'll keep that intact.
			// C1 is mapped to 0xc280 till 0xc29f.
			dest = INT123_safer_realloc(dest, source_fill);
			if(!dest)
				goto utf8outstr_bad;
			size_t dest_fill = 0;
			int c2lead = 0;
			int wasspace = 0;
			unsigned char *p = (unsigned char*)dest;
			for(size_t i=0; i<source_fill; ++i)
			{
				unsigned char c = ((unsigned char*)source)[i];
				ascii_space(&c, &wasspace);
				ASCII_C1(c, *p++ = 0xc2)
				if(c && c < 0x20)
					continue; // no C0 control chars, except space
				if(c == 0x7f)
					continue; // also no DEL
				*p++ = c;
				if(!c)
					break; // Up to zero is enough.
				// Assume each 7 bit char and each sequence start make one character.
				// So only continuation bytes need to be ignored.
				if((c & 0xc0) != 0x80)
					++width;
			}
			// Make damn sure that it ends.
			dest_fill = (char*)p - dest;
			dest[dest_fill-1] = 0;
		} else
		{
			dest = INT123_safer_realloc(dest, source_fill);
			if(!dest)
				goto utf8outstr_bad;
			size_t dest_fill = 0;
			unsigned char *p = (unsigned char*)dest;
			for(size_t i=0; i<source_fill; ++i)
			{
				unsigned char c = ((unsigned char*)source)[i];
				*p++ = c;
				if(!c)
					break; // Up to zero is enough.
				// Actual width should not matter that much for non-terminal,
				// as we should use less formatting in that case, but anyway.
				if((c & 0xc0) != 0x80)
					++width;
			}
			dest_fill = (char*)p - dest;
			dest[dest_fill-1] = 0;
		}
	} else if(to_terminal)
	{
		// Last resort: just 7-bit ASCII.
		width = to_terminal
		?	utf8_ascii_print(&dest, source)
		:	utf8_ascii(&dest, source);
		if(!width)
			goto utf8outstr_bad;
		--width;
	}

	goto utf8outstr_end;
utf8outstr_bad:
	dest = INT123_safer_realloc(dest, 0);
	width = 0;
utf8outstr_end:
	*dest_ = dest;
	return width;
}

#undef ASCII_C1

// I tried saving some malloc using provided work buffers, but
// realized that the path of Unicode transformations is so full
// of them regardless.
// Can this include all the necessary logic?
// - If UTF-8 input: Use utf8outstr(), which includes terminal switch.
// - If not:
// -- If terminal: construct safe UTF-8, pass on to outstr().
// -- If not: assume env encoding, unprocessed string that came
//    from the environment.

int outstr(char **dest, const char *str, int is_utf8, int is_term)
{
	if(!dest)
		return -1;
	if(!str)
	{
		*dest = INT123_safer_realloc(*dest, 0);
		return -1;
	}
	int ret = 0;
	if(is_utf8 || utf8env)
	{
		utf8outstr(dest, str, is_term);
		if(*dest)
			ret = -1;
	} else if(is_term)
	{
		char *usrc = NULL;
		ret = unknown2utf8(&usrc, str, -1);
		if(!ret)
		{
			utf8outstr(dest, usrc, is_term);
			if(!*dest)
				ret = -1;
		}
		free(usrc);
	} else
	{
		*dest = INT123_compat_strdup(str);
		if(!*dest)
			ret = -1;
	}
	return ret;
}

int print_outstr(FILE *out, const char *str, int is_utf8, int is_term)
{
	int ret = 0;
	if(!str)
		return -1;
	char *outbuf = NULL;
	ret = outstr(&outbuf, str, is_utf8, is_term);
	if(outbuf)
	{
		ret = fprintf(out, "%s", outbuf);
		free(outbuf);
	}
	return ret;
}