VirtualBox

Ignore:
Timestamp:
Jul 13, 2022 6:23:41 PM (3 years ago)
Author:
vboxsync
svn:sync-xref-src-repo-rev:
152263
Message:

IPRT/RTProcCreateEx/posix: Always utilise newlocale(3C) +
nl_langinfo_l(3C) when determining the codeset of the child's locale in
rtProcPosixConvertArgv(). Trimming the locale name of its
'language_territory' prefix and possible '@modifier' suffix isn't
sufficient for all locales as some "exotic" locales have a different
codeset than the one named in the locale name. Also handle composite
locales if found and deal with a buggy nl_langinfo(3C) on macOS which
can't parse locale names consisting of only 'language_territory' (e.g.
el_GR). bugref:10153

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/VBox/Runtime/r3/posix/process-creation-posix.cpp

    r95320 r95623  
    6868#if defined(RT_OS_LINUX) || defined(RT_OS_SOLARIS)
    6969# include <shadow.h>
     70#endif
     71#if defined(RT_OS_DARWIN)
     72# include <xlocale.h> /* for newlocale() */
    7073#endif
    7174
     
    14641467    else
    14651468    {
    1466         /* LC_ALL overrides everything else.*/
    1467         /** @todo I don't recall now if this can do LC_XXX= inside it's value, like
    1468          *        what setlocale returns on some systems.  It's been 15-16 years
    1469          *        since I last worked on an setlocale implementation... */
     1469        /*
     1470         * LC_ALL overrides everything else.  The LC_* environment variables are often set
     1471         * to the empty string so move on the next variable if that is the case.
     1472         */
    14701473        const char *pszVar;
    14711474        int rc = RTEnvGetEx(hEnvToUse, pszVar = "LC_ALL", szEncoding, sizeof(szEncoding), NULL);
    1472         if (rc == VERR_ENV_VAR_NOT_FOUND)
     1475        if (rc == VERR_ENV_VAR_NOT_FOUND || (RT_SUCCESS(rc) && !*szEncoding))
    14731476            rc = RTEnvGetEx(hEnvToUse, pszVar = "LC_CTYPE", szEncoding, sizeof(szEncoding), NULL);
    1474         if (rc == VERR_ENV_VAR_NOT_FOUND)
     1477        if (rc == VERR_ENV_VAR_NOT_FOUND || (RT_SUCCESS(rc) && !*szEncoding))
    14751478            rc = RTEnvGetEx(hEnvToUse, pszVar = "LANG", szEncoding, sizeof(szEncoding), NULL);
    1476         if (RT_SUCCESS(rc))
    1477         {
    1478             const char *pszDot = strchr(szEncoding, '.');
    1479             if (pszDot)
    1480                 pszDot = RTStrStripL(pszDot + 1);
    1481             if (pszDot && *pszDot)
    1482             {
    1483                 pszEncoding = pszDot;
    1484                 Log2Func(("%s=%s -> %s (simple)\n", pszVar, szEncoding, pszEncoding));
    1485             }
    1486             else
    1487             {
    1488                  /* No charset is given, so the default of the locale should be
    1489                     used.  To get at that we have to use newlocale and nl_langinfo_l,
    1490                     which is there since ancient days on linux but no necessarily else
    1491                     where. */
     1479        if (RT_SUCCESS(rc) && *szEncoding)
     1480        {
     1481            /*
     1482             * LC_ALL can contain a composite locale consisting of the locales of each of the
     1483             * categories in two different formats depending on the OS. On Solaris, macOS, and
     1484             * *BSD composite locale names use slash ('/') as the separator and the following
     1485             * order for the categories:
     1486             *   LC_CTYPE/LC_NUMERIC/LC_TIME/LC_COLLATE/LC_MONETARY/LC_MESSAGES
     1487             * e.g.:
     1488             *   en_US.UTF-8/POSIX/el_GR.UTF-8/el_CY.UTF-8/en_GB.UTF-8/es_ES.UTF-8
     1489             * N.B. On Solaris there is also a leading slash.
     1490             * On Linux the composite locale format is made up of key-value pairs of category
     1491             * names and locales of the form 'name=value' with each element separated by a
     1492             * semicolon in the same order as above with following additional categories
     1493             * included as well:
     1494             *   LC_PAPER/LC_NAME/LC_ADDRESS/LC_TELEPHONE/LC_MEASUREMENT/LC_IDENTIFICATION
     1495             * e.g.
     1496             *   LC_CTYPE=fr_BE;LC_NUMERIC=fr_BE@euro;LC_TIME=fr_BE.utf8;LC_COLLATE=fr_CA;\
     1497             *   LC_MONETARY=fr_CA.utf8;LC_MESSAGES=fr_CH;LC_PAPER=fr_CH.utf8;LC_NAME=fr_FR;\
     1498             *   LC_ADDRESS=fr_FR.utf8;LC_TELEPHONE=fr_LU;LC_MEASUREMENT=fr_LU@euro;\
     1499             *   LC_IDENTIFICATION=fr_LU.utf8
     1500             */
     1501#if !defined(RT_OS_LINUX)
     1502# if defined(RT_OS_SOLARIS)
     1503            if (RTPATH_IS_SLASH(*szEncoding))
     1504                (void) memmove(szEncoding, szEncoding + 1, strlen(szEncoding));
     1505# endif
     1506            char *pszSlash = strchr(szEncoding, '/');
     1507            if (pszSlash)
     1508                *pszSlash = '\0';
     1509#else
     1510            char *pszSemicolon = strchr(szEncoding, ';');
     1511            if (pszSemicolon)
     1512            {
     1513                *pszSemicolon = '\0';
     1514                size_t cchPrefix = strlen("LC_CTYPE=");
     1515                if (!RTStrNCmp(szEncoding, "LC_CTYPE=", cchPrefix))
     1516                    (void) memmove(szEncoding, szEncoding + cchPrefix, strlen(szEncoding));
     1517            }
     1518#endif
     1519            /*
     1520             * Use newlocale and nl_langinfo_l to determine the default codeset for the locale
     1521             * specified in the child's environment.  These routines have been around since
     1522             * ancient days on Linux and for quite a long time on macOS, Solaris, and *BSD but
     1523             * to ensure their availability check that LC_CTYPE_MASK is defined.
     1524             */
    14921525#ifdef LC_CTYPE_MASK
    1493                 locale_t hLocale = newlocale(LC_CTYPE_MASK, szEncoding, (locale_t)0);
    1494                 if (hLocale != (locale_t)0)
    1495                 {
    1496                     const char *pszCodeset = nl_langinfo_l(CODESET, hLocale);
    1497                     Log2Func(("nl_langinfo_l(CODESET, %s=%s) -> %s\n", pszVar, szEncoding, pszCodeset));
    1498                     Assert(pszCodeset && *pszCodeset != '\0');
    1499 
    1500                     rc = RTStrCopy(szEncoding, sizeof(szEncoding), pszCodeset);
    1501                     AssertRC(rc); /* cannot possibly overflow */
    1502 
    1503                     freelocale(hLocale);
    1504                     pszEncoding = szEncoding;
    1505                 }
    1506                 else
    1507 #endif
    1508                 {
    1509                     /* This is mostly wrong, but I cannot think of anything better now: */
    1510                     pszEncoding = rtStrGetLocaleCodeset();
    1511                     LogFunc(("No newlocale or it failed (on '%s=%s', errno=%d), falling back on %s that we're using...\n",
    1512                              pszVar, szEncoding, errno, pszEncoding));
    1513                 }
    1514             }
    1515             RT_NOREF_PV(pszVar);
     1526            locale_t hLocale = newlocale(LC_CTYPE_MASK, szEncoding, (locale_t)0);
     1527            if (hLocale != (locale_t)0)
     1528            {
     1529                const char *pszCodeset = nl_langinfo_l(CODESET, hLocale);
     1530# ifdef RT_OS_DARWIN
     1531                /*
     1532                 * The macOS nl_langinfo(3)/nl_langinfo_l(3) routines return a pointer to an
     1533                 * empty string for "short" locale names like en_NZ, it_IT, el_GR, etc. so
     1534                 * fallback to UTF-8 in those cases which is the default for short name locales
     1535                 * on macOS anyhow.
     1536                 */
     1537                if (pszCodeset && !*pszCodeset)
     1538                    pszCodeset = "UTF-8";
     1539# endif
     1540                Log2Func(("nl_langinfo_l(CODESET, %s=%s) -> %s\n", pszVar, szEncoding, pszCodeset));
     1541                Assert(pszCodeset && *pszCodeset != '\0');
     1542
     1543                rc = RTStrCopy(szEncoding, sizeof(szEncoding), pszCodeset);
     1544                AssertRC(rc); /* cannot possibly overflow */
     1545
     1546                freelocale(hLocale);
     1547                pszEncoding = szEncoding;
     1548             }
     1549             else
     1550#endif
     1551             {
     1552                 /* This is mostly wrong, but I cannot think of anything better now: */
     1553                 pszEncoding = rtStrGetLocaleCodeset();
     1554                 LogFunc(("No newlocale or it failed (on '%s=%s', errno=%d), falling back on %s that we're using...\n",
     1555                          pszVar, szEncoding, errno, pszEncoding));
     1556             }
     1557             RT_NOREF_PV(pszVar);
    15161558        }
    15171559        else
Note: See TracChangeset for help on using the changeset viewer.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette