• Re: Unicode...

    From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c on Wed Nov 26 19:42:09 2025
    From Newsgroup: comp.lang.c

    I've developed a UTF-8 width function with AVX-512 that can validate
    for a proper number of extension bytes after the header bytes. The
    validation is done with bit-masks delivered from AVX-intrinsics,
    i.e. without loops.
    The code accepts a basic_string_view with a chacacter widh of one
    byte (all three char-types and char8_t). It's about 20 times faster
    than a pure validation basing on non-vectored code.
    I'll make an AVX (without 512) version so ghat you can test the code.

    template<bool Validate, typename Char, typename Traits>
        requires is_integral_v<Char> && (sizeof(Char) == 1)
    size_t utf8Width512( basic_string_view<Char, Traits> str )
    {
        if( str.empty() )
            return 0;
        constexpr uint64_t ALL_ONES = -1;
        __m512i const
            oneMask = _mm512_set1_epi8( (char)0x80 ),
            oneHead = _mm512_setzero_si512();
        uintptr_t
            uBegin = (uintptr_t)to_address( str.begin() ),
            uEnd = (uintptr_t)to_address( str.end() );
        using span_t = span<__m512i>;
        span<__m512i> range64( (__m512i *)(uBegin & -64), (__m512i *)(uEnd
    + 63 & -64) );
        size_t
            head = uBegin & 63,
            tail = uEnd & 63;
        size_t n = 0;
        uint64_t mask;
        if constexpr( Validate )
        {
            __m512i const
                extendMask = _mm512_set1_epi8( (char)0xC0 ),
                extendHead = _mm512_set1_epi8( (char)0x80 ),
                twoMask = _mm512_set1_epi8( (char)0xE0 ),
                twoHead = _mm512_set1_epi8( (char)0xC0 ),
                threeMask = _mm512_set1_epi8( (char)0xF0 ),
                threeHead = _mm512_set1_epi8( (char)0xE0 ),
                fourMask = _mm512_set1_epi8( (char)0xF8 ),
                fourHead = _mm512_set1_epi8( (char)0xF0 ),
                invalid = fourMask;
            uint64_t one = 0, extend = 0, extendPrev = 0, two = 0, three =
    0, four = 0;
            auto doChunk = [&]( span_t::iterator it64 ) L_FORCEINLINE
            {
                (void)(it64 + 1);
                __m512i chunk = _mm512_load_si512( to_address( it64 ) );
                if( _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk, invalid ), invalid ) & mask ) [[unlikely]]
                    return false;
                one = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk, oneMask ), oneHead ) & mask;
                extend = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk, extendMask ), extendHead ) & mask;
                two = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk, twoMask ), twoHead ) & mask;
                three = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk, threeMask ), threeHead ) & mask;
                four = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk, fourMask ), fourHead ) & mask;
                auto shrd = []( uint64_t left, uint64_t right, unsigned n ) L_FORCEINLINE { return  left << 64 - n | right >> n; };
                uint64_t
                    extend2 = shrd( extendPrev, extend, 1 ),
                    extend3 = shrd( extendPrev, extend, 2 ) & extend2,
                    extend4 = shrd( extendPrev, extend, 3 ) & extend3,
                    beyond = shrd( extendPrev, extend, 4 ) & extend4,
                    err;
                err = (two & extend2) ^ two;
                err |= (three & extend3) ^ three;
                err |= (four & extend4) ^ four;
                err |= one & extend2;
                err |= two & extend3;
                err |= three & extend4;
                err |= four & beyond;
                if( err ) [[unlikely]]
                    return false;
                n += popcount( one | two | three | four );
                extendPrev = extend;
                return true;
            };
            span_t::iterator it64 = range64.end();
            mask = tail ? ~(ALL_ONES << tail) : ALL_ONES;
            while( it64 > range64.begin() + (size_t)(bool)head )
                if( doChunk( --it64 ) ) [[likely]]
                    mask = ALL_ONES;
                else
                    return -1;
            if( head ) [[likely]]
            {
                mask &= ALL_ONES << head;
                doChunk( it64 );
            }
            if( countr_zero( extendPrev ) < countr_zero( one | two | three
    | four ) ) [[unlikely]]
                return -1;
            return n;
        }
        else
        {
            __m512i const
                mask24 = _mm512_set1_epi8( (char)0xC0 ),
                head24 = mask24;
            auto doChunk = [&]( span_t::iterator it64 ) L_FORCEINLINE
            {
                (void)(it64 + 1);
                __m512i chunk = _mm512_load_si512( to_address( it64 ) );
                uint64_t
                    one = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk,
    oneMask ), oneHead ) & mask,
                    twoAndMore = _mm512_cmpeq_epi8_mask( _mm512_and_si512(
    chunk, mask24 ), head24 ) & mask;
                n += popcount( one | twoAndMore );
            };
            span_t::iterator it64 = range64.begin();
            mask = ALL_ONES << head;;
            for( ; it64 != range64.end() - (bool)tail; ++it64 )
            {
                doChunk( it64);
                mask = -1;
            }
            if( !tail )
                return n;
            mask &= ~(ALL_ONES << tail);
            doChunk( it64 );
            return n;
        }
    }

    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c on Wed Dec 3 06:24:23 2025
    From Newsgroup: comp.lang.c

    Am 18.11.2025 um 21:17 schrieb Michael Sanders:
    Hi James, umm 'guarantees'? No no... It does NOT verify:

    - whether the environment actually supports UTF8 fully
    - whether multibyte functions are enabled
    - whether the terminal supports UTF8
    - whether the C library supports UTF8 normalization
    (combining characters, etc. but it seems to work well here)

    To be sure: It's not a UTF-8 capability test. It's only a
    locale-string check. So it likely misses many valid UTF8
    locale variants...

    Here I'm running any mixture of: Windows/BSD/Linix Mint LMDE.
    Windows has the ...W() APIs along with codepage-based APIs with
    the ...A() Suffix. The W()-APIs support UTF-16, so no need for
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Michael Sanders@porkchop@invalid.foo to comp.lang.c on Wed Dec 3 18:33:05 2025
    From Newsgroup: comp.lang.c

    On Wed, 3 Dec 2025 06:24:23 +0100, Bonita Montero wrote:

    Here I'm running any mixture of: Windows/BSD/Linix Mint LMDE.

    Windows has the ...W() APIs along with codepage-based APIs with
    the ...A() Suffix. The W()-APIs support UTF-16, so no need for

    Hi Bonita.

    Yes that's correct, but...

    - that assumes we know in advance what the character is

    - it would only work under Windows

    We want portability across diverse OSs. In my case, the program
    does NOT care what the character is, it simply needs to be able
    to find it when searching data & displaying it in an ordered way.

    The code below works perfectly:

    #include <stdio.h>
    #include <string.h>

    int utf8_display_width(const char *s) {
    int w = 0;

    while (*s) {
    unsigned char b = *s;
    unsigned cp;
    int n;

    // UTF-8 decoder
    if (b <= 0x7F) { // 1-byte ASCII
    cp = b;
    n = 1;
    } else if (b >= 0xC0 && b <= 0xDF) { // 2-byte
    cp = ((b & 0x1F) << 6) |
    (s[1] & 0x3F);
    n = 2;
    } else if (b >= 0xE0 && b <= 0xEF) { // 3-byte
    cp = ((b & 0x0F) << 12) |
    ((s[1] & 0x3F) << 6) |
    (s[2] & 0x3F);
    n = 3;
    } else if (b >= 0xF0 && b <= 0xF7) { // 4-byte
    cp = ((b & 0x07) << 18) |
    ((s[1] & 0x3F) << 12) |
    ((s[2] & 0x3F) << 6) |
    (s[3] & 0x3F);
    n = 4;
    } else { // invalid, treat as 1-byte
    cp = b;
    n = 1;
    }

    // display width
    if (cp >= 0x0300 && cp <= 0x036F) {} // combining marks like é (zero width)
    else if ( // double-width characters...
    (cp >= 0x1100 && cp <= 0x115F) || // hangul jamo
    (cp >= 0x2E80 && cp <= 0xA4CF) || // cjk radicals & unified ideographs
    (cp >= 0xAC00 && cp <= 0xD7A3) || // hangul syllables
    (cp >= 0xF900 && cp <= 0xFAFF) || // cjk compatibility ideographs
    (cp >= 0x1F300 && cp <= 0x1FAFF) // emoji + symbols
    ) { w += 2; }
    // exceptional wide characters (unicode requirement I've read elsewhere)
    else if (cp == 0x2329 || cp == 0x232A) { w += 2; }
    else { w += 1; } // normal width for everything else

    s += n;
    }

    return w;
    }

    int main(void) {
    const char *tests[] = {
    "hello",
    "Café",
    "漢字",
    "✓",
    "🙂",
    NULL
    };

    // find maximum display width in 1st column
    int maxw = 0;
    for (int i = 0; tests[i]; i++) {
    int w = utf8_display_width(tests[i]);
    if (w > maxw) maxw = w;
    }

    // total padding after each 1st column + 3 spaces
    int total_pad = maxw + 3;

    for (int i = 0; tests[i]; i++) {
    int w = utf8_display_width(tests[i]);
    int sl = strlen(tests[i]);
    printf("%s", tests[i]);
    int pad = total_pad - w;
    while (pad-- > 0) putchar(' ');
    printf("strlen: %d utf8 display width: %d\n", sl, w);
    }

    return 0;
    }

    // eof
    --
    :wq
    Mike Sanders
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From James Kuyper@jameskuyper@alumni.caltech.edu to comp.lang.c on Wed Dec 3 14:01:38 2025
    From Newsgroup: comp.lang.c

    On 2025-12-03 13:33, Michael Sanders wrote:
    ...
    We want portability across diverse OSs. In my case, the program
    does NOT care what the character is, it simply needs to be able
    to find it when searching data & displaying it in an ordered way.

    The code below works perfectly:

    #include <stdio.h>
    #include <string.h>

    int utf8_display_width(const char *s) {
    int w = 0;

    while (*s) {
    unsigned char b = *s;
    unsigned cp;
    int n;

    // UTF-8 decoder
    if (b <= 0x7F) { // 1-byte ASCII
    cp = b;
    n = 1;
    } else if (b >= 0xC0 && b <= 0xDF) { // 2-byte
    cp = ((b & 0x1F) << 6) |
    (s[1] & 0x3F);
    n = 2;
    } else if (b >= 0xE0 && b <= 0xEF) { // 3-byte
    cp = ((b & 0x0F) << 12) |
    ((s[1] & 0x3F) << 6) |
    (s[2] & 0x3F);
    n = 3;
    } else if (b >= 0xF0 && b <= 0xF7) { // 4-byte
    cp = ((b & 0x07) << 18) |
    ((s[1] & 0x3F) << 12) |
    ((s[2] & 0x3F) << 6) |
    (s[3] & 0x3F);
    n = 4;
    } else { // invalid, treat as 1-byte
    cp = b;
    n = 1;
    }

    // display width
    if (cp >= 0x0300 && cp <= 0x036F) {} // combining marks like é (zero
    width)
    else if ( // double-width characters...
    (cp >= 0x1100 && cp <= 0x115F) || // hangul jamo
    (cp >= 0x2E80 && cp <= 0xA4CF) || // cjk radicals & unified ideographs
    (cp >= 0xAC00 && cp <= 0xD7A3) || // hangul syllables
    (cp >= 0xF900 && cp <= 0xFAFF) || // cjk compatibility ideographs
    (cp >= 0x1F300 && cp <= 0x1FAFF) // emoji + symbols
    ) { w += 2; }
    // exceptional wide characters (unicode requirement I've read elsewhere)
    else if (cp == 0x2329 || cp == 0x232A) { w += 2; }
    else { w += 1; } // normal width for everything else

    s += n;
    }

    return w;
    }

    int main(void) {
    const char *tests[] = {
    "hello",
    "Café",
    "漢字",
    "✓",
    "🙂",
    NULL
    };

    // find maximum display width in 1st column
    int maxw = 0;
    for (int i = 0; tests[i]; i++) {
    int w = utf8_display_width(tests[i]);
    if (w > maxw) maxw = w;
    }

    // total padding after each 1st column + 3 spaces
    int total_pad = maxw + 3;

    for (int i = 0; tests[i]; i++) {
    int w = utf8_display_width(tests[i]);
    int sl = strlen(tests[i]);
    printf("%s", tests[i]);
    int pad = total_pad - w;
    while (pad-- > 0) putchar(' ');
    printf("strlen: %d utf8 display width: %d\n", sl, w);
    }

    return 0;
    }

    // eof


    I find it confusing that this is supposed to "work perfectly" "across
    diverse OSs". The amount of space that a character takes up varies
    depending upon the installed fonts, especially on whether the font is monospaced or proportional. Those fonts can be different for display on
    screen or on a printer. I don't see any query to determine even what the current font is, much less what it's characteristics are. I don't know
    of any OS-independent way of collecting such information. Does this
    solution "work perfectly" only for your own particular favorite font?


    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From bart@bc@freeuk.com to comp.lang.c on Wed Dec 3 20:15:02 2025
    From Newsgroup: comp.lang.c

    On 03/12/2025 19:01, James Kuyper wrote:
    On 2025-12-03 13:33, Michael Sanders wrote:
    ...
    We want portability across diverse OSs. In my case, the program
    does NOT care what the character is, it simply needs to be able
    to find it when searching data & displaying it in an ordered way.

    The code below works perfectly:

    #include <stdio.h>
    #include <string.h>

    int utf8_display_width(const char *s) {
    int w = 0;

    while (*s) {
    unsigned char b = *s;
    unsigned cp;
    int n;

    // UTF-8 decoder
    if (b <= 0x7F) { // 1-byte ASCII
    cp = b;
    n = 1;
    } else if (b >= 0xC0 && b <= 0xDF) { // 2-byte
    cp = ((b & 0x1F) << 6) |
    (s[1] & 0x3F);
    n = 2;
    } else if (b >= 0xE0 && b <= 0xEF) { // 3-byte
    cp = ((b & 0x0F) << 12) |
    ((s[1] & 0x3F) << 6) |
    (s[2] & 0x3F);
    n = 3;
    } else if (b >= 0xF0 && b <= 0xF7) { // 4-byte
    cp = ((b & 0x07) << 18) |
    ((s[1] & 0x3F) << 12) |
    ((s[2] & 0x3F) << 6) |
    (s[3] & 0x3F);
    n = 4;
    } else { // invalid, treat as 1-byte
    cp = b;
    n = 1;
    }

    // display width
    if (cp >= 0x0300 && cp <= 0x036F) {} // combining marks like é (zero
    width)
    else if ( // double-width characters...
    (cp >= 0x1100 && cp <= 0x115F) || // hangul jamo
    (cp >= 0x2E80 && cp <= 0xA4CF) || // cjk radicals & unified ideographs
    (cp >= 0xAC00 && cp <= 0xD7A3) || // hangul syllables
    (cp >= 0xF900 && cp <= 0xFAFF) || // cjk compatibility ideographs
    (cp >= 0x1F300 && cp <= 0x1FAFF) // emoji + symbols
    ) { w += 2; }
    // exceptional wide characters (unicode requirement I've read elsewhere)
    else if (cp == 0x2329 || cp == 0x232A) { w += 2; }
    else { w += 1; } // normal width for everything else

    s += n;
    }

    return w;
    }

    int main(void) {
    const char *tests[] = {
    "hello",
    "Café",
    "漢字",
    "✓",
    "🙂",
    NULL
    };

    // find maximum display width in 1st column
    int maxw = 0;
    for (int i = 0; tests[i]; i++) {
    int w = utf8_display_width(tests[i]);
    if (w > maxw) maxw = w;
    }

    // total padding after each 1st column + 3 spaces
    int total_pad = maxw + 3;

    for (int i = 0; tests[i]; i++) {
    int w = utf8_display_width(tests[i]);
    int sl = strlen(tests[i]);
    printf("%s", tests[i]);
    int pad = total_pad - w;
    while (pad-- > 0) putchar(' ');
    printf("strlen: %d utf8 display width: %d\n", sl, w);
    }

    return 0;
    }

    // eof


    I find it confusing that this is supposed to "work perfectly" "across
    diverse OSs". The amount of space that a character takes up varies
    depending upon the installed fonts, especially on whether the font is monospaced or proportional. Those fonts can be different for display on screen or on a printer. I don't see any query to determine even what the current font is, much less what it's characteristics are. I don't know
    of any OS-independent way of collecting such information. Does this
    solution "work perfectly" only for your own particular favorite font?


    This looks like a solution for a fixed-pitch font. I get this output for
    a Windows console display (with - used for space):

    hello---strlen: 5 utf8 display width: 5
    Café----strlen: 5 utf8 display width: 4
    漢字----strlen: 6 utf8 display width: 4
    ✓-------strlen: 3 utf8 display width: 1
    🙂------strlen: 4 utf8 display width: 2

    I was hoping this would be lined up, but already, in a Thunderbird edit Window, the last lines aren't lined up properly.

    Same problem with Notepad (fixed pitch) and LibreOffice (fixed pitch).

    It only looks alright in Windows and WSL consoles/terminals. But maybe
    that's all that's needed.



    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c on Wed Dec 3 22:43:05 2025
    From Newsgroup: comp.lang.c

    On Wed, 3 Dec 2025 20:15:02 +0000
    bart <bc@freeuk.com> wrote:


    This looks like a solution for a fixed-pitch font. I get this output
    for a Windows console display (with - used for space):

    hello---strlen: 5 utf8 display width: 5
    Café----strlen: 5 utf8 display width: 4
    It sounds as a luck. é in your text just happened to be encoded as
    U+00E9. What if it was encoded as U+0065,U+00B4 ? (Hopefully, I got the
    correct code, I can't really distinguish between similar diacritics).
    漢字----strlen: 6 utf8 display width: 4
    ✓-------strlen: 3 utf8 display width: 1
    🙂------strlen: 4 utf8 display width: 2

    I was hoping this would be lined up, but already, in a Thunderbird
    edit Window, the last lines aren't lined up properly.

    Same problem with Notepad (fixed pitch) and LibreOffice (fixed pitch).

    It only looks alright in Windows and WSL consoles/terminals. But
    maybe that's all that's needed.



    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Keith Thompson@Keith.S.Thompson+u@gmail.com to comp.lang.c on Wed Dec 3 12:49:23 2025
    From Newsgroup: comp.lang.c

    bart <bc@freeuk.com> writes:
    On 03/12/2025 19:01, James Kuyper wrote:
    [...]
    I find it confusing that this is supposed to "work perfectly"
    "across
    diverse OSs". The amount of space that a character takes up varies
    depending upon the installed fonts, especially on whether the font is
    monospaced or proportional. Those fonts can be different for display on
    screen or on a printer. I don't see any query to determine even what the
    current font is, much less what it's characteristics are. I don't know
    of any OS-independent way of collecting such information. Does this
    solution "work perfectly" only for your own particular favorite font?

    This looks like a solution for a fixed-pitch font. I get this output
    for a Windows console display (with - used for space):
    [...]

    I think bart is right that this is specific to fixed-width fonts.
    For a variable width font, 'W' is going to be wider than '|'.

    See also the POSIX `int wcwidth(wchar_t wc)` function, which returns
    the "number of column positions of a wide-character code". It does
    depend on the current locale.

    The assumption seems to be that fixed-width fonts are expected to be
    consistent about the widths of characters.
    --
    Keith Thompson (The_Other_Keith) Keith.S.Thompson+u@gmail.com
    void Void(void) { Void(); } /* The recursive call of the void */
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Michael Sanders@porkchop@invalid.foo to comp.lang.c on Wed Dec 3 23:23:30 2025
    From Newsgroup: comp.lang.c

    On Wed, 3 Dec 2025 14:01:38 -0500, James Kuyper wrote:

    I find it confusing that this is supposed to "work perfectly" "across
    diverse OSs". The amount of space that a character takes up varies
    depending upon the installed fonts, especially on whether the font is monospaced or proportional. Those fonts can be different for display on screen or on a printer. I don't see any query to determine even what the current font is, much less what it's characteristics are. I don't know
    of any OS-independent way of collecting such information. Does this
    solution "work perfectly" only for your own particular favorite font?

    Just for use in the terminal & yes it works as advertised.

    In my case I simply need to match the character the user passed
    to the program when searching for a record. I dont want or need
    to know what font is used. If the terminal can display it, then
    I want to use it.

    Example, user invokes: tinybase -s=漢字 data/*.tbf

    Output is...

    FILE: data/history.tbf
    LINE: 170
    BLOCK: 4
    CRC-8: 0x30
    QUERY: 漢字
    MATCH: 漢字

    TAGS: China, History, <漢字>, [wrap:66]

    Ancient China...

    1. Geography and Early Beginnings: Ancient China, a cradle of
    civilization, evolved along the Yellow River's fertile plains.
    Protected by the Himalayas to the south, the Gobi Desert to the
    north, and vast seas to the east, this geographic isolation
    allowed for a unique and continuous cultural development spanning
    millennia.

    ...

    James, earnestly intending no offense - add something to the
    conversion rather than complaining - I want to learn & solve
    problems that's where I'm seeking help. Just modify the code,
    make it get closer to your ideal. We'll all benefit.
    --
    :wq
    Mike Sanders
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Keith Thompson@Keith.S.Thompson+u@gmail.com to comp.lang.c on Wed Dec 3 18:15:38 2025
    From Newsgroup: comp.lang.c

    Keith Thompson <Keith.S.Thompson+u@gmail.com> writes:
    bart <bc@freeuk.com> writes:
    On 03/12/2025 19:01, James Kuyper wrote:
    [...]
    I find it confusing that this is supposed to "work perfectly"
    "across
    diverse OSs". The amount of space that a character takes up varies
    depending upon the installed fonts, especially on whether the font is
    monospaced or proportional. Those fonts can be different for display on
    screen or on a printer. I don't see any query to determine even what the >>> current font is, much less what it's characteristics are. I don't know
    of any OS-independent way of collecting such information. Does this
    solution "work perfectly" only for your own particular favorite font?

    This looks like a solution for a fixed-pitch font. I get this output
    for a Windows console display (with - used for space):
    [...]

    I think bart is right that this is specific to fixed-width fonts.
    For a variable width font, 'W' is going to be wider than '|'.

    See also the POSIX `int wcwidth(wchar_t wc)` function, which returns
    the "number of column positions of a wide-character code". It does
    depend on the current locale.

    The assumption seems to be that fixed-width fonts are expected to be consistent about the widths of characters.

    And in fact Unicode specifies how many cell positions each printable
    character occupies, or at least for some of them.

    The following is quoted from wcwidth.c in the xterm sources. The text
    was originally written by Markus Kuhn.

    * For some graphical characters, the Unicode standard explicitly
    * defines a character-cell width via the definition of the East Asian
    * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
    * In all these cases, there is no ambiguity about which width a
    * terminal shall use. For characters in the East Asian Ambiguous (A)
    * class, the width choice depends purely on a preference of backward
    * compatibility with either historic CJK or Western practice.
    * Choosing single-width for these characters is easy to justify as
    * the appropriate long-term solution, as the CJK practice of
    * displaying these characters as double-width comes from historic
    * implementation simplicity (8-bit encoded characters were displayed
    * single-width and 16-bit ones double-width, even for Greek,
    * Cyrillic, etc.) and not any typographic considerations.
    --
    Keith Thompson (The_Other_Keith) Keith.S.Thompson+u@gmail.com
    void Void(void) { Void(); } /* The recursive call of the void */
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Michael Sanders@porkchop@invalid.foo to comp.lang.c on Thu Dec 4 04:11:35 2025
    From Newsgroup: comp.lang.c

    Ever worked with binary search trees Bonita?

    I've been playing around with them, or was awhile back at least...

    My criteria was to build nodes alphabetically:

    - Left subtree contains keys less than the node

    - Right subtree contains keys greater than the node

    INSTRUMENTATION

    I
    / \
    E N
    / / \
    A M S
    / / \
    I R T
    / \
    N U
    \ /
    O T
    / \
    N T
    --
    :wq
    Mike Sanders
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c on Thu Dec 4 14:03:54 2025
    From Newsgroup: comp.lang.c

    Am 03.12.2025 um 19:33 schrieb Michael Sanders:
    On Wed, 3 Dec 2025 06:24:23 +0100, Bonita Montero wrote:

    Here I'm running any mixture of: Windows/BSD/Linix Mint LMDE.
    Windows has the ...W() APIs along with codepage-based APIs with
    the ...A() Suffix. The W()-APIs support UTF-16, so no need for
    Hi Bonita.

    Yes that's correct, but...

    - that assumes we know in advance what the character is

    - it would only work under Windows

    We want portability across diverse OSs. In my case, the program
    does NOT care what the character is, it simply needs to be able
    to find it when searching data & displaying it in an ordered way.
    VC++ supports C- and C++ locale if you like to have it portable.
    Especially the locale-support in C++ with its facets is very nice
    to handle: https://en.cppreference.com/w/cpp/locale.html


    The code below works perfectly:

    #include <stdio.h>
    #include <string.h>

    int utf8_display_width(const char *s) {
    int w = 0;

    while (*s) {
    unsigned char b = *s;
    unsigned cp;
    int n;

    // UTF-8 decoder
    if (b <= 0x7F) { // 1-byte ASCII
    cp = b;
    n = 1;
    } else if (b >= 0xC0 && b <= 0xDF) { // 2-byte
    cp = ((b & 0x1F) << 6) |
    (s[1] & 0x3F);
    n = 2;
    } else if (b >= 0xE0 && b <= 0xEF) { // 3-byte
    cp = ((b & 0x0F) << 12) |
    ((s[1] & 0x3F) << 6) |
    (s[2] & 0x3F);
    n = 3;
    } else if (b >= 0xF0 && b <= 0xF7) { // 4-byte
    cp = ((b & 0x07) << 18) |
    ((s[1] & 0x3F) << 12) |
    ((s[2] & 0x3F) << 6) |
    (s[3] & 0x3F);
    n = 4;
    } else { // invalid, treat as 1-byte
    cp = b;
    n = 1;
    }

    // display width
    if (cp >= 0x0300 && cp <= 0x036F) {} // combining marks like é (zero width)
    else if ( // double-width characters...
    (cp >= 0x1100 && cp <= 0x115F) || // hangul jamo
    (cp >= 0x2E80 && cp <= 0xA4CF) || // cjk radicals & unified ideographs
    (cp >= 0xAC00 && cp <= 0xD7A3) || // hangul syllables
    (cp >= 0xF900 && cp <= 0xFAFF) || // cjk compatibility ideographs
    (cp >= 0x1F300 && cp <= 0x1FAFF) // emoji + symbols
    ) { w += 2; }
    // exceptional wide characters (unicode requirement I've read elsewhere)
    else if (cp == 0x2329 || cp == 0x232A) { w += 2; }
    else { w += 1; } // normal width for everything else

    s += n;
    }

    return w;
    }

    int main(void) {
    const char *tests[] = {
    "hello",
    "Café",
    "漢字",
    "✓",
    "🙂",
    NULL
    };

    // find maximum display width in 1st column
    int maxw = 0;
    for (int i = 0; tests[i]; i++) {
    int w = utf8_display_width(tests[i]);
    if (w > maxw) maxw = w;
    }

    // total padding after each 1st column + 3 spaces
    int total_pad = maxw + 3;

    for (int i = 0; tests[i]; i++) {
    int w = utf8_display_width(tests[i]);
    int sl = strlen(tests[i]);
    printf("%s", tests[i]);
    int pad = total_pad - w;
    while (pad-- > 0) putchar(' ');
    printf("strlen: %d utf8 display width: %d\n", sl, w);
    }

    return 0;
    }

    // eof


    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c on Thu Dec 4 14:15:35 2025
    From Newsgroup: comp.lang.c

    Am 03.12.2025 um 20:01 schrieb James Kuyper:
    I find it confusing that this is supposed to "work perfectly" "across
    diverse OSs". The amount of space that a character takes up varies
    depending upon the installed fonts, especially on whether the font is monospaced or proportional. Those fonts can be different for display on screen or on a printer. I don't see any query to determine even what the current font is, much less what it's characteristics are. I don't know
    of any OS-independent way of collecting such information. Does this
    solution "work perfectly" only for your own particular favorite font?
    Can C handle that with those means given by the standard itself.
    And is this really necessary to consider. Consoles are almost always
    fixed space. I guess the standard output for an laser printer in line
    printed mode is also fixed space.

    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Lawrence =?iso-8859-13?q?D=FFOliveiro?=@ldo@nz.invalid to comp.lang.c on Wed Dec 24 06:17:44 2025
    From Newsgroup: comp.lang.c

    On Tue, 18 Nov 2025 14:27:53 -0500, James Kuyper wrote:

    Could you identify which document guarantees that every Unicode locale contains "UTF-8"?

    How else would it work? Bytes have to be 8-bit.
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Keith Thompson@Keith.S.Thompson+u@gmail.com to comp.lang.c on Tue Dec 23 22:22:46 2025
    From Newsgroup: comp.lang.c

    Lawrence D’Oliveiro <ldo@nz.invalid> writes:
    On Tue, 18 Nov 2025 14:27:53 -0500, James Kuyper wrote:
    Could you identify which document guarantees that every Unicode locale
    contains "UTF-8"?

    How else would it work? Bytes have to be 8-bit.

    I can't figure out what point you're trying to make.

    Obviously bytes in C have to be *at least* 8 bits, but I don't see
    the relevance.

    Take a look at the article to which you replied. How does your
    followup have anything to do with it?

    One of several points that you snipped is that locale names can
    contain the string "utf8", not "UTF-8".
    --
    Keith Thompson (The_Other_Keith) Keith.S.Thompson+u@gmail.com
    void Void(void) { Void(); } /* The recursive call of the void */
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Lynn McGuire@lynnmcguire5@gmail.com to comp.lang.c on Wed Dec 24 01:41:30 2025
    From Newsgroup: comp.lang.c

    On 12/24/2025 12:22 AM, Keith Thompson wrote:
    Lawrence D’Oliveiro <ldo@nz.invalid> writes:
    On Tue, 18 Nov 2025 14:27:53 -0500, James Kuyper wrote:
    Could you identify which document guarantees that every Unicode locale
    contains "UTF-8"?

    How else would it work? Bytes have to be 8-bit.

    I can't figure out what point you're trying to make.

    Obviously bytes in C have to be *at least* 8 bits, but I don't see
    the relevance.

    Take a look at the article to which you replied. How does your
    followup have anything to do with it?

    One of several points that you snipped is that locale names can
    contain the string "utf8", not "UTF-8".

    Did C never work on the 6 bit machines such as the Univac 1108 (36 bit)
    or the CDC 7600 (60 bit) ?

    Lynn

    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c on Wed Dec 24 11:24:04 2025
    From Newsgroup: comp.lang.c

    On Wed, 24 Dec 2025 01:41:30 -0600
    Lynn McGuire <lynnmcguire5@gmail.com> wrote:
    On 12/24/2025 12:22 AM, Keith Thompson wrote:
    Lawrence D’Oliveiro <ldo@nz.invalid> writes:
    On Tue, 18 Nov 2025 14:27:53 -0500, James Kuyper wrote:
    Could you identify which document guarantees that every Unicode
    locale contains "UTF-8"?

    How else would it work? Bytes have to be 8-bit.

    I can't figure out what point you're trying to make.

    Obviously bytes in C have to be *at least* 8 bits, but I don't see
    the relevance.

    Take a look at the article to which you replied. How does your
    followup have anything to do with it?

    One of several points that you snipped is that locale names can
    contain the string "utf8", not "UTF-8".

    Did C never work on the 6 bit machines such as the Univac 1108 (36
    bit) or the CDC 7600 (60 bit) ?

    Lynn

    It depends on definition of the word C.
    The requirement for CHAR_BIT > 7 was not present in K&R C. IIRC, it
    first came in C90.
    Also, what prevents C90 compiler from using 36-bit char on Univac 1108
    and 60-bit bytes on CDC 7600? Methinks, it would be very reasonable.
    By chance, that* was a choice made both by TI and by Analog for C
    compilers of their word-addressable DSPs.
    * - not specifically 36 or 60 bits, but CHAR_BIT = native word width.
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From scott@scott@slp53.sl.home (Scott Lurndal) to comp.lang.c on Wed Dec 24 17:11:43 2025
    From Newsgroup: comp.lang.c

    Lynn McGuire <lynnmcguire5@gmail.com> writes:
    On 12/24/2025 12:22 AM, Keith Thompson wrote:
    Lawrence D’Oliveiro <ldo@nz.invalid> writes:
    On Tue, 18 Nov 2025 14:27:53 -0500, James Kuyper wrote:
    Could you identify which document guarantees that every Unicode locale >>>> contains "UTF-8"?

    How else would it work? Bytes have to be 8-bit.

    I can't figure out what point you're trying to make.

    Obviously bytes in C have to be *at least* 8 bits, but I don't see
    the relevance.

    Take a look at the article to which you replied. How does your
    followup have anything to do with it?

    One of several points that you snipped is that locale names can
    contain the string "utf8", not "UTF-8".

    Did C never work on the 6 bit machines such as the Univac 1108 (36 bit)

    Yes, there is a C compiler for the Univac machines. The byte size is
    9 bits.

    --- Synchronet 3.21a-Linux NewsLink 1.2