• valid_identifiers

    From Lawrence =?iso-8859-13?q?D=FFOliveiro?=@ldo@nz.invalid to comp.lang.python on Wed Dec 17 02:11:27 2025
    From Newsgroup: comp.lang.python

    #!/usr/bin/python3
    #+
    # Which characters are valid in identifiers?
    # See details at <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
    #-

    import sys
    import unicodedata as ud

    class SeqRuns :

    def __init__(self, seq) :
    runs = []
    start = end = None
    elts = iter(seq)
    while True :
    elt = next(elts, None)
    if elt == None or start != None and elt != end + 1 :
    if start != None :
    runs.append((start, end))
    #end if
    start = None
    if elt == None :
    break
    #end if
    if start == None :
    start = elt
    #end if
    end = elt
    #end while
    self.runs = runs
    #end __init__

    def nrelts(self) :
    return sum(e[1] - e[0] for e in self.runs)
    #end nrelts

    def __len__(self) :
    return len(self.runs)
    #end __len__

    def __iter__(self) :
    return iter(self.runs)
    #end __iter__

    #end SeqRuns

    UNICODE_RANGE = range(sys.maxunicode + 1)
    # special cases from <https://www.unicode.org/Public/13.0.0/ucd/PropList.txt> OTHER_ID_START = {0x1885, 0x1886, 0x2118, 0x212E, 0x309B, 0x309C}
    # Other_ID_Start
    OTHER_ID_CONTINUE = \
    ( # Other_ID_Continue
    {0x00B7, 0x0387}
    |
    set(range(0x1369, 0x1371 + 1))
    |
    {0x19DA}
    )
    ID_START_EXTRA = {ord("_")}

    ID_START = SeqRuns \
    (
    c for c in UNICODE_RANGE
    if
    ud.category(chr(c)) in {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl"}
    or
    c in OTHER_ID_START | ID_START_EXTRA
    )
    ID_CONTINUE = SeqRuns \
    (
    c for c in UNICODE_RANGE
    if ud.category(chr(c)) in {"Mn", "Mc", "Nd", "Pc"} or c in OTHER_ID_CONTINUE
    )
    # identifiers are compared according to NFKC normalization

    for n, l in \
    (
    ("start", ID_START),
    ("continue", ID_CONTINUE),
    ) \
    :
    sys.stdout.write \
    (
    "%s[%d]: {%s}\n"
    %
    (
    n,
    l.nrelts(),
    ", ".join
    (
    (
    lambda : "%#04X" % c[0],
    lambda : "%#04X..%#04X" % c,
    )[c[1] != c[0]]()
    for c in l
    ),
    )
    )
    #end for
    --- Synchronet 3.21a-Linux NewsLink 1.2