From Newsgroup: comp.lang.python
#!/usr/bin/python3
#+
# Which characters are valid in identifiers?
# See details at <
https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
#-
import sys
import unicodedata as ud
class SeqRuns :
def __init__(self, seq) :
runs = []
start = end = None
elts = iter(seq)
while True :
elt = next(elts, None)
if elt == None or start != None and elt != end + 1 :
if start != None :
runs.append((start, end))
#end if
start = None
if elt == None :
break
#end if
if start == None :
start = elt
#end if
end = elt
#end while
self.runs = runs
#end __init__
def nrelts(self) :
return sum(e[1] - e[0] for e in self.runs)
#end nrelts
def __len__(self) :
return len(self.runs)
#end __len__
def __iter__(self) :
return iter(self.runs)
#end __iter__
#end SeqRuns
UNICODE_RANGE = range(sys.maxunicode + 1)
# special cases from <
https://www.unicode.org/Public/13.0.0/ucd/PropList.txt> OTHER_ID_START = {0x1885, 0x1886, 0x2118, 0x212E, 0x309B, 0x309C}
# Other_ID_Start
OTHER_ID_CONTINUE = \
( # Other_ID_Continue
{0x00B7, 0x0387}
|
set(range(0x1369, 0x1371 + 1))
|
{0x19DA}
)
ID_START_EXTRA = {ord("_")}
ID_START = SeqRuns \
(
c for c in UNICODE_RANGE
if
ud.category(chr(c)) in {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl"}
or
c in OTHER_ID_START | ID_START_EXTRA
)
ID_CONTINUE = SeqRuns \
(
c for c in UNICODE_RANGE
if ud.category(chr(c)) in {"Mn", "Mc", "Nd", "Pc"} or c in OTHER_ID_CONTINUE
)
# identifiers are compared according to NFKC normalization
for n, l in \
(
("start", ID_START),
("continue", ID_CONTINUE),
) \
:
sys.stdout.write \
(
"%s[%d]: {%s}\n"
%
(
n,
l.nrelts(),
", ".join
(
(
lambda : "%#04X" % c[0],
lambda : "%#04X..%#04X" % c,
)[c[1] != c[0]]()
for c in l
),
)
)
#end for
--- Synchronet 3.21a-Linux NewsLink 1.2