/usr/web/sources/contrib/quanstro/src/runetype/tables.rc

Plan 9 from Bell Labs’s /usr/web/sources/contrib/quanstro/src/runetype/tables.rc

#!/bin/rc

c='
previous mapping included:

	0x3260,	0x327b,	/* ㉠ - ㉻ */
	0x328a,	0x32b0,	/* ㊊ - ㊰ */
	0x32d0,	0x32fe,	/* ㋐ - ㋾ */

in __alpha2[] but these are type symbol; however U+24b6 is
of type symbol, but we include it. this is because there is 
a lowercase mapping.

should we  load up all the caracters and glark the 
decomposition mapping. if we have U+X as <circle> bozo, where bozo
is a letter, then U+X is a letter, too?

or should isalpharune(U+24b6) be false?

if we need to check references, we can just load an array with each
codepoint as long as there aren''t any forward references.

'
c=()

if(~ $#uconv 0)
	uconv=8.uconv
UnicodeData=unicodedata
for(i)
	UnicodeData=$i

fn Sprint {
	$uconv
}

fn Unicode {
	grep $rune < $UnicodeData | tr -d '\015'
}

hex='
function hex(s,        base, r, n, i, k, c)
{
	base = 16;
	if(s ~ /^0[xX][0-9a-fA-f]+/)
		s = substr(str, 3);
	n = length(s)
  	r = 0
	for (i = 1; i <= n; i++) {
		c = tolower(substr(s, i, 1))
		k = index("0123456789abcdef", c) - 1;
		r = r * base + k
	}
     	return r
}
'

Unicode | awk -F';' '
' ^ $hex ^ '
# function hex (h) {
# 	return strtonum("0x" h);
# }

function init() {
	mark=0;		# decimal of 1st matching in range
	mdes="";	# description of mark.
	last=0;		# last in range
	ldes="";	# last desc.
}

BEGIN {
	# hard-code values in the ascii range.

	print "static";
	print "Rune\t__space2[] =";
	print "{"
	print "\t0x0009,\t0x000a,\t/* tab and newline */";
	print "\t0x0020,\t0x0020,\t/* space */";
	print "\t0x00a0,\t0x00a0,\t/* non-breaking space */";

	init();
}

FNR < 256 { next; }

"WS" == $5 || $3 ~ /Z[psl]/ || ("Cc" == $3 && "S" == $5) || ("Cf" == $3 && $2 ~ /.* SPACE/) {
	codepoint=hex($1);

	if (last + 1 == codepoint){
		last = codepoint;
		ldes = tolower($2);
		next;
	}

	if (last && last > mark){
		printf("\t0x%04x, 0x%04x,\t/* %s - %s */\n", mark, last, mdes, ldes);
	} else if (last) {
		printf("\t0x%04x, 0x%04x,\t/* %s */\n", mark, last, ldes);
	}

	init();
	mark=last=codepoint;
	ldes=mdes=tolower($2);
}



END {
	if (last && last > mark){
		printf("\t0x%04x, 0x%04x,\t/* %s - %s */\n", mark, last, mdes, ldes);
	} else if (last) {
		printf("\t0x%04x, 0x%04x,\t/* %s */\n", mark, last, ldes);
	}
	print "};"
	print ""
}
 ' | Sprint

awk '-F;' '
' ^ $hex ^ '
# function hex (h) {
# 	return strtonum("0x" h);
# }

function init() {
	mark=0;		# decimal of 1st matching in range
	last=0;		# last in range
}

function special(codepoint){
	if(codepoint == 6618)
		codepoint=6618-1	# 0x19da wierd single 1; ignore
	return codepoint
}

function pr(m, l){
	l = special(l)
	if(9 != l - m)
		printf("//botch");	# only handle base 10
	printf("\t0x%04x, 0x%04x,\t/* \\u%04x\\u%04x\\u%04x\\u%04x\\u%04x\\u%04x\\u%04x\\u%04x\\u%04x\\u%04x */\n", m, l, m, m+1, m+2, m+3, m+4, m+5, m+6, m+7, m+8, m+9)
}

BEGIN {
	print "static";
	print "Rune\t__digit2[] =";
	print "{";

	init();
}

"Nd" == $3 {
	codepoint=hex($1);
	if (last + 1 == codepoint){
		last = codepoint;
		next;
	}
	if(last)
		pr(mark, last);

	init();
	mark=last=codepoint;
}

END {
	if(last)
		pr(mark, last);

	print "};";
	print "";
} ' <{Unicode} | Sprint

awk '-F;' '
' ^ $hex ^ '
# function hex (h) {
# 	return strtonum("0x" h);
# }

function init() {
	mark=0;		# decimal of 1st matching in range
	last=0;		# last in range
}

BEGIN {
	print "static";
	print "Rune\t__alpha2[] =";
	print "{";

	init();
}

"Lo" == $3 || "Lm" == $3 {
	codepoint=hex($1);

	if (last + 1 == codepoint){
		last = codepoint;
		next;
	}

	if (last && last > mark){
		printf("\t0x%04x, 0x%04x,\t/* \\u%04x-\\u%04x */\n", mark, last, mark, last);
	} else if (last) {
		single = single sprintf("\t0x%04x,\t/* \\u%04x */\n", mark, mark);
	}

	init();
	mark=last=codepoint;
}

END {
	if (last && last > mark){
		printf("\t0x%04x, 0x%04x,\t/* \\u%04x-\\u%04x */\n", mark, last, mark, last);
	} else if (last) {
		single = single sprintf("\t0x%04x,\t/* \\u%04x */\n", mark, mark);
	}

	print "};";
	print "";

	print "static";
	print "Rune\t__alpha1[] =";
	print "{";

	print single;

	print "};";
	print "";

} ' <{Unicode} | Sprint

awk '-F;' '
' ^ $hex ^ '
# function hex (h) {
# 	return strtonum("0x" h);
# }

function init() {
	mark=0;		# decimal of 1st matching in range
	last=0;		# last in range
	offset=0;		# (new - mark)
	o_cp=0;
}

BEGIN {
	print "static";
	print "Rune\t__toupper2[] =";
	print "{";

	init();
}

# we allow Upper(C) to be nil, that way islower(c) works.

"Ll" == $3 || length($(NF-2)) {
	codepoint=hex($1);
	o=hex($(NF-2));
	of=o-codepoint;

	if (last + 1 == codepoint && of == offset){
		last = codepoint;
		next;
	}

	if (last && last > mark){
		printf("\t0x%04x, 0x%04x, 0x%04x,\t/* \\u%04x-\\u%04x, \\u%04x-\\u%04x */\n", mark, last, o_cp, mark, last, mark+offset, last+offset);
	} else if (last) {
		single = single sprintf("\t0x%04x, 0x%04x,\t/* \\u%04x, \\u%04x */\n", mark, o_cp, mark, o_cp);
	}

	init();
	mark=last=codepoint;
	offset=of;
	o_cp = o
}

END {
	if (last && last > mark){
		printf("\t0x%04x, 0x%04x, 0x%04x,\t/* \\u%04x-\\u%04x, \\u%04x-\\u%04x */\n", mark, last, o_cp, mark, last, mark+offset, last+offset);
	} else if (last) {
		single = single sprintf("\t0x%04x, 0x%04x,\t/* \\u%04x, \\u%04x */\n", mark, o_cp, mark, o_cp);
	}

	print "};";
	print "";

	print "static";
	print "Rune\t__toupper1[] =";
	print "{";

	# FIXME: that should be a \u0000, bug gawk chokes on \\u0000 and [\\]
	# [[:punct:]] yields "memory exhausted".

	gsub(".u0000", "<nil>", single);
	print single;

	print "};";
	print "";

} ' <{Unicode} | Sprint

awk '-F;' '
' ^ $hex ^ '
# function hex (h) {
# 	return strtonum("0x" h);
# }

function init() {
	mark=0;		# decimal of 1st matching in range
	last=0;		# last in range
	offset=0;		# (new - mark)
	o_cp=0;
}

BEGIN {
	print "static";
	print "Rune\t__tolower2[] =";
	print "{";

	init();
}

# we allow Lower(C) to be nil, that way isupper(c) works.

"Lu" == $3 || length($(NF-1)) {
	codepoint=hex($1);
	o=hex($(NF-1));
	of=o-codepoint;

	if (last + 1 == codepoint && of == offset){
		last = codepoint;
		next;
	}

	if (last && last > mark){
		printf("\t0x%04x, 0x%04x, 0x%04x,\t/* \\u%04x-\\u%04x, \\u%04x-\\u%04x */\n", mark, last, o_cp, mark, last, mark+offset, last+offset);
	} else if (last) {
		single = single sprintf("\t0x%04x, 0x%04x,\t/* \\u%04x, \\u%04x */\n", mark, o_cp, mark, o_cp);
	}

	init();
	mark=last=codepoint;
	offset=of;
	o_cp = o
}

END {
	if (last && last > mark){
		printf("\t0x%04x, 0x%04x, 0x%04x,\t/* \\u%04x-\\u%04x, \\u%04x-\\u%04x */\n", mark, last, o_cp, mark, last, mark+offset, last+offset);
	} else if (last) {
		single = single sprintf("\t0x%04x, 0x%04x,\t/* \\u%04x, \\u%04x */\n", mark, o_cp, mark, o_cp);
	}

	print "};";
	print "";


	print "/*";
	print " * We allow the target character to be nil so isupperrune() works,";
	print " * even for bogus unicode that doesn''t have a tolower().";
	print " */"
	print ""

	print "static";
	print "Rune\t__tolower1[] =";
	print "{";

	# FIXME: that should be a \u0000, bug gawk chokes on \\u0000 and [\\]
	# [[:punct:]] yields "memory exhausted".

	gsub(".u0000", "<nil>", single);
	print single;

	print "};";
	print "";

} ' <{Unicode} | Sprint

echo 'static
Rune	__totitle1[] = 
{'

awk '-F;' '"Lt" == $3 { 
	printf("\t0x%s, 0x%s,\t/* \\u%s, \\u%s */\n", $(NF-1), $1, $(NF-1), $1);
	if (length($NF)) {
		printf("\t0x%s, 0x%s,\t/* \\u%s, \\u%s */\n", $15, $1, $15, $1);
	}
}' <{Unicode} | Sprint

echo '};
'
(Return to Plan 9 Home Page)