Plan 9 from Bell Labs’s /usr/web/sources/contrib/steve/root/sys/src/cmd/suggest.c

Copyright © 2021 Plan 9 Foundation.
Distributed under the MIT License.
Download the Plan 9 distribution.


/*
 * suggest.c - generate suggestions for spelling errors.
 *
 * term% spell doc.ms | suggest
 *
 * The triagram permute validation seems a good idea, however in
 * testing it rejected only 20% of the initial suggestions. This
 * is comperable in cpu terms to the overhead in generating the
 * probability tables in the first place. 
 *
%A V. I. Levenshtein
%T Binary codes capable of correcting deletions, insertions, and reversals
%J Soviet Physics Doklady 10
%D 1966
%V 707710

%T Hanging on the Metaphone
%A Lawrence Philips
%J Computer Language
%D December 1990
%P 38
*/
#include <u.h>
#include <libc.h>
#include <ctype.h>
#include <bio.h>
#include <pool.h>

/*
 * These constants are rather arbitary but 
 * to work for my (rather small) corpus.
 */
enum{
	Idx = 3,	// apply Dist below after this many suggestions
	Dist = 2,	// stop offering suggestions if levenshtein is above this
	Maxidx = 8,	// give up after this many suggestions

			// triagramme table
	Trilen = 28,	// side length
	Begword = 0,	// code for begining of word
	Endword = 26,	// code for end of word
	Nalpha = 27,	// code for non alpha character

	Ncode = 4	// length of metaphone hash used	
};

char *Wordfile = "/lib/words";
Biobuf *Words = nil;

uvlong Unknown = 0;	// stats
uvlong Rejected = 0;
uvlong Generated = 0;

uvlong Ntri = 0;	// number of triagrammes
uint ***Tri = nil;	// triagramme counts 

typedef struct Table Table;
typedef struct Result Result;

struct Table {
	char code[Ncode];
	vlong off;
};

struct Result {
	Result *next;
	char code[Ncode];
	char *word;
	int dist;
};

Table *Tab = nil;
int Ntab = 0;

void
metaphone(char *name, char *buf, int len)
{
	int ii, jj, silent, hard, Lng, lastChr;
	char curLtr, prevLtr, nextLtr, nextLtr2, nextLtr3;
	int vowelAfter, vowelBefore, frontvAfter;
	char *p, *p1;
	char wname[60];
	char *ename = wname;
	static char *VOWELS = "AEIOU";
	static char *FRONTV = "EIY";	 /* special cases for letters in FRONT of these */
	static char *VARSON = "CSPTG";	 /* variable sound--those modified by adding an "h" */
	static char *DOUBLE = ".";	 /* let these double letters through */
	static char *excpPAIR = "AGKPW"; /* exceptions "ae-", "gn-", "kn-", "pn-", "wr-" */
	static char *nextLTR = "ENNNR";


	memset(buf, 0, len);
	jj = 0;
	for(ii = 0; name[ii] != '\0'; ii++) {
		if(isalpha(name[ii])) {
			ename[jj] = toupper(name[ii]);
			jj++;
		}
	}
	ename[jj] = '\0';

	if(strlen(ename) == 0)
		return;

	/* if ae, gn, kn, pn, wr then drop the first letter */
	if((p = strchr(excpPAIR, ename[0])) != nil) {
		p1 = nextLTR + (p - excpPAIR);
		if(*p1 == ename[1])
			strcpy(ename, &ename[1]);
	}
	/* change x to s */
	if(ename[0] == 'X')
		ename[0] = 'S';
	/* get rid of the "h" in "wh" */
	if(strncmp(ename, "WH", 2) == 0)
		strcpy(&ename[1], &ename[2]);

	Lng = strlen(ename);
	lastChr = Lng - 1;	/* index to last character in string makes code easier */

	/* Remove an S from the end of the string */
	if(ename[lastChr] == 'S') {
		ename[lastChr] = '\0';
		Lng = strlen(ename);
		lastChr = Lng - 1;
	}

	for(ii = 0; ((strlen(buf) < len) && (ii < Lng)); ii++) {

		curLtr = ename[ii];

		vowelBefore = 0;
		prevLtr = ' ';
		if(ii > 0) {
			prevLtr = ename[ii - 1];
			if(strchr(VOWELS, prevLtr) != nil)
				vowelBefore = 1;
		}
		/* if first letter is a vowel KEEP it */
		if(ii == 0 && (strchr(VOWELS, curLtr) != nil)) {
			strncat(buf, &curLtr, 1);
			continue;
		}

		vowelAfter = 0;
		frontvAfter = 0;
		nextLtr = ' ';
		if(ii < lastChr) {
			nextLtr = ename[ii + 1];
			if(strchr(VOWELS, nextLtr) != nil)
				vowelAfter = 1;
			if(strchr(FRONTV, nextLtr) != nil)
				frontvAfter = 1;
		}
		/* skip double letters except ones in list */
		if(curLtr == nextLtr && (strchr(DOUBLE, nextLtr) == nil))
			continue;

		nextLtr2 = ' ';
		if(ii < (lastChr - 1))
			nextLtr2 = ename[ii + 2];

		nextLtr3 = ' ';
		if(ii < (lastChr - 2))
			nextLtr3 = ename[ii + 3];

		switch (curLtr) {

		case 'B':
			silent = 0;
			if(ii == lastChr && prevLtr == 'M')
				silent = 1;
			if(!silent)
				strncat(buf, &curLtr, 1);
			break;

			/* silent -sci-,-sce-,-scy-; sci-, etc OK */
		case 'C':
			if(!(ii > 1 && prevLtr == 'S' && frontvAfter))

				if(ii > 0 && nextLtr == 'I' && nextLtr2 == 'A')
					strncat(buf, "X", 1);
				else if(frontvAfter)
					strncat(buf, "S", 1);
				else if(ii > 1 && prevLtr == 'S' && nextLtr == 'H')
					strncat(buf, "K", 1);
				else if(nextLtr == 'H')
					if(ii == 0 && (strchr(VOWELS, nextLtr2) == nil))
						strncat(buf, "K", 1);
					else
						strncat(buf, "X", 1);
				else if(prevLtr == 'C')
					strncat(buf, "C", 1);
				else
					strncat(buf, "K", 1);
			break;

		case 'D':
			if(nextLtr == 'G' && (strchr(FRONTV, nextLtr2) != nil))
				strncat(buf, "J", 1);
			else
				strncat(buf, "T", 1);
			break;

		case 'G':
			silent = 0;
			/* SILENT -gh- except for -gh and no vowel after h */
			if((ii < (lastChr - 1) && nextLtr == 'H')
			   && (strchr(VOWELS, nextLtr2) == nil))
				silent = 1;

			if((ii == (lastChr - 3))
			   && nextLtr == 'N' && nextLtr2 == 'E' && nextLtr3 == 'D')
				silent = 1;
			else if((ii == (lastChr - 1)) && nextLtr == 'N')
				silent = 1;

			if(prevLtr == 'D' && frontvAfter)
				silent = 1;

			if(prevLtr == 'G')
				hard = 1;
			else
				hard = 0;

			if(!silent)
				if(frontvAfter && (!hard))
					strncat(buf, "J", 1);
				else
					strncat(buf, "K", 1);
			break;

		case 'H':
			silent = 0;
			if(strchr(VARSON, prevLtr) != nil)
				silent = 1;

			if(vowelBefore && !vowelAfter)
				silent = 1;

			if(!silent)
				strncat(buf, &curLtr, 1);
			break;

		case 'F':
		case 'J':
		case 'L':
		case 'M':
		case 'N':
		case 'R':
			strncat(buf, &curLtr, 1);
			break;

		case 'K':
			if(prevLtr != 'C')
				strncat(buf, &curLtr, 1);
			break;

		case 'P':
			if(nextLtr == 'H')
				strncat(buf, "F", 1);
			else
				strncat(buf, "P", 1);
			break;

		case 'Q':
			strncat(buf, "K", 1);
			break;

		case 'S':
			if(ii > 1 && nextLtr == 'I' && (nextLtr2 == 'O' || nextLtr2 == 'A'))
				strncat(buf, "X", 1);
			else if(nextLtr == 'H')
				strncat(buf, "X", 1);
			else
				strncat(buf, "S", 1);
			break;

		case 'T':
			if(ii > 1 && nextLtr == 'I' && (nextLtr2 == 'O' || nextLtr2 == 'A'))
				strncat(buf, "X", 1);
			else if(nextLtr == 'H')	/* The=0, Tho=T, Withrow=0 */
				if(ii > 0 || (strchr(VOWELS, nextLtr2) != nil))
					strncat(buf, "0", 1);
				else
					strncat(buf, "T", 1);
			else if(!(ii < (lastChr - 2) && nextLtr == 'C' && nextLtr2 == 'H'))
				strncat(buf, "T", 1);
			break;

		case 'V':
			strncat(buf, "F", 1);
			break;

		case 'W':
		case 'Y':
			if(ii < lastChr && vowelAfter)
				strncat(buf, &curLtr, 1);
			break;

		case 'X':
			strncat(buf, "KS", 2);
			break;

		case 'Z':
			strncat(buf, "S", 1);
			break;
		}

	}

	/*
	 * DON'T DO THIS NOW, REMOVING "S" IN BEGINNING HAS the same effect
	 * with plurals, in addition imbedded S's in the Metaphone are included
	 *
	 * Lng = strlen(buf);
	 * lastChr = Lng -1; if (buf[lastChr] == 'S' && Lng >= 3 )
	 *	buf[lastChr] = '\0';
	 */
}



/**********************************************************/

/*
 * Lifted from /sys/src/ape/lib/ap/gen/bsearch.c
 * and stripped of it's ANSI-isms
 */
void*
bsearch(void* key, void* base, int nmemb, int size, int (*compar)(void*, void*))
{
	long i, bot, top, new;
	void *p;

	bot = 0;
	top = bot + nmemb - 1;
	while(bot <= top){
		new = (top + bot)/2;
		p = (char *)base+new*size;
		i = (*compar)(key, p);
		if(i == 0)
			return p;
		if(i > 0)
			bot = new + 1;
		else
			top = new - 1;
	}
	return 0;
}

/**********************************************************/

int
min3(int a, int b, int c)
{
	int min = a;

	if(b < min)
		min = b;
	if(c < min)
		min = c;
	return min;
}

int
levenshtein(char *s, char *t)
{
	static int *d = nil;
	static int alloc = 0;
	int k, i, j, n, m, cost, distance;

	n = strlen(s);
	m = strlen(t);
	if(n == 0 || m == 0)
		return -1;

	/*
	 * This leaks once on exit, but who cares
	 */
	if((m+1)*(n+1) > alloc){
		free(d);
		alloc = (m+1)*(n+1)+32;
		if((d = malloc(sizeof(int)*alloc)) == nil)
			sysfatal("no memory\n");
	}

	m++;
	n++;
	for(k = 0; k < n; k++)
		d[k] = k;
	for(k = 0; k < m; k++)
		d[k*n] = k;
	for(i = 1; i < n; i++)
		for(j = 1; j < m; j++) {
			cost = (s[i-1] == t[j-1])? 0: 1;
			d[j*n+i] = min3(d[(j-1)*n+i]+1, d[j*n+i-1]+1, d[(j -1)*n+i-1]+cost);
#ifdef NOT_DEFINED
			if(i>=2 && j>=2 && (d[j*n+i]-d[(j-2)*n+i-2]==2) && (s[i-2]==t[j-1]) && (s[i-1]==t[j-2]))
                  		d[j*n+i]--;
#endif
		}
	distance = d[n*m-1];
	return distance;
}

/**********************************************************/

int
enc(char c)
{
	if(! isalpha(c))
		return Nalpha;
	if(isupper(c))
		c = tolower(c);
	return c - 'a';
}

void
triagrams(char *word)
{
	char *p;

	if(word[0] == 0 || word[1] == 0)	// too short
		return;

	Ntri++;
	p = word;
	Tri[Begword][enc(p[0])][enc(p[1])]++;
	for(; p[0] && p[1] && p[2]; p++)
		Tri[enc(p[0])][enc(p[1])][enc(p[2])]++;
	Tri[enc(p[0])][enc(p[1])][Endword]++;
}
	
double
triprob(char *p, int l, int i)
{
	double n;

	if(i > l-2)
		return 0.5;	// too short

	if(i == 0)
		n = Tri[Begword][enc(p[0])][enc(p[1])];
	else
	if(i == l-1)
		n = Tri[enc(p[i])][enc(p[i+1])][Endword];
	else
		n = Tri[enc(p[i-1])][enc(p[i+0])][enc(p[i+1])];
	return  n / (double)Ntri;
}

/**********************************************************/

int
tabcmp(void *a, void *b)
{
	Table *x = (Table *)a;
	Table *y = (Table *)b;
	return memcmp(x->code, y->code, Ncode);
}

int
rescmp(void *a, void *b)
{
	Result **x = (Result **)a;
	Result **y = (Result **)b;
	if((*x)->dist == (*y)->dist)
		return strcmp((*x)->word, (*y)->word);
	return (*x)->dist - (*y)->dist;
}

void
ingest(void)
{
	char *p;
	vlong off;
	int alloc;

	off = 0;
	Tab = nil;
	Ntab = alloc = 0;
	while((p = Brdline(Words, '\n')) != nil){
		p[Blinelen(Words) -1] = 0;
	
		triagrams(p);

		if(Ntab >= alloc){
			alloc += 128;
			if((Tab = realloc(Tab, alloc*sizeof(Table))) == nil)
				sysfatal("No memory: %r\n");
		}
		metaphone(p, Tab[Ntab].code, Ncode);
		Tab[Ntab].off = off;
		Ntab++;
		off = Bseek(Words, 0, 1);
	}
	qsort(Tab, Ntab, sizeof(Table), tabcmp);
}

void
results(char *word, Result *root)
{
	int n, i;
	char *prev;
	Result **idx, *r;

	n = 0;
	for(r = root; r; r = r->next)
		n++;
	if((idx = malloc(n * sizeof(Result *))) == nil)
		sysfatal("No memory: %r\n");
	i = 0;
	for(r = root; r; r = r->next)
		idx[i++] = r;
	qsort(idx, n, sizeof(Result *), rescmp);

	print("%-16s → ", word);
	prev = "";
	for(i = 0; i < n; i++){
		r = idx[i];
		if(strcmp(r->word, prev) == 0)
			continue;
		print("%s ", r->word);

		if(r->dist == 0)
			break;

		if((i > Idx && r->dist > Dist) || i > Maxidx)
			break;
		prev = r->word;
	}
	print("\n");
	free(idx);
}


int
lookup(Result **root, char *word, Table *tp)
{
	Result *r;
	char l, *p;
	Table *first, *last;

	for(r = *root; r; r = r->next)
		if(memcmp(r->code, tp->code, Ncode) == 0)		// already tested
			return 0;

	for(first = tp; tabcmp(tp, first) == 0 && first >= Tab; first--)
		continue;
	first++;
	for(last = tp; tabcmp(tp, last) == 0; last++)
		continue;

	for(tp = first; tp < last; tp++){
		Bseek(Words, tp->off, 0);
		if((p = Brdline(Words, '\n')) == nil){
			fprint(2, "%s - read failed at off=%lld: %r\n", Wordfile, tp->off);
			continue;
		}
		l = Blinelen(Words) -2;
		p[l+1] = 0;
		if((r = malloc(sizeof(Result)+strlen(p)+1)) == nil)
			sysfatal("No memory");
		r->dist = levenshtein(word, p);
		memcpy(r->code, tp->code, Ncode);
		r->word = ((char *)r)+sizeof(Result);
		strcpy(r->word, p);
		r->next = *root;
		*root = r;

		if(r->dist == 0)
			return 1;

		/*
		 * Workaround for feature of deroff where it strips
		 * trailing periods from all words, even abbreviations.
		 */
		if(p[l] == '.' && strncmp(p, word, l-1) == 0){
			r->dist = 0;
			return 1;
		}
	}
	return 0;
}

void
permute(char *word)
{
	char *buf;
	int i, j, l;
	Table t, *tp;
	Result *r, *nr, *root;

	root = nil;
	l = strlen(word);
	if((buf = malloc(l+2)) == nil)		// 2 == 1 insertion + string terminator
		sysfatal("No memory %r\n");

	
	/* unchanged */
	metaphone(word, t.code, Ncode);
	if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil){
		Generated++;
		if(lookup(&root, word, tp) == 1)
			Unknown++;
	}

	/* deletion */
	for(i = 0; i < l; i++){
		Generated++;
		strncpy(buf, word, i);
		strcpy(buf+i, word+i+1);
		if(triprob(buf, l, i) == 0){
			Rejected++;
			continue;
		}
		metaphone(buf, t.code, Ncode);
		if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil)
			lookup(&root, word, tp);
				Unknown++;
	}

	/* modification */
	for(j = 'a'; j <= 'z'; j++){
		for(i = 0; i < l; i++){
			Generated++;
			strcpy(buf, word);
			buf[i] = j;
			if(triprob(buf, l, i) == 0){
				Rejected++;
				continue;
			}
			metaphone(buf, t.code, Ncode);
			if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil)
				lookup(&root, word, tp);
					Unknown++;
		}
	}

	/* transposition */
	for(j = 0; j < l-1; j++){
		for(i = j+1; i < l; i++){
			Generated++;
			strcpy(buf, word);
			buf[i] = word[j];
			buf[j] = word[i];
			if(triprob(buf, l, i) == 0 || triprob(buf, l, j) == 0){
				Rejected++;
				continue;
			}
			metaphone(buf, t.code, Ncode);
			if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil)
				lookup(&root, word, tp);
					Unknown++;
		}
	}

	/* insertion */
	for(j = 'a'; j <= 'z'; j++){
		for(i = 0; i < l; i++){
			Generated++;
			strncpy(buf, word, i);
			buf[i+1] = j;
			strcpy(buf+i+1, word+i);
			if(triprob(buf, l, i+1) == 0){
				Rejected++;
				continue;
			}
			metaphone(buf, t.code, Ncode);
			if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil)
				lookup(&root, word, tp);
					Unknown++;
		}
	}

	results(word, root);
	free(buf);

	for(r = root; r; r = nr){
		nr = r->next;
		free(r);
	}
}

void
usage(void)
{
	fprint(2, "usage: %s [-s] [-d dict] [words]\n", argv0);
	fprint(2, "  -d dict  use dict rather than %s\n", Wordfile);
	fprint(2, "  -s       print stats at exit\n");
	exits("usage");
}

void
main(int argc, char *argv[])
{
	char *p;
	Biobuf bi;
	int i, j, stats;

	stats = 0;
	ARGBEGIN{
	case 's':
		stats++;
		break;
	case 'd':
		Wordfile = EARGF(usage());
		break;
	default:
		usage();
		break;
	}ARGEND;

	if((Words = Bopen(Wordfile, OREAD)) == nil)
		sysfatal("%s - cannot open: %r\n", Wordfile);

	if((Tri = mallocz(sizeof(Tri[0][0][0]) * Trilen, 1)) == nil)
		sysfatal("No memory: %r\n");
	for(i = 0; i < Trilen; i++){
		if((Tri[i] = mallocz(sizeof(Tri[0][0][0]) * Trilen, 1)) == nil)
			sysfatal("No memory: %r\n");
		for(j = 0; j < Trilen; j++)
			if((Tri[i][j] = mallocz(sizeof(Tri[0][0][0]) * Trilen, 1)) == nil)
				sysfatal("No memory: %r\n");
	}

	ingest();

	if(argc){
		for(i = 0; i < argc; i++)
			permute(argv[i]);
	}
	else{
		Binit(&bi, OREAD, 0);
		while((p = Brdline(&bi, '\n')) != nil){
			for(i = Blinelen(&bi) -1; isspace(p[i]) && i >= 0; i--)
				p[i] = 0;
			for(; isspace(*p); p++)
				continue;
			if(*p == 0)
				continue;
			permute(p);
		}
		Bterm(&bi);
	}

	free(Tab);
	Bterm(Words);

	if(stats)
		print("generated: %llud rejected: %llud unknown: %llud\n",
			Generated, Rejected, Unknown);
	exits(0);
}

Bell Labs OSI certified Powered by Plan 9

(Return to Plan 9 Home Page)

Copyright © 2021 Plan 9 Foundation. All Rights Reserved.
Comments to webmaster@9p.io.