Plan 9 from Bell Labs’s /usr/web/sources/contrib/steve/root/sys/src/cmd/seft/words.h

Copyright © 2021 Plan 9 Foundation.
Distributed under the MIT License.
Download the Plan 9 distribution.


/*******************************************************************************
*
* Project:      seft (search engine for text)
*
* File:         words.h
*
* Author:       Owen de Kretser (oldk@cs.mu.oz.au)
*
* Organisation: Dept. of CS&SE, University of Melbourne
*
* Date:         April 1999
*
* Purpose:      word and non-word parsing macros
*
* Notes:        adapted from the code by Neil Sharman and Tim Shimmin
* 
*******************************************************************************/

/***** #includes **************************************************************/

#include <sys/types.h>
#include <ctype.h>

/***** #defines and Macros ****************************************************/

#define MAXWORDLEN	15
	/* Maximum length in bytes of any word or non-word. Note that
	   variations to MAXWORDLEN may have dramatic effects on the rest
	   of the program, as the length and the prefix match are packed
	   together into a four bit nibble, and there is not check that
	   this is possible, i.e., leave MAXWORDLEN alone... */
#define MAXSTEMLEN	255
	/* Maximum length in bytes of any stem. Note that
	   variations to MAXSTEMLEN may have dramatic effects on the rest
	   of the program, , i.e., leave MAXSTEMLEN alone... */
#define MAXNUMERIC	4
	/* Maximum number of numeric characters permitted in a word.
	   This avoids mg_long sequences of numbers creating just one
	   word occurrence for each number. At most 10,000 all numeric
	   words will be permitted. */

#define	INAWORD(c)	(isascii(c) && isalnum(c))
	/* The definition of what characters are permitted in a word
	*/

#define INNUMBER(c)     (isdigit(c)?1:0)


/*******************************************************************************
*
* Macro: PARSE_WORD
*
* Purpose: extract `pascal' word from text buffer
*
*******************************************************************************/

#define PARSE_WORD(Word, s_in, end)                      \
  do                                                          \
    {                                                         \
      register u_char  *wptr = (Word)+1;                      \
      register int    length = 0;                             \
      register int    c = *(s_in);                            \
                                                              \
      Word[2] = '\0'; \
      while ( INAWORD(c) && length < MAXSTEMLEN && (s_in)<=(end)) \
        {                                                     \
	  *wptr++ = (c);                                        \
	  length++;                                           \
	  c = *++(s_in);                                      \
	}                                                     \
      *(Word) = length;                                       \
    }while(0)                                                 

/*******************************************************************************
*
* Macro: PARSE_WORD_TOLOWER
*
* Purpose: extract `pascal' word from text buffer (characters tolower'd)
*
*******************************************************************************/

#define PARSE_WORD_TOLOWER(Word, s_in, end)                      \
  do                                                          \
    {                                                         \
      register u_char  *wptr = (Word)+1;                      \
      register int    length = 0;                             \
      register int    c = *(s_in);                            \
      register int    numeric = 0;                            \
                                                              \
      while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
        {                                                     \
 	  if ((numeric += INNUMBER(c)) > MAXNUMERIC)          \
	    break;                                            \
	  *wptr++ = tolower(c);                                        \
	  length++;                                           \
	  c = *++(s_in);                                      \
	}                                                     \
      *(Word) = length;                                       \
    }while(0)                                                 

/*******************************************************************************
*
* Macro: PARSE_NON_WORD
*
* Purpose: extract `pascal' non-word from text buffer 
*
*******************************************************************************/

#define PARSE_NON_WORD(Word,s_in, end)           \
  do                                             \
    {                                            \
      register u_char  *wptr = (Word)+1;                      \
      register int    length = 0;                             \
      register int    c = *(s_in);                            \
      while (!INAWORD(c) && (s_in)<=(end)) \
        {                                                     \
	  *wptr++ = c;                                        \
	  length++;                                           \
	  c = *++(s_in);                                      \
	}                                                     \
      *(Word) = length;                                       \
    }while(0)                                                 

/*******************************************************************************
*
* Macro: PARSE_NON_WORD
*
* Purpose: parse `pascal' non-word from text buffer, discard word
*
*******************************************************************************/

#define PARSE_NON_WORD_DISCARD(s_in, end)           \
  do                                             \
    {                                            \
      while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
	(s_in)++;                                \
    }while(0)                              

/*******************************************************************************
*
* Macro: PARSE_NON_WORD_FLAG
*
* Purpose: parse non-word from text buffer, discard word, check for ^B
*
*******************************************************************************/

#define PARSE_NON_WORD_FLAG(flag,s_in, end)           \
  do                                             \
    {                                            \
      flag = 0; \
      while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
      { \
        if (*(s_in) == '\002') \
            flag = 1; \
	    (s_in)++;                                \
      } \
    }while(0)                              


Bell Labs OSI certified Powered by Plan 9

(Return to Plan 9 Home Page)

Copyright © 2021 Plan 9 Foundation. All Rights Reserved.
Comments to webmaster@9p.io.