A Walkthrough for Making A Custom Dictionaries for tsearch2

Ben Chobot
06 March 2004

This walkthrough describes how I made a custom dictionary for tsearch2. I had some issues, so the hope is that this document will help others create their own custom dictionaries without running into the problems I did.

I'm going to assume you have a working tsearch2 installation and know how it works; if that's not the case, go read the excellent tsearch2 Guide and tsearch2 Intro.


Table of Contents

Rational and Requirements
Making the Dictionary
Installing the Dictionary
Source code


Rational and Requirements

....or, what my dictionary needed to do, and why I needed to make one in the first place.

Once upon a time I had a project where I wanted to let users search music artist names, song titles, album titles, etc. Of course, it would be great if everybody agreed on the way to write such names. Of course, that's not reality - is it "Beach Boys" or "The Beach Boys"? So I started to look at the state of fuzzy searching with Postgres. I found that, untouched, tsearch2 could help me do accurate fuzzy searches for "Beatles" vs. "The Beatles", or "Flock of Seagulls" vs. "A Flock of Seagulls". Unfortunately, it ran into limitations when trying to search for strings with numbers. How could I get "3 Dog Night" to return the same results as "Three Dog Night", "a hundred days off" to be equivalent to "100 days off", or "10000 Maniacs" to be the same as "10,000 Maniacs", "ten thousand maniacs", or even "10 thousand maniacs"?

tsearch2 easily offers a way to map specific token types from a parser to specific dictionaries, so I figured the easiest way to tackle this problem was going to be to convert all numbers into english strings, and then make a ts_vector out of that. For instance, make "3" map to "three", and "10000" and "10,000" both map to "ten thousand". Using such a dictionary, the number of lexemes in the ts_vector would certainly increase, but because most of my strings were going to only be a handful of words at most, that wasn't an issue.

In short, no matter how a user wanted to type "10,000 Maniacs", the ts_vector should always end up being as close as possible to {ten,thousand,maniacs}.


Making the dictionary

Now that we have the requirements, here's how I made my dictionary (which I called num2english). I started with the gendict code. PGSQL_SRC, below, should be replaced with where ever you keep your Postgres source.

cd PGSQL_SRC/contrib/tsearch2/gendict
./config -n num2english
cd PGSQL_SRC/contrib/dict_num2english

You'll see a file in this directory called dict_tmpl.c. I replaced the contents of that file with this code. You'll probably want to use your own code, unless you too want a num2english dictionary.

(Unfortunately, unless you're familiar with making C-language stored procs for Postgres, this is the trickiest part of the entire process. Any issues you'll have are probably going to be because there are no examples out there for what you want to do. All I can suggest is to make use of the mailing lists - it's what they're there for.)

Now that the code is in place, go ahead and make the dictionary:

make
make install


Installing the dictionary

To install the dictionary, we'll need a database. I called mine fuzzy, and entered commands like so:

createdb fuzzy
psql fuzzy < /usr/local/pgsql/share/contrib/tsearch2.sql
psql fuzzy < /usr/local/pgsql/share/contrib/dict_num2english.sql

Of course, if you installed your database in a non-default location, you'll have to modify those psql commands appropriately.

Now, to set up the dictionary in tsearch2. We'll just change the default parsing config for now:

psql fuzzy2
select to_tsvector('default','100');
 to_tsvector
-------------
 '100':1
(1 row)

update pg_ts_cfgmap set dict_name = '{num2english}'
	where ts_name = 'default' and tok_alias in ('float','int','uint','sfloat');
UPDATE 4

select reset_tsearch();
NOTICE:  TSearch cache cleaned
 reset_tsearch
---------------
  
(1 row)

select to_tsvector('default','100');
     to_tsvector
---------------------
 'one':1 'hundred':1
(1 row)


Mission accomplished!


Source Code

If you use PostgreSQL version 8.1+, then you need dict_tmpl-8.1.c, because of changes in dictionary interface !
lexize() should return pointer to TSLexeme array instead of char**.
/* 
 * num2english dictionary by Ben Chobot <bench@silentmedia.com>, based on
 * example of dictionary 
 * Teodor Sigaev <teodor@sigaev.ru>
 *
 */
#include <errno.h>
#include <stdlib.h>
#include <string.h>

#include "postgres.h"

#include "dict.h"
#include "common.h"

#include "subinclude.h"

/* special names for values */
struct nx {
	char name[20];
	int value;
};

static struct nx num2english_numarr[] =
{
{ "zero", 0 },
{ "one", 1 },
{ "two", 2 },
{ "three", 3 },
{ "four", 4 },
{ "five", 5 },
{ "six", 6 },
{ "seven", 7 },
{ "eight", 8 },
{ "nine", 9 },
{ "ten", 10 },
{ "eleven", 11 },
{ "twelve", 12 },
{ "thirteen", 13 },
{ "fourteen", 14 },
{ "fifteen", 15 },
{ "sixteen", 16 },
{ "seventeen", 17 },
{ "eighteen", 18 },
{ "nineteen", 19 },
{ "twenty", 20 },
{ "thirty", 30 },
{ "forty", 40 },
{ "fifty", 50 },
{ "sixty", 60 },
{ "seventy", 70 },
{ "eighty", 80 },
{ "ninety", 90 },
{ "", 999 }
};	

static char *num2english_denom[]=
{
"",
"thousand",
"million",
"billion",
"trillion",
"quadrillion",
"quintillion",
"sextillion",
"septillion",
"octillion",
"nonillion",
"decillion",
"undecillion",
"duodecillion",
"tredecillion",
"quattuordecillion",
"sexdecillion",
"septendecillion",
"octodecillion",
"novemdecillion",
"vigintillion"
};


static char *cvt2(int);
static char *cvt3(int);
static char *itowords(long long);


 PG_FUNCTION_INFO_V1(dinit_num2english);
 Datum dinit_num2english(PG_FUNCTION_ARGS);

 Datum 
 dinit_num2english(PG_FUNCTION_ARGS) {
	/* nothing to init */
	 
 	PG_RETURN_POINTER(NULL);
 }

PG_FUNCTION_INFO_V1(dlexize_num2english);
Datum dlexize_num2english(PG_FUNCTION_ARGS);
Datum
dlexize_num2english(PG_FUNCTION_ARGS) {
 	void* dummy = PG_GETARG_POINTER(0);
	char       *in = (char*)PG_GETARG_POINTER(1);
	char *txt = pnstrdup(in, PG_GETARG_INT32(2));
	char	**res=0;

	char	*phrase;
	char	*cursor;
	char	*last;
	int	lexes = 1;
	int	thisLex = 0;

 	if ( *txt=='\0' ) {
		res = palloc(sizeof(char**));
 		pfree(txt);
 		res[0]=NULL;
 	} 
	else
	{
		phrase = itowords(atoll(txt));
		if((cursor = strchr(txt,'.')) && *(cursor+1))
		{
			char	*phrase2;
			char	*ptemp = phrase;

			phrase2 = itowords(atoll(cursor+1));
			phrase = palloc(strlen(phrase2) + strlen(ptemp) + strlen(" . ") + 1);
			sprintf(phrase,"%s . %s",ptemp,phrase2);
			pfree(ptemp);
			pfree(phrase2);
		}
		pfree(txt);

		for(cursor=phrase; *cursor; cursor++) if(*cursor == ' ') lexes++;

		res = palloc(sizeof(char**)*(lexes +1));
		for(last=cursor=phrase; *cursor; cursor++)
		{
			if(*cursor == ' ')
			{
				res[thisLex] = palloc(sizeof(char*)*(cursor-last+1));
				memcpy(res[thisLex],last,(cursor-last));
				res[thisLex++][cursor-last] = 0;
				/* done with this lex. */
				if(*(cursor+1) == ' ') // if the next space is *also* whitespace....
				{
					/* We don't want it.
					   Fortunately we know we'll never get more than 2 spaces in a row. */
					cursor++;
				}
				last=cursor+1;
			}
		}

		/* finish up this last lex */
		res[thisLex] = palloc(sizeof(char*)*(cursor-last+1));
		memcpy(res[thisLex],last,(cursor-last));
		res[thisLex++][cursor-last] = 0;

		pfree(phrase);
		res[thisLex] = 0;
	}

	PG_RETURN_POINTER(res);
}

/* The code below was taken from http://h21007.www2.hp.com/dspp/tech/tech_TechDocumentDetailPage_IDX/1,1701,3556,00.html 
 and modified slightly to fit in the postgres stored proc framework. It appears to be without copywrite. */

/* take a two-digit number and cvt to words. */
static char *cvt2(int val)
{
	int i=0;
	char word[80];
	char *ret = 0;

	while(num2english_numarr[++i].value <= val)
		/* nothing */;
	strcpy(word,num2english_numarr[i-1].name);
	val -= num2english_numarr[i-1].value;
	if (val > 0)
	{
		strcat(word," ");
		strcat(word,num2english_numarr[val].name);
	}
	
	ret = palloc(strlen(word)+1);
	memcpy(ret,word,strlen(word)+1);
	return (ret);
}



/* take a 3 digit number and cvt it to words */
static char *cvt3(int val)
{
	int rem, mod;
	char word[80];
	char *ret = 0;

	word[0] = '\0';
	mod = val % 100;
	rem = val / 100;

	if ( rem > 0 )
	{
		strcat(word,num2english_numarr[rem].name);
		strcat(word," hundred");
		if (mod > 0)
			strcat(word," ");
	}
	if ( mod > 0 )
	{
		char *sub = cvt2(mod);
		strcat(word, sub);
		pfree(sub);
	}
	
	ret = palloc(strlen(word)+1);
	memcpy(ret,word,strlen(word)+1);
	return(ret);
}

/* here's the routine that does the rest */
static char *itowords(long long val)
{
	long long tri;	/* last three digits */
	long long place = 0;	/* which power of 10 we are on */
	int neg=0;	/* sign holder */
	char temp[255];	/* temporary string space */

	char word[255];
	char phrase[100];
	char *ret = 0;

	word[0] = '\0';

	/* check for negative int */
	if (val < 0 )
	{
		neg = 1;
		val = -val;
	}

	if ( val == 0 )
	{
		ret = palloc(5);
		sprintf(ret,"zero");
		return(ret);
	}

	/* what we do now is break it up into sets of three, and add the */
	/* appropriate denominations to each. */
	while (val > 0 )
	{
		phrase[0] = '\0';
		tri = val % 1000; /* last three digits */
		val = val / 1000; /* base 10 shift by 3 */
		if (tri > 0 )
		{
			char *sub = cvt3(tri);
			strcat(phrase,sub);
			pfree(sub);
			strcat(phrase," ");
		}
		if ((place > 0 ) && (tri > 0))
		{
			strcat(phrase,num2english_denom[place]);
			strcat(phrase," ");
		}
		place++;

		/* got the phrase, now put it in the string */
		strcpy(temp,word);
		if ((val > 0) && (tri > 0))
		{
			strcpy(word," ");
			strcat(word,phrase);
		}
		else
			strcpy(word,phrase);

		strcat(word,temp);
	}
	
	/* remember that minus sign ? */
	if (neg)
	{
		strcpy(temp,word);
		strcpy(word,"negative ");
		strcat(word,temp);
	}

	/* chop off the last space */
	word[strlen(word)-1] = 0;

	ret = palloc(strlen(word)+1);
	memcpy(ret,word,strlen(word)+1);
	return(ret);
}