Blame - third_party/sqlite/sqlite-src-3080704/ext/fts1/fts1_tokenizer1.c - mojo-tools

blob: f58fba8f8e613ffed6764ffd0868d9646cb96011 [file] [log] [blame]

James Robinson	fa68617	2015-02-23 18:32:32 -0800	[diff] [blame^]	1	/*
				2	** The author disclaims copyright to this source code.
				3	**
				4	*************************************************************************
				5	** Implementation of the "simple" full-text-search tokenizer.
				6	*/
				7
				8	/*
				9	** The code in this file is only compiled if:
				10	**
				11	** * The FTS1 module is being built as an extension
				12	** (in which case SQLITE_CORE is not defined), or
				13	**
				14	** * The FTS1 module is being built into the core of
				15	** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
				16	*/
				17	#if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS1)
				18
				19
				20	#include <assert.h>
				21	#include <stdlib.h>
				22	#include <stdio.h>
				23	#include <string.h>
				24	#include <ctype.h>
				25
				26	#include "fts1_tokenizer.h"
				27
				28	typedef struct simple_tokenizer {
				29	sqlite3_tokenizer base;
				30	char delim[128]; /* flag ASCII delimiters */
				31	} simple_tokenizer;
				32
				33	typedef struct simple_tokenizer_cursor {
				34	sqlite3_tokenizer_cursor base;
				35	const char pInput; / input we are tokenizing */
				36	int nBytes; /* size of the input */
				37	int iOffset; /* current position in pInput */
				38	int iToken; /* index of next token to be returned */
				39	char pToken; / storage for current token */
				40	int nTokenAllocated; /* space allocated to zToken buffer */
				41	} simple_tokenizer_cursor;
				42
				43
				44	/* Forward declaration */
				45	static const sqlite3_tokenizer_module simpleTokenizerModule;
				46
				47	static int isDelim(simple_tokenizer *t, unsigned char c){
				48	return c<0x80 && t->delim[c];
				49	}
				50
				51	/*
				52	** Create a new tokenizer instance.
				53	*/
				54	static int simpleCreate(
				55	int argc, const char * const *argv,
				56	sqlite3_tokenizer **ppTokenizer
				57	){
				58	simple_tokenizer *t;
				59
				60	t = (simple_tokenizer ) calloc(sizeof(t), 1);
				61	if( t==NULL ) return SQLITE_NOMEM;
				62
				63	/* TODO(shess) Delimiters need to remain the same from run to run,
				64	** else we need to reindex. One solution would be a meta-table to
				65	** track such information in the database, then we'd only want this
				66	** information on the initial create.
				67	*/
				68	if( argc>1 ){
				69	int i, n = strlen(argv[1]);
				70	for(i=0; i<n; i++){
				71	unsigned char ch = argv[1][i];
				72	/* We explicitly don't support UTF-8 delimiters for now. */
				73	if( ch>=0x80 ){
				74	free(t);
				75	return SQLITE_ERROR;
				76	}
				77	t->delim[ch] = 1;
				78	}
				79	} else {
				80	/* Mark non-alphanumeric ASCII characters as delimiters */
				81	int i;
				82	for(i=1; i<0x80; i++){
				83	t->delim[i] = !isalnum(i);
				84	}
				85	}
				86
				87	*ppTokenizer = &t->base;
				88	return SQLITE_OK;
				89	}
				90
				91	/*
				92	** Destroy a tokenizer
				93	*/
				94	static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
				95	free(pTokenizer);
				96	return SQLITE_OK;
				97	}
				98
				99	/*
				100	** Prepare to begin tokenizing a particular string. The input
				101	** string to be tokenized is pInput[0..nBytes-1]. A cursor
				102	** used to incrementally tokenize this string is returned in
				103	** *ppCursor.
				104	*/
				105	static int simpleOpen(
				106	sqlite3_tokenizer pTokenizer, / The tokenizer */
				107	const char pInput, int nBytes, / String to be tokenized */
				108	sqlite3_tokenizer_cursor *ppCursor / OUT: Tokenization cursor */
				109	){
				110	simple_tokenizer_cursor *c;
				111
				112	c = (simple_tokenizer_cursor ) malloc(sizeof(c));
				113	if( c==NULL ) return SQLITE_NOMEM;
				114
				115	c->pInput = pInput;
				116	if( pInput==0 ){
				117	c->nBytes = 0;
				118	}else if( nBytes<0 ){
				119	c->nBytes = (int)strlen(pInput);
				120	}else{
				121	c->nBytes = nBytes;
				122	}
				123	c->iOffset = 0; /* start tokenizing at the beginning */
				124	c->iToken = 0;
				125	c->pToken = NULL; /* no space allocated, yet. */
				126	c->nTokenAllocated = 0;
				127
				128	*ppCursor = &c->base;
				129	return SQLITE_OK;
				130	}
				131
				132	/*
				133	** Close a tokenization cursor previously opened by a call to
				134	** simpleOpen() above.
				135	*/
				136	static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
				137	simple_tokenizer_cursor c = (simple_tokenizer_cursor ) pCursor;
				138	free(c->pToken);
				139	free(c);
				140	return SQLITE_OK;
				141	}
				142
				143	/*
				144	** Extract the next token from a tokenization cursor. The cursor must
				145	** have been opened by a prior call to simpleOpen().
				146	*/
				147	static int simpleNext(
				148	sqlite3_tokenizer_cursor pCursor, / Cursor returned by simpleOpen */
				149	const char *ppToken, / OUT: ppToken is the token text /
				150	int pnBytes, / OUT: Number of bytes in token */
				151	int piStartOffset, / OUT: Starting offset of token */
				152	int piEndOffset, / OUT: Ending offset of token */
				153	int piPosition / OUT: Position integer of token */
				154	){
				155	simple_tokenizer_cursor c = (simple_tokenizer_cursor ) pCursor;
				156	simple_tokenizer t = (simple_tokenizer ) pCursor->pTokenizer;
				157	unsigned char p = (unsigned char )c->pInput;
				158
				159	while( c->iOffset<c->nBytes ){
				160	int iStartOffset;
				161
				162	/* Scan past delimiter characters */
				163	while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
				164	c->iOffset++;
				165	}
				166
				167	/* Count non-delimiter characters. */
				168	iStartOffset = c->iOffset;
				169	while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
				170	c->iOffset++;
				171	}
				172
				173	if( c->iOffset>iStartOffset ){
				174	int i, n = c->iOffset-iStartOffset;
				175	if( n>c->nTokenAllocated ){
				176	c->nTokenAllocated = n+20;
				177	c->pToken = realloc(c->pToken, c->nTokenAllocated);
				178	if( c->pToken==NULL ) return SQLITE_NOMEM;
				179	}
				180	for(i=0; i<n; i++){
				181	/* TODO(shess) This needs expansion to handle UTF-8
				182	** case-insensitivity.
				183	*/
				184	unsigned char ch = p[iStartOffset+i];
				185	c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
				186	}
				187	*ppToken = c->pToken;
				188	*pnBytes = n;
				189	*piStartOffset = iStartOffset;
				190	*piEndOffset = c->iOffset;
				191	*piPosition = c->iToken++;
				192
				193	return SQLITE_OK;
				194	}
				195	}
				196	return SQLITE_DONE;
				197	}
				198
				199	/*
				200	** The set of routines that implement the simple tokenizer
				201	*/
				202	static const sqlite3_tokenizer_module simpleTokenizerModule = {
				203	0,
				204	simpleCreate,
				205	simpleDestroy,
				206	simpleOpen,
				207	simpleClose,
				208	simpleNext,
				209	};
				210
				211	/*
				212	** Allocate a new simple tokenizer. Return a pointer to the new
				213	** tokenizer in *ppModule
				214	*/
				215	void sqlite3Fts1SimpleTokenizerModule(
				216	sqlite3_tokenizer_module const**ppModule
				217	){
				218	*ppModule = &simpleTokenizerModule;
				219	}
				220
				221	#endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS1) */