Blame - third_party/sqlite/src/ext/fts3/fts3_tokenizer.h - mojo-tools

blob: 4a40b2b38503833bf17ded9ee2e2742e86267fc6 [file] [log] [blame]

Scott Graham	d19529d	2014-11-03 15:04:31 -0800	[diff] [blame]	1	/*
				2	** 2006 July 10
				3	**
				4	** The author disclaims copyright to this source code.
				5	**
				6	*************************************************************************
				7	** Defines the interface to tokenizers used by fulltext-search. There
				8	** are three basic components:
				9	**
				10	** sqlite3_tokenizer_module is a singleton defining the tokenizer
				11	** interface functions. This is essentially the class structure for
				12	** tokenizers.
				13	**
				14	** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
				15	** including customization information defined at creation time.
				16	**
				17	** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
				18	** tokens from a particular input.
				19	*/
				20	#ifndef _FTS3_TOKENIZER_H_
				21	#define _FTS3_TOKENIZER_H_
				22
				23	/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
				24	** If tokenizers are to be allowed to call sqlite3_*() functions, then
				25	** we will need a way to register the API consistently.
				26	*/
				27	#include "sqlite3.h"
				28
				29	/*
				30	** Structures used by the tokenizer interface. When a new tokenizer
				31	** implementation is registered, the caller provides a pointer to
				32	** an sqlite3_tokenizer_module containing pointers to the callback
				33	** functions that make up an implementation.
				34	**
				35	** When an fts3 table is created, it passes any arguments passed to
				36	** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
				37	** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
				38	** implementation. The xCreate() function in turn returns an
				39	** sqlite3_tokenizer structure representing the specific tokenizer to
				40	** be used for the fts3 table (customized by the tokenizer clause arguments).
				41	**
				42	** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
				43	** method is called. It returns an sqlite3_tokenizer_cursor object
				44	** that may be used to tokenize a specific input buffer based on
				45	** the tokenization rules supplied by a specific sqlite3_tokenizer
				46	** object.
				47	*/
				48	typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
				49	typedef struct sqlite3_tokenizer sqlite3_tokenizer;
				50	typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
				51
				52	struct sqlite3_tokenizer_module {
				53
				54	/*
James Robinson	fa68617	2015-02-23 18:32:32 -0800	[diff] [blame^]	55	** Structure version. Should always be set to 0 or 1.
Scott Graham	d19529d	2014-11-03 15:04:31 -0800	[diff] [blame]	56	*/
				57	int iVersion;
				58
				59	/*
				60	** Create a new tokenizer. The values in the argv[] array are the
				61	** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
				62	** TABLE statement that created the fts3 table. For example, if
				63	** the following SQL is executed:
				64	**
				65	** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
				66	**
				67	** then argc is set to 2, and the argv[] array contains pointers
				68	** to the strings "arg1" and "arg2".
				69	**
				70	** This method should return either SQLITE_OK (0), or an SQLite error
				71	** code. If SQLITE_OK is returned, then *ppTokenizer should be set
				72	** to point at the newly created tokenizer structure. The generic
James Robinson	fa68617	2015-02-23 18:32:32 -0800	[diff] [blame^]	73	** sqlite3_tokenizer.pModule variable should not be initialized by
Scott Graham	d19529d	2014-11-03 15:04:31 -0800	[diff] [blame]	74	** this callback. The caller will do so.
				75	*/
				76	int (*xCreate)(
				77	int argc, /* Size of argv array */
				78	const char constargv, /* Tokenizer argument strings */
				79	sqlite3_tokenizer *ppTokenizer / OUT: Created tokenizer */
				80	);
				81
				82	/*
				83	** Destroy an existing tokenizer. The fts3 module calls this method
				84	** exactly once for each successful call to xCreate().
				85	*/
				86	int (xDestroy)(sqlite3_tokenizer pTokenizer);
				87
				88	/*
				89	** Create a tokenizer cursor to tokenize an input buffer. The caller
				90	** is responsible for ensuring that the input buffer remains valid
				91	** until the cursor is closed (using the xClose() method).
				92	*/
				93	int (*xOpen)(
				94	sqlite3_tokenizer pTokenizer, / Tokenizer object */
				95	const char pInput, int nBytes, / Input buffer */
				96	sqlite3_tokenizer_cursor *ppCursor / OUT: Created tokenizer cursor */
				97	);
				98
				99	/*
				100	** Destroy an existing tokenizer cursor. The fts3 module calls this
				101	** method exactly once for each successful call to xOpen().
				102	*/
				103	int (xClose)(sqlite3_tokenizer_cursor pCursor);
				104
				105	/*
				106	** Retrieve the next token from the tokenizer cursor pCursor. This
				107	** method should either return SQLITE_OK and set the values of the
				108	** "OUT" variables identified below, or SQLITE_DONE to indicate that
				109	** the end of the buffer has been reached, or an SQLite error code.
				110	**
				111	** *ppToken should be set to point at a buffer containing the
				112	** normalized version of the token (i.e. after any case-folding and/or
				113	** stemming has been performed). *pnBytes should be set to the length
				114	** of this buffer in bytes. The input text that generated the token is
				115	** identified by the byte offsets returned in *piStartOffset and
				116	** piEndOffset. piStartOffset should be set to the index of the first
				117	** byte of the token in the input buffer. *piEndOffset should be set
				118	** to the index of the first byte just past the end of the token in
				119	** the input buffer.
				120	**
				121	** The buffer *ppToken is set to point at is managed by the tokenizer
				122	** implementation. It is only required to be valid until the next call
				123	** to xNext() or xClose().
				124	*/
				125	/* TODO(shess) current implementation requires pInput to be
				126	** nul-terminated. This should either be fixed, or pInput/nBytes
				127	** should be converted to zInput.
				128	*/
				129	int (*xNext)(
				130	sqlite3_tokenizer_cursor pCursor, / Tokenizer cursor */
				131	const char *ppToken, int pnBytes, /* OUT: Normalized text for token */
				132	int piStartOffset, / OUT: Byte offset of token in input buffer */
				133	int piEndOffset, / OUT: Byte offset of end of token in input buffer */
				134	int piPosition / OUT: Number of tokens returned before this one */
				135	);
James Robinson	fa68617	2015-02-23 18:32:32 -0800	[diff] [blame^]	136
				137	/***********************************************************************
				138	** Methods below this point are only available if iVersion>=1.
				139	*/
				140
				141	/*
				142	** Configure the language id of a tokenizer cursor.
				143	*/
				144	int (xLanguageid)(sqlite3_tokenizer_cursor pCsr, int iLangid);
Scott Graham	d19529d	2014-11-03 15:04:31 -0800	[diff] [blame]	145	};
				146
				147	struct sqlite3_tokenizer {
				148	const sqlite3_tokenizer_module pModule; / The module for this tokenizer */
				149	/* Tokenizer implementations will typically add additional fields */
				150	};
				151
				152	struct sqlite3_tokenizer_cursor {
				153	sqlite3_tokenizer pTokenizer; / Tokenizer for this cursor. */
				154	/* Tokenizer implementations will typically add additional fields */
				155	};
				156
				157	int fts3_global_term_cnt(int iTerm, int iCol);
				158	int fts3_term_cnt(int iTerm, int iCol);
				159
				160
				161	#endif /* _FTS3_TOKENIZER_H_ */