sqlite3通过使用fts3虚表支持全文搜索,默认支持simple和porter两种分词器,并提供了接口来自定义分词器。这里我们利用mmseg来构造自定义的中文分词器。 虽然sqlite在fts3_tokenizer.h中提供了各种接口供用户自定义分词器,但其并未提供c函数供用户来注册自定义的分词器,分词器的注册必须使用SQL语句来完成。 SELECTfts3_tokenizer(<tokenizer-name>,<sqlite3_tokenizer_moduleptr>); 其中tokenizer-name是分词器的名称,sqlite3_tokenizer_moduleptr只一个指向sqlite3_tokenizer_module结构的指针并且编码为sqlblob。下面是官方给出的注册函数: int registerTokenizer( sqlite3 *db, char *zname, const sqlite3_tokenizer_module *p ){ intrc; sqlite3_stmt*pStmt; const char*zsql = "SELECT fts3_tokenizer(?,?)"; rc =sqlite3_prepare_v2(db,zsql,-1,&pStmt,0); if(rc!=sqlITE_OK ){ return rc; } sqlite3_bind_text(pStmt,1,zname,sqlITE_STATIC); sqlite3_bind_blob(pStmt,2,&p,sizeof(p),sqlITE_STATIC); sqlite3_step(pStmt); returnsqlite3_finalize(pStmt); } 要想实现自定义的分词器,最关键的时是得到指向sqlite3_tokenizer_module结构的一个指针,sqlite3_tokenizer_module结构体定义如下: struct sqlite3_tokenizer_module { int iVersion; //版本号,必须设置为0 int (*xCreate)( //创建虚表时自动调用并创建分词器 intargc, const char*const*argv, sqlite3_tokenizer**ppTokenizer ); int (*xDestroy)(sqlite3_tokenizer *pTokenizer);//数据库连接关闭时自动调用,用于销毁资源 int (*xOpen)( //插入数据或检索时自动调用以进行分词 sqlite3_tokenizer*pTokenizer, const char*pinput,intnBytes, sqlite3_tokenizer_cursor **ppCursor ); int(*xClose)(sqlite3_tokenizer_cursor *pCursor); //分词结果提取完毕后自动调用 int (*xNext)( //逐个提取分词结果 sqlite3_tokenizer_cursor*pCursor, const char**ppToken,int *pnBytes, int*piStartOffset, int*pIEndOffset, int*piposition ); }; 有几点需要注意的是: 1分词引擎使用SQL语句注册意味着每建立一个sqlite连接都必须注册一次分词器,对于需要使用词库的中文分词器来说也意味着巨大的内存消耗。 2在检索时分词结果的提取和语义的解析式交替进行的。例如我们搜索"kanif ORsqlite"的时候,引擎先将全部传入到分词器,在调用一次next获取到词kanif后,在将词sqlite传入到分词器,直到全部解析完毕。 3由于中文分词本身的特殊性,例如"北京市"很有可能视为一个完整的词,这样在搜索"北京"的时候就无法获取到结果。如果分词器支持将"北京市"切分为"北京市"和"北京"或者将十一月切分为"11月"和"十一",那么需注意(*xNext)函数中的piStartOffset和pIEndOffset参数。经测试在插入数据的时候这两个参数无实际用途,但在查询的时候这两个参数决定了下一次的输入串。 附: #include <assert.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include <ctype.h> #include <sys/types.h> #include"fts3_tokenizer.h" #include "mmseg/mmseg.cpp" static bool loadDic = true; typedef struct cus_tokenizer{ sqlite3_tokenizer base; } cus_tokenizer; typedef struct cus_tokenizer_cursor{ sqlite3_tokenizer_cursor base; char *pinput; int nBytes; int iToken; char *pToken; rmmseg::Algorithm *pAlgor; } cus_tokenizer_cursor; voID initmmseg(voID){ if(!loadDic) return; mmseg_load_words("chars.dic"); mmseg_load_words("words.dic"); loadDic =False; } static int cusCreate( int argc,const char * const *argv, sqlite3_tokenizer **ppTokenizer ){ cus_tokenizer *t; t = (cus_tokenizer *) sqlite3_malloc(sizeof(*t)); if( t==NulL ) return sqlITE_NOMEM; memset(t,sizeof(*t)); initmmseg(); *ppTokenizer = &t->base; return sqlITE_OK; } static intcusDestroy(sqlite3_tokenizer *pTokenizer){ sqlite3_free(pTokenizer); return sqlITE_OK; } static int cusOpen( sqlite3_tokenizer*pTokenizer, const char *pinput, sqlite3_tokenizer_cursor**ppCursor ){ cus_tokenizer_cursor *c; if(pinput == 0){ nBytes =0; }else if(nBytes < 0) nBytes = (int)strlen(pinput); c = (cus_tokenizer_cursor *)sqlite3_malloc(sizeof(*c)); if(c == NulL) return sqlITE_NOMEM; c->iToken =c->nBytes = 0; c->pinput = c->pToken =NulL; c->pAlgor = mmseg_algor_create(pinput,nBytes); c->nBytes = nBytes; *ppCursor = &c->base; return sqlITE_OK; } static intcusClose(sqlite3_tokenizer_cursor *pCursor){ cus_tokenizer_cursor *c = (cus_tokenizer_cursor *)pCursor; if(c->pinput != NulL){ sqlite3_free(c->pinput); } if(c->pToken != NulL){ sqlite3_free(c->pToken); } if(c->pAlgor != NulL){ mmseg_algor_destroy(c->pAlgor); } c->pinput = c->pToken =NulL; c->pAlgor = NulL; sqlite3_free(c); return sqlITE_OK; } static int cusNext( sqlite3_tokenizer_cursor *pCursor, const char**ppToken, int*pnBytes, int*piStartOffset, int*pIEndOffset, int*piposition ){ cus_tokenizer_cursor *c = (cus_tokenizer_cursor *)pCursor; cus_tokenizer *t = (cus_tokenizer *)pCursor->pTokenizer; if(c->pToken != NulL){ sqlite3_free(c->pToken); c->pToken = NulL; } struct Token token =mmseg_next_token(c->pAlgor); if(token.length != 0 ){ int l =token.length; c->pToken = (char *)sqlite3_malloc(l+1); if(c->pToken == NulL) return sqlITE_NOMEM; c->pToken[l] = 0; memcpy(c->pToken,token.text,l); *ppToken =c->pToken; *pnBytes =l; *piStartOffset = token.offset; *pIEndOffset= token.offset + token.length; *piposition= c->iToken++; returnsqlITE_OK; } //一般来说只有插入数据时才会进入到这里 return sqlITE_DONE; } static const sqlite3_tokenizer_module cusTokenizerModule ={ 0, cusCreate, cusDestroy, cusOpen, cusClose, cusNext, }; int registerTokenizer( sqlite3 *db,sqlITE_STATIC); sqlite3_step(pStmt); returnsqlite3_finalize(pStmt); } int main(){ constsqlite3_tokenizer_module *ptr =&cusTokenizerModule; sqlite3*pDB; sqlite3_stmt* stmt; char *errMsg = NulL; const char*zTail; int rc =sqlite3_open("test.sqlite3",&pDB); if(rc){ printf("create error. %s\n",sqlite3_errmsg(pDB)); return rc; } chartoken_name[] = "custoken"; registerTokenizer(pDB,token_name,ptr); rc =sqlite3_exec(pDB,"CREATE VIRTUAL table foo USINGfts3(tokenize=custoken)",&errMsg); if(rc !=sqlITE_OK){ printf("create virtual error,%s\n",errMsg); if(rc !=sqlITE_OK){ printf("create virtual error,errMsg); return rc; } rc =sqlite3_exec(pDB,"INSERT INTO foovalUES('\xe5\x8c\x97\xe4\xba\xac\xe5\xb8\x82')",&errMsg); if(rc !=sqlITE_OK){ printf("insert value error,errMsg); return rc; } int nrow =0,ncolumn = 0; char**azResult; //二维数组存放结果 sqlite3_get_table(pDB,"SELECT * FROM foo WHERE content MATCH'\xe5\x8c\x97\xe4\xba\xac\xe5\xb8\x82'",&azResult,&nrow,&ncolumn,&errMsg ); int i = 0; printf("row:%d column=%d \n",nrow,ncolumn ); printf("\nThe result of querying is : \n" ); for( i=0 ;i<( nrow + 1 ) * ncolumn ; i++ ) printf( "azResult[%d] = %s\n",i,azResult[i] ); sqlite3_free_table( azResult ); sqlite3_close(pDB); return0; } |
评论列表(0条)