00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include <stdio.h>
00033 #include <stdlib.h>
00034 #include <string.h>
00035
00036 #include <ctype.h>
00037
00038 #include "sfksearch.h"
00039
00040
00041
00042
00043 static void * KTRIE_MALLOC( int n )
00044 {
00045 void * p;
00046
00047 p = malloc( n );
00048
00049 memset(p,0,n);
00050
00051 return p;
00052 }
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067 static unsigned char Tnocase[65*1024];
00068
00069
00070
00071
00072 static unsigned char xlatcase[256];
00073
00074
00075
00076
00077 static void init_xlatcase()
00078 {
00079 int i;
00080 static int first=1;
00081
00082 if( !first ) return;
00083
00084 for(i=0;i<256;i++)
00085 {
00086 xlatcase[ i ] = (unsigned char)tolower(i);
00087 }
00088
00089 first=0;
00090 }
00091
00092
00093
00094
00095 static inline void ConvertCaseEx( unsigned char * d, unsigned char *s, int m )
00096 {
00097 int i;
00098 for( i=0; i < m; i++ )
00099 {
00100 d[i] = xlatcase[ s[i] ];
00101 }
00102 }
00103
00104
00105
00106
00107
00108 KTRIE_STRUCT * KTrieNew()
00109 {
00110 KTRIE_STRUCT * ts = (KTRIE_STRUCT*) KTRIE_MALLOC( sizeof(KTRIE_STRUCT) );
00111
00112 if( !ts ) return 0;
00113
00114 memset(ts, 0, sizeof(KTRIE_STRUCT));
00115
00116 init_xlatcase();
00117
00118 ts->memory = sizeof(KTRIE_STRUCT);
00119 ts->nchars = 0;
00120 ts->npats = 0;
00121
00122 return ts;
00123 }
00124
00125
00126
00127
00128 static KTRIEPATTERN * KTrieNewPattern(unsigned char * P, int n)
00129 {
00130 KTRIEPATTERN *p = (KTRIEPATTERN*) KTRIE_MALLOC( sizeof(KTRIEPATTERN) );
00131
00132 if( !p ) return 0;
00133
00134
00135 p->P = (unsigned char*) KTRIE_MALLOC( n );
00136 if( !p->P )
00137 return 0;
00138
00139 ConvertCaseEx( p->P, P, n );
00140
00141
00142 p->Pcase = (unsigned char*) KTRIE_MALLOC( n );
00143 if( !p->Pcase )
00144 return 0;
00145 memcpy( p->Pcase, P, n );
00146
00147 p->n = n;
00148 p->next = 0;
00149
00150 return p;
00151 }
00152
00153
00154
00155
00156 int KTrieAddPattern( KTRIE_STRUCT * ts, unsigned char * P, int n,
00157 int nocase, void * id )
00158 {
00159 KTRIEPATTERN *new;
00160
00161 if( !ts->patrn )
00162 {
00163 new = ts->patrn = KTrieNewPattern( P, n );
00164
00165 if( !new ) return -1;
00166 }
00167 else
00168 {
00169 new = KTrieNewPattern(P, n );
00170
00171 if( !new ) return -1;
00172
00173 new->next = ts->patrn;
00174
00175 ts->patrn = new;
00176 }
00177
00178 new->nocase = nocase;
00179 new->id = id;
00180 new->mnext = NULL;
00181
00182 ts->npats++;
00183 ts->memory += sizeof(KTRIEPATTERN) + 2 * n ;
00184
00185 return 1;
00186 }
00187
00188
00189
00190
00191
00192 static KTRIENODE * KTrieCreateNode(KTRIE_STRUCT * ts)
00193 {
00194 KTRIENODE * t=(KTRIENODE*)KTRIE_MALLOC( sizeof(KTRIENODE) );
00195
00196 if(!t)
00197 return 0;
00198
00199 memset(t,0,sizeof(KTRIENODE));
00200
00201 ts->memory += sizeof(KTRIENODE);
00202
00203 return t;
00204 }
00205
00206
00207
00208
00209
00210 static int KTrieInsert( KTRIE_STRUCT *ts, KTRIEPATTERN * px )
00211 {
00212 int type = 0;
00213 int n = px->n;
00214 unsigned char *P = px->P;
00215 KTRIENODE *root;
00216
00217
00218 if( !ts->root[*P] )
00219 {
00220 ts->root[*P] = root = KTrieCreateNode(ts);
00221 if( !root ) return -1;
00222 root->edge = *P;
00223
00224 }else{
00225
00226 root = ts->root[*P];
00227 }
00228
00229
00230 while( n )
00231 {
00232 if( root->edge == *P )
00233 {
00234 P++;
00235 n--;
00236
00237 if( n && root->child )
00238 {
00239 root=root->child;
00240 }
00241 else
00242 {
00243 type = 0;
00244 break;
00245 }
00246 }
00247 else
00248 {
00249 if( root->sibling )
00250 {
00251 root=root->sibling;
00252 }
00253 else
00254 {
00255 type = 1;
00256 break;
00257 }
00258 }
00259 }
00260
00261
00262
00263
00264 if( n )
00265 {
00266 if( type == 0 )
00267 {
00268
00269
00270
00271 root->child= KTrieCreateNode( ts );
00272 if( ! root->child ) return -1;
00273 root=root->child;
00274 root->edge = *P;
00275 P++;
00276 n--;
00277 ts->nchars++;
00278
00279 }
00280 else
00281 {
00282
00283
00284
00285 root->sibling= KTrieCreateNode( ts );
00286 if( ! root->sibling ) return -1;
00287 root=root->sibling;
00288 root->edge = *P;
00289 P++;
00290 n--;
00291 ts->nchars++;
00292 }
00293 }
00294
00295
00296
00297
00298 while( n )
00299 {
00300 root->child = KTrieCreateNode(ts);
00301 if( ! root->child ) return -1;
00302 root=root->child;
00303 root->edge = *P;
00304 P++;
00305 n--;
00306 ts->nchars++;
00307 }
00308
00309 if( root->pkeyword )
00310 {
00311 px->mnext = root->pkeyword;
00312 root->pkeyword = px;
00313 ts->duplicates++;
00314 }
00315 else
00316 {
00317 root->pkeyword = px;
00318 }
00319
00320 return 0;
00321 }
00322
00323
00324
00325
00326
00327 static void Build_Bad_Character_Shifts( KTRIE_STRUCT * kt )
00328 {
00329 int i,k;
00330 KTRIEPATTERN *plist;
00331
00332
00333 kt->bcSize = 32000;
00334
00335 for( plist=kt->patrn; plist!=NULL; plist=plist->next )
00336 {
00337 if( plist->n < kt->bcSize )
00338 {
00339 kt->bcSize = plist->n;
00340 }
00341 }
00342
00343
00344
00345
00346 for(i=0;i<256;i++)
00347 {
00348 kt->bcShift[i] = (unsigned short)kt->bcSize;
00349 }
00350
00351
00352
00353
00354 for( plist=kt->patrn; plist!=NULL; plist=plist->next )
00355 {
00356 int shift, cindex;
00357
00358 for( k=0; k<kt->bcSize; k++ )
00359 {
00360 shift = kt->bcSize - 1 - k;
00361
00362 cindex = plist->P[ k ];
00363
00364 if( shift < kt->bcShift[ cindex ] )
00365 {
00366 kt->bcShift[ cindex ] = (unsigned short)shift;
00367 }
00368 }
00369 }
00370 }
00371
00372
00373
00374
00375
00376
00377 int KTrieCompile(KTRIE_STRUCT * ts)
00378 {
00379 KTRIEPATTERN * p;
00380
00381
00382
00383
00384
00385
00386
00387 for( p=ts->patrn; p; p=p->next )
00388 {
00389 if( KTrieInsert( ts, p ) )
00390 return -1;
00391 }
00392
00393
00394
00395
00396 Build_Bad_Character_Shifts( ts );
00397
00398
00399
00400
00401
00402
00403 return 0;
00404 }
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423 static inline int KTriePrefixMatch( KTRIE_STRUCT * kt,
00424 unsigned char * T,
00425 unsigned char * Tc,
00426 unsigned char * bT,
00427 int n,
00428 int(*match)( void * id, int index, void * data ),
00429 void * data )
00430 {
00431 KTRIENODE * root = kt->root[ *T ];
00432 int nfound = 0;
00433 KTRIEPATTERN * pk;
00434 int index ;
00435
00436
00437 if( !root ) return 0;
00438
00439 while( n )
00440 {
00441 if( root->edge == *T )
00442 {
00443 T++;
00444 n--;
00445
00446 for( pk = root->pkeyword; pk; pk= pk->mnext )
00447 {
00448 index = (int)(T - bT - pk->n );
00449
00450 if( pk->nocase )
00451 {
00452 nfound++;
00453 if( match( pk->id, index, data ) )
00454 return nfound;
00455 }
00456 else
00457 {
00458 if( !memcmp(pk->Pcase,Tc,pk->n) )
00459 {
00460 nfound++;
00461 if( match( pk->id, index, data ) )
00462 return nfound;
00463 }
00464 }
00465 }
00466
00467 if( n && root->child )
00468 {
00469 root = root->child;
00470 }
00471 else
00472 {
00473 break;
00474 }
00475 }
00476 else
00477 {
00478 if( root->sibling )
00479 {
00480 root = root->sibling;
00481 }
00482 else
00483 {
00484 break;
00485 }
00486 }
00487 }
00488
00489 return nfound;
00490 }
00491
00492
00493
00494
00495 static inline int KTrieSearchNoBC( KTRIE_STRUCT * ks, unsigned char * Tx, int n,
00496 int(*match)( void * id, int index, void * data ), void * data )
00497 {
00498 int nfound = 0;
00499 unsigned char *T, *bT;
00500
00501 ConvertCaseEx( Tnocase, Tx, n );
00502
00503 T = Tnocase;
00504 bT = T;
00505
00506 for( ; n>0 ; n--, T++, Tx++ )
00507 {
00508 nfound += KTriePrefixMatch( ks, T, Tx, bT, n, match, data );
00509 }
00510
00511 return nfound;
00512 }
00513
00514
00515
00516
00517 static inline int KTrieSearchBC( KTRIE_STRUCT * ks, unsigned char * Tx, int n,
00518 int(*match)( void * id, int index, void * data ), void * data )
00519 {
00520 int tshift;
00521 unsigned char *Tend;
00522 unsigned char *T, *bT;
00523 int nfound = 0;
00524 short *bcShift = (short*)ks->bcShift;
00525 int bcSize = ks->bcSize;
00526
00527 ConvertCaseEx( Tnocase, Tx, n );
00528
00529 T = Tnocase;
00530 bT = T;
00531
00532 Tend = T + n - bcSize;
00533
00534 bcSize--;
00535
00536 for( ;T <= Tend; n--, T++, Tx++ )
00537 {
00538 while( (tshift = bcShift[ *( T + bcSize ) ]) > 0 )
00539 {
00540 T += tshift;
00541 Tx += tshift;
00542 if( T > Tend ) return nfound;
00543 }
00544
00545 nfound += KTriePrefixMatch( ks, T, Tx, bT, n, match, data );
00546 }
00547
00548 return nfound;
00549 }
00550
00551
00552
00553
00554 int KTrieSearch( KTRIE_STRUCT * ks, unsigned char * T, int n,
00555 int(*match)( void * id, int index, void * data ), void * data )
00556 {
00557 if( ks->bcSize < 3)
00558 return KTrieSearchNoBC( ks, T, n, match, data );
00559 else
00560 return KTrieSearchBC( ks, T, n, match, data );
00561 }
00562
00563
00564
00565
00566
00567
00568 #ifdef KTRIE_MAIN
00569
00570 char ** gargv;
00571
00572 int trie_nmatches = 0;
00573
00574 int match( unsigned id, int index, void * data )
00575 {
00576 trie_nmatches++;
00577 data = data;
00578 printf("id=%d found at index=%d, %s\n",id,index,gargv[id]);
00579 return 0;
00580 }
00581
00582
00583
00584
00585 int main( int argc, char ** argv )
00586 {
00587 int i;
00588 KTRIE_STRUCT * ts;
00589 int nocase=1;
00590
00591 gargv = argv;
00592
00593 ts = KTrieNew();
00594
00595 if( argc < 3 )
00596 {
00597 printf("%s text pat1 pat2 ... patn [-c(ase-sensitive)\n",argv[0]);
00598 printf("search for keywords-default, or match keywords\n");
00599 exit(0);
00600 }
00601
00602 for(i=1;i<argc;i++)
00603 {
00604 if( strcmp(argv[i],"-c")==0 ) nocase=0;
00605 }
00606
00607 printf("New TRIE created\n");
00608
00609 for(i=2;i<argc;i++)
00610 {
00611 if( argv[i][0]=='-' )
00612 continue;
00613
00614 KTrieAddPattern( ts, (unsigned char *)argv[i], strlen(argv[i]), nocase, i );
00615 }
00616
00617 printf("Patterns added \n");
00618
00619 KTrieCompile( ts );
00620
00621 printf("Patterns compiled \n");
00622 printf("--> %d characters, %d patterns, %d bytes allocated\n",ts->nchars,ts->npats,ts->memory);
00623
00624 printf("Searching...\n");
00625
00626 KTrieSearch( ts, (unsigned char*)argv[1], strlen(argv[1]), match, 0 );
00627
00628 printf("%d matches found\n",trie_nmatches);
00629
00630 printf("normal pgm finish.\n");
00631
00632 return 0;
00633 }
00634
00635 #endif