00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057 #include <stdio.h>
00058 #include <stdlib.h>
00059 #include <string.h>
00060 #include <ctype.h>
00061
00062 #include "acsmx.h"
00063
00064 #define MEMASSERT(p,s) if(!p){fprintf(stderr,"ACSM-No Memory: %s!\n",s);exit(0);}
00065
00066 #ifdef DEBUG_AC
00067 static int max_memory = 0;
00068 #endif
00069
00070
00071
00072
00073
00074
00075 static void *
00076 AC_MALLOC (int n)
00077 {
00078 void *p;
00079 p = malloc (n);
00080 #ifdef DEBUG_AC
00081 if (p)
00082 max_memory += n;
00083 #endif
00084 return p;
00085 }
00086
00087
00088
00089
00090
00091 static void
00092 AC_FREE (void *p)
00093 {
00094 if (p)
00095 free (p);
00096 }
00097
00098
00099
00100
00101
00102 typedef struct _qnode
00103 {
00104 int state;
00105 struct _qnode *next;
00106 }
00107 QNODE;
00108
00109
00110
00111
00112 typedef struct _queue
00113 {
00114 QNODE * head, *tail;
00115 int count;
00116 }
00117 QUEUE;
00118
00119
00120
00121
00122 static void
00123 queue_init (QUEUE * s)
00124 {
00125 s->head = s->tail = 0;
00126 s->count = 0;
00127 }
00128
00129
00130
00131
00132
00133 static void
00134 queue_add (QUEUE * s, int state)
00135 {
00136 QNODE * q;
00137 if (!s->head)
00138 {
00139 q = s->tail = s->head = (QNODE *) AC_MALLOC (sizeof (QNODE));
00140 MEMASSERT (q, "queue_add");
00141 q->state = state;
00142 q->next = 0;
00143 }
00144 else
00145 {
00146 q = (QNODE *) AC_MALLOC (sizeof (QNODE));
00147 MEMASSERT (q, "queue_add");
00148 q->state = state;
00149 q->next = 0;
00150 s->tail->next = q;
00151 s->tail = q;
00152 }
00153 s->count++;
00154 }
00155
00156
00157
00158
00159
00160 static int
00161 queue_remove (QUEUE * s)
00162 {
00163 int state = 0;
00164 QNODE * q;
00165 if (s->head)
00166 {
00167 q = s->head;
00168 state = q->state;
00169 s->head = s->head->next;
00170 s->count--;
00171 if (!s->head)
00172 {
00173 s->tail = 0;
00174 s->count = 0;
00175 }
00176 AC_FREE (q);
00177 }
00178 return state;
00179 }
00180
00181
00182
00183
00184
00185 static int
00186 queue_count (QUEUE * s)
00187 {
00188 return s->count;
00189 }
00190
00191
00192
00193
00194
00195 static void
00196 queue_free (QUEUE * s)
00197 {
00198 while (queue_count (s))
00199 {
00200 queue_remove (s);
00201 }
00202 }
00203
00204
00205
00206
00207
00208 static unsigned char xlatcase[256];
00209
00210
00211
00212
00213 static void
00214 init_xlatcase ()
00215 {
00216 int i;
00217 for (i = 0; i < 256; i++)
00218 {
00219 xlatcase[i] = toupper (i);
00220 }
00221 }
00222
00223
00224
00225
00226
00227 static inline void
00228 ConvertCase (unsigned char *s, int m)
00229 {
00230 int i;
00231 for (i = 0; i < m; i++)
00232 {
00233 s[i] = xlatcase[s[i]];
00234 }
00235 }
00236
00237
00238
00239
00240
00241 static inline void
00242 ConvertCaseEx (unsigned char *d, unsigned char *s, int m)
00243 {
00244 int i;
00245 for (i = 0; i < m; i++)
00246 {
00247 d[i] = xlatcase[s[i]];
00248 }
00249 }
00250
00251
00252
00253
00254
00255 static ACSM_PATTERN *
00256 CopyMatchListEntry (ACSM_PATTERN * px)
00257 {
00258 ACSM_PATTERN * p;
00259 p = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));
00260 MEMASSERT (p, "CopyMatchListEntry");
00261 memcpy (p, px, sizeof (ACSM_PATTERN));
00262 p->next = 0;
00263 return p;
00264 }
00265
00266
00267
00268
00269
00270
00271 static void
00272 AddMatchListEntry (ACSM_STRUCT * acsm, int state, ACSM_PATTERN * px)
00273 {
00274 ACSM_PATTERN * p;
00275 p = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));
00276 MEMASSERT (p, "AddMatchListEntry");
00277 memcpy (p, px, sizeof (ACSM_PATTERN));
00278 p->next = acsm->acsmStateTable[state].MatchList;
00279 acsm->acsmStateTable[state].MatchList = p;
00280 }
00281
00282
00283
00284
00285
00286 static void
00287 AddPatternStates (ACSM_STRUCT * acsm, ACSM_PATTERN * p)
00288 {
00289 unsigned char *pattern;
00290 int state=0, next, n;
00291 n = p->n;
00292 pattern = p->patrn;
00293
00294
00295
00296
00297 for (; n > 0; pattern++, n--)
00298 {
00299 next = acsm->acsmStateTable[state].NextState[*pattern];
00300 if (next == ACSM_FAIL_STATE)
00301 break;
00302 state = next;
00303 }
00304
00305
00306
00307
00308 for (; n > 0; pattern++, n--)
00309 {
00310 acsm->acsmNumStates++;
00311 acsm->acsmStateTable[state].NextState[*pattern] = acsm->acsmNumStates;
00312 state = acsm->acsmNumStates;
00313 }
00314
00315 AddMatchListEntry (acsm, state, p);
00316 }
00317
00318
00319
00320
00321
00322 static void
00323 Build_NFA (ACSM_STRUCT * acsm)
00324 {
00325 int r, s;
00326 int i;
00327 QUEUE q, *queue = &q;
00328 ACSM_PATTERN * mlist=0;
00329 ACSM_PATTERN * px=0;
00330
00331
00332 queue_init (queue);
00333
00334
00335 for (i = 0; i < ALPHABET_SIZE; i++)
00336 {
00337 s = acsm->acsmStateTable[0].NextState[i];
00338 if (s)
00339 {
00340 queue_add (queue, s);
00341 acsm->acsmStateTable[s].FailState = 0;
00342 }
00343 }
00344
00345
00346 while (queue_count (queue) > 0)
00347 {
00348 r = queue_remove (queue);
00349
00350
00351 for (i = 0; i < ALPHABET_SIZE; i++)
00352 {
00353 int fs, next;
00354 if ((s = acsm->acsmStateTable[r].NextState[i]) != ACSM_FAIL_STATE)
00355 {
00356 queue_add (queue, s);
00357 fs = acsm->acsmStateTable[r].FailState;
00358
00359
00360
00361
00362 while ((next=acsm->acsmStateTable[fs].NextState[i]) ==
00363 ACSM_FAIL_STATE)
00364 {
00365 fs = acsm->acsmStateTable[fs].FailState;
00366 }
00367
00368
00369
00370
00371 acsm->acsmStateTable[s].FailState = next;
00372
00373
00374
00375
00376
00377
00378 for (mlist = acsm->acsmStateTable[next].MatchList;
00379 mlist != NULL ;
00380 mlist = mlist->next)
00381 {
00382 px = CopyMatchListEntry (mlist);
00383
00384 if( !px )
00385 {
00386 printf("*** Out of memory Initializing Aho Corasick in acsmx.c ****");
00387 }
00388
00389
00390 px->next = acsm->acsmStateTable[s].MatchList;
00391 acsm->acsmStateTable[s].MatchList = px;
00392 }
00393 }
00394 }
00395 }
00396
00397
00398 queue_free (queue);
00399 }
00400
00401
00402
00403
00404
00405 static void
00406 Convert_NFA_To_DFA (ACSM_STRUCT * acsm)
00407 {
00408 int r, s;
00409 int i;
00410 QUEUE q, *queue = &q;
00411
00412
00413 queue_init (queue);
00414
00415
00416 for (i = 0; i < ALPHABET_SIZE; i++)
00417 {
00418 s = acsm->acsmStateTable[0].NextState[i];
00419 if (s)
00420 {
00421 queue_add (queue, s);
00422 }
00423 }
00424
00425
00426 while (queue_count (queue) > 0)
00427 {
00428 r = queue_remove (queue);
00429
00430
00431 for (i = 0; i < ALPHABET_SIZE; i++)
00432 {
00433 if ((s = acsm->acsmStateTable[r].NextState[i]) != ACSM_FAIL_STATE)
00434 {
00435 queue_add (queue, s);
00436 }
00437 else
00438 {
00439 acsm->acsmStateTable[r].NextState[i] =
00440 acsm->acsmStateTable[acsm->acsmStateTable[r].FailState].
00441 NextState[i];
00442 }
00443 }
00444 }
00445
00446
00447 queue_free (queue);
00448 }
00449
00450
00451
00452
00453
00454 ACSM_STRUCT * acsmNew ()
00455 {
00456 ACSM_STRUCT * p;
00457 init_xlatcase ();
00458 p = (ACSM_STRUCT *) AC_MALLOC (sizeof (ACSM_STRUCT));
00459 MEMASSERT (p, "acsmNew");
00460 if (p)
00461 memset (p, 0, sizeof (ACSM_STRUCT));
00462 return p;
00463 }
00464
00465
00466
00467
00468
00469 int
00470 acsmAddPattern (ACSM_STRUCT * p, unsigned char *pat, int n, int nocase,
00471 int offset, int depth, void * id, int iid)
00472 {
00473 ACSM_PATTERN * plist;
00474 plist = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));
00475 MEMASSERT (plist, "acsmAddPattern");
00476 plist->patrn = (unsigned char *) AC_MALLOC (n);
00477 ConvertCaseEx (plist->patrn, pat, n);
00478 plist->casepatrn = (unsigned char *) AC_MALLOC (n);
00479 memcpy (plist->casepatrn, pat, n);
00480 plist->n = n;
00481 plist->nocase = nocase;
00482 plist->offset = offset;
00483 plist->depth = depth;
00484 plist->id = id;
00485 plist->iid = iid;
00486 plist->next = p->acsmPatterns;
00487 p->acsmPatterns = plist;
00488 return 0;
00489 }
00490
00491
00492
00493
00494
00495 int
00496 acsmCompile (ACSM_STRUCT * acsm)
00497 {
00498 int i, k;
00499 ACSM_PATTERN * plist;
00500
00501
00502 acsm->acsmMaxStates = 1;
00503 for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
00504 {
00505 acsm->acsmMaxStates += plist->n;
00506 }
00507 acsm->acsmStateTable =
00508 (ACSM_STATETABLE *) AC_MALLOC (sizeof (ACSM_STATETABLE) *
00509 acsm->acsmMaxStates);
00510 MEMASSERT (acsm->acsmStateTable, "acsmCompile");
00511 memset (acsm->acsmStateTable, 0,
00512 sizeof (ACSM_STATETABLE) * acsm->acsmMaxStates);
00513
00514
00515 acsm->acsmNumStates = 0;
00516
00517
00518 for (k = 0; k < acsm->acsmMaxStates; k++)
00519 {
00520 for (i = 0; i < ALPHABET_SIZE; i++)
00521 {
00522 acsm->acsmStateTable[k].NextState[i] = ACSM_FAIL_STATE;
00523 }
00524 }
00525
00526
00527 for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
00528 {
00529 AddPatternStates (acsm, plist);
00530 }
00531
00532
00533 for (i = 0; i < ALPHABET_SIZE; i++)
00534 {
00535 if (acsm->acsmStateTable[0].NextState[i] == ACSM_FAIL_STATE)
00536 {
00537 acsm->acsmStateTable[0].NextState[i] = 0;
00538 }
00539 }
00540
00541
00542 Build_NFA (acsm);
00543
00544
00545 Convert_NFA_To_DFA (acsm);
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555 return 0;
00556 }
00557
00558
00559 static unsigned char Tc[64*1024];
00560
00561
00562
00563
00564 int
00565 acsmSearch (ACSM_STRUCT * acsm, unsigned char *Tx, int n,
00566 int (*Match) (void * id, int index, void *data), void *data)
00567 {
00568 int state;
00569 ACSM_PATTERN * mlist;
00570 unsigned char *Tend;
00571 ACSM_STATETABLE * StateTable = acsm->acsmStateTable;
00572 int nfound = 0;
00573 unsigned char *T;
00574 int index;
00575
00576
00577 ConvertCaseEx (Tc, Tx, n);
00578 T = Tc;
00579 Tend = T + n;
00580
00581 for (state = 0; T < Tend; T++)
00582 {
00583 state = StateTable[state].NextState[*T];
00584
00585 if( StateTable[state].MatchList != NULL )
00586 {
00587 for( mlist=StateTable[state].MatchList; mlist!=NULL;
00588 mlist=mlist->next )
00589 {
00590 index = T - mlist->n + 1 - Tc;
00591 if( mlist->nocase )
00592 {
00593 nfound++;
00594 if (Match (mlist->id, index, data))
00595 return nfound;
00596 }
00597 else
00598 {
00599 if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
00600 {
00601 nfound++;
00602 if (Match (mlist->id, index, data))
00603 return nfound;
00604 }
00605 }
00606 }
00607 }
00608 }
00609 return nfound;
00610 }
00611
00612
00613
00614
00615
00616 void
00617 acsmFree (ACSM_STRUCT * acsm)
00618 {
00619 int i;
00620 ACSM_PATTERN * mlist, *ilist;
00621 for (i = 0; i < acsm->acsmMaxStates; i++)
00622
00623 {
00624 if (acsm->acsmStateTable[i].MatchList != NULL)
00625
00626 {
00627 mlist = acsm->acsmStateTable[i].MatchList;
00628 while (mlist)
00629
00630 {
00631 ilist = mlist;
00632 mlist = mlist->next;
00633 AC_FREE (ilist);
00634 }
00635 }
00636 }
00637 AC_FREE (acsm->acsmStateTable);
00638 }
00639
00640
00641
00642
00643
00644
00645
00646
00647
00648
00649
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662
00663
00664
00665
00666
00667
00668
00669
00670
00671 int acsmPrintDetailInfo(ACSM_STRUCT * p)
00672 {
00673 return 0;
00674 }
00675
00676 int acsmPrintSummaryInfo(ACSM_STRUCT *acsm )
00677 {
00678 #ifdef XXXXX
00679 char * fsa[]={
00680 "TRIE",
00681 "NFA",
00682 "DFA",
00683 };
00684
00685 ACSM_STRUCT2 * p = &summary.acsm;
00686
00687 if( !summary.num_states )
00688 return;
00689
00690 printf("+--[Pattern Matcher:Aho-Corasick Summary]----------------------\n");
00691 printf("| Alphabet Size : %d Chars\n",p->acsmAlphabetSize);
00692 printf("| Sizeof State : %d bytes\n",sizeof(acstate_t));
00693 printf("| Storage Format : %s \n",sf[ p->acsmFormat ]);
00694 printf("| Num States : %d\n",summary.num_states);
00695 printf("| Num Transitions : %d\n",summary.num_transitions);
00696 printf("| State Density : %.1f%%\n",100.0*(double)summary.num_transitions/(summary.num_states*p->acsmAlphabetSize));
00697 printf("| Finite Automatum : %s\n", fsa[p->acsmFSA]);
00698 if( max_memory < 1024*1024 )
00699 printf("| Memory : %.2fKbytes\n", (float)max_memory/1024 );
00700 else
00701 printf("| Memory : %.2fMbytes\n", (float)max_memory/(1024*1024) );
00702 printf("+-------------------------------------------------------------\n");
00703
00704 #endif
00705 return 0;
00706 }
00707
00708
00709 #ifdef ACSMX_MAIN
00710
00711
00712
00713
00714 unsigned char text[512];
00715
00716
00717
00718
00719 int
00720 MatchFound (unsigned id, int index, void *data)
00721 {
00722 fprintf (stdout, "%s\n", (char *) id);
00723 return 0;
00724 }
00725
00726
00727
00728
00729
00730 int
00731 main (int argc, char **argv)
00732 {
00733 int i, nocase = 0;
00734 ACSM_STRUCT * acsm;
00735 if (argc < 3)
00736
00737 {
00738 fprintf (stderr,
00739 "Usage: acsmx pattern word-1 word-2 ... word-n -nocase\n");
00740 exit (0);
00741 }
00742 acsm = acsmNew ();
00743 strcpy (text, argv[1]);
00744 for (i = 1; i < argc; i++)
00745 if (strcmp (argv[i], "-nocase") == 0)
00746 nocase = 1;
00747 for (i = 2; i < argc; i++)
00748
00749 {
00750 if (argv[i][0] == '-')
00751 continue;
00752 acsmAddPattern (acsm, argv[i], strlen (argv[i]), nocase, 0, 0,
00753 argv[i], i - 2);
00754 }
00755 acsmCompile (acsm);
00756 acsmSearch (acsm, text, strlen (text), MatchFound, (void *) 0);
00757 acsmFree (acsm);
00758 printf ("normal pgm end\n");
00759 return (0);
00760 }
00761 #endif
00762