diff options
| author | rsc <devnull@localhost> | 2005-10-29 16:26:32 +0000 |
|---|---|---|
| committer | rsc <devnull@localhost> | 2005-10-29 16:26:32 +0000 |
| commit | d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34 (patch) | |
| tree | a4d6f28106cca984926b9dd5ecddd6053b654617 /src/cmd/upas/bayes/dfa.c | |
| parent | 9f1fdc128738b2ed76258ac22a8574c681f3df3a (diff) | |
| download | plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.gz plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.zip | |
Thanks to John Cummings.
Diffstat (limited to 'src/cmd/upas/bayes/dfa.c')
| -rw-r--r-- | src/cmd/upas/bayes/dfa.c | 800 |
1 files changed, 800 insertions, 0 deletions
diff --git a/src/cmd/upas/bayes/dfa.c b/src/cmd/upas/bayes/dfa.c new file mode 100644 index 00000000..46695efd --- /dev/null +++ b/src/cmd/upas/bayes/dfa.c @@ -0,0 +1,800 @@ +#include <u.h> +#include <libc.h> +#include <bin.h> +#include <bio.h> +#include <regexp.h> +#include "/sys/src/libregexp/regcomp.h" +#include "dfa.h" + +void rdump(Reprog*); +void dump(Dreprog*); + +/* + * Standard NFA determinization and DFA minimization. + */ +typedef struct Deter Deter; +typedef struct Reiset Reiset; + +void ddump(Deter*); + +/* state of determinization */ +struct Deter +{ + jmp_buf kaboom; /* jmp on error */ + + Bin *bin; /* bin for temporary allocations */ + + Reprog *p; /* program being determinized */ + uint ninst; /* number of instructions in program */ + + Reiset *alloc; /* chain of all Reisets */ + Reiset **last; + + Reiset **hash; /* hash of all Reisets */ + uint nhash; + + Reiset *tmp; /* temporaries for walk */ + uchar *bits; + + Rune *c; /* ``interesting'' characters */ + uint nc; +}; + +/* set of Reinsts: perhaps we should use a bit list instead of the indices? */ +struct Reiset +{ + uint *inst; /* indices of instructions in set */ + uint ninst; /* size of set */ + + Reiset *next; /* d.alloc chain */ + Reiset *hash; /* d.hash chain */ + Reiset **delta; /* where to go on each interesting char */ + uint id; /* assigned id during minimization */ + uint isfinal; /* is an accepting (final) state */ +}; + +static Reiset* +ralloc(Deter *d, int ninst) +{ + Reiset *t; + + t = binalloc(&d->bin, sizeof(Reiset)+2*d->nc*sizeof(Reiset*)+sizeof(uint)*ninst, 0); + if(t == nil) + longjmp(d->kaboom, 1); + t->delta = (Reiset**)&t[1]; + t->inst = (uint*)&t->delta[2*d->nc]; + return t; +} + +/* find the canonical form a given Reiset */ +static Reiset* +findreiset(Deter *d, Reiset *s) +{ + int i, szinst; + uint h; + Reiset *t; + + h = 0; + for(i=0; i<s->ninst; i++) + h = h*1000003 + s->inst[i]; + h %= d->nhash; + + szinst = s->ninst*sizeof(s->inst[0]); + for(t=d->hash[h]; t; t=t->hash) + if(t->ninst==s->ninst && memcmp(t->inst, s->inst, szinst)==0) + return t; + + t = ralloc(d, s->ninst); + t->hash = d->hash[h]; + d->hash[h] = t; + + *d->last = t; + d->last = &t->next; + t->next = 0; + + t->ninst = s->ninst; + memmove(t->inst, s->inst, szinst); + + /* delta is filled in later */ + + return t; +} + +/* convert bits to a real reiset */ +static Reiset* +bits2reiset(Deter *d, uchar *bits) +{ + int k; + Reiset *s; + + s = d->tmp; + s->ninst = 0; + for(k=0; k<d->ninst; k++) + if(bits[k]) + s->inst[s->ninst++] = k; + return findreiset(d, s); +} + +/* add n to state set; if n < k, need to go around again */ +static int +add(int n, uchar *bits, int k) +{ + if(bits[n]) + return 0; + bits[n] = 1; + return n < k; +} + +/* update bits to follow all the empty (non-character-related) transitions possible */ +static void +followempty(Deter *d, uchar *bits, int bol, int eol) +{ + int again, k; + Reinst *i; + + do{ + again = 0; + for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){ + if(!bits[k]) + continue; + switch(i->type){ + case RBRA: + case LBRA: + again |= add(i->next - d->p->firstinst, bits, k); + break; + case OR: + again |= add(i->left - d->p->firstinst, bits, k); + again |= add(i->right - d->p->firstinst, bits, k); + break; + case BOL: + if(bol) + again |= add(i->next - d->p->firstinst, bits, k); + break; + case EOL: + if(eol) + again |= add(i->next - d->p->firstinst, bits, k); + break; + } + } + }while(again); + + /* + * Clear bits for useless transitions. We could do this during + * the switch above, but then we have no guarantee of termination + * if we get a loop in the regexp. + */ + for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){ + if(!bits[k]) + continue; + switch(i->type){ + case RBRA: + case LBRA: + case OR: + case BOL: + case EOL: + bits[k] = 0; + break; + } + } +} + +/* + * Where does s go if it sees rune r? + * Eol is true if a $ matches the string at the position just after r. + */ +static Reiset* +transition(Deter *d, Reiset *s, Rune r, uint eol) +{ + int k; + uchar *bits; + Reinst *i, *inst0; + Rune *rp, *ep; + + bits = d->bits; + memset(bits, 0, d->ninst); + + inst0 = d->p->firstinst; + for(k=0; k < s->ninst; k++){ + i = inst0 + s->inst[k]; + switch(i->type){ + default: + werrstr("bad reprog: got type %d", i->type); + longjmp(d->kaboom, 1); + case RBRA: + case LBRA: + case OR: + case BOL: + case EOL: + werrstr("internal error: got type %d", i->type); + longjmp(d->kaboom, 1); + + case RUNE: + if(r == i->r) + bits[i->next - inst0] = 1; + break; + case ANY: + if(r != L'\n') + bits[i->next - inst0] = 1; + break; + case ANYNL: + bits[i->next - inst0] = 1; + break; + case NCCLASS: + if(r == L'\n') + break; + /* fall through */ + case CCLASS: + ep = i->cp->end; + for(rp = i->cp->spans; rp < ep; rp += 2) + if(rp[0] <= r && r <= rp[1]) + break; + if((rp < ep) ^! (i->type == CCLASS)) + bits[i->next - inst0] = 1; + break; + case END: + break; + } + } + + followempty(d, bits, r=='\n', eol); + return bits2reiset(d, bits); +} + +static int +countinst(Reprog *pp) +{ + int n; + Reinst *l; + + n = 0; + l = pp->firstinst; + while(l++->type) + n++; + return n; +} + +static void +set(Deter *d, u32int **tab, Rune r) +{ + u32int *u; + + if((u = tab[r/4096]) == nil){ + u = binalloc(&d->bin, 4096/8, 1); + if(u == nil) + longjmp(d->kaboom, 1); + tab[r/4096] = u; + } + u[(r%4096)/32] |= 1<<(r%32); +} + +/* + * Compute the list of important characters. + * Other characters behave like the ones that surround them. + */ +static void +findchars(Deter *d, Reprog *p) +{ + u32int *tab[65536/4096], *u, x; + Reinst *i; + Rune *rp, *ep; + int k, m, n, a; + + memset(tab, 0, sizeof tab); + set(d, tab, 0); + set(d, tab, 0xFFFF); + for(i=p->firstinst; i->type; i++){ + switch(i->type){ + case ANY: + set(d, tab, L'\n'-1); + set(d, tab, L'\n'); + set(d, tab, L'\n'+1); + break; + case RUNE: + set(d, tab, i->r-1); + set(d, tab, i->r); + set(d, tab, i->r+1); + break; + case NCCLASS: + set(d, tab, L'\n'-1); + set(d, tab, L'\n'); + set(d, tab, L'\n'+1); + /* fall through */ + case CCLASS: + ep = i->cp->end; + for(rp = i->cp->spans; rp < ep; rp += 2){ + set(d, tab, rp[0]-1); + set(d, tab, rp[0]); + set(d, tab, rp[1]); + set(d, tab, rp[1]+1); + } + break; + } + } + + n = 0; + for(k=0; k<nelem(tab); k++){ + if((u = tab[k]) == nil) + continue; + for(m=0; m<4096/32; m++){ + if((x = u[m]) == 0) + continue; + for(a=0; a<32; a++) + if(x&(1<<a)) + n++; + } + } + + d->c = binalloc(&d->bin, (n+1)*sizeof(Rune), 0); + if(d->c == 0) + longjmp(d->kaboom, 1); + d->nc = n; + + n = 0; + for(k=0; k<nelem(tab); k++){ + if((u = tab[k]) == nil) + continue; + for(m=0; m<4096/32; m++){ + if((x = u[m]) == 0) + continue; + for(a=0; a<32; a++) + if(x&(1<<a)) + d->c[n++] = k*4096+m*32+a; + } + } + + d->c[n] = 0; + if(n != d->nc) + abort(); +} + +/* + * convert the Deter and Reisets into a Dreprog. + * if dp and c are nil, just return the count of Drecases needed. + */ +static int +buildprog(Deter *d, Reiset **id2set, int nid, Dreprog *dp, Drecase *c) +{ + int i, j, id, n, nn; + Dreinst *di; + Reiset *s; + + nn = 0; + di = 0; + for(i=0; i<nid; i++){ + s = id2set[i]; + if(c){ + di = &dp->inst[i]; + di->isfinal = s->isfinal; + } + n = 0; + id = -1; + for(j=0; j<2*d->nc; j++){ + if(s->delta[j]->id != id){ + id = s->delta[j]->id; + if(c){ + c[n].start = ((j/d->nc)<<16) | d->c[j%d->nc]; + c[n].next = &dp->inst[id]; + } + n++; + } + } + if(c){ + if(n == 1 && c[0].next == di) + di->isloop = 1; + di->c = c; + di->nc = n; + c += n; + } + nn += n; + } + return nn; +} + +Dreprog* +dregcvt(Reprog *p) +{ + uchar *bits; + uint again, n, nid, id; + Deter d; + Reiset **id2set, *s, *t, *start[4]; + Dreprog *dp; + Drecase *c; + + memset(&d, 0, sizeof d); + + if(setjmp(d.kaboom)){ + binfree(&d.bin); + return nil; + } + + d.p = p; + d.ninst = countinst(p); + + d.last = &d.alloc; + + n = d.ninst; + /* round up to power of two; this loop is the least of our efficiency problems */ + while(n&(n-1)) + n++; + d.nhash = n; + d.hash = binalloc(&d.bin, d.nhash*sizeof(Reinst*), 1); + + /* get list of important runes */ + findchars(&d, p); + +#ifdef DUMP + print("relevant chars are: «%S»\n", d.c+1); +#endif + + d.bits = bits = binalloc(&d.bin, d.ninst, 0); + d.tmp = ralloc(&d, d.ninst); + + /* + * Convert to DFA + */ + + /* 4 start states, depending on initial bol, eol */ + for(n=0; n<4; n++){ + memset(bits, 0, d.ninst); + bits[p->startinst - p->firstinst] = 1; + followempty(&d, bits, n&1, n&2); + start[n] = bits2reiset(&d, bits); + } + + /* explore the reiset space */ + for(s=d.alloc; s; s=s->next) + for(n=0; n<2*d.nc; n++) + s->delta[n] = transition(&d, s, d.c[n%d.nc], n/d.nc); + +#ifdef DUMP + nid = 0; + for(s=d.alloc; s; s=s->next) + s->id = nid++; + ddump(&d); +#endif + + /* + * Minimize. + */ + + /* first class division is final or not */ + for(s=d.alloc; s; s=s->next){ + s->isfinal = 0; + for(n=0; n<s->ninst; n++) + if(p->firstinst[s->inst[n]].type == END) + s->isfinal = 1; + s->id = s->isfinal; + } + + /* divide states with different transition tables in id space */ + nid = 2; + do{ + again = 0; + for(s=d.alloc; s; s=s->next){ + id = -1; + for(t=s->next; t; t=t->next){ + if(s->id != t->id) + continue; + for(n=0; n<2*d.nc; n++){ + /* until we finish the for(t) loop, s->id and id are same */ + if((s->delta[n]->id == t->delta[n]->id) + || (s->delta[n]->id == s->id && t->delta[n]->id == id) + || (s->delta[n]->id == id && t->delta[n]->id == s->id)) + continue; + break; + } + if(n == 2*d.nc) + continue; + if(id == -1) + id = nid++; + t->id = id; + again = 1; + } + } + }while(again); + +#ifdef DUMP + ddump(&d); +#endif + + /* build dreprog */ + id2set = binalloc(&d.bin, nid*sizeof(Reiset*), 1); + if(id2set == nil) + longjmp(d.kaboom, 1); + for(s=d.alloc; s; s=s->next) + id2set[s->id] = s; + + n = buildprog(&d, id2set, nid, nil, nil); + dp = mallocz(sizeof(Dreprog)+nid*sizeof(Dreinst)+n*sizeof(Drecase), 1); + if(dp == nil) + longjmp(d.kaboom, 1); + c = (Drecase*)&dp->inst[nid]; + buildprog(&d, id2set, nid, dp, c); + + for(n=0; n<4; n++) + dp->start[n] = &dp->inst[start[n]->id]; + dp->ninst = nid; + + binfree(&d.bin); + return dp; +} + +int +dregexec(Dreprog *p, char *s, int bol) +{ + Rune r; + ulong rr; + Dreinst *i; + Drecase *c, *ec; + int best, n; + char *os; + + i = p->start[(bol ? 1 : 0) | (s[1]=='\n' ? 2 : 0)]; + best = -1; + os = s; + for(; *s; s+=n){ + if(i->isfinal) + best = s - os; + if(i->isloop){ + if(i->isfinal) + return strlen(os); + else + return best; + } + if((*s&0xFF) < Runeself){ + r = *s; + n = 1; + }else + n = chartorune(&r, s); + c = i->c; + ec = c+i->nc; + rr = r; + if(s[n] == '\n' || s[n] == '\0') + rr |= 0x10000; + for(; c<ec; c++){ + if(c->start > rr){ + i = c[-1].next; + goto Out; + } + } + i = ec[-1].next; + Out:; + } + if(i->isfinal) + best = s - os; + return best; +} + + +#ifdef DUMP +void +ddump(Deter *d) +{ + int i, id; + Reiset *s; + + for(s=d->alloc; s; s=s->next){ + print("%d ", s->id); + id = -1; + for(i=0; i<2*d->nc; i++){ + if(id != s->delta[i]->id){ + if(i==0) + print(" ["); + else if(i/d->nc) + print(" [%C$", d->c[i%d->nc]); + else + print(" [%C", d->c[i%d->nc]); + print(" %d]", s->delta[i]->id); + id = s->delta[i]->id; + } + } + print("\n"); + } +} + +void +rdump(Reprog *pp) +{ + Reinst *l; + Rune *p; + + l = pp->firstinst; + do{ + print("%ld:\t0%o\t%ld\t%ld", l-pp->firstinst, l->type, + l->left-pp->firstinst, l->right-pp->firstinst); + if(l->type == RUNE) + print("\t%C\n", l->r); + else if(l->type == CCLASS || l->type == NCCLASS){ + print("\t["); + if(l->type == NCCLASS) + print("^"); + for(p = l->cp->spans; p < l->cp->end; p += 2) + if(p[0] == p[1]) + print("%C", p[0]); + else + print("%C-%C", p[0], p[1]); + print("]\n"); + } else + print("\n"); + }while(l++->type); +} + +void +dump(Dreprog *pp) +{ + int i, j; + Dreinst *l; + + print("start %ld %ld %ld %ld\n", + pp->start[0]-pp->inst, + pp->start[1]-pp->inst, + pp->start[2]-pp->inst, + pp->start[3]-pp->inst); + + for(i=0; i<pp->ninst; i++){ + l = &pp->inst[i]; + print("%d:", i); + for(j=0; j<l->nc; j++){ + print(" ["); + if(j == 0) + if(l->c[j].start != 1) + abort(); + if(j != 0) + print("%C%s", l->c[j].start&0xFFFF, (l->c[j].start&0x10000) ? "$" : ""); + print("-"); + if(j != l->nc-1) + print("%C%s", (l->c[j+1].start&0xFFFF)-1, (l->c[j+1].start&0x10000) ? "$" : ""); + print("] %ld", l->c[j].next - pp->inst); + } + if(l->isfinal) + print(" final"); + if(l->isloop) + print(" loop"); + print("\n"); + } +} + + +void +main(int argc, char **argv) +{ + int i; + Reprog *p; + Dreprog *dp; + + i = 1; + p = regcomp(argv[i]); + if(p == 0){ + print("=== %s: bad regexp\n", argv[i]); + } + // print("=== %s\n", argv[i]); + // rdump(p); + dp = dregcvt(p); + print("=== dfa\n"); + dump(dp); + + for(i=2; i<argc; i++) + print("match %d\n", dregexec(dp, argv[i], 0)); + exits(0); +} +#endif + +void +Bprintdfa(Biobuf *b, Dreprog *p) +{ + int i, j, nc; + + Bprint(b, "# dreprog\n"); + nc = 0; + for(i=0; i<p->ninst; i++) + nc += p->inst[i].nc; + Bprint(b, "%d %d %ld %ld %ld %ld\n", p->ninst, nc, + p->start[0]-p->inst, p->start[1]-p->inst, + p->start[2]-p->inst, p->start[3]-p->inst); + for(i=0; i<p->ninst; i++){ + Bprint(b, "%d %d %d", p->inst[i].isfinal, p->inst[i].isloop, p->inst[i].nc); + for(j=0; j<p->inst[i].nc; j++) + Bprint(b, " %d %ld", p->inst[i].c[j].start, p->inst[i].c[j].next-p->inst); + Bprint(b, "\n"); + } +} + +static char* +egetline(Biobuf *b, int c, jmp_buf jb) +{ + char *p; + + p = Brdline(b, c); + if(p == nil) + longjmp(jb, 1); + p[Blinelen(b)-1] = '\0'; + return p; +} + +static void +egetc(Biobuf *b, int c, jmp_buf jb) +{ + if(Bgetc(b) != c) + longjmp(jb, 1); +} + +static int +egetnum(Biobuf *b, int want, jmp_buf jb) +{ + int c; + int n, first; + + n = 0; + first = 1; + while((c = Bgetc(b)) != Beof){ + if(c < '0' || c > '9'){ + if(want == 0){ + Bungetc(b); + c = 0; + } + if(first || c != want){ + werrstr("format error"); + longjmp(jb, 1); + } + return n; + } + n = n*10 + c - '0'; + first = 0; + } + werrstr("unexpected eof"); + longjmp(jb, 1); + return -1; +} + +Dreprog* +Breaddfa(Biobuf *b) +{ + char *s; + int ninst, nc; + jmp_buf jb; + Dreprog *p; + Drecase *c; + Dreinst *l; + int j, k; + + p = nil; + if(setjmp(jb)){ + free(p); + return nil; + } + + s = egetline(b, '\n', jb); + if(strcmp(s, "# dreprog") != 0){ + werrstr("format error"); + longjmp(jb, 1); + } + + ninst = egetnum(b, ' ', jb); + nc = egetnum(b, ' ', jb); + + p = mallocz(sizeof(Dreprog)+ninst*sizeof(Dreinst)+nc*sizeof(Drecase), 1); + if(p == nil) + longjmp(jb, 1); + c = (Drecase*)&p->inst[ninst]; + + p->start[0] = &p->inst[egetnum(b, ' ', jb)]; + p->start[1] = &p->inst[egetnum(b, ' ', jb)]; + p->start[2] = &p->inst[egetnum(b, ' ', jb)]; + p->start[3] = &p->inst[egetnum(b, '\n', jb)]; + + for(j=0; j<ninst; j++){ + l = &p->inst[j]; + l->isfinal = egetnum(b, ' ', jb); + l->isloop = egetnum(b, ' ', jb); + l->nc = egetnum(b, 0, jb); + l->c = c; + for(k=0; k<l->nc; k++){ + egetc(b, ' ', jb); + c->start = egetnum(b, ' ', jb); + c->next = &p->inst[egetnum(b, 0, jb)]; + c++; + } + egetc(b, '\n', jb); + } + return p; +} |
