summaryrefslogtreecommitdiffstats
path: root/src/cmd/upas/bayes/dfa.c
diff options
context:
space:
mode:
authorrsc <devnull@localhost>2005-10-29 16:26:32 +0000
committerrsc <devnull@localhost>2005-10-29 16:26:32 +0000
commitd1f529f46f957c78a3db73b42c2fcd2d3c9f8a34 (patch)
treea4d6f28106cca984926b9dd5ecddd6053b654617 /src/cmd/upas/bayes/dfa.c
parent9f1fdc128738b2ed76258ac22a8574c681f3df3a (diff)
downloadplan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.gz
plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.zip
Thanks to John Cummings.
Diffstat (limited to 'src/cmd/upas/bayes/dfa.c')
-rw-r--r--src/cmd/upas/bayes/dfa.c800
1 files changed, 800 insertions, 0 deletions
diff --git a/src/cmd/upas/bayes/dfa.c b/src/cmd/upas/bayes/dfa.c
new file mode 100644
index 00000000..46695efd
--- /dev/null
+++ b/src/cmd/upas/bayes/dfa.c
@@ -0,0 +1,800 @@
+#include <u.h>
+#include <libc.h>
+#include <bin.h>
+#include <bio.h>
+#include <regexp.h>
+#include "/sys/src/libregexp/regcomp.h"
+#include "dfa.h"
+
+void rdump(Reprog*);
+void dump(Dreprog*);
+
+/*
+ * Standard NFA determinization and DFA minimization.
+ */
+typedef struct Deter Deter;
+typedef struct Reiset Reiset;
+
+void ddump(Deter*);
+
+/* state of determinization */
+struct Deter
+{
+ jmp_buf kaboom; /* jmp on error */
+
+ Bin *bin; /* bin for temporary allocations */
+
+ Reprog *p; /* program being determinized */
+ uint ninst; /* number of instructions in program */
+
+ Reiset *alloc; /* chain of all Reisets */
+ Reiset **last;
+
+ Reiset **hash; /* hash of all Reisets */
+ uint nhash;
+
+ Reiset *tmp; /* temporaries for walk */
+ uchar *bits;
+
+ Rune *c; /* ``interesting'' characters */
+ uint nc;
+};
+
+/* set of Reinsts: perhaps we should use a bit list instead of the indices? */
+struct Reiset
+{
+ uint *inst; /* indices of instructions in set */
+ uint ninst; /* size of set */
+
+ Reiset *next; /* d.alloc chain */
+ Reiset *hash; /* d.hash chain */
+ Reiset **delta; /* where to go on each interesting char */
+ uint id; /* assigned id during minimization */
+ uint isfinal; /* is an accepting (final) state */
+};
+
+static Reiset*
+ralloc(Deter *d, int ninst)
+{
+ Reiset *t;
+
+ t = binalloc(&d->bin, sizeof(Reiset)+2*d->nc*sizeof(Reiset*)+sizeof(uint)*ninst, 0);
+ if(t == nil)
+ longjmp(d->kaboom, 1);
+ t->delta = (Reiset**)&t[1];
+ t->inst = (uint*)&t->delta[2*d->nc];
+ return t;
+}
+
+/* find the canonical form a given Reiset */
+static Reiset*
+findreiset(Deter *d, Reiset *s)
+{
+ int i, szinst;
+ uint h;
+ Reiset *t;
+
+ h = 0;
+ for(i=0; i<s->ninst; i++)
+ h = h*1000003 + s->inst[i];
+ h %= d->nhash;
+
+ szinst = s->ninst*sizeof(s->inst[0]);
+ for(t=d->hash[h]; t; t=t->hash)
+ if(t->ninst==s->ninst && memcmp(t->inst, s->inst, szinst)==0)
+ return t;
+
+ t = ralloc(d, s->ninst);
+ t->hash = d->hash[h];
+ d->hash[h] = t;
+
+ *d->last = t;
+ d->last = &t->next;
+ t->next = 0;
+
+ t->ninst = s->ninst;
+ memmove(t->inst, s->inst, szinst);
+
+ /* delta is filled in later */
+
+ return t;
+}
+
+/* convert bits to a real reiset */
+static Reiset*
+bits2reiset(Deter *d, uchar *bits)
+{
+ int k;
+ Reiset *s;
+
+ s = d->tmp;
+ s->ninst = 0;
+ for(k=0; k<d->ninst; k++)
+ if(bits[k])
+ s->inst[s->ninst++] = k;
+ return findreiset(d, s);
+}
+
+/* add n to state set; if n < k, need to go around again */
+static int
+add(int n, uchar *bits, int k)
+{
+ if(bits[n])
+ return 0;
+ bits[n] = 1;
+ return n < k;
+}
+
+/* update bits to follow all the empty (non-character-related) transitions possible */
+static void
+followempty(Deter *d, uchar *bits, int bol, int eol)
+{
+ int again, k;
+ Reinst *i;
+
+ do{
+ again = 0;
+ for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
+ if(!bits[k])
+ continue;
+ switch(i->type){
+ case RBRA:
+ case LBRA:
+ again |= add(i->next - d->p->firstinst, bits, k);
+ break;
+ case OR:
+ again |= add(i->left - d->p->firstinst, bits, k);
+ again |= add(i->right - d->p->firstinst, bits, k);
+ break;
+ case BOL:
+ if(bol)
+ again |= add(i->next - d->p->firstinst, bits, k);
+ break;
+ case EOL:
+ if(eol)
+ again |= add(i->next - d->p->firstinst, bits, k);
+ break;
+ }
+ }
+ }while(again);
+
+ /*
+ * Clear bits for useless transitions. We could do this during
+ * the switch above, but then we have no guarantee of termination
+ * if we get a loop in the regexp.
+ */
+ for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
+ if(!bits[k])
+ continue;
+ switch(i->type){
+ case RBRA:
+ case LBRA:
+ case OR:
+ case BOL:
+ case EOL:
+ bits[k] = 0;
+ break;
+ }
+ }
+}
+
+/*
+ * Where does s go if it sees rune r?
+ * Eol is true if a $ matches the string at the position just after r.
+ */
+static Reiset*
+transition(Deter *d, Reiset *s, Rune r, uint eol)
+{
+ int k;
+ uchar *bits;
+ Reinst *i, *inst0;
+ Rune *rp, *ep;
+
+ bits = d->bits;
+ memset(bits, 0, d->ninst);
+
+ inst0 = d->p->firstinst;
+ for(k=0; k < s->ninst; k++){
+ i = inst0 + s->inst[k];
+ switch(i->type){
+ default:
+ werrstr("bad reprog: got type %d", i->type);
+ longjmp(d->kaboom, 1);
+ case RBRA:
+ case LBRA:
+ case OR:
+ case BOL:
+ case EOL:
+ werrstr("internal error: got type %d", i->type);
+ longjmp(d->kaboom, 1);
+
+ case RUNE:
+ if(r == i->r)
+ bits[i->next - inst0] = 1;
+ break;
+ case ANY:
+ if(r != L'\n')
+ bits[i->next - inst0] = 1;
+ break;
+ case ANYNL:
+ bits[i->next - inst0] = 1;
+ break;
+ case NCCLASS:
+ if(r == L'\n')
+ break;
+ /* fall through */
+ case CCLASS:
+ ep = i->cp->end;
+ for(rp = i->cp->spans; rp < ep; rp += 2)
+ if(rp[0] <= r && r <= rp[1])
+ break;
+ if((rp < ep) ^! (i->type == CCLASS))
+ bits[i->next - inst0] = 1;
+ break;
+ case END:
+ break;
+ }
+ }
+
+ followempty(d, bits, r=='\n', eol);
+ return bits2reiset(d, bits);
+}
+
+static int
+countinst(Reprog *pp)
+{
+ int n;
+ Reinst *l;
+
+ n = 0;
+ l = pp->firstinst;
+ while(l++->type)
+ n++;
+ return n;
+}
+
+static void
+set(Deter *d, u32int **tab, Rune r)
+{
+ u32int *u;
+
+ if((u = tab[r/4096]) == nil){
+ u = binalloc(&d->bin, 4096/8, 1);
+ if(u == nil)
+ longjmp(d->kaboom, 1);
+ tab[r/4096] = u;
+ }
+ u[(r%4096)/32] |= 1<<(r%32);
+}
+
+/*
+ * Compute the list of important characters.
+ * Other characters behave like the ones that surround them.
+ */
+static void
+findchars(Deter *d, Reprog *p)
+{
+ u32int *tab[65536/4096], *u, x;
+ Reinst *i;
+ Rune *rp, *ep;
+ int k, m, n, a;
+
+ memset(tab, 0, sizeof tab);
+ set(d, tab, 0);
+ set(d, tab, 0xFFFF);
+ for(i=p->firstinst; i->type; i++){
+ switch(i->type){
+ case ANY:
+ set(d, tab, L'\n'-1);
+ set(d, tab, L'\n');
+ set(d, tab, L'\n'+1);
+ break;
+ case RUNE:
+ set(d, tab, i->r-1);
+ set(d, tab, i->r);
+ set(d, tab, i->r+1);
+ break;
+ case NCCLASS:
+ set(d, tab, L'\n'-1);
+ set(d, tab, L'\n');
+ set(d, tab, L'\n'+1);
+ /* fall through */
+ case CCLASS:
+ ep = i->cp->end;
+ for(rp = i->cp->spans; rp < ep; rp += 2){
+ set(d, tab, rp[0]-1);
+ set(d, tab, rp[0]);
+ set(d, tab, rp[1]);
+ set(d, tab, rp[1]+1);
+ }
+ break;
+ }
+ }
+
+ n = 0;
+ for(k=0; k<nelem(tab); k++){
+ if((u = tab[k]) == nil)
+ continue;
+ for(m=0; m<4096/32; m++){
+ if((x = u[m]) == 0)
+ continue;
+ for(a=0; a<32; a++)
+ if(x&(1<<a))
+ n++;
+ }
+ }
+
+ d->c = binalloc(&d->bin, (n+1)*sizeof(Rune), 0);
+ if(d->c == 0)
+ longjmp(d->kaboom, 1);
+ d->nc = n;
+
+ n = 0;
+ for(k=0; k<nelem(tab); k++){
+ if((u = tab[k]) == nil)
+ continue;
+ for(m=0; m<4096/32; m++){
+ if((x = u[m]) == 0)
+ continue;
+ for(a=0; a<32; a++)
+ if(x&(1<<a))
+ d->c[n++] = k*4096+m*32+a;
+ }
+ }
+
+ d->c[n] = 0;
+ if(n != d->nc)
+ abort();
+}
+
+/*
+ * convert the Deter and Reisets into a Dreprog.
+ * if dp and c are nil, just return the count of Drecases needed.
+ */
+static int
+buildprog(Deter *d, Reiset **id2set, int nid, Dreprog *dp, Drecase *c)
+{
+ int i, j, id, n, nn;
+ Dreinst *di;
+ Reiset *s;
+
+ nn = 0;
+ di = 0;
+ for(i=0; i<nid; i++){
+ s = id2set[i];
+ if(c){
+ di = &dp->inst[i];
+ di->isfinal = s->isfinal;
+ }
+ n = 0;
+ id = -1;
+ for(j=0; j<2*d->nc; j++){
+ if(s->delta[j]->id != id){
+ id = s->delta[j]->id;
+ if(c){
+ c[n].start = ((j/d->nc)<<16) | d->c[j%d->nc];
+ c[n].next = &dp->inst[id];
+ }
+ n++;
+ }
+ }
+ if(c){
+ if(n == 1 && c[0].next == di)
+ di->isloop = 1;
+ di->c = c;
+ di->nc = n;
+ c += n;
+ }
+ nn += n;
+ }
+ return nn;
+}
+
+Dreprog*
+dregcvt(Reprog *p)
+{
+ uchar *bits;
+ uint again, n, nid, id;
+ Deter d;
+ Reiset **id2set, *s, *t, *start[4];
+ Dreprog *dp;
+ Drecase *c;
+
+ memset(&d, 0, sizeof d);
+
+ if(setjmp(d.kaboom)){
+ binfree(&d.bin);
+ return nil;
+ }
+
+ d.p = p;
+ d.ninst = countinst(p);
+
+ d.last = &d.alloc;
+
+ n = d.ninst;
+ /* round up to power of two; this loop is the least of our efficiency problems */
+ while(n&(n-1))
+ n++;
+ d.nhash = n;
+ d.hash = binalloc(&d.bin, d.nhash*sizeof(Reinst*), 1);
+
+ /* get list of important runes */
+ findchars(&d, p);
+
+#ifdef DUMP
+ print("relevant chars are: «%S»\n", d.c+1);
+#endif
+
+ d.bits = bits = binalloc(&d.bin, d.ninst, 0);
+ d.tmp = ralloc(&d, d.ninst);
+
+ /*
+ * Convert to DFA
+ */
+
+ /* 4 start states, depending on initial bol, eol */
+ for(n=0; n<4; n++){
+ memset(bits, 0, d.ninst);
+ bits[p->startinst - p->firstinst] = 1;
+ followempty(&d, bits, n&1, n&2);
+ start[n] = bits2reiset(&d, bits);
+ }
+
+ /* explore the reiset space */
+ for(s=d.alloc; s; s=s->next)
+ for(n=0; n<2*d.nc; n++)
+ s->delta[n] = transition(&d, s, d.c[n%d.nc], n/d.nc);
+
+#ifdef DUMP
+ nid = 0;
+ for(s=d.alloc; s; s=s->next)
+ s->id = nid++;
+ ddump(&d);
+#endif
+
+ /*
+ * Minimize.
+ */
+
+ /* first class division is final or not */
+ for(s=d.alloc; s; s=s->next){
+ s->isfinal = 0;
+ for(n=0; n<s->ninst; n++)
+ if(p->firstinst[s->inst[n]].type == END)
+ s->isfinal = 1;
+ s->id = s->isfinal;
+ }
+
+ /* divide states with different transition tables in id space */
+ nid = 2;
+ do{
+ again = 0;
+ for(s=d.alloc; s; s=s->next){
+ id = -1;
+ for(t=s->next; t; t=t->next){
+ if(s->id != t->id)
+ continue;
+ for(n=0; n<2*d.nc; n++){
+ /* until we finish the for(t) loop, s->id and id are same */
+ if((s->delta[n]->id == t->delta[n]->id)
+ || (s->delta[n]->id == s->id && t->delta[n]->id == id)
+ || (s->delta[n]->id == id && t->delta[n]->id == s->id))
+ continue;
+ break;
+ }
+ if(n == 2*d.nc)
+ continue;
+ if(id == -1)
+ id = nid++;
+ t->id = id;
+ again = 1;
+ }
+ }
+ }while(again);
+
+#ifdef DUMP
+ ddump(&d);
+#endif
+
+ /* build dreprog */
+ id2set = binalloc(&d.bin, nid*sizeof(Reiset*), 1);
+ if(id2set == nil)
+ longjmp(d.kaboom, 1);
+ for(s=d.alloc; s; s=s->next)
+ id2set[s->id] = s;
+
+ n = buildprog(&d, id2set, nid, nil, nil);
+ dp = mallocz(sizeof(Dreprog)+nid*sizeof(Dreinst)+n*sizeof(Drecase), 1);
+ if(dp == nil)
+ longjmp(d.kaboom, 1);
+ c = (Drecase*)&dp->inst[nid];
+ buildprog(&d, id2set, nid, dp, c);
+
+ for(n=0; n<4; n++)
+ dp->start[n] = &dp->inst[start[n]->id];
+ dp->ninst = nid;
+
+ binfree(&d.bin);
+ return dp;
+}
+
+int
+dregexec(Dreprog *p, char *s, int bol)
+{
+ Rune r;
+ ulong rr;
+ Dreinst *i;
+ Drecase *c, *ec;
+ int best, n;
+ char *os;
+
+ i = p->start[(bol ? 1 : 0) | (s[1]=='\n' ? 2 : 0)];
+ best = -1;
+ os = s;
+ for(; *s; s+=n){
+ if(i->isfinal)
+ best = s - os;
+ if(i->isloop){
+ if(i->isfinal)
+ return strlen(os);
+ else
+ return best;
+ }
+ if((*s&0xFF) < Runeself){
+ r = *s;
+ n = 1;
+ }else
+ n = chartorune(&r, s);
+ c = i->c;
+ ec = c+i->nc;
+ rr = r;
+ if(s[n] == '\n' || s[n] == '\0')
+ rr |= 0x10000;
+ for(; c<ec; c++){
+ if(c->start > rr){
+ i = c[-1].next;
+ goto Out;
+ }
+ }
+ i = ec[-1].next;
+ Out:;
+ }
+ if(i->isfinal)
+ best = s - os;
+ return best;
+}
+
+
+#ifdef DUMP
+void
+ddump(Deter *d)
+{
+ int i, id;
+ Reiset *s;
+
+ for(s=d->alloc; s; s=s->next){
+ print("%d ", s->id);
+ id = -1;
+ for(i=0; i<2*d->nc; i++){
+ if(id != s->delta[i]->id){
+ if(i==0)
+ print(" [");
+ else if(i/d->nc)
+ print(" [%C$", d->c[i%d->nc]);
+ else
+ print(" [%C", d->c[i%d->nc]);
+ print(" %d]", s->delta[i]->id);
+ id = s->delta[i]->id;
+ }
+ }
+ print("\n");
+ }
+}
+
+void
+rdump(Reprog *pp)
+{
+ Reinst *l;
+ Rune *p;
+
+ l = pp->firstinst;
+ do{
+ print("%ld:\t0%o\t%ld\t%ld", l-pp->firstinst, l->type,
+ l->left-pp->firstinst, l->right-pp->firstinst);
+ if(l->type == RUNE)
+ print("\t%C\n", l->r);
+ else if(l->type == CCLASS || l->type == NCCLASS){
+ print("\t[");
+ if(l->type == NCCLASS)
+ print("^");
+ for(p = l->cp->spans; p < l->cp->end; p += 2)
+ if(p[0] == p[1])
+ print("%C", p[0]);
+ else
+ print("%C-%C", p[0], p[1]);
+ print("]\n");
+ } else
+ print("\n");
+ }while(l++->type);
+}
+
+void
+dump(Dreprog *pp)
+{
+ int i, j;
+ Dreinst *l;
+
+ print("start %ld %ld %ld %ld\n",
+ pp->start[0]-pp->inst,
+ pp->start[1]-pp->inst,
+ pp->start[2]-pp->inst,
+ pp->start[3]-pp->inst);
+
+ for(i=0; i<pp->ninst; i++){
+ l = &pp->inst[i];
+ print("%d:", i);
+ for(j=0; j<l->nc; j++){
+ print(" [");
+ if(j == 0)
+ if(l->c[j].start != 1)
+ abort();
+ if(j != 0)
+ print("%C%s", l->c[j].start&0xFFFF, (l->c[j].start&0x10000) ? "$" : "");
+ print("-");
+ if(j != l->nc-1)
+ print("%C%s", (l->c[j+1].start&0xFFFF)-1, (l->c[j+1].start&0x10000) ? "$" : "");
+ print("] %ld", l->c[j].next - pp->inst);
+ }
+ if(l->isfinal)
+ print(" final");
+ if(l->isloop)
+ print(" loop");
+ print("\n");
+ }
+}
+
+
+void
+main(int argc, char **argv)
+{
+ int i;
+ Reprog *p;
+ Dreprog *dp;
+
+ i = 1;
+ p = regcomp(argv[i]);
+ if(p == 0){
+ print("=== %s: bad regexp\n", argv[i]);
+ }
+ // print("=== %s\n", argv[i]);
+ // rdump(p);
+ dp = dregcvt(p);
+ print("=== dfa\n");
+ dump(dp);
+
+ for(i=2; i<argc; i++)
+ print("match %d\n", dregexec(dp, argv[i], 0));
+ exits(0);
+}
+#endif
+
+void
+Bprintdfa(Biobuf *b, Dreprog *p)
+{
+ int i, j, nc;
+
+ Bprint(b, "# dreprog\n");
+ nc = 0;
+ for(i=0; i<p->ninst; i++)
+ nc += p->inst[i].nc;
+ Bprint(b, "%d %d %ld %ld %ld %ld\n", p->ninst, nc,
+ p->start[0]-p->inst, p->start[1]-p->inst,
+ p->start[2]-p->inst, p->start[3]-p->inst);
+ for(i=0; i<p->ninst; i++){
+ Bprint(b, "%d %d %d", p->inst[i].isfinal, p->inst[i].isloop, p->inst[i].nc);
+ for(j=0; j<p->inst[i].nc; j++)
+ Bprint(b, " %d %ld", p->inst[i].c[j].start, p->inst[i].c[j].next-p->inst);
+ Bprint(b, "\n");
+ }
+}
+
+static char*
+egetline(Biobuf *b, int c, jmp_buf jb)
+{
+ char *p;
+
+ p = Brdline(b, c);
+ if(p == nil)
+ longjmp(jb, 1);
+ p[Blinelen(b)-1] = '\0';
+ return p;
+}
+
+static void
+egetc(Biobuf *b, int c, jmp_buf jb)
+{
+ if(Bgetc(b) != c)
+ longjmp(jb, 1);
+}
+
+static int
+egetnum(Biobuf *b, int want, jmp_buf jb)
+{
+ int c;
+ int n, first;
+
+ n = 0;
+ first = 1;
+ while((c = Bgetc(b)) != Beof){
+ if(c < '0' || c > '9'){
+ if(want == 0){
+ Bungetc(b);
+ c = 0;
+ }
+ if(first || c != want){
+ werrstr("format error");
+ longjmp(jb, 1);
+ }
+ return n;
+ }
+ n = n*10 + c - '0';
+ first = 0;
+ }
+ werrstr("unexpected eof");
+ longjmp(jb, 1);
+ return -1;
+}
+
+Dreprog*
+Breaddfa(Biobuf *b)
+{
+ char *s;
+ int ninst, nc;
+ jmp_buf jb;
+ Dreprog *p;
+ Drecase *c;
+ Dreinst *l;
+ int j, k;
+
+ p = nil;
+ if(setjmp(jb)){
+ free(p);
+ return nil;
+ }
+
+ s = egetline(b, '\n', jb);
+ if(strcmp(s, "# dreprog") != 0){
+ werrstr("format error");
+ longjmp(jb, 1);
+ }
+
+ ninst = egetnum(b, ' ', jb);
+ nc = egetnum(b, ' ', jb);
+
+ p = mallocz(sizeof(Dreprog)+ninst*sizeof(Dreinst)+nc*sizeof(Drecase), 1);
+ if(p == nil)
+ longjmp(jb, 1);
+ c = (Drecase*)&p->inst[ninst];
+
+ p->start[0] = &p->inst[egetnum(b, ' ', jb)];
+ p->start[1] = &p->inst[egetnum(b, ' ', jb)];
+ p->start[2] = &p->inst[egetnum(b, ' ', jb)];
+ p->start[3] = &p->inst[egetnum(b, '\n', jb)];
+
+ for(j=0; j<ninst; j++){
+ l = &p->inst[j];
+ l->isfinal = egetnum(b, ' ', jb);
+ l->isloop = egetnum(b, ' ', jb);
+ l->nc = egetnum(b, 0, jb);
+ l->c = c;
+ for(k=0; k<l->nc; k++){
+ egetc(b, ' ', jb);
+ c->start = egetnum(b, ' ', jb);
+ c->next = &p->inst[egetnum(b, 0, jb)];
+ c++;
+ }
+ egetc(b, '\n', jb);
+ }
+ return p;
+}