\datethis
@s octa int
@*Intro. This program is the twelfth in a series of exploratory studies by
which I'm attempting to gain first-hand experience with OBDD structures, as I
prepare Section 7.1.4 of {\sl The Art of Computer Programming}.
Again it's quite different from its predecessors: This one implements a
new approach to finding an optimum ordering for the variables of a given
Boolean function or set of functions.
The new approach is based on QDDs (quasi-reduced BDDs),
which undergo the ``jump-up'' operation but not the normal
synthesis operations of a traditional BDD package. It requires no
hashing or garbage collection.
The given function is specified explicitly by generating its QDD.
As a demonstration, we implement the $2^m$-way multiplexer
$M_m(x_1,\ldots,x_m;x_{m+1},\ldots,x_{m+2^m})$ here,
because it exhibits great extremes of BDD size under different orderings.
But with \.{CWEB} change files any other function can be substituted.
If desired, optimization will be restricted to a subrange of the possible
levels, keeping variables at the top and bottom of the BDD in place.
We assume that |botvar-topvar| is at most 24.
@d mm 2 /* order of MUX in this demonstration version */
@d nn (mm+(1<=1| */
@d botvar nn /* last variable whose order will be varied (must be |<=nn| */
@d nnn (botvar+1-topvar) /* variables being permuted (at most 25) */
@#
@d o mems++ /* count a memory access to an octabyte */
@d oo mems+=2 /* or two */
@d ooo mems+=3 /* or three */
@d oooo mems+=4 /* or four, wow */
@c
#include
#include
@@;
@@;
unsigned long long mems;
@@;
main () {
register int h,i,j,k,l,lo,hi,jj,kk,var,cycle;
octa x;
@;
for (cycle=1;cycle<1<<(nnn-1);cycle++) {
if (cycle%interval==0) {
printf("Beginning cycle %d (%llu mems so far)\n",cycle,mems);
fflush(stdout);
}
@;
}
@;
printf("Altogether %llu mems.\n",mems);
}
@ @=
typedef unsigned long long octa; /* an octabyte */
@* QDD representation. The quasi-reduced binary decision decision
for a Boolean function of $n$ variables has $q_k$ nodes on level~$k$,
for $0\le k\le n$, one for every distinct subfunction that can arise
by hard-wiring constant values for the initial variables
$(x_1,\ldots,x_k)$. The sequence $(q_0,\ldots,q_n)$ is
called the function's {\it quasi-profile}.
The maximum value of $q_k$ is $\min(t2^k,2^{2^{n-k}})$ when there are
$t$ output functions of $n$ input variables. In particular,
the maximum quasi-profile when $n=25$ and $t=1$ is
$$(1,2,4,\ldots,2^{19},2^{20},2^{16},2^8,2^4,2^2,2);$$
its potentially biggest element, $2^{20}$, occurs when $k=20$.
In this implementation the nodes on level $k$ are numbered from 0 to
$q_k-1$, and we store them consecutively in a big array |node[k]|.
Each node contains three fields, (|lo|,|hi|,|dep|), packed into a
64-bit word. The |lo| and |hi| fields, 20 bits each, point to nodes
at level~$k+1$; the |dep| field, which occupies the other 24 bits,
represents the set of variables on which this subfunction depends.
Level $n$ is special; it has the two ``sink'' nodes 0 and~1,
which are represented by |(0,0,0)| and |(1,1,0)|, respectively.
For example, suppose $n=3$ and $f(x_1,x_2,x_3)=(\bar x_1\land x_2)
\lor(x_1\land x_3)$. The nodes at level~2, which correspond to
branching on~$x_3$, are
|node[2][0]=(0,0,0)|,
|node[2][1]=(1,1,0)|, and
|node[2][2]=(0,1,0x4)|, representing the respective subfunctions 0, 1,~$x_3$.
The nodes at level~1, which correspond to branching on~$x_2$, are
|node[1][0]=(0,1,0x2)| and
|node[1][1]=(2,2,0x4)|, representing the subfunctions $x_2$ and $x_3$.
And there's one node at level~0, namely
|node[0][0]=(0,1,0x7)|.
@=
octa *node[nnn+1]; /* the node arrays */
int qq[nnn+1]; /* the quasi-profile: $|qq|[k]=q_{k-topvar+1}$ */
@ To launch this structure, we first need to allocate the |node| arrays.
They are actually created only for levels |topvar-1| through |botvar|.
@=
for (k=0;k<=nnn;k++) {
j=worksize,kk=k+topvar-1;
if (nn-kk<5 && j>1<<(1<<(nn-kk))) j=1<<(1<<(nn-kk));
if (kk<20 && ((worksize/outs)>>kk)>0 && (j>outs*(1<>1|, and I treat level~0 as a special case.
This saves time even when $n<25$.
(If I wanted to handle more than 25 variables, I could use a more
elaborate scheme in which the packing conventions vary from level to level.
But 25 is plenty for me at the moment.)
@d pack(lo,hi,dp) (((((octa)(dp)<<20)+(lo))<<20)+(hi))
@d lofield(x) (((x)>>20)&0xfffff)
@d hifield(x) ((x)&0xfffff)
@d depfield(x) (((x)>>40)<<1)
@d extrabit(k,j) (k==0? lofield(node[0][j])!=hifield(node[0][j]):0)
@ Once the |node| arrays exist, we can set up the initial QDD.
Here I implement the familiar function
$$M_m(x_1,\ldots,x_m;x_{m+1},\ldots,x_{m+2^m})\;=\;
x_{m+1+(x_1\ldots x_m)_2}.$$
(In fact, the example $f(x_1,x_2,x_3)$ above is $M_1(x_1;x_2,x_3)$.)
Only the |lo| and |hi| fields are initialized here, because we will use
the reduction routine to compute appropriate |dep|s.
The following code assumes that |topvar=1| and |botvar=nn|.
@=
for (k=0;k;
@ For diagnostic purposes, I might want to pretty-print the current QDD.
@=
void print_level(int k) {
register int j;
printf("level %d (x%d):\n",k,map[k]+topvar);
for (j=0;j=
typedef struct {@+unsigned int l,r;@+} pair;
@ @=
pair work[2*worksize],head[worksize]; /* the big workspace */
@ We use |j+1| to link to entry |j|, so that null links are
distinguishable from links to entry~0.
@=
void print_work(int k) {
register int lo,hi;
printf("Current workspace for level %d:\n",k);
for (lo=0;lo=
void reduce(int k) {
register int lo,hi,dep,p,nextp,q;
q=0; /* the number of nodes created so far */
for (o,lo=0;lo;
}
for (p=head[lo].r;p;p=nextp) {
o,nextp=work[p-1].r,hi=work[p-1].l;
o,head[hi].l=0; /* clean up */
}
o,head[lo].r=0; /* reset the |lo| list to empty */
}
o,qq[k]=q;
}
@ @=
{
if (q>=worksize) {
fprintf(stderr,"Sorry, level %d of the QDD is too big (worksize=%d)!\n",
k,worksize);
exit(-3);
}
if (lo!=hi)
oo,dep=depfield(node[k+1][lo] | node[k+1][hi])|(1<>1);
o,clone[p-1]=q;
o,head[hi].l=++q;
}
@ @=
int clone[2*worksize]; /* the new node addresses */
@ Let's illustrate |reduce| by using it to tidy up the initial QDD.
The main point of interest is that we must remap the pointers on level~$k-1$
after level~$k$ has been reduced.
@d makework(j,lo,hi) work[j].l=hi,work[j].r=head[lo].r,head[lo].r=j+1
@=
for (k=nnn-1;;k--) {
for (o,j=0;jj$, takes the variable
at level~$k$ of the branching structure and moves it up to level~$j$,
sliding the previous occupants of levels $(j,j+1,\ldots,k-1)$
down one notch. For example, suppose $n=7$ and we do jump-up $5\to2$.
Before the operation, levels 0 thru~6 represent branching on
variables $(x_1,x_2,x_3,x_4,x_5,x_6,x_7)$, respectively; after jumping, they
represent branching on $(x_1,x_2,x_6,x_3,x_4,x_5,x_7)$.
The |dep| fields are always based on the assumption that level $k$ branches
on variable $x_{k+1}$. An auxiliary |map| table records the results of
previous jumps; level~$k$ actually branches on variable
$x_{map[k]+topvar}$ of the original unpermuted function. We also maintain
a |bitmap| array, where |bitmap[k]| contains the bits that encode the set
$\{|map|[0],\ldots,|map|[k-1]\}$.
@=
int map[nnn]; /* the current permutation */
int bitmap[nnn]; /* its initial dependency sets */
@ @=
for (j=k=0;j=
o,var=map[kk];
for (k=kk;k>jj;k--) {
@;
reduce(k);
oo,map[k]=map[k-1];
oo,bitmap[k]=bitmap[k-1]|(1<;
@ Simple but cool list processing does the trick here.
@=
for (o,j=0;j=
for (o,j=0;j=
int b[1<1$, take the
sequence for $n-1$ and place $n$ at the bottom; then jump $n$ up
to the top; then use the sequence for $n-1$ on the {\it lower\/}
$n-1$ elements. This idea turns out to be equivalent to the following:
The $k$th jump-up is $\nu k+\rho k\to\nu k-1$, for $1\le k<2^{n-1}$.
(In this formula
$\nu k$ denotes the number of 1s in the binary representation of~$k$,
and $\rho k$ denotes the number of 0s at the right end of that representation.)
For example, when $n=4$ the jumps are $1\to0$, $2\to0$, $2\to1$, $3\to0$,
$2\to1$, $3\to1$, $3\to2$; the permutations are
$$\thinmuskip=\thickmuskip
1234,2134,3214,3124,4312,4132,4213,4231.$$
The idea is that all $k$-combinations that don't contain~$n$ appear
in the first half; those that do contain~$n$ appear in the second half.
@=
for (jj=0,kk=1,k=cycle;(k&1)==0;kk++) k>>=1; /* compute $\rho k$ */
for (k&=k-1;k;jj++,kk++) k&=k-1; /* compute $\nu k$ */
@;
for (k=jj+1;k<=kk;k++)
@;
@ We also need to gather statistics from the initial QDD.
@=
@;
for (k=1;k;
@ Every node in |node[k]| is marked with a |dep| field, telling which
of the remaining variables it depends on. More precisely, if the |dep|
contains the bit |1<=
{
switch (((nnn-2)>>3)*4+((k-1)>>3)) {
case 2*4+0:@+for (j=0;j<256;j++) ooo,count1[j]=count2[j]=count3[j]=0;
for (o,j=0;j>56]++;
oo,count2[(x>>48)&0xff]++;
oo,count3[(x>>40)&0xff]++;
};@+break;
case 2*4+1:@+for (j=0;j<256;j++) oo,count1[j]=count2[j]=0;
for (o,j=0;j>56]++;
oo,count2[(x>>48)&0xff]++;
};@+break;
case 2*4+2:@+for (j=0;j<256;j++) o,count1[j]=0;
for (o,j=0;j>56]++;
};@+break;
case 1*4+0:@+for (j=0;j<256;j++) oo,count2[j]=count3[j]=0;
for (o,j=0;j>48)&0xff]++;
oo,count3[(x>>40)&0xff]++;
};@+break;
case 1*4+1:@+for (j=0;j<256;j++) o,count2[j]=0;
for (o,j=0;j>48)&0xff]++;
};@+break;
case 0*4+0:@+for (j=0;j<256;j++) o,count3[j]=0;
for (o,j=0;j>40)&0xff]++;
};@+break;
}
for (j=k;j;
}
@ @=
int count1[256],count2[256],count3[256]; /* bit pattern counts */
@ @=
{
l=1<<((j-1)&0x7),i=0;
if (j<9) {
for (i=0,h=l;h<256;h=(h+1)|l) o,i+=count3[h];
}@+else if (j<17) {
for (i=0,h=l;h<256;h=(h+1)|l) o,i+=count2[h];
}@+else for (i=0,h=l;h<256;h=(h+1)|l) o,i+=count1[h];
oooo,b[bitmap[k]][map[j]]=i;
}
@ At this point we have the initial QDD, with |map[j]=j| for all |j|.
@=
for (j=0;j>k)&1;
}
@ At last comes the final reckoning: We can compute the minimum cost
that can be achieved when subset |X| occurs at the top of the BDD,
for $1\le X<2^n$.
While we're at it, we might as well compute the maximum cost too.
@=
for (k=1;k<1<h) h=l,lo=j;
}
cost[k]=h,routing[k]=lo;
}
printf("Pessimum ordering (cost %d+externals) can be achieved thus:\n",
cost[(1<=
int cost[1<