📄 gb_gates.w

📁 模拟器提供了一个简单易用的平台
💻 W
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
@<Mark all gates that are used in some output@>={  for (v=g->vertices;v!=sentinel;v=v->foo) v->lnk=NULL;  for (a=g->outs;a;a=a->next) {    v=a->tip;    if (is_boolean(v)) continue;    if (v->typ=='=')      v=a->tip=v->alt;    if (v->typ=='C') { /* this output is constant, so make it boolean */      a->tip=(Vertex*)v->bit;      continue;    }    @<Mark all gates that are used to compute |v|@>;  }}@ @<Mark all gates that are used to compute |v|@>=if (v->lnk==NULL) {  v->lnk=sentinel;   /* |v| now represents the top of the stack of nodes to be marked */  do@+{    n++;    b=v->arcs;    if (v->typ=='L') {      u=v->alt; /* latch vertices have a ``hidden'' dependency */      if (u<v) n++; /* latched input value will get a special gate */      if (u->lnk==NULL) {        u->lnk=v->lnk;        v=u;      }@+else v=v->lnk;    }@+else v=v->lnk;    for (;b;b=b->next) {      u=b->tip;      if (u->lnk==NULL) {        u->lnk=v;        v=u;      }    }  }@+while (v!=sentinel);}@ It is easier to copy a directed acyclic graph than to copy a general graph,but we do have to contend with the feedback in latches.@d reverse_arc_list(@!alist)  {@+for (aa=alist,b=NULL;aa;b=aa,aa=a) {      a=aa->next;      aa->next=b;     }   alist=b;@+}@<Copy all marked gates to a new graph@>=new_graph=gb_new_graph(n);if (new_graph==NULL) {  gb_recycle(g);  panic(no_room+2); /* out of memory */}strcpy(new_graph->id,g->id);strcpy(new_graph->util_types,"ZZZIIVZZZZZZZA");next_vert=new_graph->vertices;for (v=g->vertices,latch_ptr=NULL;v!=sentinel;v=v->foo) {  if (v->lnk) { /* yes, |v| is marked */    u=v->lnk=next_vert++; /* make note of where we've copied it */    @<Make |u| a copy of |v|; put it on the latch list if it's a latch@>;  }}@<Fix up the |alt| fields of the newly copied latches@>;reverse_arc_list(g->outs);for (a=g->outs;a;a=a->next) {  b=gb_virgin_arc();  b->tip=is_boolean(a->tip)? a->tip: a->tip->lnk;  b->next=new_graph->outs;  new_graph->outs=b;}@ @<Make |u| a copy of |v|; put it on the latch list if it's a latch@>=u->name=gb_save_string(v->name);u->typ=v->typ;if (v->typ=='L') {  u->alt=latch_ptr;@+latch_ptr=v;}reverse_arc_list(v->arcs);for (a=v->arcs;a;a=a->next)  gb_new_arc(u,a->tip->lnk,a->len);@ @<Fix up the |alt| fields of the newly copied latches@>=while (latch_ptr) {  u=latch_ptr->lnk; /* the copy of a latch */  v=u->alt;  u->alt=latch_ptr->alt->lnk;  latch_ptr=v;  if (u->alt<u) @<Replace |u->alt| by a new gate that copies an input@>;}@ Suppose we had a latch whose value was originally the {\sc AND} oftwo inputs, where one of those inputs has now been set to~1. Then thelatch should still refer to a subsequent gate, equal to the value of theother input on the previous cycle. We create such a gate here, makingit an {\sc OR} of two identical inputs. We do this because we're not supposedto leave any |'='| in the result of |reduce|, and because every {\sc OR}is supposed to have at least two inputs.@<Replace |u->alt| by a new gate that copies an input@>={  v=u->alt; /* the input gate that should be copied for latching */  u->alt=next_vert++;  sprintf(name_buf,"%s>%s",v->name,u->name);  u=u->alt;  u->name=gb_save_string(name_buf);  u->typ=OR;  gb_new_arc(u,v,DELAY);@+gb_new_arc(u,v,DELAY);}@* Parallel multiplication. Now comes the |prod| routine,which constructs a rather different network of gates, based this timeon a divide-and-conquer paradigm. Let's take a breather before we tackle it.(Deep breath.)The subroutine call |prod(m,n)| createsa network for the binary multiplication of unsigned|m|-bit numbers by |n|-bit numbers, assuming that |m>=2| and |n>=2|.There is no upper limit on the sizes of |m| and~|n|, except of coursethe limits imposed by the size of memory in which this routine is run.The overall strategy used by |prod| is to start with a generalizedgate graph for multiplication in which many of the gates areidentically zero or copies of other gates.  Then the |reduce| routinewill perform local optimizations leading to the desired result. Sincethere are no latches, some of the complexities of the general |reduce|routine are avoided.All of the |AND|, |OR|, and |XOR| gates of the network returned by|prod| have exactly two inputs. The depth of the circuit (i.e., thelength of its longest path) is $3\log m/\!\log 1.5 + \log(m+n)/\!\log\phi+O(1)$, where $\phi=(1+\sqrt5\,)/2$ is the golden ratio. The grand totalnumber of gates is $6mn+5m^2+O\bigl((m+n)\log(m+n)\bigr)$.There is a demonstration program called {\sc MULTIPLY} that uses |prod| tocompute products of large integers.@<The |prod| routine@>=Graph* prod(m,n)  unsigned long m,n; /* lengths of the binary numbers to be multiplied */{@+@<Local variables for |prod|@>@;@#  if (m<2) m=2;  if (n<2) n=2;  @<Allocate space for a temporary graph |g| and for auxiliary tables@>;  @<Fill |g| with generalized gates that do parallel multiplication@>;  if (gb_trouble_code) {    gb_recycle(g);@+panic(alloc_fault); /* too big */  }  g=reduce(g);  return g; /* if |g==NULL|, the |panic_code| was set by |reduce| */}@ The divide-and-conquer recurrences used in this network lead to interestingpatterns. First we use a method for parallel column addition that reducesthe sum of three numbers to the sum of two numbers. Repeated use of thisreduction makes it possible to reduce the sum of |m| numbers to a sum ofjust two numbers, with a total circuit depth that satisfies therecurrence $T(3N)=T(2N)+O(1)$. Then when the result has been reducedto a sum of two numbers, we use a parallel addition scheme based onrecursively ``golden sectioning the data''; in other words, the recursionpartitions the data into two parts such that the ratio of the larger partto the smaller part is approximately $\phi$. This technique proves to beslightly better than a binary partition would be, both asymptotically andfor small values of~$m+n$.\def\flog{\mathop{\rm flog}\nolimits}We define $\flog N$, the Fibonacci logarithm of~$N$, to be the smallest@^Fibonacci, Leonardo, numbers@>nonnegative integer~$k$ such that $N\le F_{k+1}$. Let $N=m+n$. Our paralleladder for two numbers of $N$ bits will turn out to have depth at most$2+\flog N$. The unreduced graph~|g| in our circuit for multiplicationwill have fewer than $(6m+3\flog N)N$ gates.@<Allocate space for a temporary graph |g| and for auxiliary tables@>=m_plus_n=m+n;@+@<Compute $f=\flog(m+n)$@>;g=gb_new_graph((6*m-7+3*f)*m_plus_n);if (g==NULL) panic(no_room); /* out of memory before we're even started */sprintf(g->id,"prod(%lu,%lu)",m,n);strcpy(g->util_types,"ZZZIIVZZZZZZZA");long_tables=gb_typed_alloc(2*m_plus_n+f,long,g->aux_data);vert_tables=gb_typed_alloc(f*m_plus_n,Vertex*,g->aux_data);if (gb_trouble_code) {  gb_recycle(g);  panic(no_room+1); /* out of memory trying to create auxiliary tables */}@ @<Local variables for |prod|@>=unsigned long m_plus_n; /* guess what this variable holds */long f; /* initially $\flog(m+n)$, later flog of other things */Graph *g; /* graph of generalized gates, to be reduced eventually */long *long_tables; /* beginning of auxiliary array of |long| numbers */Vertex **vert_tables; /* beginning of auxiliary array of gate pointers */@ @<Compute $f=\flog(m+n)$@>=f=4;@+j=3;@+k=5; /* $j=F_f$, $k=F_{f+1}$ */while (k<m_plus_n) {  k=k+j;  j=k-j;  f++;}@ The well-known formulas for a ``full adder,''$$ x+y+z=s+2c,\qquad   \hbox{where $s=x\oplus y\oplus z$ and $c=xy\lor yz\lor zx$},$$can be applied to each bit of an $N$-bit number, thereby providing uswith a way to reduce the sum of three numbers to the sum of two.The input gates of our network will be called $x_0$, $x_1$, \dots,~$x_{m-1}$,$y_0$,~$y_1$, \dots,~$y_{n-1}$, and the outputs will be called$z_0$, $z_1$, \dots,~$z_{m+n-1}$. The logic of the |prod| network will compute$$(z_{m+n-1}\ldots z_1z_0)_2=(x_{m-1}\ldots x_1x_0)_2\cdot                             (y_{n-1}\ldots y_1y_0)_2\,,$$by first considering the product to be the $m$-fold sum$A_0+A_1+\cdots+A_{m-1}$, where$$A_j=2^jx_j\cdot(y_{n-1}\ldots y_1y_0)_2\,,\qquad 0\le j<m.$$Then the three-to-two rule for addition is used to define furthernumbers $A_m$, $A_{m+1}$, \dots,~$A_{3m-5}$ by the scheme$$A_{m+2j}+A_{m+2j+1}=A_{3j}+A_{3j+1}+A_{3j+2}\,,\qquad 0\le j\le m-3.$$[A similar but slightly less efficient scheme was used by Pratt andStockmeyer in {\sl Journal of Computer and System Sciences \bf12} (1976),@^Pratt, Vaughan Ronald@>@^Stockmeyer, Larry Joseph@>Proposition~5.3. The recurrence used here is related to the Josephus@^Josephus, Flavius, problem@>problem with step-size~3; see {\sl Concrete Mathematics},{\mathhexbox278}3.3.]For this purpose, we compute intermediate results $P_j$, $Q_j$, and~$R_j$by the rules$$\eqalign{P_j&=A_{3j}\oplus A_{3j+1}\,;\cr           Q_j&=A_{3j}\land A_{3j+1}\,;\cr      A_{m+2j}&=P_j\oplus A_{3j+2}\,;\cr           R_j&=P_j\land A_{3j+2}\,;\cr    A_{m+2j+1}&=2(Q_j\lor R_j)\,.\cr}$$Finally we let$$\eqalign{U&=A_{3m-6}\oplus A_{3m-5}\,,\cr           V&=A_{3m-6}\land A_{3m-5}\,;\cr}$$these are the values that would be $P_{m-2}$ and $Q_{m-2}$ if the previousformulas were allowed to run past $j=m-3$. The final result$Z=(z_{m+n-1}\ldots z_1z_0)_2$ can now be expressed as$$Z=U+2V\,.$$The gates of the first part of the network are conveniently obtainedin groups of $N=m+n$, representing the bits of the quantities $A_j$,$P_j$, $Q_j$, $R_j$, $U$, and~$V$. We will put the least significant bitof $A_j$ in gate position |g->vertices+a(j)*N|, where $a(j)=j+1$ for$0\le j<m$ and $a(m+2j+t)=m+5j+3+2t$ for $0\le j\le m-3$, $0\le t\le1$.@<Fill |g| with generalized gates that do parallel multiplication@>=next_vert=g->vertices;start_prefix("X");@+x=first_of(m,'I');start_prefix("Y");@+y=first_of(n,'I');@<Define $A_j$ for $0\le j<m$@>;@<Define $P_j$, $Q_j$, $A_{m+2j}$, $R_j$, and $A_{m+2j+1}$  for $0\le j\le m-3$@>;@<Define $U$ and $V$@>;@<Compute the final result $Z$ by parallel addition@>;@ @<Local variables for |prod|@>=register long i,j,k,l; /* all-purpose indices */register Vertex *v; /* current vertex of interest */Vertex *x,*y; /* least-significant bits of the input gates */Vertex *alpha,*beta; /* least-significant bits of arguments */@ @<Define $A_j$ for $0\le j<m$@>=for (j=0; j<m; j++) {  numeric_prefix('A',j);  for (k=0; k<j; k++) {    v=new_vert('C');@+v->bit=0; /* this gate is the constant 0 */  }  for (k=0; k<n; k++)    make2(AND,x+j,y+k);  for (k=j+n; k<m_plus_n; k++) {    v=new_vert('C');@+v->bit=0; /* this gate is the constant 0 */  }}@ Since |m| is |unsigned|, it is necessary to say `|j<m-2|' here insteadof `|j<=m-3|'.@d a_pos(j) (j<m? j+1: m+5*((j-m)>>1)+3+(((j-m)&1)<<1))@<Define $P_j$, $Q_j$, $A_{m+2j}$, $R_j$, and $A_{m+2j+1}$...@>=for (j=0; j<m-2; j++) {  alpha=g->vertices+(a_pos(3*j)*m_plus_n);  beta=g->vertices+(a_pos(3*j+1)*m_plus_n);  numeric_prefix('P',j);  for (k=0; k<m_plus_n; k++)    make2(XOR,alpha+k,beta+k);  numeric_prefix('Q',j);  for (k=0; k<m_plus_n; k++)    make2(AND,alpha+k,beta+k);  alpha=next_vert-2*m_plus_n;  beta=g->vertices+(a_pos(3*j+2)*m_plus_n);  numeric_prefix('A',(long)m+2*j);  for (k=0; k<m_plus_n; k++)    make2(XOR,alpha+k,beta+k);  numeric_prefix('R',j);  for (k=0; k<m_plus_n; k++)    make2(AND,alpha+k,beta+k);  alpha=next_vert-3*m_plus_n;  beta=next_vert-m_plus_n;  numeric_prefix('A',(long)m+2*j+1);  v=new_vert('C');@+v->bit=0; /* another 0, it multiplies $Q\lor R$ by 2 */  for (k=0; k<m_plus_n-1; k++)    make2(OR,alpha+k,beta+k);}@ Actually $v_{m+n-1}$ will never be used (it has to be zero); but wecompute it anyway. We don't have to worry about such nitty gritty detailsbecause |reduce| will get rid of all the obvious redundancy.@<Define $U$ and $V$@>=alpha=g->vertices+(a_pos(3*m-6)*m_plus_n);beta=g->vertices+(a_pos(3*m-5)*m_plus_n);start_prefix("U");for (k=0; k<m_plus_n; k++)  make2(XOR,alpha+k,beta+k);start_prefix("V");for (k=0; k<m_plus_n; k++)  make2(AND,alpha+k,beta+k);@* Parallel addition. It's time now to take another deep breath. Wehave finished the parallel multiplier except for one last step, thedesign of a parallel adder.The adder is based on the following theory:We want to perform the binary addition$$\vbox{\halign{\
上一页 1 2 3 45
💿 文件大小 1340 K
👤 上传用户 zyhunicom
📂 所属分类网络
🏷️ 相关标签

#模拟
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -