📄 athlon.md
字号:
;; AMD Athlon Scheduling;;;; The Athlon does contain three pipelined FP units, three integer units and;; three address generation units. ;;;; The predecode logic is determining boundaries of instructions in the 64;; byte cache line. So the cache line straddling problem of K6 might be issue;; here as well, but it is not noted in the documentation.;;;; Three DirectPath instructions decoders and only one VectorPath decoder;; is available. They can decode three DirectPath instructions or one VectorPath;; instruction per cycle.;; Decoded macro instructions are then passed to 72 entry instruction control;; unit, that passes;; it to the specialized integer (18 entry) and fp (36 entry) schedulers.;;;; The load/store queue unit is not attached to the schedulers but;; communicates with all the execution units separately instead.(define_attr "athlon_decode" "direct,vector,double" (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,leave") (const_string "vector") (and (eq_attr "type" "push") (match_operand 1 "memory_operand" "")) (const_string "vector") (and (eq_attr "type" "fmov") (and (eq_attr "memory" "load,store") (eq_attr "mode" "XF"))) (const_string "vector")] (const_string "direct")));;;; decode0 decode1 decode2;; \ | /;; instruction control unit (72 entry scheduler);; | |;; integer scheduler (18) stack map;; / | | | | \ stack rename;; ieu0 agu0 ieu1 agu1 ieu2 agu2 scheduler;; | agu0 | agu1 agu2 register file;; | \ | | / | | |;; \ /\ | / fadd fmul fstore;; \ / \ | / fadd fmul fstore;; imul load/store (2x) fadd fmul fstore(define_automaton "athlon,athlon_load,athlon_mult,athlon_fp")(define_cpu_unit "athlon-decode0" "athlon")(define_cpu_unit "athlon-decode1" "athlon")(define_cpu_unit "athlon-decode2" "athlon")(define_cpu_unit "athlon-decodev" "athlon");; Model the fact that double decoded instruction may take 2 cycles;; to decode when decoder2 and decoder0 in next cycle;; is used (this is needed to allow troughput of 1.5 double decoded;; instructions per cycle).;;;; In order to avoid dependence between reservation of decoder;; and other units, we model decoder as two stage fully pipelined unit;; and only double decoded instruction may occupy unit in the first cycle.;; With this scheme however two double instructions can be issued cycle0.;;;; Avoid this by using presence set requiring decoder0 to be allocated;; too. Vector decoded instructions then can't be issued when;; modeled as consuming decoder0+decoder1+decoder2.;; We solve that by specialized vector decoder unit and exclusion set.(presence_set "athlon-decode2" "athlon-decode0")(exclusion_set "athlon-decodev" "athlon-decode0,athlon-decode1,athlon-decode2")(define_reservation "athlon-vector" "nothing,athlon-decodev")(define_reservation "athlon-direct0" "nothing,athlon-decode0")(define_reservation "athlon-direct" "nothing, (athlon-decode0 | athlon-decode1 | athlon-decode2)");; Double instructions behaves like two direct instructions.(define_reservation "athlon-double" "((athlon-decode2, athlon-decode0) | (nothing,(athlon-decode0 + athlon-decode1)) | (nothing,(athlon-decode1 + athlon-decode2)))");; Agu and ieu unit results in extremely large automatons and;; in our approximation they are hardly filled in. Only ieu;; unit can, as issue rate is 3 and agu unit is always used;; first in the insn reservations. Skip the models.;(define_cpu_unit "athlon-ieu0" "athlon_ieu");(define_cpu_unit "athlon-ieu1" "athlon_ieu");(define_cpu_unit "athlon-ieu2" "athlon_ieu");(define_reservation "athlon-ieu" "(athlon-ieu0 | athlon-ieu1 | athlon-ieu2)")(define_reservation "athlon-ieu" "nothing")(define_cpu_unit "athlon-ieu0" "athlon");(define_cpu_unit "athlon-agu0" "athlon_agu");(define_cpu_unit "athlon-agu1" "athlon_agu");(define_cpu_unit "athlon-agu2" "athlon_agu");(define_reservation "athlon-agu" "(athlon-agu0 | athlon-agu1 | athlon-agu2)")(define_reservation "athlon-agu" "nothing")(define_cpu_unit "athlon-mult" "athlon_mult")(define_cpu_unit "athlon-load0" "athlon_load")(define_cpu_unit "athlon-load1" "athlon_load")(define_reservation "athlon-load" "athlon-agu, (athlon-load0 | athlon-load1),nothing");; 128bit SSE instructions issue two loads at once(define_reservation "athlon-load2" "athlon-agu, (athlon-load0 + athlon-load1),nothing")(define_reservation "athlon-store" "(athlon-load0 | athlon-load1)");; 128bit SSE instructions issue two stores at once(define_reservation "athlon-store2" "(athlon-load0 + athlon-load1)");; The FP operations start to execute at stage 12 in the pipeline, while;; integer operations start to execute at stage 9 for Athlon and 11 for K8;; Compensate the difference for Athlon because it results in significantly;; smaller automata.(define_reservation "athlon-fpsched" "nothing,nothing,nothing");; The floating point loads.(define_reservation "athlon-fpload" "(athlon-fpsched + athlon-load)")(define_reservation "athlon-fpload2" "(athlon-fpsched + athlon-load2)")(define_reservation "athlon-fploadk8" "(athlon-fpsched + athlon-load)")(define_reservation "athlon-fpload2k8" "(athlon-fpsched + athlon-load2)");; The three fp units are fully pipelined with latency of 3(define_cpu_unit "athlon-fadd" "athlon_fp")(define_cpu_unit "athlon-fmul" "athlon_fp")(define_cpu_unit "athlon-fstore" "athlon_fp"); APPLE LOCAL begin mainline 2006-04-19 4434601(define_reservation "athlon-fany" "(athlon-fstore | athlon-fmul | athlon-fadd)")(define_reservation "athlon-faddmul" "(athlon-fadd | athlon-fmul)");; Vector operations usually consume many of pipes.(define_reservation "athlon-fvector" "(athlon-fadd + athlon-fmul + athlon-fstore)");; Jump instructions are executed in the branch unit completely transparent to us(define_insn_reservation "athlon_branch" 0 (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "ibr")) "athlon-direct,athlon-ieu")(define_insn_reservation "athlon_call" 0 (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "call,callv")) "athlon-vector,athlon-ieu");; Latency of push operation is 3 cycles, but ESP value is available;; earlier(define_insn_reservation "athlon_push" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "push")) "athlon-direct,athlon-agu,athlon-store")(define_insn_reservation "athlon_pop" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "pop")) "athlon-vector,athlon-load,athlon-ieu")(define_insn_reservation "athlon_pop_k8" 3 (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "pop")) "athlon-double,(athlon-ieu+athlon-load)")(define_insn_reservation "athlon_leave" 3 (and (eq_attr "cpu" "athlon") (eq_attr "type" "leave")) "athlon-vector,(athlon-ieu+athlon-load)")(define_insn_reservation "athlon_leave_k8" 3 (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "leave")) "athlon-double,(athlon-ieu+athlon-load)");; Lea executes in AGU unit with 2 cycles latency.(define_insn_reservation "athlon_lea" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "lea")) "athlon-direct,athlon-agu,nothing");; Mul executes in special multiplier unit attached to IEU0(define_insn_reservation "athlon_imul" 5 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "imul") (eq_attr "memory" "none,unknown"))) "athlon-vector,athlon-ieu0,athlon-mult,nothing,nothing,athlon-ieu0");; ??? Widening multiply is vector or double.(define_insn_reservation "athlon_imul_k8_DI" 4 (and (eq_attr "cpu" "k8,generic64") (and (eq_attr "type" "imul") (and (eq_attr "mode" "DI") (eq_attr "memory" "none,unknown")))) "athlon-direct0,athlon-ieu0,athlon-mult,nothing,athlon-ieu0")(define_insn_reservation "athlon_imul_k8" 3 (and (eq_attr "cpu" "k8,generic64") (and (eq_attr "type" "imul") (eq_attr "memory" "none,unknown"))) "athlon-direct0,athlon-ieu0,athlon-mult,athlon-ieu0")(define_insn_reservation "athlon_imul_mem" 8 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "imul") (eq_attr "memory" "load,both"))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,nothing,athlon-ieu")(define_insn_reservation "athlon_imul_mem_k8_DI" 7 (and (eq_attr "cpu" "k8,generic64") (and (eq_attr "type" "imul") (and (eq_attr "mode" "DI") (eq_attr "memory" "load,both")))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,athlon-ieu")(define_insn_reservation "athlon_imul_mem_k8" 6 (and (eq_attr "cpu" "k8,generic64") (and (eq_attr "type" "imul") (eq_attr "memory" "load,both"))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,athlon-ieu");; Idiv cannot execute in parallel with other instructions. Dealing with it;; as with short latency vector instruction is good approximation avoiding;; scheduler from trying too hard to can hide it's latency by overlap with;; other instructions.;; ??? Experiments show that the idiv can overlap with roughly 6 cycles;; of the other code(define_insn_reservation "athlon_idiv" 6 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "type" "idiv") (eq_attr "memory" "none,unknown"))) "athlon-vector,(athlon-ieu0*6+(athlon-fpsched,athlon-fvector))")(define_insn_reservation "athlon_idiv_mem" 9 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "type" "idiv") (eq_attr "memory" "load,both"))) "athlon-vector,((athlon-load,athlon-ieu0*6)+(athlon-fpsched,athlon-fvector))");; The parallelism of string instructions is not documented. Model it same way;; as idiv to create smaller automata. This probably does not matter much.(define_insn_reservation "athlon_str" 6 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "type" "str") (eq_attr "memory" "load,both,store"))) "athlon-vector,athlon-load,athlon-ieu0*6")(define_insn_reservation "athlon_idirect" 1 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "none,unknown")))) "athlon-direct,athlon-ieu")(define_insn_reservation "athlon_ivector" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "none,unknown")))) "athlon-vector,athlon-ieu,athlon-ieu")(define_insn_reservation "athlon_idirect_loadmov" 3 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "type" "imov") (eq_attr "memory" "load"))) "athlon-direct,athlon-load")(define_insn_reservation "athlon_idirect_load" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "load")))) "athlon-direct,athlon-load,athlon-ieu")(define_insn_reservation "athlon_ivector_load" 6 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "load")))) "athlon-vector,athlon-load,athlon-ieu,athlon-ieu")(define_insn_reservation "athlon_idirect_movstore" 1 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "type" "imov") (eq_attr "memory" "store"))) "athlon-direct,athlon-agu,athlon-store")(define_insn_reservation "athlon_idirect_both" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "both")))) "athlon-direct,athlon-load, athlon-ieu,athlon-store, athlon-store")(define_insn_reservation "athlon_ivector_both" 6 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "both")))) "athlon-vector,athlon-load, athlon-ieu, athlon-ieu, athlon-store")(define_insn_reservation "athlon_idirect_store" 1 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "store")))) "athlon-direct,(athlon-ieu+athlon-agu), athlon-store")(define_insn_reservation "athlon_ivector_store" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") (and (eq_attr "unit" "integer,unknown")
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -