📄 iq_segmented.cc
字号:
/* * Copyright (c) 2001, 2002, 2003, 2004, 2005 * The Regents of The University of Michigan * All Rights Reserved * * This code is part of the M5 simulator, developed by Nathan Binkert, * Erik Hallnor, Steve Raasch, and Steve Reinhardt, with contributions * from Ron Dreslinski, Dave Greene, Lisa Hsu, Kevin Lim, Ali Saidi, * and Andrew Schultz. * * Permission is granted to use, copy, create derivative works and * redistribute this software and such derivative works for any * purpose, so long as the copyright notice above, this grant of * permission, and the disclaimer below appear in all copies made; and * so long as the name of The University of Michigan is not used in * any advertising or publicity pertaining to the use or distribution * of this software without specific, written prior authorization. * * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION FROM THE * UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY PURPOSE, AND * WITHOUT WARRANTY BY THE UNIVERSITY OF MICHIGAN OF ANY KIND, EITHER * EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE. THE REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE * LIABLE FOR ANY DAMAGES, INCLUDING DIRECT, SPECIAL, INDIRECT, * INCIDENTAL, OR CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM * ARISING OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF SUCH * DAMAGES. */#include <iomanip>#include <sstream>#include "base/cprintf.hh"#include "base/statistics.hh"#include "encumbered/cpu/full/cpu.hh"#include "encumbered/cpu/full/create_vector.hh"#include "encumbered/cpu/full/dep_link.hh"#include "encumbered/cpu/full/iq/iqueue.hh"#include "encumbered/cpu/full/iq/segmented/iq_segmented.hh"#include "encumbered/cpu/full/iq/segmented/seg_chain.hh"#include "encumbered/cpu/full/ls_queue.hh"#include "encumbered/cpu/full/reg_info.hh"#include "mem/mem_interface.hh"#include "sim/builder.hh"#include "sim/eventq.hh"#include "sim/stats.hh"using namespace std;#define DEBUG_PROMOTION 0#define USE_NEW_SELF_TIME_CODE 1#define SANITY_CHECKING 1#define use_mod_pushdown 1#define use_mod_bypassing 1#define bypass_slot_checking 0// Always begin self-timing when the chain head issues#define load_chain_st 0// Begin self-timing only if Segment 0 is less than half-full// of not-ready instructions#define s0_st_limit 0#define DUMP_CHINFO 0//==========================================================================//// The "segmented" instruction queue implementation////==========================================================================//// The constructor//SegmentedIQ::SegmentedIQ(string n, unsigned _num_segments, unsigned _max_chain_depth, unsigned _segment_size, unsigned _segment_thresh, bool en_pri, bool _use_bypassing, bool _use_pushdown, bool _use_pipelined_prom) : BaseIQ(n){ string s; // // Save params // num_segments = _num_segments; max_chain_depth = _max_chain_depth; segment_size = _segment_size; segment_thresh = _segment_thresh; enable_priority = en_pri; use_bypassing = _use_bypassing; use_pushdown = _use_pushdown; use_pipelined_prom = _use_pipelined_prom; if (s0_st_limit && load_chain_st) fatal("SegmentedIQ: You really don't want to use both ld_chain_st " "AND s0_st_limit"); last_new_chain = 0; total_size = num_segments * segment_size; set_size(total_size); last_segment = num_segments - 1; deadlock_seg_flag = num_segments + 1; // // Allocate the IQStations // // [total_size elements, allocated, doesn't grow] // active_instructions = new iq_list(total_size, true, 0); // // Initialize now that we have the parameters // seg_thresholds = new unsigned[num_segments]; free_slot_info = new unsigned[num_segments]; // An array of pointers to segment_t queue_segment = new segment_ptr_t[num_segments]; // Initialize Deadlock deadlock_recovery_mode = false; deadlock_slot = 0; dedlk_promotion_count = 0; dedlk_issue_count = 0; iq_heads = 0; // head_count = 0; // // Statistics // pushdown_events = new Stats::Scalar<>[num_segments - 1]; pushdown_count = new Stats::Scalar<>[num_segments - 1]; total_pd_events = 0; total_pd_count = 0; deadlock_events = 0; deadlock_cycles = 0; last_deadlock = 0; total_ready_count = 0; cum_delay = 0; st_limit_events = 0; rob_chain_heads = 0; seg0_prom_early_count = 0;}voidSegmentedIQ::init(FullCPU *_cpu, unsigned dw, unsigned iw, unsigned qn){ unsigned cum_thresh = 0; // Call the base-class init BaseIQ::init(_cpu, dw, iw, qn); num_chains = cpu->max_chains; total_insts = 0; for (int i = 0; i < cpu->number_of_threads; ++i) insts[i] = 0; for (unsigned i = 0; i < num_segments; ++i) { stringstream s; cum_thresh += segment_thresh; seg_thresholds[i] = cum_thresh; s << name() << ":" << setw(2) << setfill('0') << i << ends; queue_segment[i] = new segment_t(this, s.str(), i, num_segments, segment_size, num_chains, cum_thresh, use_pipelined_prom, enable_priority); } total_thresh = cum_thresh; ccprintf(cerr, "******************************************\n" " IQ Model : Segmented\n" " %6u Segments\n" " (size=%u, delta-thresh=%u)\n" " (%u Total slots)\n" " %6u Chains (maximum)\n" " %6u Maximum chain length\n" "*****************************************\n" "\n", num_segments, segment_size, segment_thresh, total_size, num_chains, max_chain_depth);}SegmentedIQ::~SegmentedIQ(){ delete[] pushdown_events; delete[] pushdown_count; for (int i = 0; i < num_segments; ++i) delete queue_segment[i]; delete[] queue_segment; delete[] free_slot_info; delete[] seg_thresholds; delete active_instructions; delete hm_predictor;}//============================================================================//// Shared information structure...//// This structure holds information which must be shared between all// clusters, or between the clusters and the dispatch stage.//// The buildSharedInfo() method is called for IQ[0], then the structure// address is passed to any remaining clusters via the setSharedInfo()// method//ClusterSharedInfo *SegmentedIQ::buildSharedInfo(){ ClusterSharedInfo *rv = new ClusterSharedInfo; // // Build the Chain Info Table // rv->ci_table = new SegChainInfoTable(num_chains, cpu->numIQueues, num_segments, use_pipelined_prom); rv->hm_predictor = new SaturatingCounterPred(name()+":HMP", "miss", "hit", 12, 4, 0, 1, 13); rv->lr_predictor = new SaturatingCounterPred(name()+":LRP", "0", "1", 10); rv->total_chains = num_chains; // make a copy of the info locally setSharedInfo(rv); return rv;}voidSegmentedIQ::setSharedInfo(ClusterSharedInfo *p){ shared_info = p; // for local use... reg_info_table = p->ri_table; chain_info = static_cast<SegChainInfoTable *>(p->ci_table); hm_predictor = static_cast<SaturatingCounterPred *>(p->hm_predictor); lr_predictor = static_cast<SaturatingCounterPred *>(p->lr_predictor); // tell all the segments where this info is... for (int i = 0; i < num_segments; ++i) { queue_segment[i]->reg_info_table = reg_info_table; queue_segment[i]->chain_info = chain_info; }}//============================================================================//// Add an instruction to the queue://// All instructions "live" in the "active_instructions" list. We pass// iterators to these entries around instead of moving the data. This// also makes it possible to walk dependence chains at writeback time//SegmentedIQ::iq_iteratorSegmentedIQ::add_impl(DynInst *inst, InstSeqNum seq, ROBStation *rob, RegInfoElement *ri, NewChainInfo *new_chain){ IQStation rs; // We'll fill in these fields then copy the object unsigned follows_chain = 1000; unsigned n_chains = 0; // // If we're in the process of recovering from a deadlock condition, // then disable instruction dispatch // if (deadlock_recovery_mode) return 0; rs.inst = inst; rs.in_LSQ = false; rs.ea_comp = inst->isMemRef(); rs.seq = seq; // The dispatch sequence, not fetch sequence rs.queued = false; rs.squashed = false; //rs.blocked = false; rs.dispatch_timestamp = curTick; rs.lsq_entry = 0; // may be changed by dispatch() rs.tag = inst->fetch_seq; rs.rob_entry = rob; rob->head_of_chain = false; assert(!new_chain->out_of_chains); // Now that we're sure we'll add this instruction... ++total_insts; ++insts[inst->thread_number]; // Insert this info into instruction record for (int i = 0; i < TheISA::MaxInstSrcRegs; ++i) rs.idep_info[i] = new_chain->idep_info[i]; rs.head_of_chain = new_chain->head_of_chain; rs.head_chain = new_chain->head_chain; rs.pred_last_op_index = new_chain->pred_last_op_index; rs.lr_prediction = new_chain->lr_prediction; // // Add this instruction to the active list so we have someplace to link // ideps, etc // iq_iterator p = active_instructions->add_tail(rs); p->hm_prediction = new_chain->hm_prediction; rob->hm_prediction = p->hm_prediction; // // Determine which segment this instruction will be placed in // // Result is determined by bypassing mode, etc. // unsigned destination = choose_dest_segment(p); // // Now, link into producing instructions and predict issue/writeback // unsigned op_lat = cpu->FUPools[0]->getLatency(rs.opClass()); Tick max_wb = 0; Tick max_chained_op_rdy_time = 0; unsigned max_depth = 0, max_delay = 0; Tick max_op_ready_time = 0; // Adjust the instruction latency if this is the EA-Comp portion of // a LOAD instruciton if (rs.inst->isLoad()) { op_lat += cache_hit_latency; } // // This hunk of code not only links the input deps, but also returns // the expected cycle that that dep will write-back. We use these // values to determine the latest WB time. The latest WB time will // be used to determine the earliest Issue cycle, and all other time // calculations follow from there. // StaticInstPtr<TheISA> si = rs.inst->staticInst; StaticInstPtr<TheISA> eff_si = rs.ea_comp ? si->eaCompInst() : si; p->num_ideps = 0; for (int i = 0; i < eff_si->numSrcRegs(); ++i) { Tick this_wb = link_idep(p, eff_si->srcRegIdx(i)); // Find the latest predicted ops-ready time from all inputs // (we'll over-write this later with the chained value, if // we decide that we're chained) if (this_wb > max_op_ready_time) max_op_ready_time = this_wb; // If this op is chained & not-ready, we have to look at it if (!p->idep_ready[i] && new_chain->idep_info[i].chained) { ++n_chains; if (this_wb > max_chained_op_rdy_time) { follows_chain = new_chain->idep_info[i].follows_chain; max_depth = new_chain->idep_info[i].chain_depth; max_delay = new_chain->idep_info[i].delay; max_chained_op_rdy_time = this_wb; } } } // This shouldn't be necessary, since we should never look past // num_ideps in the array, but there are too many loops that go // all the way to TheISA::MaxNumSrcRegs. for (int i = p->num_ideps; i < TheISA::MaxInstSrcRegs; ++i) { p->idep_ptr[i] = 0; p->idep_reg[i] = 0; p->idep_ready[i] = true; } // If we have ANY chains, we want to use the chained time if (n_chains) max_op_ready_time = max_chained_op_rdy_time; // Now, calculate the predicted issue cycle... // Note that we can't issue until the cycle AFTER the instruction arrives // in segment zero. if (max_wb < curTick + destination + 1) p->pred_issue_cycle = curTick + destination + 1; else p->pred_issue_cycle = max_wb; // ... and store it rob->pred_issue_cycle = p->pred_issue_cycle; // Use that to calculate the predicted WB cycle rob->pred_wb_cycle = p->pred_issue_cycle + op_lat; p->pred_ready_time = max_op_ready_time; // // Decide whether we want the output of this inst to be chained from // an incoming chain... Wierdness due to the possiblity of following // multiple chains (this forces an instruction following multiple // chains to be a "head")... // bool chained = false; if (n_chains > 1) { // We'd better be the head of a new chain! } else if (n_chains == 1) { chained = true; } else { // Operands are self-timed (or ready) // ==> The result register should be marked as self-timed // (ie. not chained) } inst_depth_dist.sample(max_depth); inst_depth_lat_dist.sample(max_delay); // // "pred_wb_time" is an prediction of when this instructions // RESULT VALUES will be ready... We put this value into the // register-info table. // Tick pred_wb_time; if (p->ops_ready()) { p->ready_timestamp = curTick; delay_at_ops_rdy_dist.sample(0); } pred_wb_time = rob->pred_wb_cycle; // // If this instruction is the head of a chain // if (p->head_of_chain) { ++iq_heads; // decremented when head leaves the IQ/LSQ // Make sure the creator seq number matches correctly... if (!rs.ea_comp) { (*chain_info)[p->head_chain].creator = seq; } else { // the LSQ portion of this (must be a store) will generate value (*chain_info)[p->head_chain].creator = seq + 1; } (*chain_info)[p->head_chain].created_ts = curTick; (*chain_info)[p->head_chain].head_level = destination; // // The ROB element is what actually does the writeback, since the // instruction will have been removed from the queue at issue... // rob->head_of_chain = true; rob->head_chain = p->head_chain; // // Set the chain depth to one // (*chain_info)[p->head_chain].chain_depth = 0; max_depth = 0; // Put this plus one into the reg_info struct } for (int i = 0; i < TheISA::MaxInstSrcRegs; ++i) { if (new_chain->idep_info[i].chained) { // Let's see just how long this chain is... // if we're using a register created near the head of the chain, // our depth may not be the deepest of all chained instructions SegChainInfoEntry &info = (*chain_info)[new_chain->idep_info[i].follows_chain]; if (max_depth >= info.chain_depth) info.chain_depth = max_depth + 1; } }#if 0 // we don't actually set hmp_func anywhere... // // HMP: Add latency to the result of this load if we predict a load miss // if (use_hm_predictor && hmp_func == HMP_LATENCY && inst->isLoad()) { // If we predict this to be a miss if (p->hm_prediction == MA_CACHE_MISS) pred_wb_time += MISS_ADDITIONAL_LATENCY; } // // Special handling if we're doing BOTH... // (prediction has already been made and stored in ROB... don't // want to make _another_ one -- first in choose_chain() ) // if (use_hm_predictor && hmp_func == HMP_BOTH && inst->isLoad()) { // If we predict this to be a miss if (p->hm_prediction == MA_CACHE_MISS) pred_wb_time += MISS_ADDITIONAL_LATENCY; }#endif#if DUMP_CHINFO cout << "@ " << curTick << ": #" << seq << " head of C#"; if (rs.head_of_chain) cout << rs.head_chain;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -