📄 modsched.java
字号:
initOpUseLists(prologNode, chipDef); MasterScheduler proSched = new MasterScheduler(prologNode, _opSel, chipDef); proSched.schedule(graph); //sort the instructions so that they will be easier to read in the dotty //sort(prologNode.getInstructions()); } //do all the same for the epilog except, ... if(eList.size()>0) { BlockNode epilogNode = (BlockNode)graph.addNode(); LabelOperand blockName = bNode.getLabel(); String blockNameStr = blockName.getFullName(); epilogNode.setName("epilog_" + blockNameStr); //remap the kernal's out edges so they go from the epilog instead of //from the kernal and set their predicate to true Set out_edges = new HashSet(); out_edges.addAll(bNode.getOutEdges()); BooleanEquation edgePredicate = null; for (Iterator it = out_edges.iterator(); it.hasNext();) { BlockEdge outEdge = (BlockEdge)it.next(); if(!outEdge.isBackwardEdge()) { BlockEdge newEdgeTmp = (BlockEdge)graph.newEdge(); outEdge.setSource(epilogNode); newEdgeTmp.setPredicate(new BooleanEquation(true)); graph.replaceEdge(outEdge, newEdgeTmp); } else { BooleanEquation outEdgeCopy = (BooleanEquation)outEdge.getPredicate().clone(); edgePredicate = new BooleanEquation(outEdgeCopy.not()); } } //BooleanEquation newKernalToEpiPred = new BooleanEquation(edgePredicate); //create a new edge from the kernal to the epilog, with the kernal exit //predicate BlockEdge newEpiEdge = (BlockEdge)graph.addEdge(bNode, epilogNode); epilogNode.addInstructions(new ArrayList(eList.getInstructions())); //newEpiEdge.setPredicate(newKernalToEpiPred); newEpiEdge.setPredicate(edgePredicate); if(!chipDef.isDummyBoard()) AnalyzeHrdWrConstraintsPass.iICalc.calcLogicSpaceReq(graph, chipDef, GlobalOptions.conserveArea); initOpUseLists(epilogNode, chipDef); MasterScheduler epiSched = new MasterScheduler(epilogNode, _opSel, chipDef); epiSched.schedule(graph); //sort(epilogNode.getInstructions()); } } } //end class MuxAdder /** * This little class was created to pass these three numbers between methods. * They are used for creating the prolog and epilog (see below for how). */ public class NumberPasser { /** the length of the unrolled loop body */ public float schedLength; /** the modulo scheduled execution time of the last instruction in the unrolled loop. */ public float lastInstExecTime; /** the modulo scheduled execution time for the first instruction in the unrolled loop */ public float fstStartTime; } //end class NumberPasser /** a factor for determining how many times to attempt modulo scheduling for any given II. */ private OperationSelection _opSel; public ModSched(OperationSelection opSel, BlockNode node) { super(node); _opSel = opSel; } /** Sometimes primals are stored to in the loop body without any predicate. This is fine within the kernal, but in the epilog, I need to know which of the final iterations needs to be stored. I need this for the control logic for the mux tree. This method, adds the loop control predicate to all primal stores in the loop (that is the loop exit predicate, so that stores would only occur when the loop exit condition becomes true). _oldPreds is a hashmap that I use to save the old predicates, so they can be restored within the kernal after scheduling, where key = instruction value = old predicate (most likely "true" or this method would be ignoring the instruction) */ private HashMap _oldPreds = new HashMap(); public void addPredsToPrimalStores(BlockNode bNode) { //HashSet storeInsts = new HashSet(); //HashMap opsToReplace = new HashMap(); //HashMap primsToReplace = new HashMap(); Set out_edges = new HashSet(); out_edges.addAll(bNode.getOutEdges()); BooleanEquation edgePredicate = null; for (Iterator it = out_edges.iterator(); it.hasNext();) { BlockEdge outEdge = (BlockEdge)it.next(); if(outEdge.isBackwardEdge()) { BooleanEquation edgePredicateCopy = (BooleanEquation)outEdge.getPredicate().clone(); edgePredicate = edgePredicateCopy.not(); } } for (Iterator it1 = bNode.getInstructions().iterator(); it1.hasNext();) { Instruction inst = (Instruction)it1.next(); if(Store.conforms(inst)) { Operand out = Store.getDestination(inst); //Operand in = Store.getValue(inst); if((out.isPrimal())&& (inst.getPredicate().listBooleanOperands().size()==0)) { BooleanEquation oldPred = (BooleanEquation)inst.getPredicate().clone(); inst.setPredicate(edgePredicate); _oldPreds.put(inst, oldPred); //storeInsts.add(inst); //Operand newOp = Operand.nextBlock(out.getFullName()); //opsToReplace.put(out, newOp); //opsToReplace.put(in, newOp); //inst.replaceOperand(in, newOp); } } } /*for (Iterator it1 = bNode.getInstructions().iterator(); it1.hasNext();) { Instruction inst = (Instruction)it1.next(); if(!storeInsts.contains(inst)) { for (Iterator it2 = opsToReplace.keySet().iterator(); it2.hasNext();) { Operand op = (Operand)it2.next(); Operand newOp = (Operand)opsToReplace.get(op); inst.replaceOperand(op, newOp); } } }*/ } /** This method changes the predicates back to what they were before addPredsToPrimalStores */ public void changePrimalStoresBack(MSchedHash modSched) { for (Iterator vIt = modSched.getInstSet().iterator(); vIt.hasNext();) { MSchedHash.MSchedInstObject instObj = (MSchedHash.MSchedInstObject) vIt.next(); Instruction inst = instObj.inst; if(_oldPreds.containsKey(inst)) inst.setPredicate((BooleanEquation)_oldPreds.get(inst)); } } /** This is the top level method for the modulo scheduler. It prepares the loop body, calls iterativeSchedule to schedule it, and calls all the methods to create and edit the prolog and epilog. */ public void moduloScheduler(BlockNode node, ChipDef chipInfo, BlockGraph graph) { //add predicates to primal stores: //addPredsToPrimalStores(node); //MuxTableGenerator does some initial analysis of the loop to add store //instructions to save the predicates used by primal store instructions //(please refer to MuxTableGenerator's comments for more) MuxTableGenerator muxGen = new MuxTableGenerator(); HashSet instSet = new HashSet(node.getInstructions()); Set out_edges = new HashSet(); out_edges.addAll(node.getOutEdges()); BooleanEquation loopExitPredicate = null; for (Iterator it = out_edges.iterator(); it.hasNext();) { BlockEdge outEdge = (BlockEdge)it.next(); if(outEdge.isBackwardEdge()) { BooleanEquation edgePredicateCopy = (BooleanEquation)outEdge.getPredicate().clone(); loopExitPredicate = edgePredicateCopy.not(); } } ArrayList newInsts = new ArrayList(muxGen.getKernalPreds(instSet, loopExitPredicate)); //perform operation selection on the new instructions for (Iterator it = newInsts.iterator(); it.hasNext(); ) { Instruction instr = (Instruction)it.next(); _opSel.select(instr); } node.addInstructions(newInsts); //create a dependence flow graph for the node with the new instructions //DependenceFlowGraph dfg = node.getDepFlowGraph(); DependenceFlowGraph dfg = new DependenceFlowGraph(graph.getName() + "_" + node.getLabel().getName() + "_mod_scheduled"); //create the MSchedHash and tell it to read in the instructions from the //node MSchedHash modSched = new MSchedHash(chipInfo, dfg, _opSel, node); modSched.readInList(node); for (Iterator itin = muxGen.savePredOps.keySet().iterator(); itin.hasNext(); ) { Instruction i = (Instruction)itin.next(); Operand op = (Operand)muxGen.savePredOps.get(i); modSched.addNewIns(i, op); } for (Iterator itin = muxGen.saveSuccOps.keySet().iterator(); itin.hasNext(); ) { Instruction i = (Instruction)itin.next(); if(i==null) {continue;} Operand op = (Operand)muxGen.saveSuccOps.get(i); modSched.addNewOuts(i, op); } dfg.generateGraph(node, modSched); node.saveDepFlowGraph(dfg); //dfg.writeDotFile("trouble_maker"); //do hardware analysis: if(!chipInfo.isDummyBoard()) AnalyzeHrdWrConstraintsPass.iICalc.calcLogicSpaceReq(graph, chipInfo, GlobalOptions.conserveArea); chipInfo.saveNode(node); initOpUseLists(node, chipInfo); //find minimum ii: chipInfo.resetPorts(); int mResII = node.getII(); chipInfo.resetPorts(); int mII = modSched.calc_MII(mResII); //schedule: int ii = mII; /*System.out.println("before scheduling "); modSched.printSchedule(); System.out.println(" "); System.out.println(" "); System.out.println(" "); System.out.println(" ");*/ chipInfo.resetPorts(); while((!modSched.iterativeSchedule(ii, (int)(GlobalOptions.budgetRatio * modSched.size())))&& (ii<=modSched.getMaxRunTime())) { ii++; //these are various initializations to reset hardware usage so that //chipInfo won't tell the scheduler there are hardware conflicts where //there aren't any, because it is remembering instructions scheduled to //times in the last attempt at scheduling chipInfo.resetPorts(); chipInfo.completeInitialize(); chipInfo.saveNode(node); initOpUseLists(node, chipInfo); modSched.calcMinDistMat(ii); } /*System.err.println("ii " + ii); System.err.println("modSched.getMaxRunTime() " + modSched.getMaxRunTime());*/ //if ii gets above maxRunTime, it failed if(ii > Math.round(modSched.getMaxRunTime())) throw new ScheduleException("modulo scheduling failed! exiting.."); //save ii and the fact that this is modulo scheduled for the synthesizer: node.setII(ii); node.setIsModuloScheduled(true); //I'm leaving a lot of these print statements in (but commented out), //because it is often very useful to track the changes to the schedule /*System.out.println("used ii of " + ii); System.out.println("b4 ls and ss "); modSched.printSchedule(); System.out.println(" "); System.out.println(" ");*/ //save the predicate store and logic created above: //modSched.savePredLogicInsts(muxGen.getPredLogic()); //make all mini loops in the schedule the same length modSched.stretchOutLoops(ii); //put in loads and stores at the top and bottom end of the mod scheduled //block to transfer data between iterations //dfg.writeDotFile("b4addls"); modSched.addLoadsAndStores(ii); //dfg.writeDotFile("aftaaddls"); /*System.out.println("after ls and ss "); modSched.printSchedule(); System.out.println(" "); System.out.println(" "); System.out.println("unrolled schedule ");*/ //now it's time to create the prolog and epilog. First, we need to unroll //the schedule. NumberPasser nums = new NumberPasser(); InstructionList unrolledList = modSched.getUnrolledSched(nums, ii); float time = ii - 1 - modSched.getStoreTime(); //add the actual predicate store instructions to the kernal modSched.addAllInstsToTime(time, muxGen.getPredStores()); //put the old predicates back in place in the kernal changePrimalStoresBack(modSched); //save the schedule times to the instructions modSched.scheduleList(node); //sort(node.getInstructions()); /*node.removeAllInstructions(); ArrayList schedList = new ArrayList(modSched.getInstructions()); sort(schedList); node.addInstructions(schedList); sort(schedList);*/ //clean up some memory: modSched = null; /*unrolledList.printSchedule(); System.out.println(" "); System.out.println(" "); System.out.println("unrolled schedule after remove stuff");*/ //delete the predicate store, logic instructions from the unrolled list, //because they are for the kernal only (to understand this please look at //MuxTableGenerator.java) unrolledList.removeInsts(newInsts); /*unrolledList.printSchedule(); System.out.println(" "); System.out.println(" ");*/ MuxAdder muxAdder = new MuxAdder(); /** ****************************************************************** prolog creation ****************************************************************** */ InstructionList proList = new InstructionList(); /** the prolog consists of start of the first few iterations of the loop up until the point where full parallelism is possible at which point the kernal can run by itself. To know which instructions should go in the prolog, we start with the unrolled loop. For each iteration we need to copy all the instructions that would execute before full parallelism occurred to the prolog. That is, if we took the original loop body and started it in staggerred starts, at some point there would be a repeating group of instructions which come from different iterations of the main loop. The repeating pattern is the kernal of the modulo scheduled loop. The part before this repeating pattern is the prolog. To find the prolog we need to find each of these iterations, which are all the iterations before the first one which can start within the kernal and also end in it. The first iteration before that will run within the kernal for int(length_of_unrolled_loop/ii)-1 iterations of the kernal, where only the start of the loop will be outside the kernal. This is very hard to explain. Let's try an example. If we have the following schedule for a loop: 11 10 9 8 7 6 5 4 3 2 1 0 where each, number represents a cycle, and for each of these cycles there is a set of instructions being executed. Now let's say we can pipeline this loop with an II of 4. That means we can run it like this: 11 10 9 8 7 11 6 10
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -