📄 napi_howto.txt
字号:
rx_size = rx_status >> 16; pkt_size = rx_size - 4; /* process errors */ if ((rx_size > (MAX_ETH_FRAME_SIZE+4)) || (!(rx_status & RxStatusOK))) { netdrv_rx_err (rx_status, dev, tp, ioaddr); return; } if (--rx_work_limit < 0) break; /* grab a skb */ skb = dev_alloc_skb (pkt_size + 2); if (skb) { . . netif_rx (skb); . . } else { /* OOM */ /*seems very driver specific ... some just pass whatever is on the ring already. */ } /* move to the next skb on the ring */ entry = (++tp->cur_rx) % RX_RING_SIZE; received++ ; } /* store current ring pointer state */ tp->cur_rx = cur_rx; /* Refill the Rx ring buffers if they are needed */ refill_rx_ring(); . .}-------------------------------------------------------------------We change it to a new one below; note the additional parameter inthe call.-------------------------------------------------------------------/* this is called by the network core */static int my_poll (struct net_device *dev, int *budget){ struct my_private *tp = (struct my_private *)dev->priv; rx_ring = tp->rx_ring; cur_rx = tp->cur_rx; int entry = cur_rx % RX_BUF_LEN; /* maximum packets to send to the stack *//************************ note note *********************************/ int rx_work_limit = dev->quota;/************************ end note note *********************************/ do { // outer beginning loop starts here clear_rx_status_register_bit(); while (rx_ring_not_empty) { u32 rx_status; unsigned int rx_size; unsigned int pkt_size; struct sk_buff *skb; /* read size+status of next frame from DMA ring buffer */ /* the number 16 and 4 are just examples */ rx_status = le32_to_cpu (*(u32 *) (rx_ring + ring_offset)); rx_size = rx_status >> 16; pkt_size = rx_size - 4; /* process errors */ if ((rx_size > (MAX_ETH_FRAME_SIZE+4)) || (!(rx_status & RxStatusOK))) { netdrv_rx_err (rx_status, dev, tp, ioaddr); return 1; }/************************ note note *********************************/ if (--rx_work_limit < 0) { /* we got packets, but no quota */ /* store current ring pointer state */ tp->cur_rx = cur_rx; /* Refill the Rx ring buffers if they are needed */ refill_rx_ring(dev); goto not_done; }/********************** end note **********************************/ /* grab a skb */ skb = dev_alloc_skb (pkt_size + 2); if (skb) { . ./************************ note note *********************************/ netif_receive_skb (skb);/********************** end note **********************************/ . . } else { /* OOM */ /*seems very driver specific ... common is just pass whatever is on the ring already. */ } /* move to the next skb on the ring */ entry = (++tp->cur_rx) % RX_RING_SIZE; received++ ; } /* store current ring pointer state */ tp->cur_rx = cur_rx; /* Refill the Rx ring buffers if they are needed */ refill_rx_ring(dev); /* no packets on ring; but new ones can arrive since we last checked */ status = read_interrupt_status_reg(); if (rx status is not set) { /* If something arrives in this narrow window, an interrupt will be generated */ goto done; } /* done! at least thats what it looks like ;-> if new packets came in after our last check on status bits they'll be caught by the while check and we go back and clear them since we havent exceeded our quota */ } while (rx_status_is_set); done:/************************ note note *********************************/ dev->quota -= received; *budget -= received; /* If RX ring is not full we are out of memory. */ if (tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL) goto oom; /* we are happy/done, no more packets on ring; put us back to where we can start processing interrupts again */ netif_rx_complete(dev); enable_rx_and_rxnobuf_ints(); /* The last op happens after poll completion. Which means the following: * 1. it can race with disabling irqs in irq handler (which are done to * schedule polls) * 2. it can race with dis/enabling irqs in other poll threads * 3. if an irq raised after the begining of the outer beginning * loop(marked in the code above), it will be immediately * triggered here. * * Summarizing: the logic may results in some redundant irqs both * due to races in masking and due to too late acking of already * processed irqs. The good news: no events are ever lost. */ return 0; /* done */not_done: if (tp->cur_rx - tp->dirty_rx > RX_RING_SIZE/2 || tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL) refill_rx_ring(dev); if (!received) { printk("received==0\n"); received = 1; } dev->quota -= received; *budget -= received; return 1; /* not_done */oom: /* Start timer, stop polling, but do not enable rx interrupts. */ start_poll_timer(dev); return 0; /* we'll take it from here so tell core "done"*//************************ End note note *********************************/ }-------------------------------------------------------------------From above we note that:0) rx_work_limit = dev->quota 1) refill_rx_ring() is in charge of clearing the bit for rxnobuff whenit does the work.2) We have a done and not_done state.3) instead of netif_rx() we call netif_receive_skb() to pass the skb.4) we have a new way of handling oom condition5) A new outer for (;;) loop has been added. This serves the purpose ofensuring that if a new packet has come in, after we are all set and done,and we have not exceeded our quota that we continue sending packets up. -----------------------------------------------------------Poll timer code will need to do the following:a) if (tp->cur_rx - tp->dirty_rx > RX_RING_SIZE/2 || tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL) refill_rx_ring(dev); /* If RX ring is not full we are still out of memory. Restart the timer again. Else we re-add ourselves to the master poll list. */ if (tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL) restart_timer(); else netif_rx_schedule(dev); /* we are back on the poll list */ 5) dev->close() and dev->suspend() issues==========================================The driver writter neednt worry about this. The top net layer takescare of it.6) Adding new Stats to /proc =============================In order to debug some of the new features, we introduce new statsthat need to be collected.TODO: Fill this later.APPENDIX 1: discussion on using ethernet HW FC==============================================Most chips with FC only send a pause packet when they run out of Rx buffers.Since packets are pulled off the DMA ring by a softirq in NAPI,if the system is slow in grabbing them and we have a high inputrate (faster than the system's capacity to remove packets), then theoreticallythere will only be one rx interrupt for all packets during a given packetstorm.Under low load, we might have a single interrupt per packet.FC should be programmed to apply in the case when the system cant pull outpackets fast enough i.e send a pause only when you run out of rx buffers.Note FC in itself is a good solution but we have found it to not bemuch of a commodity feature (both in NICs and switches) and hence fallsunder the same category as using NIC based mitigation. Also experimentsindicate that its much harder to resolve the resource allocationissue (aka lazy receiving that NAPI offers) and hence quantify its usefullnessproved harder. In any case, FC works even better with NAPI but is notnecessary.APPENDIX 2: the "rotting packet" race-window avoidance scheme =============================================================There are two types of associations seen here1) status/int which honors level triggered IRQIf a status bit for receive or rxnobuff is set and the corresponding interrupt-enable bit is not on, then no interrupts will be generated. However, as soon as the "interrupt-enable" bit is unmasked, an immediate interrupt is generated. [assuming the status bit was not turned off].Generally the concept of level triggered IRQs in association with a status andinterrupt-enable CSR register set is used to avoid the race.If we take the example of the tulip:"pending work" is indicated by the status bit(CSR5 in tulip).the corresponding interrupt bit (CSR7 in tulip) might be turned off (butthe CSR5 will continue to be turned on with new packet arrivals even ifwe clear it the first time)Very important is the fact that if we turn on the interrupt bit on whenstatus is set that an immediate irq is triggered. If we cleared the rx ring and proclaimed there was "no more workto be done" and then went on to do a few other things; then when we enableinterrupts, there is a possibility that a new packet might sneak in duringthis phase. It helps to look at the pseudo code for the tulip pollroutine:-------------------------- do { ACK; while (ring_is_not_empty()) { work-work-work if quota is exceeded: exit, no touching irq status/mask } /* No packets, but new can arrive while we are doing this*/ CSR5 := read if (CSR5 is not set) { /* If something arrives in this narrow window here, * where the comments are ;-> irq will be generated */ unmask irqs; exit poll; } } while (rx_status_is_set);------------------------CSR5 bit of interest is only the rx status. If you look at the last if statement: you just finished grabbing all the packets from the rx ring .. you check ifstatus bit says theres more packets just in ... it says none; you thenenable rx interrupts again; if a new packet just came in during this check,we are counting that CSR5 will be set in that small window of opportunityand that by re-enabling interrupts, we would actually triger an interruptto register the new packet for processing.[The above description nay be very verbose, if you have better wording that will make this more understandable, please suggest it.]2) non-capable hardwareThese do not generally respect level triggered IRQs. Normally,irqs may be lost while being masked and the only way to leave poll is to doa double check for new input after netif_rx_complete() is invokedand re-enable polling (after seeing this new input).Sample code:--------- . .restart_poll: while (ring_is_not_empty()) { work-work-work if quota is exceeded: exit, not touching irq status/mask } . . . enable_rx_interrupts() netif_rx_complete(dev); if (ring_has_new_packet() && netif_rx_reschedule(dev, received)) { disable_rx_and_rxnobufs() goto restart_poll } while (rx_status_is_set);--------- Basically netif_rx_complete() removes us from the poll list, but because anew packet which will never be caught due to the possibility of a racemight come in, we attempt to re-add ourselves to the poll list. APPENDIX 3: Scheduling issues.==============================As seen NAPI moves processing to softirq level. Linux uses the ksoftirqd as the general solution to schedule softirq's to run before next interrupt and by putting them under scheduler control. Also this prevents consecutive softirq's from monopolize the CPU. This also have the effect that the priority of ksoftirq needs to be considered when running very CPU-intensive applications and networking toget the proper balance of softirq/user balance. Increasing ksoftirq priority to 0 (eventually more) is reported cure problems with low network performance at high CPU load.Most used processes in a GIGE router:USER PID %CPU %MEM SIZE RSS TTY STAT START TIME COMMANDroot 3 0.2 0.0 0 0 ? RWN Aug 15 602:00 (ksoftirqd_CPU0)root 232 0.0 7.9 41400 40884 ? S Aug 15 74:12 gated --------------------------------------------------------------------relevant sites:==================ftp://robur.slu.se/pub/Linux/net-development/NAPI/--------------------------------------------------------------------TODO: Write net-skeleton.c driver.-------------------------------------------------------------Authors:========Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>Jamal Hadi Salim <hadi@cyberus.ca>Robert Olsson <Robert.Olsson@data.slu.se>Acknowledgements:================People who made this document better:Lennert Buytenhek <buytenh@gnu.org>Andrew Morton <akpm@zip.com.au>Manfred Spraul <manfred@colorfullife.com>Donald Becker <becker@scyld.com>Jeff Garzik <jgarzik@pobox.com>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -