ip_vs_core.c
来自「linux 内核源代码」· C语言 代码 · 共 1,153 行 · 第 1/2 页
C
1,153 行
/* * IPVS An implementation of the IP virtual server support for the * LINUX operating system. IPVS is now implemented as a module * over the Netfilter framework. IPVS can be used to build a * high-performance and highly available server based on a * cluster of servers. * * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $ * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * Julian Anastasov <ja@ssi.bg> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms * and others. * * Changes: * Paul `Rusty' Russell properly handle non-linear skbs * Harald Welte don't use nfcache * */#include <linux/module.h>#include <linux/kernel.h>#include <linux/ip.h>#include <linux/tcp.h>#include <linux/icmp.h>#include <net/ip.h>#include <net/tcp.h>#include <net/udp.h>#include <net/icmp.h> /* for icmp_send */#include <net/route.h>#include <linux/netfilter.h>#include <linux/netfilter_ipv4.h>#include <net/ip_vs.h>EXPORT_SYMBOL(register_ip_vs_scheduler);EXPORT_SYMBOL(unregister_ip_vs_scheduler);EXPORT_SYMBOL(ip_vs_skb_replace);EXPORT_SYMBOL(ip_vs_proto_name);EXPORT_SYMBOL(ip_vs_conn_new);EXPORT_SYMBOL(ip_vs_conn_in_get);EXPORT_SYMBOL(ip_vs_conn_out_get);#ifdef CONFIG_IP_VS_PROTO_TCPEXPORT_SYMBOL(ip_vs_tcp_conn_listen);#endifEXPORT_SYMBOL(ip_vs_conn_put);#ifdef CONFIG_IP_VS_DEBUGEXPORT_SYMBOL(ip_vs_get_debug_level);#endif/* ID used in ICMP lookups */#define icmp_id(icmph) (((icmph)->un).echo.id)const char *ip_vs_proto_name(unsigned proto){ static char buf[20]; switch (proto) { case IPPROTO_IP: return "IP"; case IPPROTO_UDP: return "UDP"; case IPPROTO_TCP: return "TCP"; case IPPROTO_ICMP: return "ICMP"; default: sprintf(buf, "IP_%d", proto); return buf; }}void ip_vs_init_hash_table(struct list_head *table, int rows){ while (--rows >= 0) INIT_LIST_HEAD(&table[rows]);}static inline voidip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb){ struct ip_vs_dest *dest = cp->dest; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { spin_lock(&dest->stats.lock); dest->stats.inpkts++; dest->stats.inbytes += skb->len; spin_unlock(&dest->stats.lock); spin_lock(&dest->svc->stats.lock); dest->svc->stats.inpkts++; dest->svc->stats.inbytes += skb->len; spin_unlock(&dest->svc->stats.lock); spin_lock(&ip_vs_stats.lock); ip_vs_stats.inpkts++; ip_vs_stats.inbytes += skb->len; spin_unlock(&ip_vs_stats.lock); }}static inline voidip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb){ struct ip_vs_dest *dest = cp->dest; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { spin_lock(&dest->stats.lock); dest->stats.outpkts++; dest->stats.outbytes += skb->len; spin_unlock(&dest->stats.lock); spin_lock(&dest->svc->stats.lock); dest->svc->stats.outpkts++; dest->svc->stats.outbytes += skb->len; spin_unlock(&dest->svc->stats.lock); spin_lock(&ip_vs_stats.lock); ip_vs_stats.outpkts++; ip_vs_stats.outbytes += skb->len; spin_unlock(&ip_vs_stats.lock); }}static inline voidip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc){ spin_lock(&cp->dest->stats.lock); cp->dest->stats.conns++; spin_unlock(&cp->dest->stats.lock); spin_lock(&svc->stats.lock); svc->stats.conns++; spin_unlock(&svc->stats.lock); spin_lock(&ip_vs_stats.lock); ip_vs_stats.conns++; spin_unlock(&ip_vs_stats.lock);}static inline intip_vs_set_state(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_protocol *pp){ if (unlikely(!pp->state_transition)) return 0; return pp->state_transition(cp, direction, skb, pp);}/* * IPVS persistent scheduling function * It creates a connection entry according to its template if exists, * or selects a server and creates a connection entry plus a template. * Locking: we are svc user (svc->refcnt), so we hold all dests too * Protocols supported: TCP, UDP */static struct ip_vs_conn *ip_vs_sched_persist(struct ip_vs_service *svc, const struct sk_buff *skb, __be16 ports[2]){ struct ip_vs_conn *cp = NULL; struct iphdr *iph = ip_hdr(skb); struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport; /* destination port to forward */ __be32 snet; /* source network of the client, after masking */ /* Mask saddr with the netmask to adjust template granularity */ snet = iph->saddr & svc->netmask; IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " "mnet %u.%u.%u.%u\n", NIPQUAD(iph->saddr), ntohs(ports[0]), NIPQUAD(iph->daddr), ntohs(ports[1]), NIPQUAD(snet)); /* * As far as we know, FTP is a very complicated network protocol, and * it uses control connection and data connections. For active FTP, * FTP server initialize data connection to the client, its source port * is often 20. For passive FTP, FTP server tells the clients the port * that it passively listens to, and the client issues the data * connection. In the tunneling or direct routing mode, the load * balancer is on the client-to-server half of connection, the port * number is unknown to the load balancer. So, a conn template like * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> * is created for other persistent services. */ if (ports[1] == svc->port) { /* Check if a template already exists */ if (svc->port != FTPPORT) ct = ip_vs_ct_in_get(iph->protocol, snet, 0, iph->daddr, ports[1]); else ct = ip_vs_ct_in_get(iph->protocol, snet, 0, iph->daddr, 0); if (!ct || !ip_vs_check_template(ct)) { /* * No template found or the dest of the connection * template is not available. */ dest = svc->scheduler->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); return NULL; } /* * Create a template like <protocol,caddr,0, * vaddr,vport,daddr,dport> for non-ftp service, * and <protocol,caddr,0,vaddr,0,daddr,0> * for ftp service. */ if (svc->port != FTPPORT) ct = ip_vs_conn_new(iph->protocol, snet, 0, iph->daddr, ports[1], dest->addr, dest->port, IP_VS_CONN_F_TEMPLATE, dest); else ct = ip_vs_conn_new(iph->protocol, snet, 0, iph->daddr, 0, dest->addr, 0, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) return NULL; ct->timeout = svc->timeout; } else { /* set destination with the found template */ dest = ct->dest; } dport = dest->port; } else { /* * Note: persistent fwmark-based services and persistent * port zero service are handled here. * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> * port zero template: <protocol,caddr,0,vaddr,0,daddr,0> */ if (svc->fwmark) ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0, htonl(svc->fwmark), 0); else ct = ip_vs_ct_in_get(iph->protocol, snet, 0, iph->daddr, 0); if (!ct || !ip_vs_check_template(ct)) { /* * If it is not persistent port zero, return NULL, * otherwise create a connection template. */ if (svc->port) return NULL; dest = svc->scheduler->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); return NULL; } /* * Create a template according to the service */ if (svc->fwmark) ct = ip_vs_conn_new(IPPROTO_IP, snet, 0, htonl(svc->fwmark), 0, dest->addr, 0, IP_VS_CONN_F_TEMPLATE, dest); else ct = ip_vs_conn_new(iph->protocol, snet, 0, iph->daddr, 0, dest->addr, 0, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) return NULL; ct->timeout = svc->timeout; } else { /* set destination with the found template */ dest = ct->dest; } dport = ports[1]; } /* * Create a new connection according to the template */ cp = ip_vs_conn_new(iph->protocol, iph->saddr, ports[0], iph->daddr, ports[1], dest->addr, dport, 0, dest); if (cp == NULL) { ip_vs_conn_put(ct); return NULL; } /* * Add its control */ ip_vs_control_add(cp, ct); ip_vs_conn_put(ct); ip_vs_conn_stats(cp, svc); return cp;}/* * IPVS main scheduling function * It selects a server according to the virtual service, and * creates a connection entry. * Protocols supported: TCP, UDP */struct ip_vs_conn *ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb){ struct ip_vs_conn *cp = NULL; struct iphdr *iph = ip_hdr(skb); struct ip_vs_dest *dest; __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, iph->ihl*4, sizeof(_ports), _ports); if (pptr == NULL) return NULL; /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) return ip_vs_sched_persist(svc, skb, pptr); /* * Non-persistent service */ if (!svc->fwmark && pptr[1] != svc->port) { if (!svc->port) IP_VS_ERR("Schedule: port zero only supported " "in persistent services, " "check your ipvs configuration\n"); return NULL; } dest = svc->scheduler->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; } /* * Create a connection entry. */ cp = ip_vs_conn_new(iph->protocol, iph->saddr, pptr[0], iph->daddr, pptr[1], dest->addr, dest->port?dest->port:pptr[1], 0, dest); if (cp == NULL) return NULL; IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), NIPQUAD(cp->daddr), ntohs(cp->dport), cp->flags, atomic_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); return cp;}/* * Pass or drop the packet. * Called by ip_vs_in, when the virtual service is available but * no destination is available for a new connection. */int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_protocol *pp){ __be16 _ports[2], *pptr; struct iphdr *iph = ip_hdr(skb); pptr = skb_header_pointer(skb, iph->ihl*4, sizeof(_ports), _ports); if (pptr == NULL) { ip_vs_service_put(svc); return NF_DROP; } /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is RTN_UNICAST (and not local), then create a cache_bypass connection entry */ if (sysctl_ip_vs_cache_bypass && svc->fwmark && (inet_addr_type(iph->daddr) == RTN_UNICAST)) { int ret, cs; struct ip_vs_conn *cp; ip_vs_service_put(svc); /* create a new connection entry */ IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); cp = ip_vs_conn_new(iph->protocol, iph->saddr, pptr[0], iph->daddr, pptr[1], 0, 0, IP_VS_CONN_F_BYPASS, NULL); if (cp == NULL) return NF_DROP; /* statistics */ ip_vs_in_stats(cp, skb); /* set state */ cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); /* transmit the first SYN packet */ ret = cp->packet_xmit(skb, cp, pp); /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); ip_vs_conn_put(cp); return ret; } /* * When the virtual ftp service is presented, packets destined * for other services on the VIP may get here (except services * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { ip_vs_service_put(svc); return NF_ACCEPT; } ip_vs_service_put(svc); /* * Notify the client that the destination is unreachable, and * release the socket buffer. * Since it is in IP layer, the TCP socket is not actually * created, the TCP RST packet cannot be sent, instead that * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ */ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP;}/* * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING * chain, and is used for VS/NAT. * It detects packets for VS/NAT connections and sends the packets * immediately. This can avoid that iptable_nat mangles the packets * for VS/NAT. */static unsigned int ip_vs_post_routing(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)){ if (!skb->ipvs_property) return NF_ACCEPT; /* The packet was sent from IPVS, exit this chain */ return NF_STOP;}__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset){ return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));}static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user){ int err = ip_defrag(skb, user); if (!err) ip_send_check(ip_hdr(skb)); return err;}/* * Packet has been made sufficiently writable in caller * - inout: 1=in->out, 0=out->in */void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout){ struct iphdr *iph = ip_hdr(skb); unsigned int icmp_offset = iph->ihl*4; struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + icmp_offset); struct iphdr *ciph = (struct iphdr *)(icmph + 1); if (inout) { iph->saddr = cp->vaddr; ip_send_check(iph); ciph->daddr = cp->vaddr; ip_send_check(ciph); } else { iph->daddr = cp->daddr; ip_send_check(iph); ciph->saddr = cp->daddr; ip_send_check(ciph); } /* the TCP/UDP port */ if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { __be16 *ports = (void *)ciph + ciph->ihl*4; if (inout) ports[1] = cp->vport; else ports[0] = cp->dport; } /* And finally the ICMP checksum */ icmph->checksum = 0; icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); skb->ip_summed = CHECKSUM_UNNECESSARY; if (inout) IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMP"); else IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMP");}/* * Handle ICMP messages in the inside-to-outside direction (outgoing). * Find any that might be relevant, check against existing connections, * forward to the right destination host if relevant. * Currently handles error types - unreachable, quench, ttl exceeded. * (Only used in VS/NAT) */static int ip_vs_out_icmp(struct sk_buff *skb, int *related){ struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_conn *cp;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?