X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=qemu%2Froms%2Fipxe%2Fsrc%2Fdrivers%2Fnet%2Fipoib.c;fp=qemu%2Froms%2Fipxe%2Fsrc%2Fdrivers%2Fnet%2Fipoib.c;h=1b53917769637994dc5d9576656a28fefc8e3bb3;hb=e44e3482bdb4d0ebde2d8b41830ac2cdb07948fb;hp=0000000000000000000000000000000000000000;hpb=9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00;p=kvmfornfv.git diff --git a/qemu/roms/ipxe/src/drivers/net/ipoib.c b/qemu/roms/ipxe/src/drivers/net/ipoib.c new file mode 100644 index 000000000..1b5391776 --- /dev/null +++ b/qemu/roms/ipxe/src/drivers/net/ipoib.c @@ -0,0 +1,919 @@ +/* + * Copyright (C) 2007 Michael Brown . + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +FILE_LICENCE ( GPL2_OR_LATER ); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** @file + * + * IP over Infiniband + */ + +/** Number of IPoIB send work queue entries */ +#define IPOIB_NUM_SEND_WQES 2 + +/** Number of IPoIB receive work queue entries */ +#define IPOIB_NUM_RECV_WQES 4 + +/** Number of IPoIB completion entries */ +#define IPOIB_NUM_CQES 8 + +/** An IPoIB device */ +struct ipoib_device { + /** Network device */ + struct net_device *netdev; + /** Underlying Infiniband device */ + struct ib_device *ibdev; + /** Completion queue */ + struct ib_completion_queue *cq; + /** Queue pair */ + struct ib_queue_pair *qp; + /** Local MAC */ + struct ipoib_mac mac; + /** Broadcast MAC */ + struct ipoib_mac broadcast; + /** Joined to IPv4 broadcast multicast group + * + * This flag indicates whether or not we have initiated the + * join to the IPv4 broadcast multicast group. + */ + int broadcast_joined; + /** IPv4 broadcast multicast group membership */ + struct ib_mc_membership broadcast_membership; + /** REMAC cache */ + struct list_head peers; +}; + +/** Broadcast IPoIB address */ +static struct ipoib_mac ipoib_broadcast = { + .flags__qpn = htonl ( IB_QPN_BROADCAST ), + .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }, +}; + +/** Link status for "broadcast join in progress" */ +#define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING ) +#define EINFO_EINPROGRESS_JOINING __einfo_uniqify \ + ( EINFO_EINPROGRESS, 0x01, "Joining" ) + +/** Human-readable message for the link status */ +struct errortab ipoib_errors[] __errortab = { + __einfo_errortab ( EINFO_EINPROGRESS_JOINING ), +}; + +/**************************************************************************** + * + * IPoIB REMAC cache + * + **************************************************************************** + */ + +/** An IPoIB REMAC cache entry */ +struct ipoib_peer { + /** List of REMAC cache entries */ + struct list_head list; + /** Remote Ethermet MAC */ + struct ipoib_remac remac; + /** MAC address */ + struct ipoib_mac mac; +}; + +/** + * Find IPoIB MAC from REMAC + * + * @v ipoib IPoIB device + * @v remac Remote Ethernet MAC + * @ret mac IPoIB MAC (or NULL if not found) + */ +static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib, + const struct ipoib_remac *remac ) { + struct ipoib_peer *peer; + + /* Check for broadcast REMAC */ + if ( is_broadcast_ether_addr ( remac ) ) + return &ipoib->broadcast; + + /* Try to find via REMAC cache */ + list_for_each_entry ( peer, &ipoib->peers, list ) { + if ( memcmp ( remac, &peer->remac, + sizeof ( peer->remac ) ) == 0 ) { + /* Move peer to start of list */ + list_del ( &peer->list ); + list_add ( &peer->list, &ipoib->peers ); + return &peer->mac; + } + } + + DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n", + ipoib, eth_ntoa ( remac ) ); + return NULL; +} + +/** + * Add IPoIB MAC to REMAC cache + * + * @v ipoib IPoIB device + * @v remac Remote Ethernet MAC + * @v mac IPoIB MAC + * @ret rc Return status code + */ +static int ipoib_map_remac ( struct ipoib_device *ipoib, + const struct ipoib_remac *remac, + const struct ipoib_mac *mac ) { + struct ipoib_peer *peer; + + /* Check for existing entry in REMAC cache */ + list_for_each_entry ( peer, &ipoib->peers, list ) { + if ( memcmp ( remac, &peer->remac, + sizeof ( peer->remac ) ) == 0 ) { + /* Move peer to start of list */ + list_del ( &peer->list ); + list_add ( &peer->list, &ipoib->peers ); + /* Update MAC */ + memcpy ( &peer->mac, mac, sizeof ( peer->mac ) ); + return 0; + } + } + + /* Create new entry */ + peer = malloc ( sizeof ( *peer ) ); + if ( ! peer ) + return -ENOMEM; + memcpy ( &peer->remac, remac, sizeof ( peer->remac ) ); + memcpy ( &peer->mac, mac, sizeof ( peer->mac ) ); + list_add ( &peer->list, &ipoib->peers ); + + return 0; +} + +/** + * Flush REMAC cache + * + * @v ipoib IPoIB device + */ +static void ipoib_flush_remac ( struct ipoib_device *ipoib ) { + struct ipoib_peer *peer; + struct ipoib_peer *tmp; + + list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) { + list_del ( &peer->list ); + free ( peer ); + } +} + +/** + * Discard some entries from the REMAC cache + * + * @ret discarded Number of cached items discarded + */ +static unsigned int ipoib_discard_remac ( void ) { + struct ib_device *ibdev; + struct ipoib_device *ipoib; + struct ipoib_peer *peer; + unsigned int discarded = 0; + + /* Try to discard one cache entry for each IPoIB device */ + for_each_ibdev ( ibdev ) { + ipoib = ib_get_ownerdata ( ibdev ); + list_for_each_entry_reverse ( peer, &ipoib->peers, list ) { + list_del ( &peer->list ); + free ( peer ); + discarded++; + break; + } + } + + return discarded; +} + +/** IPoIB cache discarder */ +struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_NORMAL ) = { + .discard = ipoib_discard_remac, +}; + +/**************************************************************************** + * + * IPoIB link layer + * + **************************************************************************** + */ + +/** + * Initialise IPoIB link-layer address + * + * @v hw_addr Hardware address + * @v ll_addr Link-layer address + */ +static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) { + const uint8_t *guid = hw_addr; + uint8_t *eth_addr = ll_addr; + uint8_t guid_mask = IPOIB_GUID_MASK; + unsigned int i; + + /* Extract bytes from GUID according to mask */ + for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) { + if ( guid_mask & 0x80 ) + *(eth_addr++) = *guid; + } +} + +/** IPoIB protocol */ +struct ll_protocol ipoib_protocol __ll_protocol = { + .name = "IPoIB", + .ll_proto = htons ( ARPHRD_ETHER ), + .hw_addr_len = sizeof ( union ib_guid ), + .ll_addr_len = ETH_ALEN, + .ll_header_len = ETH_HLEN, + .push = eth_push, + .pull = eth_pull, + .init_addr = ipoib_init_addr, + .ntoa = eth_ntoa, + .mc_hash = eth_mc_hash, + .eth_addr = eth_eth_addr, + .eui64 = eth_eui64, + .flags = LL_NAME_ONLY, +}; + +/** + * Allocate IPoIB device + * + * @v priv_size Size of driver private data + * @ret netdev Network device, or NULL + */ +struct net_device * alloc_ipoibdev ( size_t priv_size ) { + struct net_device *netdev; + + netdev = alloc_netdev ( priv_size ); + if ( netdev ) { + netdev->ll_protocol = &ipoib_protocol; + netdev->ll_broadcast = eth_broadcast; + netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE; + } + return netdev; +} + +/**************************************************************************** + * + * IPoIB translation layer + * + **************************************************************************** + */ + +/** + * Translate transmitted ARP packet + * + * @v netdev Network device + * @v iobuf Packet to be transmitted (with no link-layer headers) + * @ret rc Return status code + */ +static int ipoib_translate_tx_arp ( struct net_device *netdev, + struct io_buffer *iobuf ) { + struct ipoib_device *ipoib = netdev->priv; + struct arphdr *arphdr = iobuf->data; + struct ipoib_mac *target_ha = NULL; + void *sender_pa; + void *target_pa; + + /* Do nothing unless ARP contains eIPoIB link-layer addresses */ + if ( arphdr->ar_hln != ETH_ALEN ) + return 0; + + /* Fail unless we have room to expand packet */ + if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) - + ETH_ALEN ) ) ) { + DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n", + ipoib ); + return -ENOBUFS; + } + + /* Look up REMAC, if applicable */ + if ( arphdr->ar_op == ARPOP_REPLY ) { + target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr )); + if ( ! target_ha ) + return -ENXIO; + } + + /* Construct new packet */ + iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) ); + sender_pa = arp_sender_pa ( arphdr ); + target_pa = arp_target_pa ( arphdr ); + arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND ); + arphdr->ar_hln = sizeof ( ipoib->mac ); + memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln ); + memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln ); + memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) ); + memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) ); + if ( target_ha ) { + memcpy ( arp_target_ha ( arphdr ), target_ha, + sizeof ( *target_ha ) ); + } + + return 0; +} + +/** + * Translate transmitted packet + * + * @v netdev Network device + * @v iobuf Packet to be transmitted (with no link-layer headers) + * @v net_proto Network-layer protocol (in network byte order) + * @ret rc Return status code + */ +static int ipoib_translate_tx ( struct net_device *netdev, + struct io_buffer *iobuf, uint16_t net_proto ) { + + switch ( net_proto ) { + case htons ( ETH_P_ARP ) : + return ipoib_translate_tx_arp ( netdev, iobuf ); + case htons ( ETH_P_IP ) : + /* No translation needed */ + return 0; + default: + /* Cannot handle other traffic via eIPoIB */ + return -ENOTSUP; + } +} + +/** + * Translate received ARP packet + * + * @v netdev Network device + * @v iobuf Received packet (with no link-layer headers) + * @v remac Constructed Remote Ethernet MAC + * @ret rc Return status code + */ +static int ipoib_translate_rx_arp ( struct net_device *netdev, + struct io_buffer *iobuf, + struct ipoib_remac *remac ) { + struct ipoib_device *ipoib = netdev->priv; + struct arphdr *arphdr = iobuf->data; + void *sender_pa; + void *target_pa; + int rc; + + /* Do nothing unless ARP contains IPoIB link-layer addresses */ + if ( arphdr->ar_hln != sizeof ( ipoib->mac ) ) + return 0; + + /* Create REMAC cache entry */ + if ( ( rc = ipoib_map_remac ( ipoib, remac, + arp_sender_ha ( arphdr ) ) ) != 0 ) { + DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n", + ipoib, strerror ( rc ) ); + return rc; + } + + /* Construct new packet */ + sender_pa = arp_sender_pa ( arphdr ); + target_pa = arp_target_pa ( arphdr ); + arphdr->ar_hrd = htons ( ARPHRD_ETHER ); + arphdr->ar_hln = ETH_ALEN; + memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln ); + memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln ); + memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN ); + memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN ); + if ( arphdr->ar_op == ARPOP_REPLY ) { + /* Assume received replies were directed to us */ + memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN ); + } + iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) ); + + return 0; +} + +/** + * Translate received packet + * + * @v netdev Network device + * @v iobuf Received packet (with no link-layer headers) + * @v remac Constructed Remote Ethernet MAC + * @v net_proto Network-layer protocol (in network byte order) + * @ret rc Return status code + */ +static int ipoib_translate_rx ( struct net_device *netdev, + struct io_buffer *iobuf, + struct ipoib_remac *remac, + uint16_t net_proto ) { + + switch ( net_proto ) { + case htons ( ETH_P_ARP ) : + return ipoib_translate_rx_arp ( netdev, iobuf, remac ); + case htons ( ETH_P_IP ) : + /* No translation needed */ + return 0; + default: + /* Cannot handle other traffic via eIPoIB */ + return -ENOTSUP; + } +} + +/**************************************************************************** + * + * IPoIB network device + * + **************************************************************************** + */ + +/** + * Transmit packet via IPoIB network device + * + * @v netdev Network device + * @v iobuf I/O buffer + * @ret rc Return status code + */ +static int ipoib_transmit ( struct net_device *netdev, + struct io_buffer *iobuf ) { + struct ipoib_device *ipoib = netdev->priv; + struct ib_device *ibdev = ipoib->ibdev; + struct ethhdr *ethhdr; + struct ipoib_hdr *ipoib_hdr; + struct ipoib_mac *mac; + struct ib_address_vector dest; + uint16_t net_proto; + int rc; + + /* Sanity check */ + if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) { + DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib ); + return -EINVAL; + } + + /* Attempting transmission while link is down will put the + * queue pair into an error state, so don't try it. + */ + if ( ! ib_link_ok ( ibdev ) ) + return -ENETUNREACH; + + /* Strip eIPoIB header */ + ethhdr = iobuf->data; + net_proto = ethhdr->h_protocol; + iob_pull ( iobuf, sizeof ( *ethhdr ) ); + + /* Identify destination address */ + mac = ipoib_find_remac ( ipoib, ( ( void *) ethhdr->h_dest ) ); + if ( ! mac ) + return -ENXIO; + + /* Translate packet if applicable */ + if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 ) + return rc; + + /* Prepend real IPoIB header */ + ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) ); + ipoib_hdr->proto = net_proto; + ipoib_hdr->reserved = 0; + + /* Construct address vector */ + memset ( &dest, 0, sizeof ( dest ) ); + dest.qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK ); + dest.gid_present = 1; + memcpy ( &dest.gid, &mac->gid, sizeof ( dest.gid ) ); + if ( ( rc = ib_resolve_path ( ibdev, &dest ) ) != 0 ) { + /* Path not resolved yet */ + return rc; + } + + return ib_post_send ( ibdev, ipoib->qp, &dest, iobuf ); +} + +/** + * Handle IPoIB send completion + * + * @v ibdev Infiniband device + * @v qp Queue pair + * @v iobuf I/O buffer + * @v rc Completion status code + */ +static void ipoib_complete_send ( struct ib_device *ibdev __unused, + struct ib_queue_pair *qp, + struct io_buffer *iobuf, int rc ) { + struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp ); + + netdev_tx_complete_err ( ipoib->netdev, iobuf, rc ); +} + +/** + * Handle IPoIB receive completion + * + * @v ibdev Infiniband device + * @v qp Queue pair + * @v dest Destination address vector, or NULL + * @v source Source address vector, or NULL + * @v iobuf I/O buffer + * @v rc Completion status code + */ +static void ipoib_complete_recv ( struct ib_device *ibdev __unused, + struct ib_queue_pair *qp, + struct ib_address_vector *dest, + struct ib_address_vector *source, + struct io_buffer *iobuf, int rc ) { + struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp ); + struct net_device *netdev = ipoib->netdev; + struct ipoib_hdr *ipoib_hdr; + struct ethhdr *ethhdr; + struct ipoib_remac remac; + uint16_t net_proto; + + /* Record errors */ + if ( rc != 0 ) { + netdev_rx_err ( netdev, iobuf, rc ); + return; + } + + /* Sanity check */ + if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) { + DBGC ( ipoib, "IPoIB %p received packet too short to " + "contain IPoIB header\n", ipoib ); + DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) ); + netdev_rx_err ( netdev, iobuf, -EIO ); + return; + } + if ( ! source ) { + DBGC ( ipoib, "IPoIB %p received packet without address " + "vector\n", ipoib ); + netdev_rx_err ( netdev, iobuf, -ENOTTY ); + return; + } + + /* Strip real IPoIB header */ + ipoib_hdr = iobuf->data; + net_proto = ipoib_hdr->proto; + iob_pull ( iobuf, sizeof ( *ipoib_hdr ) ); + + /* Construct source address from remote QPN and LID */ + remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA ); + remac.lid = htons ( source->lid ); + + /* Translate packet if applicable */ + if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac, + net_proto ) ) != 0 ) { + netdev_rx_err ( netdev, iobuf, rc ); + return; + } + + /* Prepend eIPoIB header */ + ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) ); + memcpy ( ðhdr->h_source, &remac, sizeof ( ethhdr->h_source ) ); + ethhdr->h_protocol = net_proto; + + /* Construct destination address */ + if ( dest->gid_present && ( memcmp ( &dest->gid, &ipoib->broadcast.gid, + sizeof ( dest->gid ) ) == 0 ) ) { + /* Broadcast GID; use the Ethernet broadcast address */ + memcpy ( ðhdr->h_dest, eth_broadcast, + sizeof ( ethhdr->h_dest ) ); + } else { + /* Assume destination address is local Ethernet MAC */ + memcpy ( ðhdr->h_dest, netdev->ll_addr, + sizeof ( ethhdr->h_dest ) ); + } + + /* Hand off to network layer */ + netdev_rx ( netdev, iobuf ); +} + +/** IPoIB completion operations */ +static struct ib_completion_queue_operations ipoib_cq_op = { + .complete_send = ipoib_complete_send, + .complete_recv = ipoib_complete_recv, +}; + +/** + * Allocate IPoIB receive I/O buffer + * + * @v len Length of buffer + * @ret iobuf I/O buffer, or NULL + * + * Some Infiniband hardware requires 2kB alignment of receive buffers + * and provides no way to disable header separation. The result is + * that there are only four bytes of link-layer header (the real IPoIB + * header) before the payload. This is not sufficient space to insert + * an eIPoIB link-layer pseudo-header. + * + * We therefore allocate I/O buffers offset to start slightly before + * the natural alignment boundary, in order to allow sufficient space. + */ +static struct io_buffer * ipoib_alloc_iob ( size_t len ) { + struct io_buffer *iobuf; + size_t reserve_len; + + /* Calculate additional length required at start of buffer */ + reserve_len = ( sizeof ( struct ethhdr ) - + sizeof ( struct ipoib_hdr ) ); + + /* Allocate buffer */ + iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len ); + if ( iobuf ) { + iob_reserve ( iobuf, reserve_len ); + } + return iobuf; +} + +/** IPoIB queue pair operations */ +static struct ib_queue_pair_operations ipoib_qp_op = { + .alloc_iob = ipoib_alloc_iob, +}; + +/** + * Poll IPoIB network device + * + * @v netdev Network device + */ +static void ipoib_poll ( struct net_device *netdev ) { + struct ipoib_device *ipoib = netdev->priv; + struct ib_device *ibdev = ipoib->ibdev; + + /* Poll Infiniband device */ + ib_poll_eq ( ibdev ); + + /* Poll the retry timers (required for IPoIB multicast join) */ + retry_poll(); +} + +/** + * Handle IPv4 broadcast multicast group join completion + * + * @v ibdev Infiniband device + * @v qp Queue pair + * @v membership Multicast group membership + * @v rc Status code + * @v mad Response MAD (or NULL on error) + */ +void ipoib_join_complete ( struct ib_device *ibdev __unused, + struct ib_queue_pair *qp __unused, + struct ib_mc_membership *membership, int rc, + union ib_mad *mad __unused ) { + struct ipoib_device *ipoib = container_of ( membership, + struct ipoib_device, broadcast_membership ); + + /* Record join status as link status */ + netdev_link_err ( ipoib->netdev, rc ); +} + +/** + * Join IPv4 broadcast multicast group + * + * @v ipoib IPoIB device + * @ret rc Return status code + */ +static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) { + int rc; + + if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp, + &ipoib->broadcast_membership, + &ipoib->broadcast.gid, + ipoib_join_complete ) ) != 0 ) { + DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n", + ipoib, strerror ( rc ) ); + return rc; + } + ipoib->broadcast_joined = 1; + + return 0; +} + +/** + * Leave IPv4 broadcast multicast group + * + * @v ipoib IPoIB device + */ +static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) { + + if ( ipoib->broadcast_joined ) { + ib_mcast_leave ( ipoib->ibdev, ipoib->qp, + &ipoib->broadcast_membership ); + ipoib->broadcast_joined = 0; + } +} + +/** + * Handle link status change + * + * @v ibdev Infiniband device + */ +static void ipoib_link_state_changed ( struct ib_device *ibdev ) { + struct net_device *netdev = ib_get_ownerdata ( ibdev ); + struct ipoib_device *ipoib = netdev->priv; + int rc; + + /* Leave existing broadcast group */ + ipoib_leave_broadcast_group ( ipoib ); + + /* Update MAC address based on potentially-new GID prefix */ + memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix, + sizeof ( ipoib->mac.gid.s.prefix ) ); + + /* Update broadcast GID based on potentially-new partition key */ + ipoib->broadcast.gid.words[2] = + htons ( ibdev->pkey | IB_PKEY_FULL ); + + /* Set net device link state to reflect Infiniband link state */ + rc = ib_link_rc ( ibdev ); + netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) ); + + /* Join new broadcast group */ + if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && + ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) { + DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: " + "%s\n", ipoib, strerror ( rc ) ); + netdev_link_err ( netdev, rc ); + return; + } +} + +/** + * Open IPoIB network device + * + * @v netdev Network device + * @ret rc Return status code + */ +static int ipoib_open ( struct net_device *netdev ) { + struct ipoib_device *ipoib = netdev->priv; + struct ib_device *ibdev = ipoib->ibdev; + int rc; + + /* Open IB device */ + if ( ( rc = ib_open ( ibdev ) ) != 0 ) { + DBGC ( ipoib, "IPoIB %p could not open device: %s\n", + ipoib, strerror ( rc ) ); + goto err_ib_open; + } + + /* Allocate completion queue */ + ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op ); + if ( ! ipoib->cq ) { + DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n", + ipoib ); + rc = -ENOMEM; + goto err_create_cq; + } + + /* Allocate queue pair */ + ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES, + ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq, + &ipoib_qp_op ); + if ( ! ipoib->qp ) { + DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n", + ipoib ); + rc = -ENOMEM; + goto err_create_qp; + } + ib_qp_set_ownerdata ( ipoib->qp, ipoib ); + + /* Update MAC address with QPN */ + ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn ); + + /* Fill receive rings */ + ib_refill_recv ( ibdev, ipoib->qp ); + + /* Fake a link status change to join the broadcast group */ + ipoib_link_state_changed ( ibdev ); + + return 0; + + ib_destroy_qp ( ibdev, ipoib->qp ); + err_create_qp: + ib_destroy_cq ( ibdev, ipoib->cq ); + err_create_cq: + ib_close ( ibdev ); + err_ib_open: + return rc; +} + +/** + * Close IPoIB network device + * + * @v netdev Network device + */ +static void ipoib_close ( struct net_device *netdev ) { + struct ipoib_device *ipoib = netdev->priv; + struct ib_device *ibdev = ipoib->ibdev; + + /* Flush REMAC cache */ + ipoib_flush_remac ( ipoib ); + + /* Leave broadcast group */ + ipoib_leave_broadcast_group ( ipoib ); + + /* Remove QPN from MAC address */ + ipoib->mac.flags__qpn = 0; + + /* Tear down the queues */ + ib_destroy_qp ( ibdev, ipoib->qp ); + ib_destroy_cq ( ibdev, ipoib->cq ); + + /* Close IB device */ + ib_close ( ibdev ); +} + +/** IPoIB network device operations */ +static struct net_device_operations ipoib_operations = { + .open = ipoib_open, + .close = ipoib_close, + .transmit = ipoib_transmit, + .poll = ipoib_poll, +}; + +/** + * Probe IPoIB device + * + * @v ibdev Infiniband device + * @ret rc Return status code + */ +static int ipoib_probe ( struct ib_device *ibdev ) { + struct net_device *netdev; + struct ipoib_device *ipoib; + int rc; + + /* Allocate network device */ + netdev = alloc_ipoibdev ( sizeof ( *ipoib ) ); + if ( ! netdev ) + return -ENOMEM; + netdev_init ( netdev, &ipoib_operations ); + ipoib = netdev->priv; + ib_set_ownerdata ( ibdev, netdev ); + netdev->dev = ibdev->dev; + memset ( ipoib, 0, sizeof ( *ipoib ) ); + ipoib->netdev = netdev; + ipoib->ibdev = ibdev; + INIT_LIST_HEAD ( &ipoib->peers ); + + /* Extract hardware address */ + memcpy ( netdev->hw_addr, &ibdev->gid.s.guid, + sizeof ( ibdev->gid.s.guid ) ); + + /* Set local MAC address */ + memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid, + sizeof ( ipoib->mac.gid.s.guid ) ); + + /* Set default broadcast MAC address */ + memcpy ( &ipoib->broadcast, &ipoib_broadcast, + sizeof ( ipoib->broadcast ) ); + + /* Register network device */ + if ( ( rc = register_netdev ( netdev ) ) != 0 ) + goto err_register_netdev; + + return 0; + + err_register_netdev: + netdev_nullify ( netdev ); + netdev_put ( netdev ); + return rc; +} + +/** + * Remove IPoIB device + * + * @v ibdev Infiniband device + */ +static void ipoib_remove ( struct ib_device *ibdev ) { + struct net_device *netdev = ib_get_ownerdata ( ibdev ); + + unregister_netdev ( netdev ); + netdev_nullify ( netdev ); + netdev_put ( netdev ); +} + +/** IPoIB driver */ +struct ib_driver ipoib_driver __ib_driver = { + .name = "IPoIB", + .probe = ipoib_probe, + .notify = ipoib_link_state_changed, + .remove = ipoib_remove, +};