2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 * You can also choose to distribute this program under the terms of
20 * the Unmodified Binary Distribution Licence (as given in the file
21 * COPYING.UBDL), provided that you have satisfied its requirements.
24 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
33 #include <ipxe/errortab.h>
34 #include <ipxe/malloc.h>
35 #include <ipxe/if_arp.h>
37 #include <ipxe/if_ether.h>
38 #include <ipxe/ethernet.h>
40 #include <ipxe/iobuf.h>
41 #include <ipxe/netdevice.h>
42 #include <ipxe/infiniband.h>
43 #include <ipxe/ib_pathrec.h>
44 #include <ipxe/ib_mcast.h>
45 #include <ipxe/retry.h>
46 #include <ipxe/ipoib.h>
53 /* Disambiguate the various error causes */
54 #define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY )
55 #define EINFO_ENXIO_ARP_REPLY \
56 __einfo_uniqify ( EINFO_ENXIO, 0x01, \
57 "Missing REMAC for ARP reply target address" )
58 #define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 )
59 #define EINFO_ENXIO_NON_IPV4 \
60 __einfo_uniqify ( EINFO_ENXIO, 0x02, \
61 "Missing REMAC for non-IPv4 packet" )
62 #define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT )
63 #define EINFO_ENXIO_ARP_SENT \
64 __einfo_uniqify ( EINFO_ENXIO, 0x03, \
65 "Missing REMAC for IPv4 packet (ARP sent)" )
67 /** Number of IPoIB send work queue entries */
68 #define IPOIB_NUM_SEND_WQES 2
70 /** Number of IPoIB receive work queue entries */
71 #define IPOIB_NUM_RECV_WQES 4
73 /** Number of IPoIB completion entries */
74 #define IPOIB_NUM_CQES 8
76 /** An IPoIB device */
79 struct net_device *netdev;
80 /** Underlying Infiniband device */
81 struct ib_device *ibdev;
82 /** Completion queue */
83 struct ib_completion_queue *cq;
85 struct ib_queue_pair *qp;
89 struct ipoib_mac broadcast;
90 /** Joined to IPv4 broadcast multicast group
92 * This flag indicates whether or not we have initiated the
93 * join to the IPv4 broadcast multicast group.
96 /** IPv4 broadcast multicast group membership */
97 struct ib_mc_membership broadcast_membership;
99 struct list_head peers;
102 /** Broadcast IPoIB address */
103 static struct ipoib_mac ipoib_broadcast = {
104 .flags__qpn = htonl ( IB_QPN_BROADCAST ),
105 .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
106 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
109 /** Link status for "broadcast join in progress" */
110 #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
111 #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
112 ( EINFO_EINPROGRESS, 0x01, "Joining" )
114 /** Human-readable message for the link status */
115 struct errortab ipoib_errors[] __errortab = {
116 __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
119 static struct net_device_operations ipoib_operations;
121 /****************************************************************************
125 ****************************************************************************
128 /** An IPoIB REMAC cache entry */
130 /** List of REMAC cache entries */
131 struct list_head list;
132 /** Remote Ethermet MAC */
133 struct ipoib_remac remac;
135 struct ipoib_mac mac;
139 * Find IPoIB MAC from REMAC
141 * @v ipoib IPoIB device
142 * @v remac Remote Ethernet MAC
143 * @ret mac IPoIB MAC (or NULL if not found)
145 static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
146 const struct ipoib_remac *remac ) {
147 struct ipoib_peer *peer;
149 /* Check for broadcast or multicast REMAC. We transmit
150 * multicasts as broadcasts for simplicity.
152 if ( is_multicast_ether_addr ( remac ) )
153 return &ipoib->broadcast;
155 /* Try to find via REMAC cache */
156 list_for_each_entry ( peer, &ipoib->peers, list ) {
157 if ( memcmp ( remac, &peer->remac,
158 sizeof ( peer->remac ) ) == 0 ) {
159 /* Move peer to start of list */
160 list_del ( &peer->list );
161 list_add ( &peer->list, &ipoib->peers );
166 DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
167 ipoib, eth_ntoa ( remac ) );
172 * Add IPoIB MAC to REMAC cache
174 * @v ipoib IPoIB device
175 * @v remac Remote Ethernet MAC
177 * @ret rc Return status code
179 static int ipoib_map_remac ( struct ipoib_device *ipoib,
180 const struct ipoib_remac *remac,
181 const struct ipoib_mac *mac ) {
182 struct ipoib_peer *peer;
184 /* Check for existing entry in REMAC cache */
185 list_for_each_entry ( peer, &ipoib->peers, list ) {
186 if ( memcmp ( remac, &peer->remac,
187 sizeof ( peer->remac ) ) == 0 ) {
188 /* Move peer to start of list */
189 list_del ( &peer->list );
190 list_add ( &peer->list, &ipoib->peers );
192 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
197 /* Create new entry */
198 peer = malloc ( sizeof ( *peer ) );
201 memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
202 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
203 list_add ( &peer->list, &ipoib->peers );
211 * @v ipoib IPoIB device
213 static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
214 struct ipoib_peer *peer;
215 struct ipoib_peer *tmp;
217 list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
218 list_del ( &peer->list );
224 * Discard some entries from the REMAC cache
226 * @ret discarded Number of cached items discarded
228 static unsigned int ipoib_discard_remac ( void ) {
229 struct net_device *netdev;
230 struct ipoib_device *ipoib;
231 struct ipoib_peer *peer;
232 unsigned int discarded = 0;
234 /* Try to discard one cache entry for each IPoIB device */
235 for_each_netdev ( netdev ) {
237 /* Skip non-IPoIB devices */
238 if ( netdev->op != &ipoib_operations )
240 ipoib = netdev->priv;
242 /* Discard least recently used cache entry (if any) */
243 list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
244 list_del ( &peer->list );
254 /** IPoIB cache discarder */
255 struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
256 .discard = ipoib_discard_remac,
259 /****************************************************************************
263 ****************************************************************************
267 * Initialise IPoIB link-layer address
269 * @v hw_addr Hardware address
270 * @v ll_addr Link-layer address
272 static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
273 const uint8_t *guid = hw_addr;
274 uint8_t *eth_addr = ll_addr;
275 uint8_t guid_mask = IPOIB_GUID_MASK;
278 /* Extract bytes from GUID according to mask */
279 for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
280 if ( guid_mask & 0x80 )
281 *(eth_addr++) = *guid;
285 /** IPoIB protocol */
286 struct ll_protocol ipoib_protocol __ll_protocol = {
288 .ll_proto = htons ( ARPHRD_ETHER ),
289 .hw_addr_len = sizeof ( union ib_guid ),
290 .ll_addr_len = ETH_ALEN,
291 .ll_header_len = ETH_HLEN,
294 .init_addr = ipoib_init_addr,
296 .mc_hash = eth_mc_hash,
297 .eth_addr = eth_eth_addr,
299 .flags = LL_NAME_ONLY,
303 * Allocate IPoIB device
305 * @v priv_size Size of driver private data
306 * @ret netdev Network device, or NULL
308 struct net_device * alloc_ipoibdev ( size_t priv_size ) {
309 struct net_device *netdev;
311 netdev = alloc_netdev ( priv_size );
313 netdev->ll_protocol = &ipoib_protocol;
314 netdev->ll_broadcast = eth_broadcast;
315 netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
320 /****************************************************************************
322 * IPoIB translation layer
324 ****************************************************************************
328 * Translate transmitted ARP packet
330 * @v netdev Network device
331 * @v iobuf Packet to be transmitted (with no link-layer headers)
332 * @ret rc Return status code
334 static int ipoib_translate_tx_arp ( struct net_device *netdev,
335 struct io_buffer *iobuf ) {
336 struct ipoib_device *ipoib = netdev->priv;
337 struct arphdr *arphdr = iobuf->data;
338 struct ipoib_mac *target_ha = NULL;
342 /* Do nothing unless ARP contains eIPoIB link-layer addresses */
343 if ( arphdr->ar_hln != ETH_ALEN )
346 /* Fail unless we have room to expand packet */
347 if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
349 DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
354 /* Look up REMAC, if applicable */
355 if ( arphdr->ar_op == ARPOP_REPLY ) {
356 target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
358 DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n",
359 ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) );
360 return -ENXIO_ARP_REPLY;
364 /* Construct new packet */
365 iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
366 sender_pa = arp_sender_pa ( arphdr );
367 target_pa = arp_target_pa ( arphdr );
368 arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
369 arphdr->ar_hln = sizeof ( ipoib->mac );
370 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
371 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
372 memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
373 memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
375 memcpy ( arp_target_ha ( arphdr ), target_ha,
376 sizeof ( *target_ha ) );
383 * Translate transmitted packet
385 * @v netdev Network device
386 * @v iobuf Packet to be transmitted (with no link-layer headers)
387 * @v net_proto Network-layer protocol (in network byte order)
388 * @ret rc Return status code
390 static int ipoib_translate_tx ( struct net_device *netdev,
391 struct io_buffer *iobuf, uint16_t net_proto ) {
393 switch ( net_proto ) {
394 case htons ( ETH_P_ARP ) :
395 return ipoib_translate_tx_arp ( netdev, iobuf );
396 case htons ( ETH_P_IP ) :
397 /* No translation needed */
400 /* Cannot handle other traffic via eIPoIB */
406 * Translate received ARP packet
408 * @v netdev Network device
409 * @v iobuf Received packet (with no link-layer headers)
410 * @v remac Constructed Remote Ethernet MAC
411 * @ret rc Return status code
413 static int ipoib_translate_rx_arp ( struct net_device *netdev,
414 struct io_buffer *iobuf,
415 struct ipoib_remac *remac ) {
416 struct ipoib_device *ipoib = netdev->priv;
417 struct arphdr *arphdr = iobuf->data;
422 /* Do nothing unless ARP contains IPoIB link-layer addresses */
423 if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
426 /* Create REMAC cache entry */
427 if ( ( rc = ipoib_map_remac ( ipoib, remac,
428 arp_sender_ha ( arphdr ) ) ) != 0 ) {
429 DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
430 ipoib, strerror ( rc ) );
434 /* Construct new packet */
435 sender_pa = arp_sender_pa ( arphdr );
436 target_pa = arp_target_pa ( arphdr );
437 arphdr->ar_hrd = htons ( ARPHRD_ETHER );
438 arphdr->ar_hln = ETH_ALEN;
439 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
440 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
441 memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
442 memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
443 if ( arphdr->ar_op == ARPOP_REPLY ) {
444 /* Assume received replies were directed to us */
445 memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
447 iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
453 * Translate received packet
455 * @v netdev Network device
456 * @v iobuf Received packet (with no link-layer headers)
457 * @v remac Constructed Remote Ethernet MAC
458 * @v net_proto Network-layer protocol (in network byte order)
459 * @ret rc Return status code
461 static int ipoib_translate_rx ( struct net_device *netdev,
462 struct io_buffer *iobuf,
463 struct ipoib_remac *remac,
464 uint16_t net_proto ) {
466 switch ( net_proto ) {
467 case htons ( ETH_P_ARP ) :
468 return ipoib_translate_rx_arp ( netdev, iobuf, remac );
469 case htons ( ETH_P_IP ) :
470 /* No translation needed */
473 /* Cannot handle other traffic via eIPoIB */
478 /****************************************************************************
480 * IPoIB network device
482 ****************************************************************************
486 * Transmit packet via IPoIB network device
488 * @v netdev Network device
489 * @v iobuf I/O buffer
490 * @ret rc Return status code
492 static int ipoib_transmit ( struct net_device *netdev,
493 struct io_buffer *iobuf ) {
494 struct ipoib_device *ipoib = netdev->priv;
495 struct ib_device *ibdev = ipoib->ibdev;
496 struct ethhdr *ethhdr;
498 struct ipoib_hdr *ipoib_hdr;
499 struct ipoib_mac *mac;
500 struct ib_address_vector dest;
505 if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
506 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
510 /* Attempting transmission while link is down will put the
511 * queue pair into an error state, so don't try it.
513 if ( ! ib_link_ok ( ibdev ) )
516 /* Strip eIPoIB header */
517 ethhdr = iobuf->data;
518 net_proto = ethhdr->h_protocol;
519 iob_pull ( iobuf, sizeof ( *ethhdr ) );
521 /* Identify destination address */
522 mac = ipoib_find_remac ( ipoib, ( ( void * ) ethhdr->h_dest ) );
524 /* Generate a new ARP request (if possible) to trigger
525 * population of the REMAC cache entry.
527 if ( ( net_proto != htons ( ETH_P_IP ) ) ||
528 ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) {
529 DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 "
530 "packet type %04x\n", ipoib,
531 eth_ntoa ( ethhdr->h_dest ),
532 ntohs ( net_proto ) );
533 return -ENXIO_NON_IPV4;
536 if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol,
537 &iphdr->dest, &iphdr->src ) ) !=0){
538 DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/",
539 ipoib, eth_ntoa ( ethhdr->h_dest ),
540 inet_ntoa ( iphdr->dest ) );
541 DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ),
545 DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib,
546 eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) );
547 DBGC ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) );
548 return -ENXIO_ARP_SENT;
551 /* Translate packet if applicable */
552 if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
555 /* Prepend real IPoIB header */
556 ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
557 ipoib_hdr->proto = net_proto;
558 ipoib_hdr->reserved = 0;
560 /* Construct address vector */
561 memset ( &dest, 0, sizeof ( dest ) );
562 dest.qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
563 dest.gid_present = 1;
564 memcpy ( &dest.gid, &mac->gid, sizeof ( dest.gid ) );
565 if ( ( rc = ib_resolve_path ( ibdev, &dest ) ) != 0 ) {
566 /* Path not resolved yet */
570 return ib_post_send ( ibdev, ipoib->qp, &dest, iobuf );
574 * Handle IPoIB send completion
576 * @v ibdev Infiniband device
578 * @v iobuf I/O buffer
579 * @v rc Completion status code
581 static void ipoib_complete_send ( struct ib_device *ibdev __unused,
582 struct ib_queue_pair *qp,
583 struct io_buffer *iobuf, int rc ) {
584 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
586 netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
590 * Handle IPoIB receive completion
592 * @v ibdev Infiniband device
594 * @v dest Destination address vector, or NULL
595 * @v source Source address vector, or NULL
596 * @v iobuf I/O buffer
597 * @v rc Completion status code
599 static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
600 struct ib_queue_pair *qp,
601 struct ib_address_vector *dest,
602 struct ib_address_vector *source,
603 struct io_buffer *iobuf, int rc ) {
604 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
605 struct net_device *netdev = ipoib->netdev;
606 struct ipoib_hdr *ipoib_hdr;
607 struct ethhdr *ethhdr;
608 struct ipoib_remac remac;
613 netdev_rx_err ( netdev, iobuf, rc );
618 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
619 DBGC ( ipoib, "IPoIB %p received packet too short to "
620 "contain IPoIB header\n", ipoib );
621 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
622 netdev_rx_err ( netdev, iobuf, -EIO );
626 DBGC ( ipoib, "IPoIB %p received packet without address "
628 netdev_rx_err ( netdev, iobuf, -ENOTTY );
632 /* Strip real IPoIB header */
633 ipoib_hdr = iobuf->data;
634 net_proto = ipoib_hdr->proto;
635 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
637 /* Construct source address from remote QPN and LID */
638 remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
639 remac.lid = htons ( source->lid );
641 /* Translate packet if applicable */
642 if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
643 net_proto ) ) != 0 ) {
644 netdev_rx_err ( netdev, iobuf, rc );
648 /* Prepend eIPoIB header */
649 ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
650 memcpy ( ðhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
651 ethhdr->h_protocol = net_proto;
653 /* Construct destination address */
654 if ( dest->gid_present && ( memcmp ( &dest->gid, &ipoib->broadcast.gid,
655 sizeof ( dest->gid ) ) == 0 ) ) {
656 /* Broadcast GID; use the Ethernet broadcast address */
657 memcpy ( ðhdr->h_dest, eth_broadcast,
658 sizeof ( ethhdr->h_dest ) );
660 /* Assume destination address is local Ethernet MAC */
661 memcpy ( ðhdr->h_dest, netdev->ll_addr,
662 sizeof ( ethhdr->h_dest ) );
665 /* Hand off to network layer */
666 netdev_rx ( netdev, iobuf );
669 /** IPoIB completion operations */
670 static struct ib_completion_queue_operations ipoib_cq_op = {
671 .complete_send = ipoib_complete_send,
672 .complete_recv = ipoib_complete_recv,
676 * Allocate IPoIB receive I/O buffer
678 * @v len Length of buffer
679 * @ret iobuf I/O buffer, or NULL
681 * Some Infiniband hardware requires 2kB alignment of receive buffers
682 * and provides no way to disable header separation. The result is
683 * that there are only four bytes of link-layer header (the real IPoIB
684 * header) before the payload. This is not sufficient space to insert
685 * an eIPoIB link-layer pseudo-header.
687 * We therefore allocate I/O buffers offset to start slightly before
688 * the natural alignment boundary, in order to allow sufficient space.
690 static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
691 struct io_buffer *iobuf;
694 /* Calculate additional length required at start of buffer */
695 reserve_len = ( sizeof ( struct ethhdr ) -
696 sizeof ( struct ipoib_hdr ) );
698 /* Allocate buffer */
699 iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
701 iob_reserve ( iobuf, reserve_len );
706 /** IPoIB queue pair operations */
707 static struct ib_queue_pair_operations ipoib_qp_op = {
708 .alloc_iob = ipoib_alloc_iob,
712 * Poll IPoIB network device
714 * @v netdev Network device
716 static void ipoib_poll ( struct net_device *netdev ) {
717 struct ipoib_device *ipoib = netdev->priv;
718 struct ib_device *ibdev = ipoib->ibdev;
720 /* Poll Infiniband device */
721 ib_poll_eq ( ibdev );
723 /* Poll the retry timers (required for IPoIB multicast join) */
728 * Handle IPv4 broadcast multicast group join completion
730 * @v ibdev Infiniband device
732 * @v membership Multicast group membership
734 * @v mad Response MAD (or NULL on error)
736 void ipoib_join_complete ( struct ib_device *ibdev __unused,
737 struct ib_queue_pair *qp __unused,
738 struct ib_mc_membership *membership, int rc,
739 union ib_mad *mad __unused ) {
740 struct ipoib_device *ipoib = container_of ( membership,
741 struct ipoib_device, broadcast_membership );
743 /* Record join status as link status */
744 netdev_link_err ( ipoib->netdev, rc );
748 * Join IPv4 broadcast multicast group
750 * @v ipoib IPoIB device
751 * @ret rc Return status code
753 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
756 if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
757 &ipoib->broadcast_membership,
758 &ipoib->broadcast.gid,
759 ipoib_join_complete ) ) != 0 ) {
760 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
761 ipoib, strerror ( rc ) );
764 ipoib->broadcast_joined = 1;
770 * Leave IPv4 broadcast multicast group
772 * @v ipoib IPoIB device
774 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
776 if ( ipoib->broadcast_joined ) {
777 ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
778 &ipoib->broadcast_membership );
779 ipoib->broadcast_joined = 0;
784 * Handle link status change
786 * @v ibdev Infiniband device
788 static void ipoib_link_state_changed ( struct ib_device *ibdev ) {
789 struct net_device *netdev = ib_get_ownerdata ( ibdev );
790 struct ipoib_device *ipoib = netdev->priv;
793 /* Leave existing broadcast group */
795 ipoib_leave_broadcast_group ( ipoib );
797 /* Update MAC address based on potentially-new GID prefix */
798 memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
799 sizeof ( ipoib->mac.gid.s.prefix ) );
801 /* Update broadcast GID based on potentially-new partition key */
802 ipoib->broadcast.gid.words[2] =
803 htons ( ibdev->pkey | IB_PKEY_FULL );
805 /* Set net device link state to reflect Infiniband link state */
806 rc = ib_link_rc ( ibdev );
807 netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
809 /* Join new broadcast group */
810 if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp &&
811 ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
812 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
813 "%s\n", ipoib, strerror ( rc ) );
814 netdev_link_err ( netdev, rc );
820 * Open IPoIB network device
822 * @v netdev Network device
823 * @ret rc Return status code
825 static int ipoib_open ( struct net_device *netdev ) {
826 struct ipoib_device *ipoib = netdev->priv;
827 struct ib_device *ibdev = ipoib->ibdev;
831 if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
832 DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
833 ipoib, strerror ( rc ) );
837 /* Allocate completion queue */
838 ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
840 DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
846 /* Allocate queue pair */
847 ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
848 ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
851 DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
856 ib_qp_set_ownerdata ( ipoib->qp, ipoib );
858 /* Update MAC address with QPN */
859 ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
861 /* Fill receive rings */
862 ib_refill_recv ( ibdev, ipoib->qp );
864 /* Fake a link status change to join the broadcast group */
865 ipoib_link_state_changed ( ibdev );
869 ib_destroy_qp ( ibdev, ipoib->qp );
871 ib_destroy_cq ( ibdev, ipoib->cq );
879 * Close IPoIB network device
881 * @v netdev Network device
883 static void ipoib_close ( struct net_device *netdev ) {
884 struct ipoib_device *ipoib = netdev->priv;
885 struct ib_device *ibdev = ipoib->ibdev;
887 /* Flush REMAC cache */
888 ipoib_flush_remac ( ipoib );
890 /* Leave broadcast group */
891 ipoib_leave_broadcast_group ( ipoib );
893 /* Remove QPN from MAC address */
894 ipoib->mac.flags__qpn = 0;
896 /* Tear down the queues */
897 ib_destroy_qp ( ibdev, ipoib->qp );
899 ib_destroy_cq ( ibdev, ipoib->cq );
902 /* Close IB device */
906 /** IPoIB network device operations */
907 static struct net_device_operations ipoib_operations = {
909 .close = ipoib_close,
910 .transmit = ipoib_transmit,
917 * @v ibdev Infiniband device
918 * @ret rc Return status code
920 static int ipoib_probe ( struct ib_device *ibdev ) {
921 struct net_device *netdev;
922 struct ipoib_device *ipoib;
925 /* Allocate network device */
926 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
929 netdev_init ( netdev, &ipoib_operations );
930 ipoib = netdev->priv;
931 ib_set_ownerdata ( ibdev, netdev );
932 netdev->dev = ibdev->dev;
933 memset ( ipoib, 0, sizeof ( *ipoib ) );
934 ipoib->netdev = netdev;
935 ipoib->ibdev = ibdev;
936 INIT_LIST_HEAD ( &ipoib->peers );
938 /* Extract hardware address */
939 memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
940 sizeof ( ibdev->gid.s.guid ) );
942 /* Set local MAC address */
943 memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
944 sizeof ( ipoib->mac.gid.s.guid ) );
946 /* Set default broadcast MAC address */
947 memcpy ( &ipoib->broadcast, &ipoib_broadcast,
948 sizeof ( ipoib->broadcast ) );
950 /* Register network device */
951 if ( ( rc = register_netdev ( netdev ) ) != 0 )
952 goto err_register_netdev;
957 netdev_nullify ( netdev );
958 netdev_put ( netdev );
963 * Remove IPoIB device
965 * @v ibdev Infiniband device
967 static void ipoib_remove ( struct ib_device *ibdev ) {
968 struct net_device *netdev = ib_get_ownerdata ( ibdev );
970 unregister_netdev ( netdev );
971 netdev_nullify ( netdev );
972 netdev_put ( netdev );
976 struct ib_driver ipoib_driver __ib_driver = {
978 .probe = ipoib_probe,
979 .notify = ipoib_link_state_changed,
980 .remove = ipoib_remove,