TLC Lustre Repository Browser

Viewing: gnilnd_conn.c

// SPDX-License-Identifier: GPL-2.0

/* Copyright (C) 2012 Cray, Inc.
 *
 * Copyright (c) 2014, Intel Corporation.
 */

/* This file is part of Lustre, http://www.lustre.org.
 *
 * Author: Nic Henke <nic@cray.com>
 * Author: James Shimek <jshimek@cray.com>
 */

#include "gnilnd.h"
#include <linux/swap.h>

void
kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
{
	smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
	smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
	smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
}

int
kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
{
	gni_return_t            rrc;
	__u32                   flags = GNI_MEM_READWRITE;
	static unsigned long    reg_to;
	int                     rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;

	if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
		flags |= GNI_MEM_PHYS_CONT;
	}

	fma_blk->gnm_hold_timeout = 0;

	/* make sure we are mapping a clean block */
	LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL,
		 "fma_blk %px dirty\n", fma_blk);

	rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
				   fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
				   flags, &fma_blk->gnm_hndl);
	if (rrc != GNI_RC_SUCCESS) {
		if (rfto != GNILND_REGFAILTO_DISABLE) {
			if (reg_to == 0)
				reg_to = jiffies + cfs_time_seconds(rfto);
			else
				LASSERTF(!time_after(jiffies, reg_to),
					 "FATAL:fmablk registration has failed for %ld seconds.\n",
					 cfs_duration_sec(jiffies - reg_to) + rfto);
		}

		CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
			fma_blk, fma_blk->gnm_mbox_size, flags);
		RETURN(-ENOMEM);
	}

	reg_to = 0;

	/* PHYS_CONT memory isn't really mapped, at least not in GART -
	 *  but all mappings chew up a MDD
	 */
	if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
		atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
	}

	atomic_inc(&device->gnd_n_mdd);
	/* nfmablk is live (mapped) blocks */
	atomic_inc(&device->gnd_nfmablk);

	RETURN(0);
}

int
kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
{
	int                     rc = 0;
	int                     num_mbox;
	kgn_fma_memblock_t     *fma_blk;
	gni_smsg_attr_t         smsg_attr;
	unsigned long           fmablk_vers;

#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
	/* We allocate large blocks of memory here potentially leading
	 * to memory exhaustion during massive reconnects during a network
	 * outage. Limit the amount of fma blocks to use by always keeping
	 * a percent of pages free initially set to 25% of total memory. */
	if (nr_free_pages() < kgnilnd_data.free_pages_limit) {
		LCONSOLE_INFO("Exceeding free page limit of %ld. "
			      "Free pages available %ld\n",
			      kgnilnd_data.free_pages_limit,
			      nr_free_pages());
		return -ENOMEM;
	}
#endif
	/* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
	 * to this allocation code. Everyone will sample the version
	 * before and after getting the mutex. If it has changed,
	 * we'll bail out to check the lists again - this indicates that
	 * some sort of change was made to the lists and it is possible
	 * that there is a mailbox for us to find now. This should prevent
	 * a ton of spinning in the case where there are lots of threads
	 * that need a yet-to-be-allocated mailbox for a connection. */

	fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
	mutex_lock(&device->gnd_fmablk_mutex);

	if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
		/* version changed while we were waiting for semaphore,
		 * we'll recheck the lists assuming something nice happened */
		mutex_unlock(&device->gnd_fmablk_mutex);
		return 0;
	}

	LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
	if (fma_blk == NULL) {
		CNETERR("could not allocate fma block descriptor\n");
		rc = -ENOMEM;
		GOTO(out, rc);
	}

	INIT_LIST_HEAD(&fma_blk->gnm_bufflist);

	kgnilnd_setup_smsg_attr(&smsg_attr);

	gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);

	LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);

	/* gni_smsg_buff_size_needed calculates the base mailbox size and since
	 * we want to hold kgn_peer_credits worth of messages in both directions,
	 * we add PAYLOAD to grow the mailbox size
	 */

	fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;

	/* we'll only use physical during preallocate at startup -- this keeps it nice and
	 * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
	 * as reallocating them is tough if there is memory fragmentation */

	if (use_phys) {
		fma_blk->gnm_block = kmem_cache_alloc(kgnilnd_data.kgn_mbox_cache, GFP_ATOMIC);
		if (fma_blk->gnm_block == NULL) {
			CNETERR("could not allocate physical SMSG mailbox memory\n");
			rc = -ENOMEM;
			GOTO(free_desc, rc);
		}
		fma_blk->gnm_blk_size = GNILND_MBOX_SIZE;
		num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;

		LASSERTF(num_mbox >= 1,
			 "num_mbox %d blk_size %u mbox_size %d\n",
			  num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);

		fma_blk->gnm_state = GNILND_FMABLK_PHYS;

	} else {
		num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
		fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;

		LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
			 "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
			 num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
			 *kgnilnd_tunables.kgn_mbox_per_block);

		fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
		if (fma_blk->gnm_block == NULL) {
			CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
			rc = -ENOMEM;
			GOTO(free_desc, rc);
		}

		fma_blk->gnm_state = GNILND_FMABLK_VIRT;
	}

	/* allocate just enough space for the bits to track the mailboxes */
	CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
	if (fma_blk->gnm_bit_array == NULL) {
		CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
		       sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
		rc = -ENOMEM;
		GOTO(free_blk, rc);
	}
	bitmap_zero(fma_blk->gnm_bit_array, num_mbox);

	/* now that the num_mbox is set based on allocation type, get debug
	 * info setup
	 * */
	CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
	if (fma_blk->gnm_mbox_info == NULL) {
		CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
		       sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
		rc = -ENOMEM;
		GOTO(free_bit, rc);
	}

	rc = kgnilnd_map_fmablk(device, fma_blk);
	if (rc) {
		GOTO(free_info, rc);
	}

	fma_blk->gnm_next_avail_mbox = 0;
	fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;

	CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
		"mbox_size %d MDD %#llx.%#llx\n",
		fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
		fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
		fma_blk->gnm_hndl.qword2);

	/* lock Is protecting data structures, not semaphore */

	spin_lock(&device->gnd_fmablk_lock);
	list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);

	/* toggle under the lock so once they change the list is also
	 * ready for others to traverse */
	atomic_inc(&device->gnd_fmablk_vers);

	spin_unlock(&device->gnd_fmablk_lock);

	mutex_unlock(&device->gnd_fmablk_mutex);

	return 0;

free_info:
	CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
free_bit:
	CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
free_blk:
	if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
		kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
	} else {
		kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
	}
free_desc:
	LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
out:
	mutex_unlock(&device->gnd_fmablk_mutex);
	return rc;
}

void
kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
{
	gni_return_t            rrc;

	/* if some held, set hold_timeout from conn timeouts used in this block
	 * but not during shutdown, then just nuke and pave
	 * During a stack reset, we need to deregister with a hold timeout
	 * set so we don't use the same mdd after reset is complete */
	if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
	    kgnilnd_data.kgn_in_reset) {
		fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
	}

	/* we are changing the state of a block, tickle version to tell
	 * proc code list is stale now */
	atomic_inc(&dev->gnd_fmablk_vers);

	rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);

	CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
	       "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
		"hold_timeout %d\n",
	       fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
	       fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
	       fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
	       fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);

	LASSERTF(rrc == GNI_RC_SUCCESS,
		"tried to double unmap or something bad, fma_blk %px (rrc %d)\n",
		fma_blk, rrc);

	if (fma_blk->gnm_hold_timeout &&
	    !(kgnilnd_data.kgn_in_reset &&
	      fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
		atomic_inc(&dev->gnd_n_mdd_held);
	} else {
		atomic_dec(&dev->gnd_n_mdd);
	}

	/* PHYS blocks don't get mapped */
	if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
		atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
		fma_blk->gnm_state = GNILND_FMABLK_IDLE;
	} else if (kgnilnd_data.kgn_in_reset) {
		/* in stack reset, clear MDD handle for PHYS blocks, as we'll
		 * re-use the fma_blk after reset so we don't have to drop/allocate
		 * all of those physical blocks */
		fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
	}

	/* Decrement here as this is the # of mapped blocks */
	atomic_dec(&dev->gnd_nfmablk);
}


/* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
void
kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
{
	LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
		 "fma_blk %px@%d free in bad state (%d): blk total %d avail %d held %d\n",
		 fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
		fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);

	atomic_inc(&dev->gnd_fmablk_vers);

	if (fma_blk->gnm_hold_timeout) {
		CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
			"mbox_size %d\n",
			fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
			fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);

		/* We leave MDD dangling over stack reset */
		if (!kgnilnd_data.kgn_in_reset) {
			kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
		}
		/* ignoring the return code - if kgni/ghal can't find it
		 * it must be released already */
		atomic_dec(&dev->gnd_n_mdd_held);
		atomic_dec(&dev->gnd_n_mdd);
	}

	/* we cant' free the gnm_block until all the conns have released their
	 * purgatory holds. While we have purgatory holds, we might check the conn
	 * RX mailbox during the CLOSING process. It is possible that kgni might
	 * try to look into the RX side for credits when sending the CLOSE msg too */
	if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
		LIBCFS_MEM_MSG(fma_blk->gnm_block, fma_blk->gnm_mbox_size, "free");
		kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
	} else {
		kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
	}
	fma_blk->gnm_state = GNILND_FMABLK_FREED;

	list_del(&fma_blk->gnm_bufflist);

	CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, fma_blk->gnm_num_mboxs);
	CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array,
			   BITS_TO_LONGS(fma_blk->gnm_num_mboxs));
	LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
}

void
kgnilnd_find_free_mbox(kgn_conn_t *conn)
{
	kgn_device_t            *dev = conn->gnc_device;
	gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
	kgn_fma_memblock_t      *fma_blk;
	kgn_mbox_info_t         *mbox = NULL;
	int                     id;

	spin_lock(&dev->gnd_fmablk_lock);

	list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
			    gnm_bufflist) {
		if (fma_blk->gnm_avail_mboxs <= 0 ||
		    fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
			continue;
		}
		/* look in bitarray for available mailbox */
		do {
			id = find_next_zero_bit(
				fma_blk->gnm_bit_array,
				fma_blk->gnm_num_mboxs,
				fma_blk->gnm_next_avail_mbox);
		      if (id == fma_blk->gnm_num_mboxs &&
			  fma_blk->gnm_next_avail_mbox != 0) {
				/* wrap around */
				fma_blk->gnm_next_avail_mbox = 0;
			} else {
				break;
			}
		} while (1);

		LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
			 id, fma_blk->gnm_num_mboxs);
		set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
		conn->gnc_mbox_id = id;

		fma_blk->gnm_next_avail_mbox =
			(id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
		fma_blk->gnm_avail_mboxs--;
		conn->gnc_fma_blk = fma_blk;

		kgnilnd_setup_smsg_attr(smsg_attr);

		smsg_attr->msg_buffer = fma_blk->gnm_block;
		smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
		smsg_attr->mem_hndl = fma_blk->gnm_hndl;
		smsg_attr->buff_size = fma_blk->gnm_mbox_size;

		/* We'll set the hndl to zero for PHYS blocks unmapped during stack
		 * reset and re-use the same fma_blk after stack reset. This ensures we've
		 * properly mapped it before we use it */
		LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL,
			"unmapped fma_blk %px, state %d\n",
			 fma_blk, fma_blk->gnm_state);

		CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
			"allocating SMSG mbox %d buf %p "
			"offset %u hndl %#llx.%#llx\n",
			conn, smsg_attr, fma_blk, id,
			smsg_attr->msg_buffer, smsg_attr->mbox_offset,
			fma_blk->gnm_hndl.qword1,
			fma_blk->gnm_hndl.qword2);

		mbox = &fma_blk->gnm_mbox_info[id];
		mbox->mbx_create_conn_memset = jiffies;
		mbox->mbx_nallocs++;
		mbox->mbx_nallocs_total++;

		/* zero mbox to remove any old data from our last use.
		 * this better be safe, if not our purgatory timers
		 * are too short or a peer really is misbehaving */
		memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
		       0, smsg_attr->buff_size);
		break;
	}

	spin_unlock(&dev->gnd_fmablk_lock);
}

int
kgnilnd_setup_mbox(kgn_conn_t *conn)
{
	gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
	int                      err = 0;

	smsg_attr->msg_buffer = NULL;
	/* Look for available mbox */
	do {
		kgnilnd_find_free_mbox(conn);

		/* nothing in the existing buffers, make a new one */
		if (smsg_attr->msg_buffer == NULL) {
			/* for runtime allocations, we only want vmalloc */
			err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
			if (err) {
				break;
			}
		}
	} while (smsg_attr->msg_buffer == NULL);

	if (err)
		CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
			conn, err);
	return err;
}

void
kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
{
	kgn_device_t           *dev = conn->gnc_device;
	gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
	kgn_fma_memblock_t     *fma_blk = NULL;
	kgn_mbox_info_t        *mbox = NULL;
	int                     found = 0;
	int                     id;

	/* if we failed to setup mbox and now destroying conn */
	if (smsg_attr->msg_buffer == NULL) {
		return;
	}

	id = conn->gnc_mbox_id;

	spin_lock(&dev->gnd_fmablk_lock);
	/* make sure our conn points at a valid fma_blk
	 * We use this instead of a mem block search out of smsg_attr
	 * because we could have freed a block for fma_blk #1 but the fma_blk
	 * is still in the list for a purgatory hold. This would induce a false
	 * match if that same block gets reallocated to fma_blk #2 */
	list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
		if (fma_blk == conn->gnc_fma_blk) {
			found = 1;
			break;
		}
	}
	LASSERTF(found,
		"unable to find conn 0x%p with gnc_fma_blk %px anywhere in the world\n",
		 conn, conn->gnc_fma_blk);

	LASSERTF(id < fma_blk->gnm_num_mboxs,
		"bad id %d max %d\n",
		id, fma_blk->gnm_num_mboxs);

	/* < 0 - was held, now free it
	 * == 0 - just free it
	 * > 0 - hold it for now */
	if (purgatory_hold == 0) {
		CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
			"hndl %#llx.%#llx\n",
			conn, smsg_attr, fma_blk, id,
			fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
		fma_blk->gnm_avail_mboxs++;

	} else if (purgatory_hold > 0) {
		CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
			"hndl %#llx.%#llx\n",
			conn, smsg_attr, fma_blk, id,
			fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);

		fma_blk->gnm_held_mboxs++;
		fma_blk->gnm_max_timeout = max_t(long, fma_blk->gnm_max_timeout,
						 conn->gnc_timeout);
	} else {
		CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
			"hndl %#llx.%#llx\n",
			conn, smsg_attr, fma_blk, id,
			fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);

		fma_blk->gnm_held_mboxs--;
		fma_blk->gnm_avail_mboxs++;
	}

	if (purgatory_hold <= 0) {
		/* if kgni is retransmitting, freeing the smsg block before the EP
		 * is destroyed gets messy. Bug 768295. */
		LASSERTF(conn->gnc_ephandle == NULL,
			 "can't release mbox before EP is nuked. conn 0x%p\n", conn);

		mbox = &fma_blk->gnm_mbox_info[id];
		mbox->mbx_release_from_purgatory = jiffies;

		/* clear conn gnc_fmablk if it is gone - this allows us to
		 * not worry about state so much in kgnilnd_destroy_conn
		 * and makes the guaranteed cleanup of the resources easier */
		LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
			"conn %px bit %d already cleared in fma_blk %px\n",
			 conn, id, fma_blk);
		conn->gnc_fma_blk = NULL;
		mbox->mbx_nallocs--;
	}

	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
		CERROR("LBUGs in your future: forcibly marking fma_blk %p "
		       "as mapped\n", fma_blk);
		fma_blk->gnm_state = GNILND_FMABLK_VIRT;
	}

	/* we don't release or unmap PHYS blocks as part of the normal cycle --
	 * those are controlled manually from startup/shutdown */
	if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
		/* we can unmap once all are unused (held or avail)
		 * but check hold_timeout to make sure we are not trying to double
		 * unmap this buffer. If there was no hold_timeout set due to
		 * held_mboxs, we'll free the mobx here shortly and won't have to
		 * worry about catching a double free for a 'clean' fma_blk */
		if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
		    (!fma_blk->gnm_hold_timeout)) {
			kgnilnd_unmap_fmablk(dev, fma_blk);
		}

		/* But we can only free once they are all avail */
		if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
		    fma_blk->gnm_held_mboxs == 0) {
			/* all mailboxes are released, free fma_blk */
			kgnilnd_free_fmablk_locked(dev, fma_blk);
		}
	}

	spin_unlock(&dev->gnd_fmablk_lock);
}

int
kgnilnd_count_phys_mbox(kgn_device_t *device)
{
	int                     i = 0;
	kgn_fma_memblock_t     *fma_blk;

	spin_lock(&device->gnd_fmablk_lock);

	list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
		if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
			i += fma_blk->gnm_num_mboxs;
	}
	spin_unlock(&device->gnd_fmablk_lock);

	RETURN(i);
}

int
kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
{
	int     rc;

	while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {

		rc = kgnilnd_alloc_fmablk(device, 1);
		if (rc) {
			CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
				kgnilnd_count_phys_mbox(device), rc);
			RETURN(rc);
		}
	}
	RETURN(0);
}

int
kgnilnd_map_phys_fmablk(kgn_device_t *device)
{

	int                     rc = 0;
	kgn_fma_memblock_t     *fma_blk;

	/* use mutex to gate access to single thread, just in case */
	mutex_lock(&device->gnd_fmablk_mutex);

	spin_lock(&device->gnd_fmablk_lock);

	list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
		if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
			rc = kgnilnd_map_fmablk(device, fma_blk);
			if (rc)
				break;
		}
	}
	spin_unlock(&device->gnd_fmablk_lock);

	mutex_unlock(&device->gnd_fmablk_mutex);

	RETURN(rc);
}

void
kgnilnd_unmap_fma_blocks(kgn_device_t *device)
{

	kgn_fma_memblock_t      *fma_blk;

	/* use mutex to gate access to single thread, just in case */
	mutex_lock(&device->gnd_fmablk_mutex);

	spin_lock(&device->gnd_fmablk_lock);

	list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
		kgnilnd_unmap_fmablk(device, fma_blk);
	}
	spin_unlock(&device->gnd_fmablk_lock);

	mutex_unlock(&device->gnd_fmablk_mutex);
}

void
kgnilnd_free_phys_fmablk(kgn_device_t *device)
{

	kgn_fma_memblock_t      *fma_blk, *fma_blkN;

	/* use mutex to gate access to single thread, just in case */
	mutex_lock(&device->gnd_fmablk_mutex);

	spin_lock(&device->gnd_fmablk_lock);

	list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
		if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
			kgnilnd_free_fmablk_locked(device, fma_blk);
	}
	spin_unlock(&device->gnd_fmablk_lock);

	mutex_unlock(&device->gnd_fmablk_mutex);
}

/* kgnilnd dgram nid->struct managment */

static inline struct list_head *
kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
{
	unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;

	RETURN(&dev->gnd_dgrams[hash]);
}


/* needs dev->gnd_dgram_lock held */
kgn_dgram_t *
kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
{
	struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
	kgn_dgram_t      *dgram;

	list_for_each_entry(dgram, dgram_list, gndg_list) {

		/* if state > POSTED, we are already handling cancel/completion */
		if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
		     dgram->gndg_state > GNILND_DGRAM_POSTED)
			continue;

		CDEBUG(D_NET, "got dgram [%p] -> %s\n",
		       dgram, libcfs_nid2str(dst_nid));
		return dgram;
	}
	return NULL;
}

int
kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
{
	kgn_dgram_t     *dgram;

	spin_lock(&dev->gnd_dgram_lock);
	dgram = kgnilnd_find_dgram_locked(dev, dst_nid);

	if (dgram) {
		kgnilnd_cancel_dgram_locked(dgram);
	}
	spin_unlock(&dev->gnd_dgram_lock);

	RETURN(!!(dgram == NULL));
}

int
kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
		     lnet_nid_t srcnid, lnet_nid_t dstnid,
		     kgn_connreq_type_t type)
{
	int err = 0;

	/* ensure we haven't violated max datagram size */
	BUILD_BUG_ON(sizeof(kgn_connreq_t) > GNI_DATAGRAM_MAXSIZE);

	/* no need to zero out, we do that when allocating dgram */
	connreq->gncr_magic     = GNILND_MSG_MAGIC;

	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
		srcnid = 0xABADBABE;
	} else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
		dstnid = 0xDEFEC8ED;
	}

	connreq->gncr_srcnid    = srcnid;
	connreq->gncr_dstnid    = dstnid;

	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
		connreq->gncr_version = 99;
	} else {
		connreq->gncr_version   = GNILND_CONNREQ_VERSION;
	}
	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
		connreq->gncr_type = 99;
	} else {
		connreq->gncr_type      = type;
	}
	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
		connreq->gncr_peerstamp = 0;
	} else {
		connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
	}
	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
		connreq->gncr_connstamp = 0;
	} else {
		connreq->gncr_connstamp = conn->gnc_my_connstamp;
	}
	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
		connreq->gncr_timeout = 0;
	} else {
		connreq->gncr_timeout   = conn->gnc_timeout;
	}

	/* the rest pack the data into the payload in other places */
	if (type == GNILND_CONNREQ_REQ) {
		kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
		req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
		req_params->gnpr_cqid = conn->gnc_cqid;

		/* allocate mailbox for this connection */
		err = kgnilnd_setup_mbox(conn);
		if (err != 0) {
			CERROR("Failed to setup FMA mailbox (%d)\n", err);
		}
		req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
	}

	/* XXX Nic: TBD - checksum computation */

	return err;
}

int
kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
{
	kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
	int                      swab, rc = 0;
	kgn_net_t               *net;

	/* the following fields must be handled in a backwards compatible
	 * manner to ensure we can always send and interpret NAKs */

	if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
	    connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
		/* Unexpected magic! */
		CERROR("Unexpected magic %08x\n",
		       connreq->gncr_magic);
		return -EBADF;
	}

	swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
	if (swab) {
		__swab32s(&connreq->gncr_magic);
		__swab32s(&connreq->gncr_cksum);
		__swab16s(&connreq->gncr_type);
		__swab16s(&connreq->gncr_version);
		__swab32s(&connreq->gncr_timeout);
		__swab64s(&connreq->gncr_srcnid);
		__swab64s(&connreq->gncr_dstnid);
		__swab64s(&connreq->gncr_peerstamp);
		__swab64s(&connreq->gncr_connstamp);
	}

	/* Do NOT return anything but -EBADF before we munge
	 * connreq->gncr_srcnid - we need that to send the nak */

	if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
		lnet_nid_t      incoming = connreq->gncr_srcnid;

		/* even if the incoming packet is hosed, we know who we sent
		 * the original and can set the srcnid so that we can properly
		 * look up our peer to close the loop on this connreq. We still use
		 * -EBADF to prevent a NAK - just in case there are issues with
		 * the payload coming from a random spot, etc. */
		connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;

		if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
				LNET_NIDADDR(incoming)) {
			/* we got a datagram match for the wrong nid... */
			CERROR("matched datagram 0x%p with srcnid %s "
				"(%x), expecting %s (%x)\n",
				dgram,
				libcfs_nid2str(incoming),
				LNET_NIDADDR(incoming),
				libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
				LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
			return -EBADF;
		}
	} else {
		/* if we have a wildcard datagram it should match an
		 * incoming "active" datagram that should have a fully formed
		 * srcnid and dstnid. If we couldn't unpack it, we drop as
		 * corrupted packet, otherwise we'll just verify that the dstnid
		 * matches the NID for the NET that the dgram was posted */

		/* make sure their wildcard didn't match ours, that is unpossible */
		LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
			 "dgram 0x%p from %s, connreq 0x%p; "
			 "wildcard matched wildcard \n", dgram,
			 libcfs_nid2str(connreq->gncr_srcnid), connreq);

		rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);

		if (rc == -ESHUTDOWN) {
			CERROR("Looking up network: device is in shutdown\n");
			return rc;
		} else if (rc == -ENONET) {
			CERROR("Connection data from %s: she sent "
			"dst_nid %s, but net lookup failed on "
			"dgram 0x%p@%s\n",
			libcfs_nid2str(connreq->gncr_srcnid),
			libcfs_nid2str(connreq->gncr_dstnid),
			dgram, kgnilnd_dgram_type2str(dgram));
			return rc;
		}

		if (lnet_nid_to_nid4(&net->gnn_ni->ni_nid) !=
		    connreq->gncr_dstnid) {
			CERROR("Bad connection data from %s: she sent "
			       "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
			       libcfs_nid2str(connreq->gncr_srcnid),
			       libcfs_nid2str(connreq->gncr_dstnid),
			       libcfs_nidstr(&net->gnn_ni->ni_nid),
			       dgram, kgnilnd_dgram_type2str(dgram));
			kgnilnd_net_decref(net);
			return -EBADSLT;
		}

		/* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
		kgnilnd_net_decref(net);
	}

	if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
		CERROR("Unexpected version %d\n", connreq->gncr_version);
		return -EPROTO;
	}

	/* XXX Nic: TBD - checksum validation */
	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
		return -EBADF;
	}

	if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
		__u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;

		__swab32s(&connreq->gncr_gnparams.gnpr_host_id);
		__swab32s(&connreq->gncr_gnparams.gnpr_cqid);
		__swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
		__swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
		__swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
		__swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
		__swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
		__swab64s(&msg_addr);
		__swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
		__swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
	} else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
		__swab32s(&connreq->gncr_nakdata.gnnd_errno);
	}

	/* since we use a unique instance ID for each network, the driver
	 * will take care of dropping datagrams if we don't have that network.
	 */

	/* few more idiot software or configuration checks */

	switch (connreq->gncr_type) {
	case GNILND_CONNREQ_REQ:
		/* wire up EP and SMSG block - this will check the incoming data
		 * and barf a NAK back if need to */
		rc = kgnilnd_set_conn_params(dgram);
		if (rc)
			return rc;
		break;
	case GNILND_CONNREQ_NAK:
	case GNILND_CONNREQ_CLOSE:
		break;
	default:
		CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
		return -EPROTO;
	}

	if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
		CERROR("Recived bad timestamps peer %llu conn %llu\n",
		connreq->gncr_peerstamp, connreq->gncr_connstamp);
		return -EPROTO;
	}

	if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
		CERROR("Received timeout %d < MIN %d\n",
		       connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
		return -EPROTO;
	}

	return 0;
}

int
kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
{
	kgn_dgram_t         *dgram;

	dgram = kmem_cache_zalloc(kgnilnd_data.kgn_dgram_cache, GFP_ATOMIC);
	if (dgram == NULL)
		return -ENOMEM;

	INIT_LIST_HEAD(&dgram->gndg_list);
	dgram->gndg_state = GNILND_DGRAM_USED;
	dgram->gndg_type = type;
	dgram->gndg_magic = GNILND_DGRAM_MAGIC;

	atomic_inc(&dev->gnd_ndgrams);

	CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p %s ndgrams"
		" %d\n",
		sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
		atomic_read(&dev->gnd_ndgrams));

	*dgramp = dgram;
	return 0;
}

/* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
 * returns < 0 on dgram to be cleaned up
 * > 0 on dgram that isn't done yet
 * == 0 on dgram that is ok and needs connreq processing */
int
kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
{
	int rc = 0;

	switch (post_state) {
	case GNI_POST_COMPLETED:
		/* normal state for dgrams that need actual processing */
		/* GOTO to avoid processing dgram as canceled/done */
		GOTO(process_out, rc);

	case GNI_POST_PENDING:
		/* we should only see this if we are testing a WC dgram after a
		 * cancel - it means that it needs a full cycle of waiting
		 * for kgni_sm_task to finish moving it to TERMINATED */
		LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
			  (dgram->gndg_state == GNILND_DGRAM_CANCELED),
			 "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
			 dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
			 dgram->gndg_state, kgnilnd_dgram_state2str(dgram));

		/* positive RC as this dgram isn't done yet */
		rc = EINPROGRESS;

		/* GOTO as this isn't done yet */
		GOTO(process_out, rc);
		break;

	case GNI_POST_TERMINATED:
		/* we've called cancel and it is done or remote guy called cancel and
		 * we've receved it on a WC dgram */
#if 0
		/* we are seeing weird terminations on non WC dgrams when we have not
		 * canceled them */

		LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
			 dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
			"dgram 0x%p with bad state %d(%s) or dst nid %s\n",
			dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
			libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
#endif

		CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
		       dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");

		rc =  -ECANCELED;
		break;

	case GNI_POST_TIMEOUT:
		/* we could have a timeout on a wildcard dgram too - if
		 * we got the incoming request but the remote node beefed
		 * before kgni could send the match data back. We'll just error
		 * on the active case and bail out gracefully */
		if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
			CNETERR("hardware timeout for connect to "
			       "%s after %lu seconds. Is node dead?\n",
			       libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
			       cfs_duration_sec(jiffies - dgram->gndg_post_time));
		}

		rc = -ETIMEDOUT;
		break;

	default:
		CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
		LBUG();
	}

	/* now finish cleaning up a dgram that is canceled/terminated and needs to
	 * go away */

	/* If this was actively canceled, drop the count now that we are processing */
	if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
		atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
		/* caller responsible for gndg_list removal */
	}

process_out:

	RETURN(rc);
}

/* needs dev->gnd_dgram_lock held */
void
kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
{
	gni_return_t            grc;

	if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
		return;
	}

	LASSERTF(dgram->gndg_conn != NULL,
		 "dgram 0x%p with NULL conn\n", dgram);

	/* C.E - WC dgrams could be canceled immediately but
	 * if there was some match pending, we need to call
	 * test_by_id to clear it out. If that test returns
	 * POST_PENDING, it is half done and needs to go along
	 * with the rest of dgrams and go through a kgni_sm_task cycle
	 * and deliver a GNI_POST_TERMINATED event before they
	 * are actually canceled */

	dgram->gndg_state = GNILND_DGRAM_CANCELED;

	if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
		/* we don't need to cancel_by_id if the datagram was good */
		return;
	}

	/* let folks know there are outstanding cancels */
	atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
	/* leave on nid list until cancel is done for debugging fun */
	grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);

	/* if we don't get success here, we have hosed up the dgram tracking
	 * code and need to bail out */
	LASSERTF(grc == GNI_RC_SUCCESS,
		 "postdata_cancel returned %d for conn 0x%p to %s\n",
		 grc, dgram->gndg_conn,
		 dgram->gndg_conn->gnc_peer ?
		  libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
		  : "<?>");

	CDEBUG(D_NETTRACE,
		"canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
		dgram, dgram->gndg_conn,
		dgram->gndg_conn->gnc_ephandle);

	if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
		gni_post_state_t         post_state;
		int                      rc = 0;
		__u32                    remote_addr = 0, remote_id = 0;

		grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
						     (__u64)dgram, &post_state,
						     &remote_addr, &remote_id);

		LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
			 "bad grc %d from test_by_id on dgram 0x%p\n",
			grc, dgram);

		/* if WC was canceled immediately, we get NO_MATCH, if needs to go
		 * through full cycle, we get SUCCESS and need to parse post_state */

		CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
			"remote_addr %u remote_id %u\n", grc, dgram,
			kgnilnd_dgram_type2str(dgram),
			post_state, remote_addr, remote_id);

		if (grc == GNI_RC_NO_MATCH) {
			/* she's gone, reduce count and move along */
			dgram->gndg_state = GNILND_DGRAM_DONE;
			atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
			RETURN_EXIT;
		}

		rc = kgnilnd_process_dgram(dgram, post_state);

		if (rc <= 0) {
			/* if for some weird reason we get a valid dgram back, just mark as done
			 * so we can drop it and move along.
			 * C.E - if it was completed, we'll just release the conn/mbox
			 * back into the pool and it'll get reused. That said, we should only
			 * be canceling a WC dgram on stack rest or shutdown, so that is moot */
			dgram->gndg_state = GNILND_DGRAM_DONE;
			atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);

			/* caller context responsible for calling kgnilnd_release_dgram() */
		} else {
			/* still pending, let it simmer until golden brown and delicious */
		}
	}

	/* for non WC dgrams, they are still on the nid list but marked canceled waiting
	 * for kgni to return their ID to us via probe - that is when we'll complete their
	 * cancel processing */
}

void
kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
{
	/* release the dgram ref on conn */
	if (dgram->gndg_conn) {
		kgnilnd_conn_decref(dgram->gndg_conn);
		dgram->gndg_conn = NULL;
	}
}

void
kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
{
	LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
		 dgram->gndg_state == GNILND_DGRAM_DONE,
		 "dgram 0x%p with bad state %s\n",
		 dgram, kgnilnd_dgram_state2str(dgram));

	/* bit of poisoning to help detect bad driver data */
	dgram->gndg_magic = 0x6f5a6b5f;
	atomic_dec(&dev->gnd_ndgrams);

	kmem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
	CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p %s"
	       " ndgrams %d\n",
	       sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
	       atomic_read(&dev->gnd_ndgrams));
}

int
kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
		   int data_rc)
{
	int              rc = 0;
	kgn_dgram_t     *dgram = NULL;
	kgn_dgram_t     *tmpdgram;
	kgn_dgram_type_t dgtype;
	gni_return_t     grc;
	__u64            srcnid;
	ENTRY;

	switch (type) {
	case GNILND_CONNREQ_REQ:
		if (dstnid == LNET_NID_ANY)
			dgtype = GNILND_DGRAM_WC_REQ;
		else
			dgtype = GNILND_DGRAM_REQ;
		break;
	case GNILND_CONNREQ_NAK:
		LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
		dgtype = GNILND_DGRAM_NAK;
		break;
	default:
		CERROR("unknown connreq type %d\n", type);
		LBUG();
	}

	rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
	if (rc < 0) {
		rc = -ENOMEM;
		GOTO(post_failed, rc);
	}

	rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
	if (rc) {
		GOTO(post_failed, rc);
	}

	if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
		/* clear buffer for sanity on reuse of wildcard */
		memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
	}

	if (dstnid == LNET_NID_ANY) {
		/* set here to reset any dgram re-use */
		dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
	} else {
		__u32            host_id;

		rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
		if (rc <= 0) {
			rc = -ESRCH;
			GOTO(post_failed, rc);
		}

		dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;

		/* don't need to serialize, there are no CQs for the dgram
		 * EP on the kgn_net_t */
		grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);

		if (grc != GNI_RC_SUCCESS) {
			rc = -ECONNABORTED;
			GOTO(post_failed, rc);
		}

	}

	/* If we are posting wildcards post using a net of 0, otherwise we'll use the
	 * net of the destination node.
	 */

	if (dstnid == LNET_NID_ANY) {
		srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
	} else {
		srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
	}

	rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
				  srcnid, dstnid, type);
	if (rc) {
		GOTO(post_failed, rc);
	}

	if (type == GNILND_CONNREQ_NAK)
		dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;

	dgram->gndg_post_time = jiffies;

	/* XXX Nic: here is where we'd add in logical network multiplexing */

	CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
	       dgram, kgnilnd_dgram_type2str(dgram),
	       libcfs_nid2str(srcnid),
	       libcfs_nid2str(dstnid), dev->gnd_id);

	/* this allocates memory, can't hold locks across */
	grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
				   &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
				   &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
				   (__u64)dgram);

	if (grc != GNI_RC_SUCCESS) {
		CNETERR("dropping failed dgram post id 0x%p type %s"
			" reqtype %s to %s: rc %d\n",
			dgram, kgnilnd_dgram_type2str(dgram),
			kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
			libcfs_nid2str(dstnid), grc);
		rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
		GOTO(post_failed, rc);
	}

	/* we don't need to add earlier - if someone does del_peer during post,
	 * that peer will get marked as unlinked and the callers wil take care of it.
	 * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
	 * the completed dgram later when we cant find a peer to stuff it into */

	spin_lock(&dev->gnd_dgram_lock);

	/* make sure we are not double posting targeted dgrams
	 * - we can multiple post WC dgrams to help with processing speed */
	if (dstnid != LNET_NID_ANY) {
		tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);

		LASSERTF(tmpdgram == NULL,
			"dgram 0x%p->%s already posted\n",
			 dgram, libcfs_nid2str(dstnid));
	}

	/* unmunge dstnid to help processing code cope... */
	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
		dgram->gndg_conn_out.gncr_dstnid = dstnid;
	}

	list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
	dgram->gndg_state = GNILND_DGRAM_POSTED;
	spin_unlock(&dev->gnd_dgram_lock);

post_failed:
	if (rc < 0 && dgram != NULL) {
		kgnilnd_cleanup_dgram(dgram);
		kgnilnd_free_dgram(dev, dgram);
	}

	RETURN(rc);
}

/* The shutdown flag is set from the shutdown and stack reset threads. */
void
kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown)
{
	/* The conns of canceled active dgrams need to be put in purgatory so
	 * we don't reuse the mailbox */
	if (unlikely(dgram->gndg_state == GNILND_DGRAM_CANCELED)) {
		kgn_peer_t *peer;
		kgn_conn_t *conn = dgram->gndg_conn;
		lnet_nid_t nid = dgram->gndg_conn_out.gncr_dstnid;

		dgram->gndg_state = GNILND_DGRAM_DONE;

		/* During shutdown we've already removed the peer so we don't
		 * need to add a peer. During stack reset we don't care about
		 * MDDs since they are all released. */
		if (!shutdown) {
			write_lock(&kgnilnd_data.kgn_peer_conn_lock);
			peer = kgnilnd_find_peer_locked(nid);

			if (peer != NULL) {
				CDEBUG(D_NET, "adding peer's conn with nid %s "
					"to purgatory\n", libcfs_nid2str(nid));
				kgnilnd_conn_addref(conn);
				conn->gnc_peer = peer;
				kgnilnd_peer_addref(peer);
				kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
				conn->gnc_state = GNILND_CONN_CLOSED;
				list_add_tail(&conn->gnc_list,
					      &peer->gnp_conns);
				kgnilnd_add_purgatory_locked(conn,
							     conn->gnc_peer);
				kgnilnd_schedule_conn(conn);
			}
			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
		}
	}

	spin_lock(&dev->gnd_dgram_lock);
	kgnilnd_cancel_dgram_locked(dgram);
	spin_unlock(&dev->gnd_dgram_lock);

	kgnilnd_cleanup_dgram(dgram);

	/* if the dgram is 'canceled' it needs to be wait until the event
	 * comes up from kgni that tells us it is safe to release */
	if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
		dgram->gndg_state = GNILND_DGRAM_DONE;

		LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);

		/* if it is a wildcard and we are in an appropriate state, repost
		 * the wildcard */

		if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
		    (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
			int     rerc;

			rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
			if (rerc != 0) {
				/* We failed to repost the WC dgram for some reason
				 * mark it so the repost system attempts to repost */
				kgnilnd_admin_addref(dev->gnd_nwcdgrams);
			}
		}

		/* always free the old dgram */
		kgnilnd_free_dgram(dev, dgram);
	}
}


int
kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
{
	kgn_dgram_t             *dgram = NULL;
	gni_post_state_t         post_state;
	gni_return_t             grc;
	int                      rc = 0;
	__u64                    readyid;
	__u32                    remote_addr = 0, remote_id = 0;
	ENTRY;

	/* Probe with the lock held. That way if we get a dgram we dont have it canceled
	 * between finding the ready dgram and grabbing the lock to remove it from the
	 * list. Otherwise we could be left in an inconsistent state. We own the dgram
	 * once its off the list so we don't need to worry about others changing it at
	 * that point. */
	spin_lock(&dev->gnd_dgram_lock);
	grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
	if (grc != GNI_RC_SUCCESS) {
		spin_unlock(&dev->gnd_dgram_lock);
		/* return 0 to indicate nothing happened */
		RETURN(0);
	}

	CDEBUG(D_NET, "ready %#llx on device 0x%p\n",
		readyid, dev);

	dgram = (kgn_dgram_t *)readyid;

	LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
		 "dgram 0x%p from id %#llx with bad magic %x\n",
		 dgram, readyid, dgram->gndg_magic);

	LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
		 dgram->gndg_state == GNILND_DGRAM_CANCELED,
		 "dgram 0x%p with bad state %s\n",
		 dgram, kgnilnd_dgram_state2str(dgram));

	LASSERTF(!list_empty(&dgram->gndg_list),
		 "dgram 0x%p with bad list state %s type %s\n",
		 dgram, kgnilnd_dgram_state2str(dgram),
		 kgnilnd_dgram_type2str(dgram));

	/* now we know that the datagram structure is ok, so pull off list */
	list_del_init(&dgram->gndg_list);

	/* while we have the gnn_dgram_lock and BEFORE we call test_by_id
	 * change the state from POSTED to PROCESSING to ensure that
	 * nobody cancels it after we've pulled it from the wire */
	if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
		dgram->gndg_state = GNILND_DGRAM_PROCESSING;
	}

	LASSERTF(dgram->gndg_conn != NULL,
		"dgram 0x%p with NULL conn\n", dgram);

	grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
					     (__u64)dgram, &post_state,
					     &remote_addr, &remote_id);

	/* we now "own" this datagram */
	spin_unlock(&dev->gnd_dgram_lock);

	LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
		 " id %llu was ready\n", readyid);

	CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
		"remote_addr %u remote_id %u\n", grc, dgram,
		kgnilnd_dgram_type2str(dgram),
		post_state, remote_addr, remote_id);

	if (unlikely(grc != GNI_RC_SUCCESS)) {
		CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
			dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
			grc);
		rc = -EINVAL;
		GOTO(probe_for_out, rc);
	}

	rc = kgnilnd_process_dgram(dgram, post_state);

	/* we should never get probe finding a dgram for us and then it
	 * being a WC dgram that is still in the middle of processing */
	LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
		 rc, dgram, post_state);

	if (rc == 0) {
		/* dgram is good enough for the data to be used */
		dgram->gndg_state = GNILND_DGRAM_PROCESSING;
		/* fake rc to mark that we've done something */
		rc = 1;
	} else {
		/* let kgnilnd_release_dgram take care of canceled dgrams */
		if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
			dgram->gndg_state = GNILND_DGRAM_DONE;
		}
	}

	*dgramp = dgram;
	RETURN(rc);

probe_for_out:

	kgnilnd_release_dgram(dev, dgram, 0);
	RETURN(rc);
}

int
kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
{
	/* if kgn_wildcard is zero, return error */
	int     rc = -ENOENT, i;
	ENTRY;

	for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
		rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
		if (rc < 0) {
			CERROR("error %d: could not post wildcard datagram # %d\n",
				rc, i);
			rc = -EINVAL;
			GOTO(failed, rc);
		}
	}

failed:
	RETURN(rc);
}

int
kgnilnd_cancel_net_dgrams(kgn_net_t *net)
{
	kgn_dgram_t *dg, *dgN;
	LIST_HEAD(zombies);
	int i;
	ENTRY;

	/* we want to cancel any outstanding dgrams - we don't want to rely
	 * on del_peer_or_conn catching all of them. This helps protect us in cases
	 * where we don't quite keep the peer->dgram mapping in sync due to some
	 * race conditions */

	LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
		 "called with LND invalid state: net shutdown %d "
		 "in reset %d\n", net->gnn_shutdown,
		 kgnilnd_data.kgn_in_reset);

	spin_lock(&net->gnn_dev->gnd_dgram_lock);

	for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
		list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {

			/* skip nids not on our net or are wildcards */


			if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
				net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
				continue;

			kgnilnd_cancel_dgram_locked(dg);
		}
	}

	spin_unlock(&net->gnn_dev->gnd_dgram_lock);

	RETURN(0);
}

int
kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
{
	kgn_dgram_t *dg, *dgN;
	LIST_HEAD(zombies);
	ENTRY;

	/* Time to kill the outstanding WC's
	 * WC's exist on net 0 only but match on any net...
	 */

	LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
		"called with LND invalid state: WC shutdown %d "
		"in reset %d\n", kgnilnd_data.kgn_wc_kill,
		kgnilnd_data.kgn_in_reset);

	spin_lock(&dev->gnd_dgram_lock);

	do {
		dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
		if (dg != NULL) {
			LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
				 "dgram 0x%p->%s with bad type %d (%s)\n",
				dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
				dg->gndg_type, kgnilnd_dgram_type2str(dg));

			kgnilnd_cancel_dgram_locked(dg);

			/* WC could be DONE already, check and if so add to list to be released */
			if (dg->gndg_state == GNILND_DGRAM_DONE)
				list_move_tail(&dg->gndg_list, &zombies);
		}
	} while (dg != NULL);

	spin_unlock(&dev->gnd_dgram_lock);

	list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
		list_del_init(&dg->gndg_list);
		kgnilnd_release_dgram(dev, dg, 1);
	}
	RETURN(0);

}

int
kgnilnd_cancel_dgrams(kgn_device_t *dev)
{
	kgn_dgram_t *dg, *dgN;
	int i;
	ENTRY;

	/* Cancel any outstanding non wildcard datagrams regardless
	 * of which net they are on as we are in base shutdown and
	 * dont care about connecting anymore.
	 */

	LASSERTF(kgnilnd_data.kgn_wc_kill == 1,"We didnt get called from base shutdown\n");

	spin_lock(&dev->gnd_dgram_lock);

	for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size -1); i++) {
		list_for_each_entry_safe(dg, dgN, &dev->gnd_dgrams[i], gndg_list) {
			if (dg->gndg_type != GNILND_DGRAM_WC_REQ)
				kgnilnd_cancel_dgram_locked(dg);
		}
	}

	spin_unlock(&dev->gnd_dgram_lock);

	RETURN(0);
}


void
kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
{
	int             i = 4;
	int             rc;
	gni_return_t    grc;
	__u64           readyid;
	kgn_dgram_t    *dgram;

	/* use do while to get at least one check run to allow
	 * regression test for 762072 to hit bug if there */

	/* This function races with the dgram mover during shutdown so it is possible for
	 * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
	 * dgram mover thread instead of inside of this function.
	 */

	/* This should only be called from within shutdown, baseshutdown, or stack reset.
	 * there are no assertions here to verify since base_shutdown has nothing in it we can check
	 * the net is gone by then.
	 */

	do {
		i++;
		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
			"Waiting for %d canceled datagrams to clear on device %d\n",
			atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);

		/* check once a second */
		grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
		       250, &readyid);

		if (grc != GNI_RC_SUCCESS)
			continue;

		CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n",
			readyid, dev->gnd_id, dev);

		rc = kgnilnd_probe_for_dgram(dev, &dgram);
		if (rc != 0) {
			/* if we got a valid dgram or one that is now done, clean up */
			kgnilnd_release_dgram(dev, dgram, 1);
		}
	} while (atomic_read(&dev->gnd_canceled_dgrams));
}

int
kgnilnd_start_connect(kgn_peer_t *peer)
{
	int              rc = 0;
	/* sync point for kgnilnd_del_peer_locked - do an early check to
	 * catch the most common hits where del_peer is done by the
	 * time we get here */
	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
		while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
	}

	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
	if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
		/* raced with peer getting unlinked */
		write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
		rc = ESTALE;
		GOTO(out, rc);
	}
	peer->gnp_connecting = GNILND_PEER_POSTING;
	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);

	set_mb(peer->gnp_last_dgram_time, jiffies);
	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
		while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
	}

	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
		while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
		rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
	} else {
		rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
					peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
	}
	if (rc < 0) {
		set_mb(peer->gnp_last_dgram_errno, rc);
		GOTO(failed, rc);
	}

	/* while we're posting someone could have decided this peer/dgram needed to
	 * die a quick death, so we check for state change and process accordingly */

	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
	if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
		if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
			peer->gnp_connecting = GNILND_PEER_KILL;
		}
		write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
		/* positive RC to avoid dgram cleanup - we'll have to
		 * wait for the kgni GNI_POST_TERMINATED event to
		 * finish cleaning up */
		rc = ESTALE;
		kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
		GOTO(out, rc);
	}
	peer->gnp_connecting = GNILND_PEER_POSTED;
	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
	/* reaper thread will take care of any timeouts */
	CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
	       libcfs_nid2str(peer->gnp_nid), rc);

	RETURN(rc);

failed:
	CDEBUG(D_NET, "connect to %s failed: rc %d \n",
	       libcfs_nid2str(peer->gnp_nid), rc);
out:
	RETURN(rc);
}

int
kgnilnd_finish_connect(kgn_dgram_t *dgram)
{
	kgn_conn_t        *conn = dgram->gndg_conn;
	lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
	struct lnet_nid    peer_nid;
	kgn_peer_t        *new_peer, *peer = NULL;
	kgn_tx_t          *tx;
	kgn_tx_t          *txn;
	kgn_mbox_info_t   *mbox;
	int                rc;
	int                nstale;

	/* try to find a peer that matches the nid we got in the connreq
	 * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
	 * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */

	/* assume this is a new peer  - it makes locking cleaner when it isn't */
	/* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */

	rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_PEER_UP);
	if (rc != 0) {
		CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
		return rc;
	}

	write_lock(&kgnilnd_data.kgn_peer_conn_lock);

	/* this transfers ref from create_peer to the kgn_peer table */
	kgnilnd_add_peer_locked(her_nid, new_peer, &peer);

	/* if we found an existing peer, is it really ready for a new conn ? */
	if (peer != new_peer) {
		/* if this was an active connect attempt but we can't find a peer waiting for it
		 * we will dump in the trash */

		if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
			CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
			       libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
			rc = ECANCELED;
			GOTO(out, rc);
		}

		/* check to see if we can catch a connecting peer before it is
		 * removed from the connd_peers list - if not, we need to
		 * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
		if (peer->gnp_connecting != GNILND_PEER_IDLE) {
			spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
			if (!list_empty(&peer->gnp_connd_list)) {
				list_del_init(&peer->gnp_connd_list);
				/* drop connd ref */
				kgnilnd_peer_decref(peer);
			}
			spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
			/* clear rc to make sure we don't have fake error */
			rc = 0;
		}

		/* no matter what, we are no longer waiting to connect this peer now */
		peer->gnp_connecting = GNILND_PEER_IDLE;

		/* Refuse to duplicate an existing connection (both sides might try to
		 * connect at once).  NB we return success!  We _are_ connected so we
		 * _don't_ have any blocked txs to complete with failure. */
		rc = kgnilnd_conn_isdup_locked(peer, conn);
		if (rc != 0) {
			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
			CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
			      libcfs_nid2str(her_nid), rc);
			rc = EALREADY;
			GOTO(out, rc);
		}
	}

	if (peer->gnp_state == GNILND_PEER_DOWN) {
		CNETERR("Received connection request from down nid %s\n",
			libcfs_nid2str(her_nid));
	}

	peer->gnp_state = GNILND_PEER_UP;
	nstale = kgnilnd_close_stale_conns_locked(peer, conn);

	/* either way with peer (new or existing), we are ok with ref counts here as the
	 * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
	 * ref for the peer table. */

	/* at this point, the connection request is a winner */

	/* mark 'DONE' to avoid cancel being called from release */
	dgram->gndg_state = GNILND_DGRAM_DONE;

	/* initialise timestamps before reaper looks at them */
	conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;

	/* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
	 * immediatly send a NOOP in the reaper thread during the call to
	 * kgnilnd_check_conn_timeouts_locked
	 */
	conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
	conn->gnc_state = GNILND_CONN_ESTABLISHED;

	/* save the dgram type used to establish this connection */
	conn->gnc_dgram_type = dgram->gndg_type;

	/* refs are not transferred from dgram to tables, so increment to
	 * take ownership */
	kgnilnd_conn_addref(conn);
	kgnilnd_peer_addref(peer);
	conn->gnc_peer = peer;
	list_add_tail(&conn->gnc_list, &peer->gnp_conns);

	kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
	list_add_tail(&conn->gnc_hashlist,
		      kgnilnd_cqid2connlist(conn->gnc_cqid));
	kgnilnd_data.kgn_conn_version++;

	/* Dont send NOOP if fail_loc is set
	 */
	if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
		tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP,
					lnet_nid_to_nid4(&peer->gnp_net->gnn_ni->ni_nid));
		if (tx == NULL) {
			CNETERR("can't get TX to initiate NOOP to %s\n",
				libcfs_nid2str(peer->gnp_nid));
		} else {
			kgnilnd_queue_tx(conn, tx);
		}
	}

	/* Schedule all packets blocking for a connection */
	list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
		/* lock held here is the peer_conn lock */
		kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
		kgnilnd_queue_tx(conn, tx);
	}

	/* If this is an active connection lets mark its timestamp on the MBoX */
	if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
		mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
		/* conn->gnc_last_rx is jiffies it better exist as it was just set */
		mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
	}

	/* Bug 765042: wake up scheduler for a race with finish_connect and
	 * complete_conn_closed with a conn in purgatory
	 * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
	 * we just check for set and then clear */
	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
		cfs_fail_loc = 0x0;
		/* get scheduler thread moving again */
		kgnilnd_schedule_device(conn->gnc_device);
	}

	CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
	       conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);

	/* make sure we reset peer reconnect interval now that we have a good conn */
	kgnilnd_peer_alive(peer);
	peer->gnp_reconnect_interval = 0;

	/* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
	 * on the atomic forever
	 */
	if (peer->gnp_pending_unlink) {
		peer->gnp_pending_unlink = 0;
		kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
		CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
	}

	/* add ref to make it hang around until after we drop the lock */
	kgnilnd_conn_addref(conn);

	/* Once the peer_conn lock is dropped, the conn could actually move into
	 * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
	 * lock until we are really done */
	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);

	/* Notify LNET that we now have a working connection to this peer.
	 * This is a Cray extension to the "standard" LND behavior.
	 */
	lnet_nid4_to_nid(peer->gnp_nid, &peer_nid);
	lnet_notify(peer->gnp_net->gnn_ni, &peer_nid, true, true,
		    ktime_get_seconds());

	/* drop our 'hold' ref */
	kgnilnd_conn_decref(conn);

out:
	RETURN(rc);
}

void
kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
{
	int              rc = 0;
	ENTRY;

	LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));

	CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);

	rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);

	if (rc < 0) {
		CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
	}
	EXIT;
}

int
kgnilnd_process_nak(kgn_dgram_t *dgram)
{
	kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
	lnet_nid_t         src_nid = connreq->gncr_srcnid;
	int                errno = connreq->gncr_nakdata.gnnd_errno;
	kgn_peer_t        *peer;
	int                rc = 0;

	write_lock(&kgnilnd_data.kgn_peer_conn_lock);

	peer = kgnilnd_find_peer_locked(src_nid);
	if (peer == NULL) {
		/* we likely dropped him from bad data when we processed
		 * the original REQ */
		write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
		return -EBADSLT;
	}

	/* need to check peerstamp/connstamp against the ones we find
	 * to make sure we don't close new (and good?) conns that we
	 * formed after this connreq failed */
	if (peer->gnp_connecting == GNILND_PEER_IDLE) {
		kgn_conn_t        conn;

		if (list_empty(&peer->gnp_conns)) {
			/* assume already procced datagram and it barfed up
			 * on this side too */
			CDEBUG(D_NET, "dropping NAK from %s; "
			       "peer %s is already not connected\n",
				libcfs_nid2str(connreq->gncr_srcnid),
				libcfs_nid2str(connreq->gncr_dstnid));
			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
			return 0;
		}

		/* stub up a connection with the connreq XXX_stamps to allow
		 * use to use close_stale_conns_locked */
		conn.gnc_peerstamp = connreq->gncr_peerstamp;
		conn.gnc_my_connstamp = connreq->gncr_connstamp;
		conn.gnc_peer_connstamp = connreq->gncr_connstamp;
		conn.gnc_device = peer->gnp_net->gnn_dev;

		rc = kgnilnd_close_stale_conns_locked(peer, &conn);

		LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
			"closed %d connections\n",
			libcfs_nid2str(connreq->gncr_srcnid),
			libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
	} else {
		spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);

		if (list_empty(&peer->gnp_connd_list)) {
			/* if peer isn't on waiting list, try to find one to nuke */
			rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
							   peer->gnp_nid);

			if (rc) {
				LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
					"canceled pending connect request\n",
					libcfs_nid2str(connreq->gncr_srcnid),
					libcfs_nid2str(connreq->gncr_dstnid), errno);
			}

			/* if we can't find a waiting dgram, we just drop the nak - the conn
			 * connect must have failed (didn't find conn above and clear connecting
			 * -- so nothing to do besides drop */
		} else {
			/* peer is on list, meaning it is a new connect attempt from the one
			 * we started that generated the NAK - so just drop NAK */

			/* use negative to prevent error message */
			rc = -EAGAIN;
		}
		spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
	}

	/* success! we found a peer and at least marked pending_nak */
	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);

	return rc;
}

int
kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
{
	int                      rc;

	rc = kgnilnd_unpack_connreq(dgram);
	if (rc < 0) {
		if (rc != -EBADF) {
			/* only NAK if we have good srcnid to use */
			*needs_nak = 1;
		}
		goto connreq_out;
	}

	switch (dgram->gndg_conn_in.gncr_type) {
	case GNILND_CONNREQ_REQ:
		/* wire up peer & conn, send queued TX */
		rc = kgnilnd_finish_connect(dgram);

		/* don't nak when the nid is hosed */
		if ((rc < 0)) {
			*needs_nak = 1;
		}

		break;
	case GNILND_CONNREQ_NAK:
		rc = kgnilnd_process_nak(dgram);
		/* return early to prevent reconnect bump */
		return rc;
	default:
		CERROR("unexpected connreq type %s (%d) from %s\n",
			kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
			dgram->gndg_conn_in.gncr_type,
			libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
		rc = -EINVAL;
		*needs_nak = 1;
		break;
	}

connreq_out:
	RETURN(rc);
}

int
kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
{
	int                      rc;
	int                      needs_nak = 0;
	lnet_nid_t               nak_dstnid = LNET_NID_ANY;
	lnet_nid_t               orig_dstnid;
	kgn_dgram_t             *dgram = NULL;
	kgn_peer_t              *peer;
	ENTRY;

	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
		rc = 0;
	} else {
		rc = kgnilnd_probe_for_dgram(dev, &dgram);
	}

	if (rc == 0) {
		RETURN(0);
	} else if (rc < 0) {
		GOTO(inform_peer, rc);
	} else {
		/* rc > 1 means it did something, reset for this func  */
		rc = 0;
	}

	switch (dgram->gndg_type) {
	case GNILND_DGRAM_WC_REQ:
	case GNILND_DGRAM_REQ:
		rc = kgnilnd_process_connreq(dgram, &needs_nak);
		break;
	case GNILND_DGRAM_NAK:
		CDEBUG(D_NETTRACE, "NAK to %s done\n",
			libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
		break;
	default:
		CERROR("unknown datagram type %s (%d)\n",
		       kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
		break;
	}

	/* stash data to use after releasing current datagram */
	/* don't stash net - we are operating on a net already,
	 * so the lock on rw_net_lock is sufficient */

	nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;

inform_peer:
	LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);

	orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;

	kgnilnd_release_dgram(dev, dgram, 0);

	CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
	       libcfs_nid2str(orig_dstnid), rc);

	/* if this was a WC_REQ that matched an existing peer, it'll get marked done
	 * in kgnilnd_finish_connect - if errors are from before we get to there,
	 * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
	if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
		/* if we have a negative rc, we want to find a peer to inform about
		 * the bad connection attempt. Sorry buddy, better luck next time! */

		write_lock(&kgnilnd_data.kgn_peer_conn_lock);
		peer = kgnilnd_find_peer_locked(orig_dstnid);

		if (peer != NULL) {
			/* add ref to make sure he stays around past the possible unlink
			 * so we can tell LNet about him */
			kgnilnd_peer_addref(peer);

			/* if he still cares about the outstanding connect */
			if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
				/* check if he is on the connd list and remove.. */
				spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
				if (!list_empty(&peer->gnp_connd_list)) {
					list_del_init(&peer->gnp_connd_list);
					/* drop connd ref */
					kgnilnd_peer_decref(peer);
				}
				spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);

				/* clear gnp_connecting so we don't have a non-connecting peer
				 * on gnd_connd_list */
				peer->gnp_connecting = GNILND_PEER_IDLE;

				set_mb(peer->gnp_last_dgram_errno, rc);

				kgnilnd_peer_increase_reconnect_locked(peer);
			}
		}
		write_unlock(&kgnilnd_data.kgn_peer_conn_lock);

		/* now that we are outside the lock, tell Mommy */
		if (peer != NULL) {
			kgnilnd_peer_notify(peer, rc, 0);
			kgnilnd_peer_decref(peer);
		}
	}

	if (needs_nak) {
		kgnilnd_send_nak(dev, nak_dstnid, rc);
	}

	RETURN(1);
}

void
kgnilnd_reaper_dgram_check(kgn_device_t *dev)
{
	kgn_dgram_t    *dgram, *tmp;
	int             i;

	spin_lock(&dev->gnd_dgram_lock);

	for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
		list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
			unsigned long            now = jiffies;
			unsigned long            timeout;

			/* don't timeout stuff if the network is mucked or shutting down */
			if (kgnilnd_check_hw_quiesce()) {
				break;
			}

			if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
			    (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
				continue;
			}
			CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
				"state %s conn 0x%p to %s age %lus\n",
				dgram, kgnilnd_dgram_type2str(dgram),
				kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
				libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
				cfs_duration_sec(now - dgram->gndg_post_time));

			timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);

			if (time_before(now, (dgram->gndg_post_time + timeout)))
				continue;

			CNETERR("%s datagram to %s timed out @ %lus dgram "
				"0x%p state %s conn 0x%p\n",
				kgnilnd_dgram_type2str(dgram),
				libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
				cfs_duration_sec(now - dgram->gndg_post_time),
				dgram, kgnilnd_dgram_state2str(dgram),
				dgram->gndg_conn);

			kgnilnd_cancel_dgram_locked(dgram);
		}
	}
	spin_unlock(&dev->gnd_dgram_lock);
}


/* use a thread for the possibly long-blocking wait_by_id to prevent
 * stalling the global workqueues */
int
kgnilnd_dgram_waitq(void *arg)
{
	kgn_device_t     *dev = (kgn_device_t *) arg;
	char              name[16];
	gni_return_t      grc;
	__u64             readyid;
	DEFINE_WAIT(mover_done);

	snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);

	/* all gnilnd threads need to run fairly urgently */
	set_user_nice(current, *kgnilnd_tunables.kgn_nice);

	/* we dont shut down until the device shuts down ... */
	while (!kgnilnd_data.kgn_shutdown) {
		/* to quiesce or to not quiesce, that is the question */
		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
			KGNILND_SPIN_QUIESCE;
		}

		while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}

		/* check once a second */
		grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
						       1000, &readyid);

		if (grc == GNI_RC_SUCCESS) {
			CDEBUG(D_INFO, "waking up dgram mover thread\n");
			kgnilnd_schedule_dgram(dev);

			/* wait for dgram thread to ping us before spinning again */
			prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
					TASK_INTERRUPTIBLE);

			/* don't sleep if we need to quiesce */
			if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
				schedule();
			}
			finish_wait(&dev->gnd_dgping_waitq, &mover_done);
		}
	}

	kgnilnd_thread_fini();
	return 0;
}

int
kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
{
	int                      did_something = 0, rc;
	kgn_peer_t              *peer = NULL;

	spin_lock(&dev->gnd_connd_lock);

	/* Active connect - we added this in kgnilnd_launch_tx */
	while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
		peer = list_first_entry(&dev->gnd_connd_peers,
					kgn_peer_t, gnp_connd_list);

		/* ref for connd removed in if/else below */
	       list_del_init(&peer->gnp_connd_list);

		/* gnp_connecting and membership on gnd_connd_peers should be
		 * done coherently to avoid double adding, etc */
		/* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
		 * to get the peer to gnp_connecting in the first place. We just need to
		 * rely on gnd_connd_lock to serialize someone pulling him from the list
		 * BEFORE clearing gnp_connecting */
		LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
			 peer, libcfs_nid2str(peer->gnp_nid));

		spin_unlock(&dev->gnd_connd_lock);

		CDEBUG(D_NET, "processing connect to %s\n",
		       libcfs_nid2str(peer->gnp_nid));

		did_something += 1;
		rc = kgnilnd_start_connect(peer);

		if (likely(rc >= 0)) {
			/* 0 on success, positive on 'just drop peer' errors */
			kgnilnd_peer_decref(peer);
		} else if (rc == -ENOMEM) {
			/* if we are out of wildcards, add back to
			 * connd_list - then break out and we'll try later
			 * if other errors, we'll bail & cancel pending tx */
			write_lock(&kgnilnd_data.kgn_peer_conn_lock);
			if (peer->gnp_connecting == GNILND_PEER_POSTING) {
				peer->gnp_connecting = GNILND_PEER_CONNECT;
				spin_lock(&dev->gnd_connd_lock);
				list_add_tail(&peer->gnp_connd_list,
					      &dev->gnd_connd_peers);
			} else {
				/* connecting changed while we were posting */

				LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
					" state 0x%p->%s, connecting %d\n",
					peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
				peer->gnp_connecting = GNILND_PEER_KILL;
				spin_lock(&dev->gnd_connd_lock);
				/* remove the peer ref frrom the cond list */
				kgnilnd_peer_decref(peer);
				/* let the system handle itself */
			}
			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
			/* the datagrams are a global pool,
			 * so break out of trying and hope some free
			 * up soon */
			did_something -= 1;
			break;
		} else {
			/* something bad happened, you lose */
			CNETERR("could not start connecting to %s "
				"rc %d: Will retry until TX timeout\n",
			       libcfs_nid2str(peer->gnp_nid), rc);
			/* It didnt post so just set connecting back to zero now.
			 * The reaper will reattempt the connection if it needs too.
			 * If the peer needs death set it so the reaper will cleanup.
			 */
			write_lock(&kgnilnd_data.kgn_peer_conn_lock);
			if (peer->gnp_connecting == GNILND_PEER_POSTING) {
				peer->gnp_connecting = GNILND_PEER_IDLE;
				kgnilnd_peer_increase_reconnect_locked(peer);
			} else {
				LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
					" state 0x%p->%s, connecting %d\n",
					peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
				peer->gnp_connecting = GNILND_PEER_KILL;
			}
			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);

			/* hold onto ref until we are really done - if it was
			 * unlinked this could result in a destroy */
			kgnilnd_peer_decref(peer);
		}
		spin_lock(&dev->gnd_connd_lock);
	}

	spin_unlock(&dev->gnd_connd_lock);
	RETURN(did_something);
}

int
kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
{
	int did_something = 0, to_repost, i;
	to_repost = atomic_read(&dev->gnd_nwcdgrams);
	ENTRY;

	for (i = 0; i < to_repost; ++i) {
		int	rerc;
		rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
		if (rerc == 0) {
			kgnilnd_admin_decref(dev->gnd_nwcdgrams);
			did_something += 1;
		} else {
			CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
				rerc, dev->gnd_id);
			break;
		}
	}

	RETURN(did_something);
}

struct kgnilnd_dgram_timer {
	struct timer_list timer;
	kgn_device_t *dev;
};

static void
kgnilnd_dgram_poke_with_stick(cfs_timer_cb_arg_t arg)
{
	struct kgnilnd_dgram_timer *t = cfs_from_timer(t, arg, timer);

	wake_up(&t->dev->gnd_dgram_waitq);
}

/* use single thread for dgrams - should be sufficient for performance */
int
kgnilnd_dgram_mover(void *arg)
{
	kgn_device_t            *dev = (kgn_device_t *)arg;
	char                     name[16];
	int                      rc, did_something;
	unsigned long            next_purge_check = jiffies - 1;
	unsigned long            timeout;
	struct kgnilnd_dgram_timer timer;
	unsigned long deadline = 0;
	DEFINE_WAIT(wait);

	snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);

	/* all gnilnd threads need to run fairly urgently */
	set_user_nice(current, *kgnilnd_tunables.kgn_nice);

	/* we are ok not locking for these variables as the dgram waitq threads
	 * will block both due to tying up net (kgn_shutdown) and the completion
	 * event for the dgram_waitq (kgn_quiesce_trigger) */
	deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
	while (!kgnilnd_data.kgn_shutdown) {
		/* Safe: kgn_shutdown only set when quiescent */

		/* race with stack reset - we want to hold off seeing any new incoming dgrams
		 * so we can force a dirty WC dgram for Bug 762072 - put right before
		 * quiesce check so that it'll go right into that and not do any
		 * dgram mucking */
		CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);

		/* to quiesce or to not quiesce, that is the question */
		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
			KGNILND_SPIN_QUIESCE;
		}
		did_something = 0;

		CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);

		/* process any newly completed dgrams */
		down_read(&kgnilnd_data.kgn_net_rw_sem);

		rc = kgnilnd_probe_and_process_dgram(dev);
		if (rc > 0) {
			did_something += rc;
		}

		up_read(&kgnilnd_data.kgn_net_rw_sem);

		CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
			(*kgnilnd_tunables.kgn_dgram_timeout + 1));
		/* start new outbound dgrams */
		did_something += kgnilnd_start_outbound_dgrams(dev, deadline);

		/* find dead dgrams */
		if (time_after_eq(jiffies, next_purge_check)) {
			/* these don't need to be checked that often */
			kgnilnd_reaper_dgram_check(dev);

			next_purge_check = (long) jiffies +
				      cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
		}

		did_something += kgnilnd_repost_wc_dgrams(dev);

		/* careful with the jiffy wrap... */
		timeout = (long)(next_purge_check - jiffies);

		CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
		       did_something, timeout, next_purge_check, jiffies);

		if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
			did_something = 0;
			continue;
		}

		prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);

		cfs_timer_setup(&timer.timer,
				kgnilnd_dgram_poke_with_stick,
				dev, 0);
		timer.dev = dev;
		mod_timer(&timer.timer, (long) jiffies + timeout);

		/* last second chance for others to poke us */
		did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);

		/* check flag variables before committing even if we
		 * did something; if we are after the deadline call
		 * schedule */
		if ((!did_something || time_after(jiffies, deadline)) &&
		    !kgnilnd_data.kgn_shutdown &&
		    !kgnilnd_data.kgn_quiesce_trigger) {
			CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
			       timeout, cfs_duration_sec(timeout));
			wake_up(&dev->gnd_dgping_waitq);
			schedule();
			CDEBUG(D_INFO, "awake after schedule\n");
			deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
		}

		timer_delete_sync(&timer.timer);
		finish_wait(&dev->gnd_dgram_waitq, &wait);
	}

	kgnilnd_thread_fini();
	return 0;
}