[Cbe-oss-dev] [patch 15/16] powerpc and x86_64: Initial port of PCIe endpoint device driver for Axon

Mon Mar 17 15:45:04 EST 2008

On Saturday 08 March 2008, Murali Iyer wrote:
> Subject: [patch 15/16] AXON PCIe Network (apnet) driver
> 
> High level description:
> 	Provides ethernet like interface per Axon. The Virtual Network interface
> will be called apnet?. Primary functzions like memory allocation, interrupt
> (mailbox and msi), DMA transfers are handled by the base Axon driver that is
> not discussed in this patch.
> 	Pre-allocates receive SKBs and populates the data pointer in the
> descriptors for remote side to DMA it. NAPI poll function uses netif_receive_skb
> to indicate availability of the packet to the network stack and allocates
> new receive SKB for the slot. Always pushes the data to the remote side,
> meaning transmit function of the local system locates the free descriptor on
> the remote system and uses DMA to send the packet. Trasmit function sends a
> message (interrupt) to the remote system that acts as a receive interrupt.
> 	APNET driver requests base driver a shared memory area to store buffer
> descriptor configuration and receive descriptors.
> Approximately it needs about 2KB of shared memory for the default configuration
> of 128 descriptors of 16 bytes each.
> 	MAC addresses are generated randomly using random_ether_addr() and 
> supports set_mac_address function to change the default MAC. This might be
> useful to enable bonding driver over apnet and ease dhcpd set-up as well.
> 
>    View of shared memory area as follows.
> 	  -----------------
> 	 | Configuration   |
> 	  -----------------
> 	 | Descriptor 1    |
> 	  -----------------
> 	 | Descriptor 2    |
> 	  -----------------
> 	 | ...........     |
> 	  -----------------
> 	 | Descriptor n-1  |
> 	  -----------------
> 	 | Descriptor n    |
> 	  -----------------
> 
> 	 ------------------------------------------------------------------
> 	|		|		|				   |
> 	|Ctrl/Sts 32b 	| Pkt. len 32b  | Physical Address of pkt data 64b |
> 	|		|		|				   |
> 	 ------------------------------------------------------------------
> 		Sample Descriptor
> 

I think it would be good to also post this driver to the 
netdev at vger.kernel.org mailing list next time, because we don't
have many network experts around here.

> +
> +/* number of Axon devices found in the system for network interfaces */
> +static	int	num_apnet_if;
> +
> +/* Store apnet's private data per interface */
> +static struct apnet	*apnet_interface[NUM_APNET];
> +

Shouldn't be needed. The only place where you access these
is the deregistration of the driver, and that automatically
walks the device list and calls the driver->remove function.

> +/**
> + * apnet_poll - NAPI receive poll function
> + * @ndev - net device structure
> + * @budget - number of packets to process
> + *
> + * returns 0 if there are no more packets to process, otherwise 1
> + * This is the main receive function that retrives and passes receive
> + * packets to the network stack. Updates various status, enables remote
> + * notification call back if there are no more packets in the buffer.
> + */
> +static int apnet_poll(struct net_device *ndev, int *budget)
> +{
> +	int npkts = 0, quota = min(ndev->quota, *budget);
> +	__be32 plen, data;
> +	struct sk_buff *skb;
> +	struct apnet *adev = netdev_priv(ndev);
> +	struct apnet_bd *bd = apnet_get_rx_bd(adev);
> +
> +	while (npkts < quota) {
> +		if (!bd)
> +			break;	/* no valid packets to receive */
> +		plen = read32be(&bd->data_len);
> +		if (unlikely(plen > adev->rx_cfg.mtu)) {
> +			pr_err("%s: RX MTU size error. %d > %d\n", \
> +				ndev->name, plen, adev->rx_cfg.mtu);
> +			adev->nstats.rx_errors++;
> +			goto release_slot;
> +		}
> +		/* Will ever be empty? */
> +		if (!(adev->rx_skb[adev->rx_slot])) {
> +			pr_err("%s: Trying to receive empty skb at slot %d\n",\
> +				ndev->name, adev->rx_slot);
> +			adev->nstats.rx_dropped++;
> +			goto release_slot;
> +		}
> +		skb = adev->rx_skb[adev->rx_slot];
> +		mb();	/* make sure rx packet in memory */
> +		skb_put(skb, plen);
> +		skb->dev = ndev;
> +		skb->protocol = eth_type_trans(skb, ndev);
> +		skb->ip_summed = CHECKSUM_UNNECESSARY;
> +		netif_receive_skb(skb);
> +		adev->astats.inflight_rx_skbs--;
> +		axon_dma_unmap_single(adev->if_num, bd->data_ptr,
> +				adev->rx_cfg.mtu, DMA_FROM_DEVICE);
> +		if (apnet_alloc_rx_skb(ndev, adev->rx_slot))
> +			adev->nstats.rx_dropped++;
> +		adev->nstats.rx_packets++;
> +		adev->nstats.rx_bytes += plen;
> +release_slot:
> +		data = read32be(&bd->ctrl);
> +		data &= ~APNET_RX_CTRL_FULL;
> +		write32be(data, &bd->ctrl);
> +		npkts++;	/* includes good and dropped pkts */
> +		/* update to the next rx slot */
> +		adev->rx_slot++;
> +		bd = apnet_get_rx_bd(adev);
> +	}
> +	*budget -= npkts;
> +	ndev->quota -= npkts;
> +	if (!bd) {
> +		axon_enable_notify_callback(APNET_MSG_TYPE, adev->if_num);
> +		netif_rx_complete(ndev);
> +		adev->astats.in_rx_poll = 0;
> +		mb();		/* make sure update completes */
> +		return 0;	/* no more packets */
> +	}
> +	return 1;	/* more pkts are in rx buffer */
> +}

Do I see this right that you call axon_enable_notify_callback() to switch
on/off the local delivery of callback interrupts, but not the actual calling
of the interrupt handler?

> +static void apnet_tx_timeout(struct net_device *ndev)
> +{
> +       struct apnet *adev = netdev_priv(ndev);
> +
> +       if (apnet_tx_ring_status(adev))
> +               return;
> +
> +       if (adev->astats.tx_dma_err)    /* return if we got DMA error */
> +               return;
> +
> +       if (apnet_get_tx_bd(adev))
> +               netif_wake_queue(ndev); /* found free slot to xmit */
> +       else
> +               axon_send_remote_notification(APNET_MSG_TYPE,
> adev->if_num);
> +       return; 

If the DMA failed, do you really want to send a remote notification?
Doesn't that cause the broken packet to get received by the remote
side?

> +
> +/**
> + * apnet_tx_dma_complete - this function called when DMA completes
> + * @if_num - interface number
> + * @tx_status - apnet tx status structure.
> + *
> + * returns 0 on success
> + * This is a callback function that DMA engines calls back when it
> + * completes requested transfer. Status updated and remote notification
> + * sent to indicate packet availability
> + */
> +static int apnet_tx_dma_complete(int if_num, void *tx_status)
> +{
> +	struct apnet_tx_status	*tx_sts = tx_status;
> +
> +	netif_tx_lock_bh(tx_sts->ndev);
> +	axon_dma_unmap_single(if_num,
> +			      tx_sts->handle,
> +			      tx_sts->skb->len,
> +			      DMA_TO_DEVICE);
> +	dev_kfree_skb(tx_sts->skb);
> +	tx_sts->sts &= ~cpu_to_be32(APNET_RX_CTRL_FULL);
> +	netif_tx_unlock_bh(tx_sts->ndev);
> +	/* generate remote interrupt to indicate packet arrival */
> +	axon_send_remote_notification(APNET_MSG_TYPE, if_num);
> +	return 0;
> +}
> +

If I read this correctly, you generate a local interrupt for every frame
you transmit, and from this handler, generate another remote interrupt
to notify of the packet being transmitted if that is enabled?

IIRC, you get a lot of tx interrupts this way, and it would be good
to reduce this number. Can you send the dma-complete event to the
remote end instead? That would first of all reduced the latency
of the initial packet rx, and also mean that you can consolidate
multiple tx-complete events in a single interrupt sent back from
the receiving end.

> +/**
> + * apnet_set_mac_addr - sets new MAC address for the interface
> + * @ndev - net device structure
> + * @sockaddr - socket address structure
> + *
> + * returns 0 on success, <0 on failure
> + * Allows to change MAC address of the interface.
> + * example : ifconfig apnet0 hw ether 8a:c2:51:47:69:de
> + */
> +static int apnet_set_mac_addr(struct net_device *ndev, void *sockaddr)
> +{
> +	struct sockaddr *addr = sockaddr;
> +
> +	if (!is_valid_ether_addr(addr->sa_data))
> +		return -EINVAL;
> +
> +	pr_info("%s: Current MAC address %02X:%02X:%02X:%02X:%02X:%02X\n",
> +		ndev->name,
> +		ndev->dev_addr[0], ndev->dev_addr[1],
> +		ndev->dev_addr[2], ndev->dev_addr[3],
> +		ndev->dev_addr[4], ndev->dev_addr[5]);
> +	memcpy(ndev->dev_addr, addr->sa_data, ndev->addr_len);
> +	pr_info("%s: New MAC address %02X:%02X:%02X:%02X:%02X:%02X\n",
> +		ndev->name,
> +		ndev->dev_addr[0], ndev->dev_addr[1],
> +		ndev->dev_addr[2], ndev->dev_addr[3],
> +		ndev->dev_addr[4], ndev->dev_addr[5]);
> +	return 0;
> +}
> +

remember to make these messages pr_debug at some point. When debugging,
it shouldn't make a difference, and later they get annoying.

> +static inline __be64 read64be(__be64 *addr)
> +{
> +	__be64 tmp = be64_to_cpu(__raw_readq(addr));
> +	rmb();	/* make sure read ordering */
> +	return tmp;
> +}
> +
> +static inline void write64be(__be64 data, __be64 *addr)
> +{
> +	__raw_writeq(cpu_to_be64(data), addr);
> +	wmb(); /* make sure write ordering */
> +}

The use of __raw_writeq in device drivers is a bit suspicious,
as the __iomem token is not necessarily really a pointer, depending
on some compile-time options. Also, your read functions have
somewhat less strict ordering rules than the in_be64 function used
by the regular accessors.

Checking this code with sparse should give you a lot of warnings
about address space mismatches. I think it would be much better
to have separate functions for accessing local and remote memory,
even if that means some amount of duplication in code using it.

> +#define pr_err(fmt, arg...) \
> +	printk(KERN_ERR fmt, ##arg)

This is already defined in kernel.h

	Arnd <><