[Cbe-oss-dev] [patch 05/16] powerpc and x86_64: Initial port of PCIe endpoint device driver for Axon

Tue Mar 11 17:24:09 EST 2008

On Saturday 08 March 2008, Murali Iyer wrote:
> Subject: [patch 05/16] axon driver's DMA interface
> 
> Description:
> 	This patch provides generic interface to the axon dmax 
> 	dma engine.

Instead of this one line description, it would be good to copy
the introductory comment form the source file here.

Have you tried using the dma-engine API for this?
Olof has recently posted an excellent driver for this API
for a similar DMA controller used on PA-Semi processors, see
http://patchwork.ozlabs.org/linuxppc/patch?id=17222

> +const uint32_t chnl_err_mask[] = {RI0, RI1, RI2, RI3};
> +const uint32_t chnl_sg_status_mask[] = {SG0, SG1, SG2, SG3};
> +const uint32_t chnl_busy_mask[] = {CB0, CB1, CB2, CB3};
> +const uint32_t chnl_status_mask[] =
> +	{0x88000880, 0x40800440, 0x20080220, 0x10008110};

For mostly historical reasons, we tend to use types like 'u32'
in the kernel instead of the standard uint32_t.

> +/**
> + * dmax_get_base_ctl - Setup the base type of SG elem for this xfer
> + * @xfer: Pointer to transfer structure
> + * @desc: Pointer to DMAX S/G descriptor
> + */
> +inline void dmax_get_base_ctl(struct dmax_desc *desc)
> +{

static inline, please

> +/**
> + * dmax_descriptors_available - get the number of available descriptors
> + * @chan_p: the DMA channel
> + *
> + */
> +static inline int dmax_descriptors_available(struct dmax_chn *chan_p)
> +{
> +	if (chan_p->desc_head >= chan_p->desc_tail)
> +		return chan_p->desc_ring_size -
> +			(chan_p->desc_head - chan_p->desc_tail) - 1;
> +	else
> +		return (chan_p->desc_tail - chan_p->desc_head) -1;
> +}

A nice trick with ring buffers is to make the size a power of two and
then just use the lower bits for indexing. That way, you don't need
to check for wrap-around but can constantly keep incrementing the
index. This function in that case becomes

	return chan_p->desc_tail - chan_p->desc_head - 1;

> +/**
> + * dmax_get_next_desc_index - return the next available descriptor index
> + * @chan_p: the DMA channel
> + *
> + */
> +static inline int dmax_get_next_desc_index(struct dmax_chn *chan_p)
> +{
> +	u32 head = chan_p->desc_head;
> +	chan_p->desc_head++;
> +	chan_p->desc_head = chan_p->desc_head % chan_p->desc_ring_size;
> +	return head;
> +}

This code seems to lack locking around the access. If you use the trick
I described above, you can avoid adding locking by using atomic updates,
which you can't easily do in your algorithm.

> +/**
> + * dmax_seq_plb5 - get the plb5 addr to write the seequnce number via DMA
> + * @chan_p:
> + *
> + */
> +static u64 dmax_seq_plb5(struct dmax_chn *chan_p)
> +{
> +	int chan_num = chan_p->channel_num;
> +
> +	u64 local_d2d_plb5 = local_phys_to_plb5(chan_p->dev->axon_priv,
> +			chan_p->dev->axon_priv->local_d2d_dma_handle);
> +	return local_d2d_plb5 + offsetof(struct axon_d2d,
> +					dma_channel_seq_complete[chan_num]);
> +}

I believe we have discussed this in the past, but I may have forgotten
your arguments. AFAICS, all the conversion between PCI, PLB and IOIF
addresses should be handled by the dma api, if you provide an appropriate
dma_mapping_ops implementation.

> +
> +	if (pse & RE) {
> +		switch (RS_ERROR(pse)) {
> +		case RS_ERR_0:
> +			printk( KERN_ERR "PLB5 Parity Error on address for a write\n");
> +			break;
> +		case RS_ERR_1:
> +			printk( KERN_ERR "PLB5 Parity Error on BE for a write\n");
> +			break;
> +		case RS_ERR_2:
> +			printk( KERN_ERR "PLB5 Write Error targeting reserved space\n");
> +			break;
> +		case RS_ERR_3:
> +			printk( KERN_ERR "PLB5 Write Error request non-single beat\n");
> +			break;
> +		case RS_ERR_4:
> +			printk( KERN_ERR "PLB5 Write Error request with non-contiguous"
> +				"BE or BE crossing a word boundary\n");
> +			break;
> +		case RS_ERR_5:
> +			printk( KERN_ERR "PLB5 Snoopable write request targeting"
> +				"the PLB slave port address map\n");
> +			break;
> +		case RS_ERR_8:
> +			printk( KERN_ERR "PLB5 Parity error on address for a read\n");
> +			break;
> +		case RS_ERR_9:
> +			printk( KERN_ERR "PLB Parity error on BE for a read\n");
> +			break;
> +		case RS_ERR_10:
> +			printk( KERN_ERR "PLB5 Read Error targeting reserved space\n");
> +			break;
> +		case RS_ERR_11:
> +			printk( KERN_ERR "PLB5 Read Error request non-single beat\n");
> +			break;
> +		case RS_ERR_12:
> +			printk( KERN_ERR "PLB5 Read Error request with non-contiguous"
> +				"BE or BE crossing a word boundary\n");
> +			break;
> +		case RS_ERR_13:
> +			printk( KERN_ERR "PLB5 Snoopable read request targeting"
> +				"the PLB slave port address map\n");
> +			break;
> +		default:
> +			pr_info("No PLB5 RS Error indicated\n");
> +		}

Since this is just a simple lookup table, why not write it in a more
compact way, as

	static const char *plb_error_table[13] = {
		[0] = "Parity Error on address for a write",
		[1] = "Parity Error on BE for a write",
		...
	};
	printk(KERN_ERR "PLB5: %s\n", plb_error_table[RS_ERROR(pse)];

> +		rc = 1;
> +	}
> +
> +	return rc;
> +}

Always use negative errno.h constants to indicate an error, or 0 for
success.

> +/**
> + * dmax_channel_timer_pop -  per_channel transfer timer function
> + * @data: the pointer to the channel structure
> + *
> + */
> +static void dmax_channel_timer_pop(unsigned long data)
> +{
> +	struct dmax_chn *chan_p = (struct dmax_chn *)data;
> +	queue_work(chan_p->dev->wq, &chan_p->timer_worker);
> +}
> +
> +
> +/**
> + * dmax_timer_work -  per_channel timer deferred work
> + * @work: pointer to the enqueued work_struct
> + *
> + */
> +static void dmax_timer_work(struct work_struct *work)
> +{
> +	struct dmax_chn *chan_p = container_of(work, struct dmax_chn, timer_worker);
> +	struct axon_d2d *d2d = chan_p->dev->axon_priv->local_d2d;
> +	int chan_num = chan_p->channel_num;
> +	u64 completed;
> +	u64 issued;
> +	unsigned long irq_flags;
> +	int timeout = 0;
> +
> +	spin_lock_irqsave(&chan_p->chan_lock, irq_flags);
> +	dmax_quick_cleanup(chan_p);
> +	completed = d2d->dma_channel_seq_complete[chan_num];
> +	issued = d2d->dma_channel_seq_issued[chan_num];
> +
> +	if (chan_p->completed || (completed == issued) ) {
> +		chan_p->completed = 0;
> +		chan_p->xfer_pop = 0;
> +	} else {
> +		if (issued == chan_p->xfer_pop) {
> +			timeout = 1;
> +		} else {
> +			chan_p->xfer_pop = issued;
> +		}
> +	}
> +
> +	spin_unlock_irqrestore(&chan_p->chan_lock, irq_flags);
> +
> +	if (timeout)
> +		dmax_xfer_timeout(dmax_seq_to_xfer(chan_p,issued));
> +	else {
> +		if (chan_p->available) {
> +			chan_p->timer.expires +=
> +				msecs_to_jiffies(chan_p->timeout_in_ms);
> +			add_timer(&chan_p->timer);
> +		}
> +	}
> +}

You can use queue_delayed_work() to write this in a simpler way, it
will set up a timer itself for calling the work function.

> +/**
> + * dmax_select_chnl - For a given transfer, select the channel to run on
> + * @xfer:
> + *
> + */
> +static inline int dmax_select_chnl(struct dmax *dmax_p, unsigned long flags)
> +{
> +	int chan;
> +	chan = dmax_p->last_chan?0:1;
> +	dmax_p->last_chan = chan;
> +	return chan;
> +}

the flags argument is unused, is that on purpose?

> +/**
> + * xfer_add_interrupt - enable a completion interrupt for the given xfer
> + * @xfer:
> + *
> + */
> +static void xfer_add_interrupt(struct dmax_xfer *xfer)
> +{
> +#ifdef CONFIG_TB_AXON_PCI
> +	/*
> +	 * we set up a dummy descriptor for this case
> +	 * move the source address to the right spot
> +	 */
> +	u32 addr_lo = (xfer->chan_p->plb5_int_addr) & 0xFFFFFFFF;
> +	xfer->last_desc->src_addr_lo = cpu_to_be32(addr_lo);
> +#else
> +	/* just tag the last descriptor with a BIE */
> +	xfer->last_desc->ccw |= cpu_to_be32(BIE);
> +#endif
> +}

For functions that behave in different ways on PCI and on the BIF,
you may need a callback to do the right stuff in the hardware specific
driver.

In this case, I don't understand what the difference is about, can you
explain?

> +/**
> + * dmax_intr_error_parse - determine the type of DMA error from the registers
> + * @dmax_p:
> + * @channel:
> + */
> +int dmax_intr_error_parse(struct dmax *dmax_p, int channel)
> +{
> +	int rc = 0;
> +	u32 temp_reg;
> +
> +	pr_info("dmax_intr[%d]: Got an error IRQ: status = 0x%x!!\n",
> +	    channel, dmax_reg_read(dmax_p, DMAX_SR));
> +	temp_reg = dmax_reg_read(dmax_p, DMAX_SR);
> +	if ( temp_reg & dmax_sr_error(channel) ) {
> +		rc = dmax_mask_sr_err(temp_reg, channel);
> +		pr_info("CHANNEL ERROR = 0x%x\n", rc);
> +		dmax_p->channel[channel].stats.perf.errors++;
> +
> +		switch (rc) {
> +		case WRITE_REQUEST:
> +			pr_info("Write Request Error.\n");
> +			dmax_p->channel[channel].stats.err.write_req++;
> +			break;
> +		case WRITE_DATA:
> +			pr_info("Write Data Error.\n");
> +			dmax_p->channel[channel].stats.err.write_data++;
> +			break;
> +		case READ_REQ_REG:
> +			pr_info("read request error\n");
> +			dmax_p->channel[channel].stats.err.read_req_reg++;
> +			break;
> +		case READ_DATA_REG:
> +			pr_info("read data error.\n");
> +			dmax_p->channel[channel].stats.err.read_data_req++;
> +			break;
> +		case READ_DATA_SG:
> +			pr_info("read data error: S/G or resume fetch.\n");
> +			dmax_p->channel[channel].stats.err.read_data_sg++;
> +			break;
> +		case ALIGN_SRC_DST:
> +			pr_info("address alignment error for source or dest.\n");
> +			dmax_p->channel[channel].stats.err.align++;
> +			break;
> +		case SG_ALIGN:
> +			pr_info("S/G address alignment error.\n");
> +			dmax_p->channel[channel].stats.err.sg_align++;
> +			break;
> +		}

As before, this can be done nicer with a small table of messages, and using
the error code as an index instead of the switch/case statement.

> +static void dmax_link_ring_descriptors(struct dmax *dmax_p,
> +					struct dmax_chn *chan_p,
> +					int chan_num)
> +{
> +	struct dmax_desc *desc;
> +	int num_descs = chan_p->desc_ring_size;
> +	u64 base_plb5_addr = dmax_phys_to_plb5(chan_p->dev, chan_p->desc_da);
> +	u32 base_plb5_addr_high = cpu_to_be32(base_plb5_addr >> 32);
> +	u32 base_plb5_addr_low = base_plb5_addr & 0xFFFFFFFF;

This looks wrong: If you do cpu_to_be32() on the upper half, why
not on the lower half here?

> +/**
> + * Client routines
> + */
> +
> +/**
> + * axon_request_dma - Perform a DMA
> + * @dmax_p: dma engine to use
> + * @req: DMA request structure
> + *
> + */
> +int axon_request_dma( struct dmax *dmax_p, struct dmax_dma_req *req)
> +{
> +	int rc = 0;
> +	struct dmax_xfer *xfer = NULL;
> +	u32 flags = req->flags;
> +
> +	pr_debug("DMA Request for %p of length 0x%lx\n",
> +	    dmax_p, (long unsigned int) req->length);
> +
> +	if (unlikely(!(flags & AXON_DMA_ASYNCH)))
> +		return -EINVAL;
> +
> +	if (unlikely(!(flags & (AXON_DMA_MR | AXON_DMA_NET))))
> +		return -EINVAL;
> +
> +	rc = dmax_validate_lists(dmax_p, req);
> +	if ( unlikely(rc) ) {
> +		return rc;
> +	}
> +
> +	rc = dmax_setup(dmax_p,
> +			req,
> +			&xfer);
> +	if (!rc) {
> +		rc = -EIOCBQUEUED;
> +	} else {
> +		if (rc != -EAGAIN) {
> +			printk( KERN_ERR "dmax_setup() returned errno = %d\n", rc);
> +		}
> +	}
> +
> +	return rc;
> +}
> +
> +/**
> + * axon_apnet_dma - Perform a DMA within the apnet memory area
> + * @dev_num: Device number ( typically 0 - 3 )
> + * @local_buf: Kernel virtual address of local memory
> + * @remote_offset: Offset within remote memory area
> + * @length: Length in bytes of transfer
> + * @direction: Use AXON_APNET_DMATYPE_xxx
> + * @local_status: local memory location of status value to write
> + * @remote_status_offset: local to write status value
> + *
> + */
> +int axon_apnet_dma(int dev_num, struct axon_apnet_dma_cb *cb)
> +{

Why is a DMA for apnet different from the other DMAs? Can you move
this function to the apnet driver and call axon_request_dma() from
there?

> +/**
> + *  @struct  struct dma_sg_list
> + * This structure contains the SG list for a DMA transfer
> + */
> +struct dma_sg_list {
> +	void *start_desc;	/* Starting descriptor */
> +	void *last_desc;	/* Final descriptor */
> +	u64 count;		/* Total bytes to transfer*/
> +	u64 num_descs;
> +
> +	struct axon_dma_list *src;	 /* Map of source addresses*/
> +	struct axon_dma_list *dest;	 /* Map of destination addresses*/
> +};

Why do you define a new sg_list instead of using the one from
linux/scatterlist.h?

> +/**
> + * dmax_find_channel -
> + * @irq:
> + *
> + */
> +static inline int dmax_find_channel(struct dmax *dmax_p, int irq)
> +{
> +	if (irq == dmax_p->irq[0])
> +		return DMA_0;
> +	if (irq == dmax_p->irq[1])
> +		return DMA_1;
> +	if (irq == dmax_p->irq[2])
> +		return DMA_2;
> +	if (irq == dmax_p->irq[3])
> +		return DMA_3;
> +	if (irq == dmax_p->irq[4]) {
> +		return DMA_0;
> +	}
> +	if (irq == dmax_p->irq[5]) {
> +		return DMA_1;
> +	}
> +	if (irq == dmax_p->irq[6]) {
> +		return DMA_2;
> +	}
> +	if (irq == dmax_p->irq[7]) {
> +		return DMA_3;
> +	}
> +
> +	pr_debug("No channel for IRQ: %d\n", irq);
> +	return -1;
> +}

Why do you need this function? You currently pass down the struct dmax
into the interrupt handler, why not pass a channel specific structure
instead?

> +	dmax_p->signature = dmax_signature;
> +	dmax_p->index = dmax_driver.dma_engines;
> +	dmax_p->dev = &(ofdev->dev);
> +	dmax_p->chn_shift = 0x0;
> +	sprintf((char *)&dmax_p->name[0], "dmax%d", dmax_driver.dma_engines);
> +	dmax_p->local_phys_plb5_addr = DMAX_BE_XDR_BASE;

DMAX_BE_XDR_BASE should not be hardcoded here. On Open Firmware, all the
addresses are encoded in the device tree.

> +	dmax_driver.dma_engines++;
> +
> +	tmp = ofdev->node->full_name;
> +	ret = of_address_to_resource(ofdev->node, 0, &r);
> +	if (!ret) {
> +		dmax_p->plb5_base_addr =
> +			(__u8 *)ioremap(r.start, r.end - r.start + 1);

You can use of_iomap() to do both of_address_to_resource and ioremap
in one call. The typecast is both unnecessary and incorrect, as
plb5_base_addr needs to be an __iomem pointer.

> +
> +#define DMAX_0	0
> +#define DMAX_1	1
> +#define DMAX_2	2
> +#define DMAX_3	3
> +#define NUM_DMAX_CHAN	4

These definitions look like they should go into the dmax driver, not a
header file, because they do not define an interface between two components
of the kernel.

> +#define DMAX_DDR0_BASE	0x0000000000000000
> +#define DMAX_DDR1_BASE	0x2000002000000000
> +
> +#define DMAX_BE_XDR_BASE 0x6000006000000000
> +#define DMAX_BE_XDR_BASE_HI 0x60000060

This does not look like an interface at all, but rather like something
that is hardware specific. In case of the PCI driver, it might go into
the driver implementation, while on the cell blade, it needs to come
from the device tree.

> +#define DMAX_IRQ_0	119
> +#define DMAX_IRQ_1	120
> +#define DMAX_IRQ_2	121
> +#define DMAX_IRQ_3	122
> +#define DMAX_IRQ_0_ERR	123
> +#define DMAX_IRQ_1_ERR	124
> +#define DMAX_IRQ_2_ERR	125
> +#define DMAX_IRQ_3_ERR	126
> +#define DMAX_SLAVE_ERR	127
> +#define DMAX_NUM_IRQ		9

These obviously have no place in a header file whatsoever. Interrupt
numbers are specific to the physical interrupt controller and do
not carry a significance in the device driver at all, they get remapped
into virtual interrupt numbers.

> +/*
> + * DMAX Descriptor - MUST by 64 bytes long
> + */
> +
> +struct dmax_desc{
> +	uint32_t ccw;		  /* 0x0 DMA Channel Control Word */
> +	uint32_t count_control;	  /* 0x4 Count and Control */
> +	uint32_t data_stride;	  /* 0x8 Data Stride */
> +	uint32_t reserved;	  /* 0xc */
> +	uint32_t src_addr_hi;	  /* 0x10 Source Address High */
> +	uint32_t src_addr_lo;	  /* 0x14 Source Address Low */
> +	uint32_t dest_addr_hi;	  /* 0x18 Destination Address High */
> +	uint32_t dest_addr_lo;	  /* 0x1c Destination Address Lo */
> +	uint32_t next_sg_addr_hi; /* 0x20 Linked Next Scatter/Gather
> +				     Descriptor Address High */
> +	uint32_t next_sg_addr_lo; /* 0x24 Linked Next Scatter/Gather
> +				     Descriptor Address Lo */
> +	uint32_t reserved2;	  /* 0x28 */
> +	uint32_t reserved3;	  /* 0x2c */
> +	void	 *next;		  /* 0x30 Must be cleared to reserved
> +				     prior to transfer */
> +	uint32_t reserved5;	  /* 0x38 */
> +	uint32_t reserved6;	  /* 0x3c */
> +};

If you build on a 32 bit host, the structure is 60 bytes long, not 64.
Since this is hardware specific, you should have some comment on
it, and use __be32 or __le32 data types to show the endianess.

> +/*
> + * DMA Channel Control Registers (DMA_CRn)
> + */
> +#define DMAX_CR(x)		((0x80*x) + 0x0)
> +
> +#define CE		0x80000000
> +#define BIE		0x40000000
> +#define CIS		0x10000000
> +#define CIM		0x08000000
> +#define PW		0x07000000
> +#define PW_BYTE		0x00000000
> +#define PW_HALF		0x01000000
> +#define PW_WORD		0x02000000
> +#define PW_DOUBLE	0x03000000
> +#define PW_QUAD		0x04000000
> +#define DMAX_PW_SHIFT	24
> +#define DSS		0x00200000
> +#define DSD		0x00100000
> +#define CP		0x00060000
> +#define SSC		0x00002000
> +#define DSC		0x00001000
> +#define BAR		0x00000001

These definitions should go into the driver, like many more of the
definitions below these.

Also, if these were defining an interface between kernel components,
the identifiers should be qualified with a prefix, so they don't
clash with other symbols.

> +#define DESC_BARRIER (CE | BAR)
> +
> +#ifdef CONFIG_TB_AXON_PCI
> +#define LAST_DESC_BARRIER DESC_BARRIER
> +#else
> +#define LAST_DESC_BARRIER (CE | BAR | BIE)
> +#endif

This #ifdef hurts portability, maybe you can express this value
as a member in the device structure.

> +
> +/**
> + * tbtime_t - time base time type
> + */
> +union tbtime_t {
> +	uint64_t tb;		/* full 64 bit portion of TBR */
> +	struct {
> +		uint32_t high;	/* High 32-bit portion of the time base */
> +		uint32_t low;	/* Low 32-bit portion of the time base */
> +	}half;
> +};
> +
> +
> +static inline void tbtime_get(union tbtime_t *tb)
> +{
> +	tb->tb = get_cycles();
> +}

What's the point in this abstraction? I can see no advantage over just
using the well-known get_cycles function directly in your driver.

	Arnd <><