[PATCH 02/11][v2] async_tx: add support for asynchronous GF multiplication
Dan Williams
dan.j.williams at intel.com
Thu Dec 18 05:34:19 EST 2008
Hi Yuri,
On Mon, Dec 8, 2008 at 2:55 PM, Yuri Tikhonov <yur at emcraft.com> wrote:
> This adds support for doing asynchronous GF multiplication by adding
> four additional functions to async_tx API:
>
> async_pq() does simultaneous XOR of sources and XOR of sources
> GF-multiplied by given coefficients.
>
> async_pq_zero_sum() checks if results of calculations match given
> ones.
>
> async_gen_syndrome() does sumultaneous XOR and R/S syndrome of sources.
>
> async_syndrome_zerosum() checks if results of XOR/syndrome calculation
> matches given ones.
>
> Latter two functions just use async_pq() with the approprite coefficients
> in asynchronous case but have significant optimizations if synchronous
> case.
>
I like this separation of gen_syndrome and generic pq.
[..]
> + /* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */
> + dma_dest[0] = !blocks[src_cnt] ? 0 :
> + dma_map_page(dma->dev, blocks[src_cnt],
> + offset, len, DMA_BIDIRECTIONAL);
"0" could be a valid dma address on some architectures.
DMA_ERROR_CODE looks like the closest fit for what we are trying to do
here, but that only exists on sparc and powerpc. We could add a
"dest_mask" parameter to device_prep_dma_pq where the mask is 1 =
p-only, 2 = q-only, and 3 = p and q.
> + dma_dest[1] = !blocks[src_cnt+1] ? 0 :
> + dma_map_page(dma->dev, blocks[src_cnt+1],
> + offset, len, DMA_BIDIRECTIONAL);
> +
> + for (i = 0; i < src_cnt; i++)
> + dma_src[i] = dma_map_page(dma->dev, blocks[i],
> + offset, len, DMA_TO_DEVICE);
> +
> + while (src_cnt) {
> + async_flags = flags;
> + pq_src_cnt = min(src_cnt, dma->max_pq);
> + /* if we are submitting additional pqs, leave the chain open,
> + * clear the callback parameters, and leave the destination
> + * buffers mapped
> + */
> + if (src_cnt > pq_src_cnt) {
> + async_flags &= ~ASYNC_TX_ACK;
> + dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP;
> + _cb_fn = NULL;
> + _cb_param = NULL;
> + } else {
> + _cb_fn = cb_fn;
> + _cb_param = cb_param;
> + }
> + if (_cb_fn)
> + dma_flags |= DMA_PREP_INTERRUPT;
> +
> + /* Since we have clobbered the src_list we are committed
> + * to doing this asynchronously. Drivers force forward
> + * progress in case they can not provide a descriptor
> + */
> + tx = dma->device_prep_dma_pq(chan, dma_dest,
> + &dma_src[src_off], pq_src_cnt,
> + scf_list ? &scf_list[src_off] :
> + NULL,
> + len, dma_flags);
...one nit for readability can we replace these ternary conditionals
with proper if-else statements? i.e.
if (scf_list)
scf = &scf_list[src_off];
else
scf = NULL;
tx = dma->device_prep_dma_pq(chan, dma_dest,
&dma_src[src_off], pq_src_cnt,
scf, len, dma_flags);
> + if (unlikely(!tx))
> + async_tx_quiesce(&depend_tx);
> +
> + /* spin wait for the preceeding transactions to complete */
> + while (unlikely(!tx)) {
> + dma_async_issue_pending(chan);
> + tx = dma->device_prep_dma_pq(chan, dma_dest,
> + &dma_src[src_off], pq_src_cnt,
> + scf_list ? &scf_list[src_off] : NULL,
> + len, dma_flags);
> + }
> +
> + async_tx_submit(chan, tx, async_flags, depend_tx,
> + _cb_fn, _cb_param);
> +
> + depend_tx = tx;
> + flags |= ASYNC_TX_DEP_ACK;
> +
> + if (src_cnt > pq_src_cnt) {
> + /* drop completed sources */
> + src_cnt -= pq_src_cnt;
> + src_off += pq_src_cnt;
> +
> + /* use the intermediate result as a source; we
> + * clear DMA_PREP_ZERO, so prep_dma_pq will
> + * include destination(s) into calculations
> + */
> + dma_flags = 0;
> + } else
> + break;
> + }
> +
> + return tx;
> +}
> +
> +/**
> + * do_sync_pq - synchronously calculate P and Q
> + */
> +static void
> +do_sync_pq(struct page **blocks, unsigned char *scf, unsigned int offset,
> + int src_cnt, size_t len, enum async_tx_flags flags,
> + struct dma_async_tx_descriptor *depend_tx,
> + dma_async_tx_callback cb_fn, void *cb_param)
> +{
> + int i, pos;
> + uint8_t *p, *q, *src;
> +
> + /* set destination addresses */
> + p = blocks[src_cnt] ?
> + (uint8_t *)(page_address(blocks[src_cnt]) + offset) :
> + NULL;
> + q = blocks[src_cnt+1] ?
> + (uint8_t *)(page_address(blocks[src_cnt+1]) + offset) :
> + NULL;
> +
...more ternary conditional to if-else conversion
> + if (flags & ASYNC_TX_PQ_ZERO_P) {
> + BUG_ON(!p);
> + memset(p, 0, len);
> + }
> +
> + if (flags & ASYNC_TX_PQ_ZERO_Q) {
> + BUG_ON(!q);
> + memset(q, 0, len);
> + }
> +
> + for (i = 0; i < src_cnt; i++) {
> + src = (uint8_t *)(page_address(blocks[i]) + offset);
> + for (pos = 0; pos < len; pos++) {
> + if (p)
> + p[pos] ^= src[pos];
> + if (q)
> + q[pos] ^= raid6_gfmul[scf[i]][src[pos]];
> + }
> + }
> + async_tx_sync_epilog(cb_fn, cb_param);
> +}
> +
> +/**
> + * async_pq - attempt to do XOR and Galois calculations in parallel using
> + * a dma engine.
> + * @blocks: source block array from 0 to (src_cnt-1) with the p destination
> + * at blocks[src_cnt] and q at blocks[src_cnt + 1]. Only one of two
> + * destinations may be present (another then has to be set to NULL).
> + * By default, the result of calculations is XOR-ed with the initial
> + * content of the destinationa buffers. Use ASYNC_TX_PQ_ZERO_x flags
> + * to avoid this.
> + * NOTE: client code must assume the contents of this array are destroyed
> + * @scf: array of source coefficients used in GF-multiplication
> + * @offset: offset in pages to start transaction
> + * @src_cnt: number of source pages
> + * @len: length in bytes
> + * @flags: ASYNC_TX_PQ_ZERO_P, ASYNC_TX_PQ_ZERO_Q, ASYNC_TX_ASSUME_COHERENT,
> + * ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, ASYNC_TX_ASYNC_ONLY
> + * @depend_tx: depends on the result of this transaction.
> + * @cb_fn: function to call when the operation completes
> + * @cb_param: parameter to pass to the callback routine
> + */
> +struct dma_async_tx_descriptor *
> +async_pq(struct page **blocks, unsigned char *scf,
> + unsigned int offset, int src_cnt, size_t len,
> + enum async_tx_flags flags,
> + struct dma_async_tx_descriptor *depend_tx,
> + dma_async_tx_callback cb_fn, void *cb_param)
> +{
> + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_PQ,
> + &blocks[src_cnt], 2,
> + blocks, src_cnt, len);
> + struct dma_device *device = chan ? chan->device : NULL;
> + struct dma_async_tx_descriptor *tx = NULL;
> +
> + if (!device && (flags & ASYNC_TX_ASYNC_ONLY))
> + return NULL;
> +
> + if (device) {
> + /* run pq asynchronously */
> + tx = do_async_pq(chan, blocks, scf, offset, src_cnt,
> + len, flags, depend_tx, cb_fn,cb_param);
> + } else {
> + /* run pq synchronously */
> + if (!blocks[src_cnt+1]) {
> + struct page *pdst = blocks[src_cnt];
> + int i;
> +
> + /* Calculate P-parity only.
> + * As opposite to async_xor(), async_pq() assumes
> + * that destinations are included into calculations,
> + * so we should re-arrange the xor src list to
> + * achieve the similar behavior.
> + */
> + if (!(flags & ASYNC_TX_PQ_ZERO_P)) {
> + /* If async_pq() user doesn't set ZERO flag,
> + * it's assumed that destination has some
> + * reasonable data to include in calculations.
> + * The destination must be at position 0, so
> + * shift the sources and put pdst at the
> + * beginning of the list.
> + */
> + for (i = src_cnt - 1; i >= 0; i--)
> + blocks[i+1] = blocks[i];
> + blocks[0] = pdst;
> + src_cnt++;
> + flags |= ASYNC_TX_XOR_DROP_DST;
> + } else {
> + /* If async_pq() user want to clear P, then
> + * this will be done automatically in async
> + * case, and with the help of ZERO_DST in
> + * the sync one.
> + */
> + flags &= ~ASYNC_TX_PQ_ZERO_P;
> + flags |= ASYNC_TX_XOR_ZERO_DST;
> + }
> +
> +
> + return async_xor(pdst, blocks, offset,
> + src_cnt, len, flags, depend_tx,
> + cb_fn, cb_param);
> + }
> +
> + /* wait for any prerequisite operations */
> + async_tx_quiesce(&depend_tx);
> +
> + do_sync_pq(blocks, scf, offset, src_cnt, len, flags,
> + depend_tx, cb_fn, cb_param);
> + }
> +
> + return tx;
> +}
> +EXPORT_SYMBOL_GPL(async_pq);
> +
> +/**
> + * do_sync_gen_syndrome - synchronously calculate P (xor) and Q (Reed-Solomon
> + * code)
> + */
> +static void
> +do_sync_gen_syndrome(struct page **blocks, unsigned int offset,
> + int src_cnt, size_t len, enum async_tx_flags flags,
> + struct dma_async_tx_descriptor *depend_tx,
> + dma_async_tx_callback cb_fn, void *cb_param)
> +{
> + int i;
> + void *tsrc[src_cnt+2];
> +
> + for (i = 0; i < src_cnt + 2; i++)
> + tsrc[i] = page_address(blocks[i]) + offset;
> +
> + raid6_call.gen_syndrome(i, len, tsrc);
> +
> + async_tx_sync_epilog(cb_fn, cb_param);
> +}
> +
> +/**
> + * async_gen_syndrome - attempt to generate P (xor) and Q (Reed-Solomon code)
> + * with a dma engine for a given set of blocks. This routine assumes a
> + * field of GF(2^8) with a primitive polynomial of 0x11d and a generator
> + * of {02}.
> + * @blocks: source block array ordered from 0..src_cnt-1 with the P destination
> + * at blocks[src_cnt] and Q at blocks[src_cnt + 1]. Only one of two
> + * destinations may be present (another then has to be set to NULL).
> + * NOTE: client code must assume the contents of this array are destroyed
> + * @offset: offset in pages to start transaction
> + * @src_cnt: number of source pages: 2 < src_cnt <= 255
> + * @len: length of blocks in bytes
> + * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, ASYNC_TX_ASYNC_ONLY
> + * @depend_tx: P+Q operation depends on the result of this transaction.
> + * @cb_fn: function to call when P+Q generation completes
> + * @cb_param: parameter to pass to the callback routine
> + */
> +struct dma_async_tx_descriptor *
> +async_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt,
> + size_t len, enum async_tx_flags flags,
> + struct dma_async_tx_descriptor *depend_tx,
> + dma_async_tx_callback cb_fn, void *cb_param)
> +{
> + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_PQ,
> + &blocks[src_cnt], 2,
> + blocks, src_cnt, len);
> + struct dma_device *device = chan ? chan->device : NULL;
> + struct dma_async_tx_descriptor *tx = NULL;
> +
> + BUG_ON(src_cnt > 255 || (!blocks[src_cnt] && !blocks[src_cnt+1]));
> +
> + if (!device && (flags & ASYNC_TX_ASYNC_ONLY))
> + return NULL;
> +
> + /* Synchronous gen_syndrome() doesn't take care of destinations,
> + * but asynchronous implies them as sources; so, when generating
> + * syndromes - command to clear destinations up explicitly
> + */
> + if (blocks[src_cnt])
> + flags |= ASYNC_TX_PQ_ZERO_P;
> + if (blocks[src_cnt+1])
> + flags |= ASYNC_TX_PQ_ZERO_Q;
> +
> + if (device) {
> + /* run the xor asynchronously */
> + tx = do_async_pq(chan, blocks, (uint8_t *)raid6_gfexp,
> + offset, src_cnt, len, flags, depend_tx,
> + cb_fn, cb_param);
> + } else {
> + /* run the pq synchronously */
> + /* wait for any prerequisite operations */
> + async_tx_quiesce(&depend_tx);
> +
> + if (!blocks[src_cnt])
> + blocks[src_cnt] = spare_pages[2];
> + if (!blocks[src_cnt+1])
> + blocks[src_cnt+1] = spare_pages[2];
> + do_sync_gen_syndrome(blocks, offset, src_cnt, len, flags,
> + depend_tx, cb_fn, cb_param);
> + }
> +
> + return tx;
> +}
> +EXPORT_SYMBOL_GPL(async_gen_syndrome);
> +
> +/**
> + * async_pq_zero_sum - attempt a PQ parities check with a dma engine.
> + * @blocks: array of source pages. The 0..src_cnt-1 are the sources, the
> + * src_cnt and src_cnt+1 are the P and Q destinations to check, resp.
> + * Only one of two destinations may be present.
> + * NOTE: client code must assume the contents of this array are destroyed
> + * @scf: coefficients to use in GF-multiplications
> + * @offset: offset in pages to start transaction
> + * @src_cnt: number of source pages
> + * @len: length in bytes
> + * @presult: where to store the result of P-ckeck, which is 0 if P-parity
> + * OK, and non-zero otherwise.
> + * @qresult: where to store the result of P-ckeck, which is 0 if Q-parity
> + * OK, and non-zero otherwise.
> + * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
> + * @depend_tx: depends on the result of this transaction.
> + * @cb_fn: function to call when the xor completes
> + * @cb_param: parameter to pass to the callback routine
> + */
> +struct dma_async_tx_descriptor *
> +async_pq_zero_sum(struct page **blocks, unsigned char *scf,
> + unsigned int offset, int src_cnt, size_t len,
> + u32 *presult, u32 *qresult, enum async_tx_flags flags,
> + struct dma_async_tx_descriptor *depend_tx,
> + dma_async_tx_callback cb_fn, void *cb_param)
> +{
> + struct dma_chan *chan = async_tx_find_channel(depend_tx,
> + DMA_PQ_ZERO_SUM,
> + &blocks[src_cnt], 2,
> + blocks, src_cnt, len);
> + struct dma_device *device = chan ? chan->device : NULL;
> + struct dma_async_tx_descriptor *tx = NULL;
> +
> + BUG_ON(src_cnt < 2);
> +
> + if (device && src_cnt <= device->max_pq) {
> + dma_addr_t dma_src[src_cnt + 2];
> + enum dma_ctrl_flags dma_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
> + int i;
> +
> + for (i = 0; i < src_cnt + 2; i++)
> + dma_src[i] = blocks[i] ? dma_map_page(device->dev,
> + blocks[i], offset, len,
> + DMA_TO_DEVICE) : 0;
If we go with the "dest_mask" approach to specifying p and q then we
need to separate them into their own parameter here... although in
this case it would be a "src_mask" to select p or q.
> +
> + tx = device->device_prep_dma_pqzero_sum(chan, dma_src, src_cnt,
> + scf, len,
> + presult, qresult,
> + dma_flags);
> +
> + if (unlikely(!tx)) {
> + async_tx_quiesce(&depend_tx);
> +
> + while (unlikely(!tx)) {
> + dma_async_issue_pending(chan);
> + tx = device->device_prep_dma_pqzero_sum(chan,
> + dma_src, src_cnt, scf, len,
> + presult, qresult,
> + dma_flags);
> + }
> + }
> +
> + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
> + } else {
> + struct page *pdest = blocks[src_cnt];
> + struct page *qdest = blocks[src_cnt + 1];
> + enum async_tx_flags lflags = flags;
> +
> + lflags &= ~ASYNC_TX_ACK;
> + lflags |= ASYNC_TX_PQ_ZERO_P | ASYNC_TX_PQ_ZERO_Q;
> +
> + spin_lock(&spare_lock);
> + blocks[src_cnt] = spare_pages[0];
> + blocks[src_cnt + 1] = spare_pages[1];
> + tx = async_pq(blocks, scf, offset, src_cnt, len, lflags,
> + depend_tx, NULL, NULL);
> +
> + async_tx_quiesce(&tx);
> +
> + if (presult && pdest)
> + *presult = memcmp(page_address(pdest) + offset,
> + page_address(spare_pages[0]) +
> + offset, len) == 0 ? 0 : 1;
> + if (qresult && qdest)
> + *qresult = memcmp(page_address(qdest) + offset,
> + page_address(spare_pages[1]) +
> + offset, len) == 0 ? 0 : 1;
> + spin_unlock(&spare_lock);
> + }
> +
> + return tx;
> +}
> +EXPORT_SYMBOL_GPL(async_pq_zero_sum);
> +
> +/**
> + * async_syndrome_zero_sum - attempt a P (xor) and Q (Reed-Solomon code)
> + * parities check with a dma engine. This routine assumes a field of
> + * GF(2^8) with a primitive polynomial of 0x11d and a generator of {02}.
> + * @blocks: array of source pages. The 0..src_cnt-1 are the sources, the
> + * src_cnt and src_cnt+1 are the P and Q destinations to check, resp.
> + * Only one of two destinations may be present.
> + * NOTE: client code must assume the contents of this array are destroyed
> + * @offset: offset in pages to start transaction
> + * @src_cnt: number of source pages
> + * @len: length in bytes
> + * @presult: where to store the result of P-ckeck: 0 if P-parity is OK,
> + * and non-zero otherwise.
> + * @qresult: where to store the result of P-ckeck: 0 if Q-parity is OK.
> + * and non-zero otherwise.
> + * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
> + * @depend_tx: depends on the result of this transaction.
> + * @cb_fn: function to call when the xor completes
> + * @cb_param: parameter to pass to the callback routine
> + */
> +struct dma_async_tx_descriptor *
> +async_syndrome_zero_sum(struct page **blocks, unsigned int offset,
> + int src_cnt, size_t len, u32 *presult, u32 *qresult,
> + enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
> + dma_async_tx_callback cb_fn, void *cb_param)
> +{
> + struct dma_chan *chan = async_tx_find_channel(depend_tx,
> + DMA_PQ_ZERO_SUM,
> + &blocks[src_cnt], 2,
> + blocks, src_cnt, len);
> + struct dma_device *device = chan ? chan->device : NULL;
> + struct dma_async_tx_descriptor *tx = NULL;
> +
> + BUG_ON(src_cnt < 2);
> +
> + if (device && src_cnt <= device->max_pq) {
> + dma_addr_t dma_src[src_cnt + 2];
> + enum dma_ctrl_flags dma_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
> + int i;
> +
> + for (i = 0; i < src_cnt + 2; i++)
> + dma_src[i] = blocks[i] ? dma_map_page(device->dev,
> + blocks[i], offset, len,
> + DMA_TO_DEVICE) : 0;
> +
> + tx = device->device_prep_dma_pqzero_sum(chan, dma_src, src_cnt,
> + (uint8_t *)raid6_gfexp,
> + len, presult, qresult,
> + dma_flags);
> +
> + if (unlikely(!tx)) {
> + async_tx_quiesce(&depend_tx);
> + while (unlikely(!tx)) {
> + dma_async_issue_pending(chan);
> + tx = device->device_prep_dma_pqzero_sum(chan,
> + dma_src, src_cnt,
> + (uint8_t *)raid6_gfexp, len,
> + presult, qresult,
> + dma_flags);
> + }
> + }
> +
> + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
> + } else {
> + struct page *pdest = blocks[src_cnt];
> + struct page *qdest = blocks[src_cnt + 1];
> + enum async_tx_flags lflags = flags;
> +
> + lflags &= ~ASYNC_TX_ACK;
> +
> + spin_lock(&spare_lock);
> + blocks[src_cnt] = spare_pages[0];
> + blocks[src_cnt + 1] = spare_pages[1];
> + tx = async_gen_syndrome(blocks, offset,
> + src_cnt, len, lflags,
> + depend_tx, NULL, NULL);
> + async_tx_quiesce(&tx);
> +
> + if (presult && pdest)
> + *presult = memcmp(page_address(pdest) + offset,
> + page_address(spare_pages[0]) +
> + offset, len) == 0 ? 0 : 1;
> + if (qresult && qdest)
> + *qresult = memcmp(page_address(qdest) + offset,
> + page_address(spare_pages[1]) +
> + offset, len) == 0 ? 0 : 1;
> + spin_unlock(&spare_lock);
> + }
> +
> + return tx;
> +}
> +EXPORT_SYMBOL_GPL(async_syndrome_zero_sum);
> +
> +static int __init async_pq_init(void)
> +{
> + spin_lock_init(&spare_lock);
> +
> + spare_pages[0] = alloc_page(GFP_KERNEL);
> + if (!spare_pages[0])
> + goto abort;
> + spare_pages[1] = alloc_page(GFP_KERNEL);
> + if (!spare_pages[1])
> + goto abort;
> + spare_pages[2] = alloc_page(GFP_KERNEL);
> + if (!spare_pages[2])
> + goto abort;
> + return 0;
> +abort:
> + safe_put_page(spare_pages[2]);
> + safe_put_page(spare_pages[1]);
> + safe_put_page(spare_pages[0]);
> + printk(KERN_ERR "%s: cannot allocate spare!\n", __func__);
> + return -ENOMEM;
> +}
> +
> +static void __exit async_pq_exit(void)
> +{
> + safe_put_page(spare_pages[2]);
> + safe_put_page(spare_pages[1]);
> + safe_put_page(spare_pages[0]);
> +}
> +
> +module_init(async_pq_init);
> +module_exit(async_pq_exit);
> +
> +MODULE_AUTHOR("Yuri Tikhonov <yur at emcraft.com>");
> +MODULE_DESCRIPTION("asynchronous pq/pq-zero-sum api");
> +MODULE_LICENSE("GPL");
> diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
> index 0f50d4c..5d6b639 100644
> --- a/include/linux/async_tx.h
> +++ b/include/linux/async_tx.h
> @@ -42,6 +42,12 @@ struct dma_chan_ref {
> * @ASYNC_TX_XOR_ZERO_DST: this flag must be used for xor operations where the
> * the destination address is not a source. The asynchronous case handles this
> * implicitly, the synchronous case needs to zero the destination block.
> + * @ASYNC_TX_PQ_ZERO_P: this flag must be used for async_pq operations since the
> + * destination there is always the source (the result of P after async_pq is
> + * xor-ed with the previous content of P block if this flag isn't set).
> + * @ASYNC_TX_PQ_ZERO_Q: this flag must be used for async_pq operations since the
> + * destination there is always the source (the result of Q after async_pq is
> + * xor-ed with the previous content of Q block if this flag isn't set).
> * @ASYNC_TX_XOR_DROP_DST: this flag must be used if the destination address is
> * also one of the source addresses. In the synchronous case the destination
> * address is an implied source, whereas the asynchronous case it must be listed
> @@ -50,12 +56,17 @@ struct dma_chan_ref {
> * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
> * dependency chain
> * @ASYNC_TX_DEP_ACK: ack the dependency descriptor. Useful for chaining.
> + * @ASYNC_TX_ASYNC_ONLY: if set then try to perform operation requested only in
> + * the asynchronous mode.
> */
> enum async_tx_flags {
> ASYNC_TX_XOR_ZERO_DST = (1 << 0),
> - ASYNC_TX_XOR_DROP_DST = (1 << 1),
> - ASYNC_TX_ACK = (1 << 3),
> - ASYNC_TX_DEP_ACK = (1 << 4),
> + ASYNC_TX_PQ_ZERO_P = (1 << 1),
> + ASYNC_TX_PQ_ZERO_Q = (1 << 2),
> + ASYNC_TX_XOR_DROP_DST = (1 << 3),
> + ASYNC_TX_ACK = (1 << 4),
> + ASYNC_TX_DEP_ACK = (1 << 5),
> + ASYNC_TX_ASYNC_ONLY = (1 << 6),
> };
>
> #ifdef CONFIG_DMA_ENGINE
> @@ -146,5 +157,33 @@ async_trigger_callback(enum async_tx_flags flags,
> struct dma_async_tx_descriptor *depend_tx,
> dma_async_tx_callback cb_fn, void *cb_fn_param);
>
> +struct dma_async_tx_descriptor *
> +async_pqxor(struct page *pdest, struct page *qdest,
> + struct page **src_list, unsigned char *scoef_list,
> + unsigned int offset, int src_cnt, size_t len, enum async_tx_flags flags,
> + struct dma_async_tx_descriptor *depend_tx,
> + dma_async_tx_callback callback, void *callback_param);
> +
...forgot to update the declartion.
In this case async_pq() can be declared static since nothing outside
of async_pq.c calls it.
> +struct dma_async_tx_descriptor *
> +async_gen_syndrome(struct page *pdest, struct page *qdest,
> + struct page **src_list, unsigned int offset, int src_cnt, size_t len,
> + enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
> + dma_async_tx_callback callback, void *callback_param);
> +
...forgot to update the declartion.
> +struct dma_async_tx_descriptor *
> +async_pqxor_zero_sum(struct page *pdest, struct page *qdest,
> + struct page **src_list, unsigned char *scoef_list,
> + unsigned int offset, int src_cnt, size_t len,
> + u32 *presult, u32 *qresult, enum async_tx_flags flags,
> + struct dma_async_tx_descriptor *depend_tx,
> + dma_async_tx_callback callback, void *callback_param);
> +
...ditto
> +struct dma_async_tx_descriptor *
> +async_syndrome_zero_sum(struct page *pdest, struct page *qdest,
> + struct page **src_list, unsigned int offset, int src_cnt, size_t len,
> + u32 *presult, u32 *qresult, enum async_tx_flags flags,
> + struct dma_async_tx_descriptor *depend_tx,
> + dma_async_tx_callback callback, void *callback_param);
> +
...ditto again.
> void async_tx_quiesce(struct dma_async_tx_descriptor **tx);
> #endif /* _ASYNC_TX_H_ */
> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
> index adb0b08..84525c3 100644
> --- a/include/linux/dmaengine.h
> +++ b/include/linux/dmaengine.h
> @@ -81,7 +81,7 @@ enum dma_status {
> enum dma_transaction_type {
> DMA_MEMCPY,
> DMA_XOR,
> - DMA_PQ_XOR,
> + DMA_PQ,
> DMA_DUAL_XOR,
> DMA_PQ_UPDATE,
> DMA_ZERO_SUM,
> @@ -123,6 +123,8 @@ enum dma_ctrl_flags {
> DMA_CTRL_ACK = (1 << 1),
> DMA_COMPL_SKIP_SRC_UNMAP = (1 << 2),
> DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3),
> + DMA_PREP_ZERO_P = (1 << 4),
> + DMA_PREP_ZERO_Q = (1 << 5),
> };
I would rather not add operation-type-specific flags to
dma_ctrl_flags. In this case can we set up a dependency chain with
async_memset()?
>
> /**
> @@ -299,6 +301,7 @@ struct dma_async_tx_descriptor {
> * @global_node: list_head for global dma_device_list
> * @cap_mask: one or more dma_capability flags
> * @max_xor: maximum number of xor sources, 0 if no capability
> + * @max_pq: maximum number of PQ sources, 0 if no capability
> * @refcount: reference count
> * @done: IO completion struct
> * @dev_id: unique device ID
> @@ -308,7 +311,9 @@ struct dma_async_tx_descriptor {
> * @device_free_chan_resources: release DMA channel's resources
> * @device_prep_dma_memcpy: prepares a memcpy operation
> * @device_prep_dma_xor: prepares a xor operation
> + * @device_prep_dma_pq: prepares a pq operation
> * @device_prep_dma_zero_sum: prepares a zero_sum operation
> + * @device_prep_dma_pqzero_sum: prepares a pqzero_sum operation
> * @device_prep_dma_memset: prepares a memset operation
> * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
> * @device_prep_slave_sg: prepares a slave dma operation
> @@ -322,6 +327,7 @@ struct dma_device {
> struct list_head global_node;
> dma_cap_mask_t cap_mask;
> int max_xor;
> + int max_pq;
>
max_xor and max_pq can be changed to unsigned shorts to keep the size
of the struct the same.
> struct kref refcount;
> struct completion done;
> @@ -339,9 +345,17 @@ struct dma_device {
> struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
> struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
> unsigned int src_cnt, size_t len, unsigned long flags);
> + struct dma_async_tx_descriptor *(*device_prep_dma_pq)(
> + struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
> + unsigned int src_cnt, unsigned char *scf,
> + size_t len, unsigned long flags);
> struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)(
> struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt,
> size_t len, u32 *result, unsigned long flags);
> + struct dma_async_tx_descriptor *(*device_prep_dma_pqzero_sum)(
> + struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt,
> + unsigned char *scf,
> + size_t len, u32 *presult, u32 *qresult, unsigned long flags);
I would rather we turn the 'result' parameter into a pointer to flags
where bit 0 is the xor/p result and bit1 is the q result.
Thanks,
Dan
More information about the Linuxppc-dev
mailing list