These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / drivers / dma / at_xdmac.c
index c89a7ab..02f9aa4 100644 (file)
 #define                AT_XDMAC_CC_WRIP        (0x1 << 23)     /* Write in Progress (read only) */
 #define                        AT_XDMAC_CC_WRIP_DONE           (0x0 << 23)
 #define                        AT_XDMAC_CC_WRIP_IN_PROGRESS    (0x1 << 23)
-#define                AT_XDMAC_CC_PERID(i)    (0x7f & (h) << 24)      /* Channel Peripheral Identifier */
+#define                AT_XDMAC_CC_PERID(i)    (0x7f & (i) << 24)      /* Channel Peripheral Identifier */
 #define AT_XDMAC_CDS_MSP       0x2C    /* Channel Data Stride Memory Set Pattern */
 #define AT_XDMAC_CSUS          0x30    /* Channel Source Microblock Stride */
 #define AT_XDMAC_CDUS          0x34    /* Channel Destination Microblock Stride */
 #define AT_XDMAC_MAX_CHAN      0x20
 #define AT_XDMAC_MAX_CSIZE     16      /* 16 data */
 #define AT_XDMAC_MAX_DWIDTH    8       /* 64 bits */
+#define AT_XDMAC_RESIDUE_MAX_RETRIES   5
 
 #define AT_XDMAC_DMA_BUSWIDTHS\
        (BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED) |\
@@ -235,6 +236,10 @@ struct at_xdmac_lld {
        dma_addr_t      mbr_sa;         /* Source Address Member */
        dma_addr_t      mbr_da;         /* Destination Address Member */
        u32             mbr_cfg;        /* Configuration Register */
+       u32             mbr_bc;         /* Block Control Register */
+       u32             mbr_ds;         /* Data Stride Register */
+       u32             mbr_sus;        /* Source Microblock Stride Register */
+       u32             mbr_dus;        /* Destination Microblock Stride Register */
 };
 
 
@@ -355,16 +360,19 @@ static void at_xdmac_start_xfer(struct at_xdmac_chan *atchan,
         * descriptor view 2 since some fields of the configuration register
         * depend on transfer size and src/dest addresses.
         */
-       if (at_xdmac_chan_is_cyclic(atchan)) {
+       if (at_xdmac_chan_is_cyclic(atchan))
                reg = AT_XDMAC_CNDC_NDVIEW_NDV1;
-               at_xdmac_chan_write(atchan, AT_XDMAC_CC, first->lld.mbr_cfg);
-       } else {
-               /*
-                * No need to write AT_XDMAC_CC reg, it will be done when the
-                * descriptor is fecthed.
-                */
+       else if (first->lld.mbr_ubc & AT_XDMAC_MBR_UBC_NDV3)
+               reg = AT_XDMAC_CNDC_NDVIEW_NDV3;
+       else
                reg = AT_XDMAC_CNDC_NDVIEW_NDV2;
-       }
+       /*
+        * Even if the register will be updated from the configuration in the
+        * descriptor when using view 2 or higher, the PROT bit won't be set
+        * properly. This bit can be modified only by using the channel
+        * configuration register.
+        */
+       at_xdmac_chan_write(atchan, AT_XDMAC_CC, first->lld.mbr_cfg);
 
        reg |= AT_XDMAC_CNDC_NDDUP
               | AT_XDMAC_CNDC_NDSUP
@@ -448,6 +456,15 @@ static struct at_xdmac_desc *at_xdmac_alloc_desc(struct dma_chan *chan,
        return desc;
 }
 
+void at_xdmac_init_used_desc(struct at_xdmac_desc *desc)
+{
+       memset(&desc->lld, 0, sizeof(desc->lld));
+       INIT_LIST_HEAD(&desc->descs_list);
+       desc->direction = DMA_TRANS_NONE;
+       desc->xfer_size = 0;
+       desc->active_xfer = false;
+}
+
 /* Call must be protected by lock. */
 static struct at_xdmac_desc *at_xdmac_get_desc(struct at_xdmac_chan *atchan)
 {
@@ -459,12 +476,39 @@ static struct at_xdmac_desc *at_xdmac_get_desc(struct at_xdmac_chan *atchan)
                desc = list_first_entry(&atchan->free_descs_list,
                                        struct at_xdmac_desc, desc_node);
                list_del(&desc->desc_node);
-               desc->active_xfer = false;
+               at_xdmac_init_used_desc(desc);
        }
 
        return desc;
 }
 
+static void at_xdmac_queue_desc(struct dma_chan *chan,
+                               struct at_xdmac_desc *prev,
+                               struct at_xdmac_desc *desc)
+{
+       if (!prev || !desc)
+               return;
+
+       prev->lld.mbr_nda = desc->tx_dma_desc.phys;
+       prev->lld.mbr_ubc |= AT_XDMAC_MBR_UBC_NDE;
+
+       dev_dbg(chan2dev(chan), "%s: chain lld: prev=0x%p, mbr_nda=%pad\n",
+               __func__, prev, &prev->lld.mbr_nda);
+}
+
+static inline void at_xdmac_increment_block_count(struct dma_chan *chan,
+                                                 struct at_xdmac_desc *desc)
+{
+       if (!desc)
+               return;
+
+       desc->lld.mbr_bc++;
+
+       dev_dbg(chan2dev(chan),
+               "%s: incrementing the block count of the desc 0x%p\n",
+               __func__, desc);
+}
+
 static struct dma_chan *at_xdmac_xlate(struct of_phandle_args *dma_spec,
                                       struct of_dma *of_dma)
 {
@@ -591,12 +635,12 @@ at_xdmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                       unsigned int sg_len, enum dma_transfer_direction direction,
                       unsigned long flags, void *context)
 {
-       struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
-       struct at_xdmac_desc    *first = NULL, *prev = NULL;
-       struct scatterlist      *sg;
-       int                     i;
-       unsigned int            xfer_size = 0;
-       unsigned long           irqflags;
+       struct at_xdmac_chan            *atchan = to_at_xdmac_chan(chan);
+       struct at_xdmac_desc            *first = NULL, *prev = NULL;
+       struct scatterlist              *sg;
+       int                             i;
+       unsigned int                    xfer_size = 0;
+       unsigned long                   irqflags;
        struct dma_async_tx_descriptor  *ret = NULL;
 
        if (!sgl)
@@ -655,7 +699,6 @@ at_xdmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV2                       /* next descriptor view */
                        | AT_XDMAC_MBR_UBC_NDEN                                 /* next descriptor dst parameter update */
                        | AT_XDMAC_MBR_UBC_NSEN                                 /* next descriptor src parameter update */
-                       | (i == sg_len - 1 ? 0 : AT_XDMAC_MBR_UBC_NDE)          /* descriptor fetch */
                        | (len >> fixed_dwidth);                                /* microblock length */
                desc->lld.mbr_cfg = (atchan->cfg & ~AT_XDMAC_CC_DWIDTH_MASK) |
                                    AT_XDMAC_CC_DWIDTH(fixed_dwidth);
@@ -664,12 +707,8 @@ at_xdmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                         __func__, &desc->lld.mbr_sa, &desc->lld.mbr_da, desc->lld.mbr_ubc);
 
                /* Chain lld. */
-               if (prev) {
-                       prev->lld.mbr_nda = desc->tx_dma_desc.phys;
-                       dev_dbg(chan2dev(chan),
-                                "%s: chain lld: prev=0x%p, mbr_nda=%pad\n",
-                                __func__, prev, &prev->lld.mbr_nda);
-               }
+               if (prev)
+                       at_xdmac_queue_desc(chan, prev, desc);
 
                prev = desc;
                if (!first)
@@ -749,7 +788,6 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr,
                desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV1
                        | AT_XDMAC_MBR_UBC_NDEN
                        | AT_XDMAC_MBR_UBC_NSEN
-                       | AT_XDMAC_MBR_UBC_NDE
                        | period_len >> at_xdmac_get_dwidth(desc->lld.mbr_cfg);
 
                dev_dbg(chan2dev(chan),
@@ -757,12 +795,8 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr,
                         __func__, &desc->lld.mbr_sa, &desc->lld.mbr_da, desc->lld.mbr_ubc);
 
                /* Chain lld. */
-               if (prev) {
-                       prev->lld.mbr_nda = desc->tx_dma_desc.phys;
-                       dev_dbg(chan2dev(chan),
-                                "%s: chain lld: prev=0x%p, mbr_nda=%pad\n",
-                                __func__, prev, &prev->lld.mbr_nda);
-               }
+               if (prev)
+                       at_xdmac_queue_desc(chan, prev, desc);
 
                prev = desc;
                if (!first)
@@ -773,10 +807,7 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr,
                list_add_tail(&desc->desc_node, &first->descs_list);
        }
 
-       prev->lld.mbr_nda = first->tx_dma_desc.phys;
-       dev_dbg(chan2dev(chan),
-               "%s: chain lld: prev=0x%p, mbr_nda=%pad\n",
-               __func__, prev, &prev->lld.mbr_nda);
+       at_xdmac_queue_desc(chan, prev, first);
        first->tx_dma_desc.flags = flags;
        first->xfer_size = buf_len;
        first->direction = direction;
@@ -784,6 +815,217 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr,
        return &first->tx_dma_desc;
 }
 
+static inline u32 at_xdmac_align_width(struct dma_chan *chan, dma_addr_t addr)
+{
+       u32 width;
+
+       /*
+        * Check address alignment to select the greater data width we
+        * can use.
+        *
+        * Some XDMAC implementations don't provide dword transfer, in
+        * this case selecting dword has the same behavior as
+        * selecting word transfers.
+        */
+       if (!(addr & 7)) {
+               width = AT_XDMAC_CC_DWIDTH_DWORD;
+               dev_dbg(chan2dev(chan), "%s: dwidth: double word\n", __func__);
+       } else if (!(addr & 3)) {
+               width = AT_XDMAC_CC_DWIDTH_WORD;
+               dev_dbg(chan2dev(chan), "%s: dwidth: word\n", __func__);
+       } else if (!(addr & 1)) {
+               width = AT_XDMAC_CC_DWIDTH_HALFWORD;
+               dev_dbg(chan2dev(chan), "%s: dwidth: half word\n", __func__);
+       } else {
+               width = AT_XDMAC_CC_DWIDTH_BYTE;
+               dev_dbg(chan2dev(chan), "%s: dwidth: byte\n", __func__);
+       }
+
+       return width;
+}
+
+static struct at_xdmac_desc *
+at_xdmac_interleaved_queue_desc(struct dma_chan *chan,
+                               struct at_xdmac_chan *atchan,
+                               struct at_xdmac_desc *prev,
+                               dma_addr_t src, dma_addr_t dst,
+                               struct dma_interleaved_template *xt,
+                               struct data_chunk *chunk)
+{
+       struct at_xdmac_desc    *desc;
+       u32                     dwidth;
+       unsigned long           flags;
+       size_t                  ublen;
+       /*
+        * WARNING: The channel configuration is set here since there is no
+        * dmaengine_slave_config call in this case. Moreover we don't know the
+        * direction, it involves we can't dynamically set the source and dest
+        * interface so we have to use the same one. Only interface 0 allows EBI
+        * access. Hopefully we can access DDR through both ports (at least on
+        * SAMA5D4x), so we can use the same interface for source and dest,
+        * that solves the fact we don't know the direction.
+        */
+       u32                     chan_cc = AT_XDMAC_CC_DIF(0)
+                                       | AT_XDMAC_CC_SIF(0)
+                                       | AT_XDMAC_CC_MBSIZE_SIXTEEN
+                                       | AT_XDMAC_CC_TYPE_MEM_TRAN;
+
+       dwidth = at_xdmac_align_width(chan, src | dst | chunk->size);
+       if (chunk->size >= (AT_XDMAC_MBR_UBC_UBLEN_MAX << dwidth)) {
+               dev_dbg(chan2dev(chan),
+                       "%s: chunk too big (%d, max size %lu)...\n",
+                       __func__, chunk->size,
+                       AT_XDMAC_MBR_UBC_UBLEN_MAX << dwidth);
+               return NULL;
+       }
+
+       if (prev)
+               dev_dbg(chan2dev(chan),
+                       "Adding items at the end of desc 0x%p\n", prev);
+
+       if (xt->src_inc) {
+               if (xt->src_sgl)
+                       chan_cc |=  AT_XDMAC_CC_SAM_UBS_AM;
+               else
+                       chan_cc |=  AT_XDMAC_CC_SAM_INCREMENTED_AM;
+       }
+
+       if (xt->dst_inc) {
+               if (xt->dst_sgl)
+                       chan_cc |=  AT_XDMAC_CC_DAM_UBS_AM;
+               else
+                       chan_cc |=  AT_XDMAC_CC_DAM_INCREMENTED_AM;
+       }
+
+       spin_lock_irqsave(&atchan->lock, flags);
+       desc = at_xdmac_get_desc(atchan);
+       spin_unlock_irqrestore(&atchan->lock, flags);
+       if (!desc) {
+               dev_err(chan2dev(chan), "can't get descriptor\n");
+               return NULL;
+       }
+
+       chan_cc |= AT_XDMAC_CC_DWIDTH(dwidth);
+
+       ublen = chunk->size >> dwidth;
+
+       desc->lld.mbr_sa = src;
+       desc->lld.mbr_da = dst;
+       desc->lld.mbr_sus = dmaengine_get_src_icg(xt, chunk);
+       desc->lld.mbr_dus = dmaengine_get_dst_icg(xt, chunk);
+
+       desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV3
+               | AT_XDMAC_MBR_UBC_NDEN
+               | AT_XDMAC_MBR_UBC_NSEN
+               | ublen;
+       desc->lld.mbr_cfg = chan_cc;
+
+       dev_dbg(chan2dev(chan),
+               "%s: lld: mbr_sa=%pad, mbr_da=%pad, mbr_ubc=0x%08x, mbr_cfg=0x%08x\n",
+               __func__, &desc->lld.mbr_sa, &desc->lld.mbr_da,
+               desc->lld.mbr_ubc, desc->lld.mbr_cfg);
+
+       /* Chain lld. */
+       if (prev)
+               at_xdmac_queue_desc(chan, prev, desc);
+
+       return desc;
+}
+
+static struct dma_async_tx_descriptor *
+at_xdmac_prep_interleaved(struct dma_chan *chan,
+                         struct dma_interleaved_template *xt,
+                         unsigned long flags)
+{
+       struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
+       struct at_xdmac_desc    *prev = NULL, *first = NULL;
+       dma_addr_t              dst_addr, src_addr;
+       size_t                  src_skip = 0, dst_skip = 0, len = 0;
+       struct data_chunk       *chunk;
+       int                     i;
+
+       if (!xt || !xt->numf || (xt->dir != DMA_MEM_TO_MEM))
+               return NULL;
+
+       /*
+        * TODO: Handle the case where we have to repeat a chain of
+        * descriptors...
+        */
+       if ((xt->numf > 1) && (xt->frame_size > 1))
+               return NULL;
+
+       dev_dbg(chan2dev(chan), "%s: src=%pad, dest=%pad, numf=%d, frame_size=%d, flags=0x%lx\n",
+               __func__, &xt->src_start, &xt->dst_start,       xt->numf,
+               xt->frame_size, flags);
+
+       src_addr = xt->src_start;
+       dst_addr = xt->dst_start;
+
+       if (xt->numf > 1) {
+               first = at_xdmac_interleaved_queue_desc(chan, atchan,
+                                                       NULL,
+                                                       src_addr, dst_addr,
+                                                       xt, xt->sgl);
+
+               /* Length of the block is (BLEN+1) microblocks. */
+               for (i = 0; i < xt->numf - 1; i++)
+                       at_xdmac_increment_block_count(chan, first);
+
+               dev_dbg(chan2dev(chan), "%s: add desc 0x%p to descs_list 0x%p\n",
+                       __func__, first, first);
+               list_add_tail(&first->desc_node, &first->descs_list);
+       } else {
+               for (i = 0; i < xt->frame_size; i++) {
+                       size_t src_icg = 0, dst_icg = 0;
+                       struct at_xdmac_desc *desc;
+
+                       chunk = xt->sgl + i;
+
+                       dst_icg = dmaengine_get_dst_icg(xt, chunk);
+                       src_icg = dmaengine_get_src_icg(xt, chunk);
+
+                       src_skip = chunk->size + src_icg;
+                       dst_skip = chunk->size + dst_icg;
+
+                       dev_dbg(chan2dev(chan),
+                               "%s: chunk size=%d, src icg=%d, dst icg=%d\n",
+                               __func__, chunk->size, src_icg, dst_icg);
+
+                       desc = at_xdmac_interleaved_queue_desc(chan, atchan,
+                                                              prev,
+                                                              src_addr, dst_addr,
+                                                              xt, chunk);
+                       if (!desc) {
+                               list_splice_init(&first->descs_list,
+                                                &atchan->free_descs_list);
+                               return NULL;
+                       }
+
+                       if (!first)
+                               first = desc;
+
+                       dev_dbg(chan2dev(chan), "%s: add desc 0x%p to descs_list 0x%p\n",
+                               __func__, desc, first);
+                       list_add_tail(&desc->desc_node, &first->descs_list);
+
+                       if (xt->src_sgl)
+                               src_addr += src_skip;
+
+                       if (xt->dst_sgl)
+                               dst_addr += dst_skip;
+
+                       len += chunk->size;
+                       prev = desc;
+               }
+       }
+
+       first->tx_dma_desc.cookie = -EBUSY;
+       first->tx_dma_desc.flags = flags;
+       first->xfer_size = len;
+
+       return &first->tx_dma_desc;
+}
+
 static struct dma_async_tx_descriptor *
 at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
                         size_t len, unsigned long flags)
@@ -815,24 +1057,7 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
        if (unlikely(!len))
                return NULL;
 
-       /*
-        * Check address alignment to select the greater data width we can use.
-        * Some XDMAC implementations don't provide dword transfer, in this
-        * case selecting dword has the same behavior as selecting word transfers.
-        */
-       if (!((src_addr | dst_addr) & 7)) {
-               dwidth = AT_XDMAC_CC_DWIDTH_DWORD;
-               dev_dbg(chan2dev(chan), "%s: dwidth: double word\n", __func__);
-       } else if (!((src_addr | dst_addr)  & 3)) {
-               dwidth = AT_XDMAC_CC_DWIDTH_WORD;
-               dev_dbg(chan2dev(chan), "%s: dwidth: word\n", __func__);
-       } else if (!((src_addr | dst_addr) & 1)) {
-               dwidth = AT_XDMAC_CC_DWIDTH_HALFWORD;
-               dev_dbg(chan2dev(chan), "%s: dwidth: half word\n", __func__);
-       } else {
-               dwidth = AT_XDMAC_CC_DWIDTH_BYTE;
-               dev_dbg(chan2dev(chan), "%s: dwidth: byte\n", __func__);
-       }
+       dwidth = at_xdmac_align_width(chan, src_addr | dst_addr);
 
        /* Prepare descriptors. */
        while (remaining_size) {
@@ -862,19 +1087,9 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
                dev_dbg(chan2dev(chan), "%s: xfer_size=%zu\n", __func__, xfer_size);
 
                /* Check remaining length and change data width if needed. */
-               if (!((src_addr | dst_addr | xfer_size) & 7)) {
-                       dwidth = AT_XDMAC_CC_DWIDTH_DWORD;
-                       dev_dbg(chan2dev(chan), "%s: dwidth: double word\n", __func__);
-               } else if (!((src_addr | dst_addr | xfer_size)  & 3)) {
-                       dwidth = AT_XDMAC_CC_DWIDTH_WORD;
-                       dev_dbg(chan2dev(chan), "%s: dwidth: word\n", __func__);
-               } else if (!((src_addr | dst_addr | xfer_size) & 1)) {
-                       dwidth = AT_XDMAC_CC_DWIDTH_HALFWORD;
-                       dev_dbg(chan2dev(chan), "%s: dwidth: half word\n", __func__);
-               } else if ((src_addr | dst_addr | xfer_size) & 1) {
-                       dwidth = AT_XDMAC_CC_DWIDTH_BYTE;
-                       dev_dbg(chan2dev(chan), "%s: dwidth: byte\n", __func__);
-               }
+               dwidth = at_xdmac_align_width(chan,
+                                             src_addr | dst_addr | xfer_size);
+               chan_cc &= ~AT_XDMAC_CC_DWIDTH_MASK;
                chan_cc |= AT_XDMAC_CC_DWIDTH(dwidth);
 
                ublen = xfer_size >> dwidth;
@@ -885,7 +1100,6 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
                desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV2
                        | AT_XDMAC_MBR_UBC_NDEN
                        | AT_XDMAC_MBR_UBC_NSEN
-                       | (remaining_size ? AT_XDMAC_MBR_UBC_NDE : 0)
                        | ublen;
                desc->lld.mbr_cfg = chan_cc;
 
@@ -894,12 +1108,8 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
                         __func__, &desc->lld.mbr_sa, &desc->lld.mbr_da, desc->lld.mbr_ubc, desc->lld.mbr_cfg);
 
                /* Chain lld. */
-               if (prev) {
-                       prev->lld.mbr_nda = desc->tx_dma_desc.phys;
-                       dev_dbg(chan2dev(chan),
-                                "%s: chain lld: prev=0x%p, mbr_nda=0x%08x\n",
-                                __func__, prev, prev->lld.mbr_nda);
-               }
+               if (prev)
+                       at_xdmac_queue_desc(chan, prev, desc);
 
                prev = desc;
                if (!first)
@@ -916,6 +1126,255 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
        return &first->tx_dma_desc;
 }
 
+static struct at_xdmac_desc *at_xdmac_memset_create_desc(struct dma_chan *chan,
+                                                        struct at_xdmac_chan *atchan,
+                                                        dma_addr_t dst_addr,
+                                                        size_t len,
+                                                        int value)
+{
+       struct at_xdmac_desc    *desc;
+       unsigned long           flags;
+       size_t                  ublen;
+       u32                     dwidth;
+       /*
+        * WARNING: The channel configuration is set here since there is no
+        * dmaengine_slave_config call in this case. Moreover we don't know the
+        * direction, it involves we can't dynamically set the source and dest
+        * interface so we have to use the same one. Only interface 0 allows EBI
+        * access. Hopefully we can access DDR through both ports (at least on
+        * SAMA5D4x), so we can use the same interface for source and dest,
+        * that solves the fact we don't know the direction.
+        */
+       u32                     chan_cc = AT_XDMAC_CC_DAM_UBS_AM
+                                       | AT_XDMAC_CC_SAM_INCREMENTED_AM
+                                       | AT_XDMAC_CC_DIF(0)
+                                       | AT_XDMAC_CC_SIF(0)
+                                       | AT_XDMAC_CC_MBSIZE_SIXTEEN
+                                       | AT_XDMAC_CC_MEMSET_HW_MODE
+                                       | AT_XDMAC_CC_TYPE_MEM_TRAN;
+
+       dwidth = at_xdmac_align_width(chan, dst_addr);
+
+       if (len >= (AT_XDMAC_MBR_UBC_UBLEN_MAX << dwidth)) {
+               dev_err(chan2dev(chan),
+                       "%s: Transfer too large, aborting...\n",
+                       __func__);
+               return NULL;
+       }
+
+       spin_lock_irqsave(&atchan->lock, flags);
+       desc = at_xdmac_get_desc(atchan);
+       spin_unlock_irqrestore(&atchan->lock, flags);
+       if (!desc) {
+               dev_err(chan2dev(chan), "can't get descriptor\n");
+               return NULL;
+       }
+
+       chan_cc |= AT_XDMAC_CC_DWIDTH(dwidth);
+
+       ublen = len >> dwidth;
+
+       desc->lld.mbr_da = dst_addr;
+       desc->lld.mbr_ds = value;
+       desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV3
+               | AT_XDMAC_MBR_UBC_NDEN
+               | AT_XDMAC_MBR_UBC_NSEN
+               | ublen;
+       desc->lld.mbr_cfg = chan_cc;
+
+       dev_dbg(chan2dev(chan),
+               "%s: lld: mbr_da=%pad, mbr_ds=%pad, mbr_ubc=0x%08x, mbr_cfg=0x%08x\n",
+               __func__, &desc->lld.mbr_da, &desc->lld.mbr_ds, desc->lld.mbr_ubc,
+               desc->lld.mbr_cfg);
+
+       return desc;
+}
+
+struct dma_async_tx_descriptor *
+at_xdmac_prep_dma_memset(struct dma_chan *chan, dma_addr_t dest, int value,
+                        size_t len, unsigned long flags)
+{
+       struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
+       struct at_xdmac_desc    *desc;
+
+       dev_dbg(chan2dev(chan), "%s: dest=%pad, len=%d, pattern=0x%x, flags=0x%lx\n",
+               __func__, &dest, len, value, flags);
+
+       if (unlikely(!len))
+               return NULL;
+
+       desc = at_xdmac_memset_create_desc(chan, atchan, dest, len, value);
+       list_add_tail(&desc->desc_node, &desc->descs_list);
+
+       desc->tx_dma_desc.cookie = -EBUSY;
+       desc->tx_dma_desc.flags = flags;
+       desc->xfer_size = len;
+
+       return &desc->tx_dma_desc;
+}
+
+static struct dma_async_tx_descriptor *
+at_xdmac_prep_dma_memset_sg(struct dma_chan *chan, struct scatterlist *sgl,
+                           unsigned int sg_len, int value,
+                           unsigned long flags)
+{
+       struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
+       struct at_xdmac_desc    *desc, *pdesc = NULL,
+                               *ppdesc = NULL, *first = NULL;
+       struct scatterlist      *sg, *psg = NULL, *ppsg = NULL;
+       size_t                  stride = 0, pstride = 0, len = 0;
+       int                     i;
+
+       if (!sgl)
+               return NULL;
+
+       dev_dbg(chan2dev(chan), "%s: sg_len=%d, value=0x%x, flags=0x%lx\n",
+               __func__, sg_len, value, flags);
+
+       /* Prepare descriptors. */
+       for_each_sg(sgl, sg, sg_len, i) {
+               dev_dbg(chan2dev(chan), "%s: dest=%pad, len=%d, pattern=0x%x, flags=0x%lx\n",
+                       __func__, &sg_dma_address(sg), sg_dma_len(sg),
+                       value, flags);
+               desc = at_xdmac_memset_create_desc(chan, atchan,
+                                                  sg_dma_address(sg),
+                                                  sg_dma_len(sg),
+                                                  value);
+               if (!desc && first)
+                       list_splice_init(&first->descs_list,
+                                        &atchan->free_descs_list);
+
+               if (!first)
+                       first = desc;
+
+               /* Update our strides */
+               pstride = stride;
+               if (psg)
+                       stride = sg_dma_address(sg) -
+                               (sg_dma_address(psg) + sg_dma_len(psg));
+
+               /*
+                * The scatterlist API gives us only the address and
+                * length of each elements.
+                *
+                * Unfortunately, we don't have the stride, which we
+                * will need to compute.
+                *
+                * That make us end up in a situation like this one:
+                *    len    stride    len    stride    len
+                * +-------+        +-------+        +-------+
+                * |  N-2  |        |  N-1  |        |   N   |
+                * +-------+        +-------+        +-------+
+                *
+                * We need all these three elements (N-2, N-1 and N)
+                * to actually take the decision on whether we need to
+                * queue N-1 or reuse N-2.
+                *
+                * We will only consider N if it is the last element.
+                */
+               if (ppdesc && pdesc) {
+                       if ((stride == pstride) &&
+                           (sg_dma_len(ppsg) == sg_dma_len(psg))) {
+                               dev_dbg(chan2dev(chan),
+                                       "%s: desc 0x%p can be merged with desc 0x%p\n",
+                                       __func__, pdesc, ppdesc);
+
+                               /*
+                                * Increment the block count of the
+                                * N-2 descriptor
+                                */
+                               at_xdmac_increment_block_count(chan, ppdesc);
+                               ppdesc->lld.mbr_dus = stride;
+
+                               /*
+                                * Put back the N-1 descriptor in the
+                                * free descriptor list
+                                */
+                               list_add_tail(&pdesc->desc_node,
+                                             &atchan->free_descs_list);
+
+                               /*
+                                * Make our N-1 descriptor pointer
+                                * point to the N-2 since they were
+                                * actually merged.
+                                */
+                               pdesc = ppdesc;
+
+                       /*
+                        * Rule out the case where we don't have
+                        * pstride computed yet (our second sg
+                        * element)
+                        *
+                        * We also want to catch the case where there
+                        * would be a negative stride,
+                        */
+                       } else if (pstride ||
+                                  sg_dma_address(sg) < sg_dma_address(psg)) {
+                               /*
+                                * Queue the N-1 descriptor after the
+                                * N-2
+                                */
+                               at_xdmac_queue_desc(chan, ppdesc, pdesc);
+
+                               /*
+                                * Add the N-1 descriptor to the list
+                                * of the descriptors used for this
+                                * transfer
+                                */
+                               list_add_tail(&desc->desc_node,
+                                             &first->descs_list);
+                               dev_dbg(chan2dev(chan),
+                                       "%s: add desc 0x%p to descs_list 0x%p\n",
+                                       __func__, desc, first);
+                       }
+               }
+
+               /*
+                * If we are the last element, just see if we have the
+                * same size than the previous element.
+                *
+                * If so, we can merge it with the previous descriptor
+                * since we don't care about the stride anymore.
+                */
+               if ((i == (sg_len - 1)) &&
+                   sg_dma_len(psg) == sg_dma_len(sg)) {
+                       dev_dbg(chan2dev(chan),
+                               "%s: desc 0x%p can be merged with desc 0x%p\n",
+                               __func__, desc, pdesc);
+
+                       /*
+                        * Increment the block count of the N-1
+                        * descriptor
+                        */
+                       at_xdmac_increment_block_count(chan, pdesc);
+                       pdesc->lld.mbr_dus = stride;
+
+                       /*
+                        * Put back the N descriptor in the free
+                        * descriptor list
+                        */
+                       list_add_tail(&desc->desc_node,
+                                     &atchan->free_descs_list);
+               }
+
+               /* Update our descriptors */
+               ppdesc = pdesc;
+               pdesc = desc;
+
+               /* Update our scatter pointers */
+               ppsg = psg;
+               psg = sg;
+
+               len += sg_dma_len(sg);
+       }
+
+       first->tx_dma_desc.cookie = -EBUSY;
+       first->tx_dma_desc.flags = flags;
+       first->xfer_size = len;
+
+       return &first->tx_dma_desc;
+}
+
 static enum dma_status
 at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
                struct dma_tx_state *txstate)
@@ -925,8 +1384,8 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
        struct at_xdmac_desc    *desc, *_desc;
        struct list_head        *descs_list;
        enum dma_status         ret;
-       int                     residue;
-       u32                     cur_nda, mask, value;
+       int                     residue, retry;
+       u32                     cur_nda, check_nda, cur_ubc, mask, value;
        u8                      dwidth = 0;
        unsigned long           flags;
 
@@ -963,7 +1422,42 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
                        cpu_relax();
        }
 
+       /*
+        * When processing the residue, we need to read two registers but we
+        * can't do it in an atomic way. AT_XDMAC_CNDA is used to find where
+        * we stand in the descriptor list and AT_XDMAC_CUBC is used
+        * to know how many data are remaining for the current descriptor.
+        * Since the dma channel is not paused to not loose data, between the
+        * AT_XDMAC_CNDA and AT_XDMAC_CUBC read, we may have change of
+        * descriptor.
+        * For that reason, after reading AT_XDMAC_CUBC, we check if we are
+        * still using the same descriptor by reading a second time
+        * AT_XDMAC_CNDA. If AT_XDMAC_CNDA has changed, it means we have to
+        * read again AT_XDMAC_CUBC.
+        * Memory barriers are used to ensure the read order of the registers.
+        * A max number of retries is set because unlikely it can never ends if
+        * we are transferring a lot of data with small buffers.
+        */
        cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+       rmb();
+       cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+       for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) {
+               rmb();
+               check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+
+               if (likely(cur_nda == check_nda))
+                       break;
+
+               cur_nda = check_nda;
+               rmb();
+               cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+       }
+
+       if (unlikely(retry >= AT_XDMAC_RESIDUE_MAX_RETRIES)) {
+               ret = DMA_ERROR;
+               goto spin_unlock;
+       }
+
        /*
         * Remove size of all microblocks already transferred and the current
         * one. Then add the remaining size to transfer of the current
@@ -976,7 +1470,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
                if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda)
                        break;
        }
-       residue += at_xdmac_chan_read(atchan, AT_XDMAC_CUBC) << dwidth;
+       residue += cur_ubc << dwidth;
 
        dma_set_residue(txstate, residue);
 
@@ -1230,6 +1724,7 @@ static int at_xdmac_device_terminate_all(struct dma_chan *chan)
        list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node)
                at_xdmac_remove_xfer(atchan, desc);
 
+       clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
        clear_bit(AT_XDMAC_CHAN_IS_CYCLIC, &atchan->status);
        spin_unlock_irqrestore(&atchan->lock, flags);
 
@@ -1362,6 +1857,8 @@ static int atmel_xdmac_resume(struct device *dev)
                atchan = to_at_xdmac_chan(chan);
                at_xdmac_chan_write(atchan, AT_XDMAC_CC, atchan->save_cc);
                if (at_xdmac_chan_is_cyclic(atchan)) {
+                       if (at_xdmac_chan_is_paused(atchan))
+                               at_xdmac_device_resume(chan);
                        at_xdmac_chan_write(atchan, AT_XDMAC_CNDA, atchan->save_cnda);
                        at_xdmac_chan_write(atchan, AT_XDMAC_CNDC, atchan->save_cndc);
                        at_xdmac_chan_write(atchan, AT_XDMAC_CIE, atchan->save_cim);
@@ -1446,7 +1943,10 @@ static int at_xdmac_probe(struct platform_device *pdev)
        }
 
        dma_cap_set(DMA_CYCLIC, atxdmac->dma.cap_mask);
+       dma_cap_set(DMA_INTERLEAVE, atxdmac->dma.cap_mask);
        dma_cap_set(DMA_MEMCPY, atxdmac->dma.cap_mask);
+       dma_cap_set(DMA_MEMSET, atxdmac->dma.cap_mask);
+       dma_cap_set(DMA_MEMSET_SG, atxdmac->dma.cap_mask);
        dma_cap_set(DMA_SLAVE, atxdmac->dma.cap_mask);
        /*
         * Without DMA_PRIVATE the driver is not able to allocate more than
@@ -1459,7 +1959,10 @@ static int at_xdmac_probe(struct platform_device *pdev)
        atxdmac->dma.device_tx_status                   = at_xdmac_tx_status;
        atxdmac->dma.device_issue_pending               = at_xdmac_issue_pending;
        atxdmac->dma.device_prep_dma_cyclic             = at_xdmac_prep_dma_cyclic;
+       atxdmac->dma.device_prep_interleaved_dma        = at_xdmac_prep_interleaved;
        atxdmac->dma.device_prep_dma_memcpy             = at_xdmac_prep_dma_memcpy;
+       atxdmac->dma.device_prep_dma_memset             = at_xdmac_prep_dma_memset;
+       atxdmac->dma.device_prep_dma_memset_sg          = at_xdmac_prep_dma_memset_sg;
        atxdmac->dma.device_prep_slave_sg               = at_xdmac_prep_slave_sg;
        atxdmac->dma.device_config                      = at_xdmac_device_config;
        atxdmac->dma.device_pause                       = at_xdmac_device_pause;