/*
 * Driver for DMW96 Generig DMA Controller
 *
 * Copyright (C) 2011 DSPG Technologies GmbH
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
#include <linux/ccu.h>
#include <linux/clk.h>
#include <linux/dma-mapping.h>
#include <linux/dmapool.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/platform_device.h>
#include <linux/slab.h>

#include <linux/dmw96dma.h>

/* Registers */
#define DMA_REGLEN 0x200

#define CEMSR      0x000
#define CER        0x004
#define CDR        0x008
#define CTOR       0x00C
#define CCIER      0x010
#define CDIER      0x014
#define CCICR      0x018
#define CDICR      0x01C
#define CCICLR     0x020
#define CDICLR     0x024
#define CRDR       0x028
#define CTOENR     0x02C
#define CTERENR    0x030
#define CTOCR      0x034
#define CTECR      0x038
#define CTOCLR     0x03C
#define CTECLR     0x040
#define CRCONF1    0x044
#define CRCONF2    0x048
#define CRCONF3    0x04C
#define CRCONF4    0x050
#define CRCONF5    0x054
#define CRCONF6    0x058
#define CDPR       0x05C
#define C0DESCADR  0x060
#define C0RTC      0x0E0

/* Transfer descriptor mode fields */
#define TD_DONE     (1ul <<  1)
#define TD_2D_PORT  (1ul <<  8)
#define TD_CHAIN_CH (1ul << 12)
#define TD_HW_PORT  (1ul << 20)
#define TD_TB_PORT  (1ul << 21)
#define TD_TMO_EN   (1ul << 28)
#define TD_INT_EN   (1ul << 29)
#define TD_OWNER    (1ul << 30)
#define TD_DIR      (1ul << 31)

#define for_each_bit(i, bits) \
	for (i=0; bits != 0; i++, bits>>=1) if (bits & 1)

#define ROTATE_HW_CHANNEL  32
#define MEMCPY_HW_CHANNEL  33

struct ch_cfg {
	u32 mode;
	u32 base;
};

struct dmw96dma_dev {
	struct platform_device *pdev;

	void __iomem *regs;
	struct clk   *clk;

	struct dma_pool *td_pool;
	struct tasklet_struct tasklet_done;
	struct tasklet_struct tasklet_err;
	spinlock_t lock;
	struct dmw96dma_list *running[32];
	struct list_head pending[32];

	u32 irq_td;
	u32 irq_done;
	u32 irq_tmo;
	u32 irq_err;

	struct ch_cfg configs[34];
	u8 priorities[34];
};


struct hw_td {
	u32 port_a;
	u32 port_b;
	u32 size;
	u32 mode;
	u32 next;
};

struct td {
	struct td *next;
	struct hw_td *hw_td;
	dma_addr_t hw_td_phys;
	struct hw_td *chain_hw_td;
	dma_addr_t chain_hw_td_phys;

	size_t size_a;
	void *buffer_a;
	dma_addr_t dma_buffer_a;
	enum dma_data_direction dir_a;

	size_t size_b;
	void *buffer_b;
	dma_addr_t dma_buffer_b;
	enum dma_data_direction dir_b;

	dmw96dma_callback done;
	void *context;
};

struct dmw96dma_list {
	struct list_head link;

	int circular : 1;
	int discontinuous : 1;

	unsigned int hw_channel;
	unsigned int channel; /* != 0 if submitted */

	dmw96dma_callback finish;
	void *context;

	struct td *head, *tail;
	struct td *cur;
};

static struct dmw96dma_dev *_dmw96dma_dev = NULL;

/*****************************************************************************/


static void init_td(struct dmw96dma_dev *dev, struct td *td)
{
	u32 mode;

	/* update mode */
	mode = td->hw_td->mode;
	mode &= ~TD_DONE;
	mode |=  TD_OWNER;
	td->hw_td->mode = mode;

	/* map CPU buffer(s) */
	if (td->buffer_a)
		td->dma_buffer_a = dma_map_single(&dev->pdev->dev, td->buffer_a,
			td->size_a, td->dir_a);
	if (td->buffer_b)
		td->dma_buffer_b = dma_map_single(&dev->pdev->dev, td->buffer_b,
			td->size_b, td->dir_b);

	/* assign physical addresses to transfer descriptors */
	if (unlikely(td->chain_hw_td)) {
		mode = td->chain_hw_td->mode;
		mode &= ~TD_DONE;
		mode |=  TD_OWNER;
		td->chain_hw_td->mode = mode;

		if (mode & TD_TB_PORT) {
			/* temporary buffer on port B */
			td->hw_td->port_a = td->dma_buffer_a;
			td->chain_hw_td->port_a = td->dma_buffer_b;
		} else {
			/* temporary buffer on port A */
			td->hw_td->port_b = td->dma_buffer_a;
			td->chain_hw_td->port_b = td->dma_buffer_b;
		}
	} else {
		td->hw_td->port_a = td->dma_buffer_a;
		td->hw_td->port_b = td->dma_buffer_b;
	}
}


static void init_td_list(struct dmw96dma_dev *dev, struct dmw96dma_list *list)
{
	struct td *td, *head;

	td = head = list->head;
	list->cur = td;
	while (td) {
		init_td(dev, td);
		td = td->next;
		if (td == head)
			break;
	}
}


static void finish_td(struct dmw96dma_dev *dev, struct td *td, int err)
{
	if (td->buffer_a)
		dma_unmap_single(&dev->pdev->dev, td->dma_buffer_a, td->size_a, td->dir_a);
	if (td->buffer_b)
		dma_unmap_single(&dev->pdev->dev, td->dma_buffer_b, td->size_b, td->dir_b);

	/* Invoke callback if any. We might have to drop the lock at this point
	 * because the callee may submit or cancel new transfers from the
	 * callback.
	 */
	if (td->done) {
		if (spin_is_locked(&dev->lock)) {
			spin_unlock(&dev->lock);
			td->done(err, td->context);
			spin_lock(&dev->lock);
		} else
			td->done(err, td->context);
	}
}


static void retire_td_list(struct dmw96dma_dev *dev, struct dmw96dma_list *list)
{
	struct td *td;

	while ((td = list->cur) != NULL) {
		/* descriptor done? */
		if ((td->hw_td->mode & TD_DONE) == 0)
			break;

		list->cur = td->next;
		finish_td(dev, td, 0);
		if (list->circular)
			init_td(dev, td);
	}
}


static void finish_td_list(struct dmw96dma_dev *dev, struct dmw96dma_list *list, int err)
{
	struct td *td, *stop;

	td = stop = list->cur;
	while (td) {
		finish_td(dev, td, err);
		td = td->next;
		if (td == stop)
			break;
	}

	if (list->finish) {
		if (spin_is_locked(&dev->lock)) {
			spin_unlock(&dev->lock);
			list->finish(err, list->context);
			spin_lock(&dev->lock);
		} else
			list->finish(err, list->context);
	}
}


static void start_list(struct dmw96dma_dev *dev, struct dmw96dma_list *list,
	int channel)
{
	u32 mask;

	dev->running[channel] = list;

	writel(list->cur->hw_td_phys, dev->regs + C0DESCADR + channel*4);
	if (list->cur->chain_hw_td_phys) {
		writel(list->cur->chain_hw_td_phys,
			dev->regs + C0DESCADR + (channel-1)*4);
		mask = 3 << (channel - 1);
	} else
		mask = 1 << channel;

	writel(mask, dev->regs + CER);
}


static void check_pending(struct dmw96dma_dev *dev, int channel)
{
	struct dmw96dma_list *list;

	if (list_empty(&dev->pending[channel]))
		return;

	list = list_first_entry(&dev->pending[channel], struct dmw96dma_list,
		link);
	list_del(&list->link);
	start_list(dev, list, channel);
}


static void dmw96dma_tasklet_done(unsigned long data)
{
	struct dmw96dma_dev *dev = (struct dmw96dma_dev *)data;
	unsigned long flags;
	u32 irq_td, irq_done;
	int channel;

	spin_lock_irqsave(&dev->lock, flags);
	irq_td   = dev->irq_td;   dev->irq_td   = 0;
	irq_done = dev->irq_done; dev->irq_done = 0;
	spin_unlock_irqrestore(&dev->lock, flags);

	/*
	 * ATTENTION: The following call will guarantee that all memory
	 * transactions indicated by the device irq flags have actually hit the
	 * memory. Just process the flags which have been sampled. When reading
	 * the status again any new IRQ flags MUST NOT be processed until
	 * ccu_barrier() is called again!
	 */
	ccu_barrier();

	spin_lock(&dev->lock);

	/* finished some transfer descriptor(s) */
	for_each_bit(channel, irq_td) {
		if (dev->running[channel])
			retire_td_list(dev, dev->running[channel]);
		else
			dev_warn(&dev->pdev->dev, "Ghost TD irq on channel %d\n",
				channel);
	}

	/* finished whole transfer list(s) */
	for_each_bit(channel, irq_done) {
		if (dev->running[channel]) {
			int err;
			struct dmw96dma_list *list = dev->running[channel];

			if (list->discontinuous) {
				retire_td_list(dev, list);
				if (list->cur) {
					start_list(dev, list, channel);
					continue;
				}
			}

			dev->running[channel] = NULL;
			check_pending(dev, channel);

			list->channel = 0;
			if (list->circular)
				err = -EPIPE;	/* system too slow! */
			else
				err = 0;

			clk_disable(dev->clk);
			finish_td_list(dev, list, err);
		} else
			dev_warn(&dev->pdev->dev, "Ghost DONE irq on channel %d\n",
				channel);
	}

	spin_unlock(&dev->lock);
}


static void dmw96dma_tasklet_err(unsigned long data)
{
	struct dmw96dma_dev *dev = (struct dmw96dma_dev *)data;
	unsigned long flags;
	u32 irq_err, irq_end;
	int channel;

	spin_lock_irqsave(&dev->lock, flags);
	irq_end  = dev->irq_tmo;  dev->irq_tmo  = 0;
	irq_err  = dev->irq_err;  dev->irq_err  = 0; irq_end |= irq_err;
	spin_unlock_irqrestore(&dev->lock, flags);

	/*
	 * ATTENTION: The following call will guarantee that all memory
	 * transactions indicated by the device irq flags have actually hit the
	 * memory. Just process the flags which have been sampled. When reading
	 * the status again any new IRQ flags MUST NOT be processed until
	 * ccu_barrier() is called again!
	 */
	ccu_barrier();

	spin_lock(&dev->lock);

	for_each_bit(channel, irq_end) {
		if (dev->running[channel]) {
			int err;
			struct dmw96dma_list *list = dev->running[channel];

			dev->running[channel] = NULL;
			check_pending(dev, channel);

			list->channel = 0;
			if (irq_err & (1 << channel))
				err = -EIO;
			else
				err = -EBUSY;

			clk_disable(dev->clk);
			finish_td_list(dev, list, err);
		} else
			dev_warn(&dev->pdev->dev, "Ghost ERROR irq on channel %d\n",
				channel);
	}

	spin_unlock(&dev->lock);
}


static irqreturn_t dmw96dma_irq_done(int irq, void *dev_id)
{
	struct dmw96dma_dev *dev = dev_id;

	dev->irq_td |= readl(dev->regs + CCICR);
	writel(dev->irq_td, dev->regs + CCICLR);
	dev->irq_done |= readl(dev->regs + CDICR);
	writel(dev->irq_done, dev->regs + CDICLR);

	tasklet_schedule(&dev->tasklet_done);

	return IRQ_HANDLED;
}


static irqreturn_t dmw96dma_irq_err(int irq, void *dev_id)
{
	struct dmw96dma_dev *dev = dev_id;

	dev->irq_tmo |= readl(dev->regs + CTOCR);
	writel(dev->irq_tmo, dev->regs + CTOCLR);
	dev->irq_err |= readl(dev->regs + CTECR);
	writel(dev->irq_err, dev->regs + CTECLR);

	tasklet_schedule(&dev->tasklet_err);

	return IRQ_HANDLED;
}


static void dmw96dma_int_enable(struct dmw96dma_dev *dev)
{
	u32 mask = 0xfffffffful & ~(1 << (dev->priorities[MEMCPY_HW_CHANNEL] - 1));

	writel(mask, dev->regs + CCIER);
	writel(mask, dev->regs + CDIER);
	writel(mask, dev->regs + CTOENR);
	writel(mask, dev->regs + CTERENR);
}


static void dmw96dma_int_disable(struct dmw96dma_dev *dev)
{
	writel(0, dev->regs + CCIER);
	writel(0, dev->regs + CDIER);
	writel(0, dev->regs + CTOENR);
	writel(0, dev->regs + CTERENR);
}


/*****************************************************************************/


struct dmw96dma_list *dmw96dma_alloc_io_list(int circular,
	unsigned int hw_channel, dmw96dma_callback finish, void *context)
{
	struct dmw96dma_list *list;

	if (hw_channel > 31)
		return NULL;

	list = kzalloc(sizeof(*list), GFP_KERNEL);
	if (list) {
		list->circular   = circular;
		list->hw_channel = hw_channel;
		list->finish     = finish;
		list->context    = context;
	}

	return list;
}
EXPORT_SYMBOL(dmw96dma_alloc_io_list);


struct dmw96dma_list *dmw96dma_alloc_image_list(dmw96dma_callback finish,
	void *context)
{
	struct dmw96dma_list *list;

	list = kzalloc(sizeof(*list), GFP_KERNEL);
	if (list) {
		list->hw_channel = ROTATE_HW_CHANNEL;
		list->finish     = finish;
		list->context    = context;
	}

	return list;
}
EXPORT_SYMBOL(dmw96dma_alloc_image_list);


struct dmw96dma_list *dmw96dma_alloc_memcpy_list(dmw96dma_callback finish,
	void *context)
{
	struct dmw96dma_list *list;

	list = kzalloc(sizeof(*list), GFP_KERNEL);
	if (list) {
		list->discontinuous = 1;
		list->hw_channel    = MEMCPY_HW_CHANNEL;
		list->finish        = finish;
		list->context       = context;
	}

	return list;
}
EXPORT_SYMBOL(dmw96dma_alloc_memcpy_list);


int dmw96dma_free_list(struct dmw96dma_list *list)
{
	struct dmw96dma_dev *dev = _dmw96dma_dev;
	struct td *td, *head;

	if (!dev)
		return -ENODEV;
	if (list->channel)
		return -EBUSY;

	/* free transfer descriptors */
	td = head = list->head;
	while (td) {
		struct td *next = td->next;

		dma_pool_free(dev->td_pool, td->hw_td, td->hw_td_phys);
		if (td->chain_hw_td)
			dma_pool_free(dev->td_pool, td->chain_hw_td,
				td->chain_hw_td_phys);
		kfree(td);

		td = next;
		if (td == head)
			break;
	}

	/* free the list */
	kfree(list);

	return 0;
}
EXPORT_SYMBOL(dmw96dma_free_list);


static int dmw96dma_add_io_transfer(struct dmw96dma_list *list, void *buf_virt,
	dma_addr_t buf_phys, size_t length, dmw96dma_callback done,
	void *context)
{
	struct dmw96dma_dev *dev = _dmw96dma_dev;
	struct ch_cfg *cfg;
	struct td *td;
	u32 mode;

	if (!dev)
		return -ENODEV;
	if (list->hw_channel > 31)
		return -EINVAL;

	/* basic allocations */
	td = kzalloc(sizeof(struct td), GFP_KERNEL);
	if (!td)
		return -ENOMEM;

	td->size_a  = length;
	td->size_b  = length;
	td->done    = done;
	td->context = context;
	td->hw_td   = dma_pool_alloc(dev->td_pool, GFP_KERNEL, &td->hw_td_phys);
	if (!td->hw_td)
		goto out;
	memset(td->hw_td, 0, sizeof(*td->hw_td));

	/* chain it */
	if (list->head) {
		list->tail->next = td;
		list->tail->hw_td->next = td->hw_td_phys;
		list->tail = td;
	} else {
		list->head = list->tail = td;
	}

	if (list->circular) {
		td->next = list->head;
		td->hw_td->next = list->head->hw_td_phys;
	}

	/* fill the hardware descriptor */
	cfg = dev->configs + list->hw_channel;
	mode = cfg->mode;
	if (mode & TD_HW_PORT) {
		td->buffer_a  = buf_virt;
		td->dma_buffer_a = buf_phys;
		td->dma_buffer_b = cfg->base;
		td->dir_a = (mode & TD_DIR) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
	} else {
		td->buffer_b  = buf_virt;
		td->dma_buffer_b = buf_phys;
		td->dma_buffer_a = cfg->base;
		td->dir_b = (mode & TD_DIR) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
	}

	/*
	 * Always generate interrupts for circular lists. Needed to get the
	 * transfer descriptor recycled for the next round...
	 */
	if (done || list->circular)
		mode |= TD_INT_EN;
	td->hw_td->mode = mode;
	td->hw_td->size = length;

	return 0;

out:
	kfree(td);
	return -ENOMEM;
}


int dmw96dma_add_io_transfer_buf(struct dmw96dma_list *list, void *buffer,
	size_t length, dmw96dma_callback done, void *context)
{
	return dmw96dma_add_io_transfer(list, buffer, 0, length, done,
		context);
}
EXPORT_SYMBOL(dmw96dma_add_io_transfer_buf);


int dmw96dma_add_io_transfer_coherent(struct dmw96dma_list *list, dma_addr_t buffer,
	size_t length, dmw96dma_callback done, void *context)
{
	return dmw96dma_add_io_transfer(list, NULL, buffer, length, done,
		context);
}
EXPORT_SYMBOL(dmw96dma_add_io_transfer_coherent);


int dmw96dma_add_rotate(struct dmw96dma_list *list,
	struct dmw96dma_image *src,
	struct dmw96dma_image *dst,
	struct dmw96dma_rect *src_rect,
	struct dmw96dma_rect *dst_rect,
	enum dmw96dma_rotate degree,
	dmw96dma_callback done,
	void *context)
{
	struct dmw96dma_dev *dev = _dmw96dma_dev;
	unsigned int mask, shift, skip, skip_src, skip_dst, v_blocks, h_blocks;
	struct dmw96dma_rect full_src_rect, full_dst_rect;
	struct td *td;
	u32 mode;

	if (!dev)
		return -ENODEV;
	if (list->hw_channel != ROTATE_HW_CHANNEL)
		return -EINVAL;

	if (!src_rect) {
		full_src_rect.x = full_src_rect.y = 0;
		full_src_rect.width = src->width;
		full_src_rect.height = src->height;
		src_rect = &full_src_rect;
	}
	if (!dst_rect) {
		full_dst_rect.x = full_dst_rect.y = 0;
		full_dst_rect.width = dst->width;
		full_dst_rect.height = dst->height;
		dst_rect = &full_dst_rect;
	}

	/* check src<>dst dimensions/format */
	if (src->format != dst->format)
		return -EINVAL;
	if (degree == DMW96DMA_ROTATE_180_CW) {
		if (src_rect->width != dst_rect->width)
			return -EINVAL;
		if (src_rect->height != dst_rect->height)
			return -EINVAL;
	} else {
		if (src_rect->width != dst_rect->height)
			return -EINVAL;
		if (src_rect->height != dst_rect->width)
			return -EINVAL;
	}

	/* check for block boundaries */
	mask = 63 >> dst->format;
	if ((src->width & mask) || (src->height & mask))
		return -EINVAL;
	if ((dst->width & mask) || (dst->height & mask))
		return -EINVAL;
	if ((src_rect->x & mask) || (src_rect->y & mask))
		return -EINVAL;
	if ((src_rect->width & mask) || (src_rect->width & mask))
		return -EINVAL;
	if ((dst_rect->x & mask) || (dst_rect->y & mask))
		return -EINVAL;
	if ((dst_rect->width & mask) || (dst_rect->width & mask))
		return -EINVAL;

	/* check sizes */
	shift = 6 - dst->format;
	h_blocks = src_rect->width >> shift;
	v_blocks = src_rect->height >> shift;
	skip_src = (src->width - src_rect->width) >> shift;
	skip_dst = (dst->width - dst_rect->width) >> shift;
	if ((h_blocks > 255) || (v_blocks > 255) || (skip_src > 255) || (skip_dst > 255))
		return -EINVAL;

	/* basic allocations */
	td = kzalloc(sizeof(struct td), GFP_KERNEL);
	if (!td)
		return -ENOMEM;

	td->hw_td = dma_pool_alloc(dev->td_pool, GFP_KERNEL, &td->hw_td_phys);
	if (!td->hw_td) {
		kfree(td);
		return -ENOMEM;
	}
	memset(td->hw_td, 0, sizeof(*td->hw_td));

	/* fill the hardware descriptor */
	td->done = done;
	td->context = context;

	skip = (src_rect->x + src->width * src_rect->y) << src->format;
	if (src->coherent) {
		td->dma_buffer_a = src->data.coherent + skip;
	} else {
		td->buffer_a = src->data.buf + skip;
		td->size_a = (src->width * src_rect->height) << src->format;
		td->dir_a = DMA_TO_DEVICE;
	}

	skip = (dst_rect->x + dst->width * dst_rect->y) << dst->format;
	if (dst->coherent) {
		td->dma_buffer_b = dst->data.coherent + skip;
	} else {
		td->buffer_b = dst->data.buf + skip;
		td->size_b = (dst->width * dst_rect->height) << dst->format;
		td->dir_b = DMA_FROM_DEVICE;
	}

	mode = dev->configs[ROTATE_HW_CHANNEL].mode;
	mode |= degree << 10;
	mode |= src->format << 18;
	mode |= DMW96DMA_MODE_2D_ROTATE << 25;
	if (done)
		mode |= TD_INT_EN;
	td->hw_td->mode = mode;
	td->hw_td->size = (skip_dst << 24) | (v_blocks << 16) | (skip_src << 8) | h_blocks;

	/* chain it */
	if (list->head) {
		list->tail->next = td;
		list->tail->hw_td->next = td->hw_td_phys;
		list->tail = td;
	} else {
		list->head = list->tail = td;
	}

	return 0;
}
EXPORT_SYMBOL(dmw96dma_add_rotate);


static int dmw96dma_add_memcpy(struct dmw96dma_list *list,
	void *src_virt, dma_addr_t src_phys,
	void *dst_virt, dma_addr_t dst_phys,
	size_t length, dmw96dma_callback done, void *context)
{
	struct dmw96dma_dev *dev = _dmw96dma_dev;
	struct td *td = NULL;
	u32 mode;
	dma_addr_t base;

	if (!dev)
		return -ENODEV;
	if (list->hw_channel != MEMCPY_HW_CHANNEL)
		return -EINVAL;

	mode = dev->configs[MEMCPY_HW_CHANNEL].mode;
	base = dev->configs[MEMCPY_HW_CHANNEL].base;

	/* first full 256 byte blocks with chained block transfer */
	if (length & ~0xfful) {
		td = kzalloc(sizeof(struct td), GFP_KERNEL);
		td->hw_td = dma_pool_alloc(dev->td_pool, GFP_KERNEL,
			&td->hw_td_phys);
		memset(td->hw_td, 0, sizeof(*td->hw_td));
		td->chain_hw_td = dma_pool_alloc(dev->td_pool, GFP_KERNEL,
			&td->chain_hw_td_phys);
		memset(td->chain_hw_td, 0, sizeof(*td->chain_hw_td));

		/*
		 * Install two chained descriptors. The first descriptor is
		 * copying from the internal memory to the destination and the
		 * 2nd vice versa. This is needed to get the correct "done"
		 * interrupt as the 2nd descriptor does not generate an irq.
		 */
		td->size_a = length & ~0xfful;
		td->buffer_a = dst_virt;
		td->dma_buffer_a = dst_phys;
		td->dir_a = DMA_FROM_DEVICE;
		td->size_b = length & ~0xfful;
		td->buffer_b = src_virt;
		td->dma_buffer_b = src_phys;
		td->dir_b = DMA_TO_DEVICE;

		td->hw_td->size = length & ~0xfful;
		td->chain_hw_td->size = length & ~0xfful;
		if (mode & TD_TB_PORT) {
			td->hw_td->port_b = base;
			td->hw_td->mode   = mode | TD_DIR
				| (DMW96DMA_MODE_CHAINED_BLOCK << 25);
			td->chain_hw_td->port_b = base;
			td->chain_hw_td->mode   = mode | TD_CHAIN_CH
				| (DMW96DMA_MODE_CHAINED_BLOCK << 25);
		} else {
			td->hw_td->port_a = base;
			td->hw_td->mode   = mode
				| (DMW96DMA_MODE_CHAINED_BLOCK << 25);
			td->chain_hw_td->port_a = base;
			td->chain_hw_td->mode   = mode | TD_CHAIN_CH | TD_DIR
				| (DMW96DMA_MODE_CHAINED_BLOCK << 25);
		}

		if (src_virt)
			src_virt += length & ~0xfful;
		if (src_phys)
			src_phys += length & ~0xfful;
		if (dst_virt)
			dst_virt += length & ~0xfful;
		if (dst_phys)
			dst_phys += length & ~0xfful;
	}

	/* any remaining rest... */
	if (length & 0xfful) {
		struct td *rtd1, *rtd2;
		unsigned int burst;

		rtd1 = kzalloc(sizeof(struct td), GFP_KERNEL);
		rtd1->hw_td = dma_pool_alloc(dev->td_pool, GFP_KERNEL,
			&rtd1->hw_td_phys);
		memset(rtd1->hw_td, 0, sizeof(*rtd1->hw_td));
		rtd2 = kzalloc(sizeof(struct td), GFP_KERNEL);
		rtd2->hw_td = dma_pool_alloc(dev->td_pool, GFP_KERNEL,
			&rtd2->hw_td_phys);
		memset(rtd2->hw_td, 0, sizeof(*rtd2->hw_td));

		/* Quick'n dirty burst length calculation. Could be optimized. */
		burst = 127;
		while (length & burst)
			burst >>= 1;
		burst >>= 2; /* length in 32bit words */

		/*
		 * We use single block transfers to move the data first to the
		 * intermediate buffer and then to the destination. They can be
		 * chained.
		 */
		if (mode & TD_TB_PORT) {
			/* src (A) -> temporal buffer (B) */
			rtd1->size_a = length & 0xfful;
			rtd1->buffer_a = src_virt;
			rtd1->dma_buffer_a = src_phys;
			rtd1->dir_a = DMA_TO_DEVICE;
			rtd1->dma_buffer_b = base;
			rtd1->hw_td->size = length & 0xfful;
			rtd1->hw_td->mode = (burst << 13)
				| (DMW96DMA_MODE_SINGLE_BLOCK << 25);

			/* dst (A) <- temporal buffer (B) */
			rtd2->size_a = length & 0xfful;
			rtd2->buffer_a = dst_virt;
			rtd2->dma_buffer_a = dst_phys;
			rtd2->dir_a = DMA_FROM_DEVICE;
			rtd2->dma_buffer_b = base;
			rtd2->hw_td->size = length & 0xfful;
			rtd2->hw_td->mode = (burst << 13) | TD_DIR
				| (DMW96DMA_MODE_SINGLE_BLOCK << 25);
		} else {
			/* temporal buffer (A) <- src (B) */
			rtd1->dma_buffer_a = base;
			rtd1->size_b = length & 0xfful;
			rtd1->buffer_b = src_virt;
			rtd1->dma_buffer_b = src_phys;
			rtd1->dir_b = DMA_TO_DEVICE;
			rtd1->hw_td->size = length & 0xfful;
			rtd1->hw_td->mode = (burst << 13) | TD_DIR
				| (DMW96DMA_MODE_SINGLE_BLOCK << 25);

			/* temporal buffer (A) -> dst (B) */
			rtd2->dma_buffer_a = base;
			rtd2->size_b = length & 0xfful;
			rtd2->buffer_b = dst_virt;
			rtd2->dma_buffer_b = dst_phys;
			rtd2->dir_b = DMA_FROM_DEVICE;
			rtd2->hw_td->size = length & 0xfful;
			rtd2->hw_td->mode = (burst << 13)
				| (DMW96DMA_MODE_SINGLE_BLOCK << 25);
		}

		/* connect td's */
		rtd1->next = rtd2;
		rtd1->hw_td->next = rtd2->hw_td_phys;

		/* maybe attach to whole block transfer */
		if (td)
			td->next = rtd1;
		else
			td = rtd1;
	}

	/* chain it */
	if (list->head)
		list->tail->next = td;
	else
		list->head = td;

	/* find tail */
	while (td->next) td = td->next;
	list->tail = td;

	/* callback? */
	if (done) {
		td->done = done;
		td->context = context;
		td->hw_td->mode |= TD_INT_EN;
	}

	return 0;
}


int dmw96dma_add_memcpy_buf(struct dmw96dma_list *list, void *src, void *dst,
	size_t length, dmw96dma_callback done, void *context)
{
	return dmw96dma_add_memcpy(list, src, 0, dst, 0, length, done, context);
}
EXPORT_SYMBOL(dmw96dma_add_memcpy_buf);


int dmw96dma_add_memcpy_coherent(struct dmw96dma_list *list, dma_addr_t src,
	dma_addr_t dst, size_t length, dmw96dma_callback done, void *context)
{
	return dmw96dma_add_memcpy(list, NULL, src, NULL, dst, length, done,
		context);
}
EXPORT_SYMBOL(dmw96dma_add_memcpy_coherent);


int dmw96dma_submit(struct dmw96dma_list *list)
{
	struct dmw96dma_dev *dev = _dmw96dma_dev;
	unsigned int channel;

	if (!dev)
		return -ENODEV;
	if (list->channel)
		return -EBUSY;

	clk_enable(dev->clk);

	init_td_list(dev, list);
	channel = dev->priorities[list->hw_channel];
	list->channel = channel + 1;

	spin_lock_bh(&dev->lock);

	list_add_tail(&list->link, &dev->pending[channel]);
	if (!dev->running[channel])
		check_pending(dev, channel);

	spin_unlock_bh(&dev->lock);

	return 0;
}
EXPORT_SYMBOL(dmw96dma_submit);


int dmw96dma_cancel(struct dmw96dma_list *list)
{
	struct dmw96dma_dev *dev = _dmw96dma_dev;
	unsigned int channel;

	if (!dev)
		return -ENODEV;

	spin_lock_bh(&dev->lock);

	if (!list->channel) {
		spin_unlock_bh(&dev->lock);
		return 0;
	}
	channel = list->channel - 1;

	/* currently running? */
	if (dev->running[channel] == list) {
		/* stop engine */
		dev->running[channel] = NULL;
		writel(1 << channel, dev->regs + CDR);
		while (readl(dev->regs + CEMSR) & (1 << channel))
			cpu_relax();

		/* start any pending requests */
		check_pending(dev, channel);
	} else {
		/* just remove from pending list */
		list_del(&list->link);
	}
	list->channel = 0;

	spin_unlock_bh(&dev->lock);

	/* invoke callbacks */
	clk_disable(dev->clk);
	finish_td_list(dev, list, -EINTR);

	return 0;
}
EXPORT_SYMBOL(dmw96dma_cancel);


int dmw96dma_config_get(unsigned int hw_channel, struct dmw96dma_ch_config *out)
{
	struct dmw96dma_dev *dev = _dmw96dma_dev;
	struct ch_cfg *cfg;
	u32 mode;

	if (!dev)
		return -ENODEV;
	if (hw_channel > 31)
		return -ENODEV;

	cfg = dev->configs + hw_channel;
	out->base = cfg->base;
	mode = cfg->mode;
	out->consecutive     = (mode >>  2) & 0x1f;
	out->transfer_length = (mode >> 13) & 0x1f;
	out->width           = (mode >> 18) & 0x03;
	out->mode            = (mode >> 25) & 0x07;
	out->hw_port         = !!(mode & TD_HW_PORT);
	out->timeout         = !!(mode & TD_TMO_EN);
	out->direction       = !!(mode & TD_DIR);

	return 0;
}
EXPORT_SYMBOL(dmw96dma_config_get);


int dmw96dma_config_set(unsigned int hw_channel, struct dmw96dma_ch_config *in)
{
	struct dmw96dma_dev *dev = _dmw96dma_dev;
	struct ch_cfg *cfg;
	u32 mode = 0;

	if (!dev)
		return -ENODEV;
	if (hw_channel > 31)
		return -ENODEV;
	if (in->consecutive > 31 || in->transfer_length > 31)
		return -EINVAL;

	mode |= (u32)in->consecutive     << 2;
	mode |= (u32)in->transfer_length << 13;
	mode |= (u32)in->width           << 18;
	mode |= (u32)in->mode            << 25;
	if (in->hw_port)
		mode |= TD_HW_PORT;
	if (in->timeout)
		mode |= TD_TMO_EN;
	if (in->direction)
		mode |= TD_DIR;

	cfg = dev->configs + hw_channel;
	cfg->mode     = mode;
	cfg->base     = in->base;

	return 0;
}
EXPORT_SYMBOL(dmw96dma_config_set);


/*****************************************************************************/


static int __init dmw96dma_probe(struct platform_device *pdev)
{
	struct dmw96dma_platform_data *pdata;
	struct dmw96dma_dev           *dev;
	struct resource               *io;
	int irq_done, irq_err;
	int err = -EINVAL;
	int i, j;

	pdata = pdev->dev.platform_data;
	if (!pdata)
		return -EINVAL;

	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
	if (!io)
		return -EINVAL;

	irq_done = platform_get_irq(pdev, 0);
	if (irq_done < 0)
		return irq_done;

	irq_err = platform_get_irq(pdev, 1);
	if (irq_err < 0)
		return irq_err;

	dev = kzalloc(sizeof(struct dmw96dma_dev), GFP_KERNEL);
	if (!dev)
		return -ENOMEM;

	dev->pdev = pdev;
	platform_set_drvdata(pdev, dev);
	_dmw96dma_dev = dev;

	/*
	 * Initialize device structure
	 */
	memset(dev->priorities, 0xff, sizeof(dev->priorities));
	for (i=0; i<ARRAY_SIZE(dev->pending); i++)
		INIT_LIST_HEAD(&dev->pending[i]);

	for (i=0; i<pdata->num_channels; i++) {
		if (pdata->ch_priorities[i] > 31)
			goto err_kfree;
		dev->priorities[i] = pdata->ch_priorities[i];
		dmw96dma_config_set(i, pdata->ch_configs + i);
	}

	dev->priorities[ROTATE_HW_CHANNEL] = pdata->rotate_priority;
	dev->configs[ROTATE_HW_CHANNEL].mode = pdata->desc_port ? TD_2D_PORT : 0;
	dev->priorities[MEMCPY_HW_CHANNEL] = pdata->memcpy_priority + 1;
	dev->configs[MEMCPY_HW_CHANNEL].mode = (31 << 13) /* 32 word bursts */
		| (pdata->desc_port ? 0 : TD_TB_PORT);
	dev->configs[MEMCPY_HW_CHANNEL].base = pdata->memcpy_base;

	for (i=0; i<ARRAY_SIZE(dev->priorities)-1; i++) {
		if (dev->priorities[i] == 0xff)
			continue;

		for (j=i+1; j<ARRAY_SIZE(dev->priorities); j++)
			if (dev->priorities[i] == dev->priorities[j]) {
				dev_err(&pdev->dev,
					"Same priorities on channels %d and %d\n",
					i, j);
				goto err_kfree;
			}
	}

	/*
	 * Allocate ressources
	 */
	if (!request_mem_region(io->start, DMA_REGLEN, pdev->dev.driver->name)) {
		err = -EBUSY;
		goto err_kfree;
	}

	dev->regs = ioremap_nocache(io->start, DMA_REGLEN);
	if (!dev->regs) {
		err = -ENOMEM;
		goto err_release_r;
	}

	dev->clk = clk_get(&pdev->dev, "hclk");
	if (IS_ERR(dev->clk)) {
		err = PTR_ERR(dev->clk);
		goto err_clk;
	}
	clk_enable(dev->clk);

	/* Initialize registers */
	writel(0, dev->regs + CRDR);
	writel(0, dev->regs + CCIER);
	writel(0, dev->regs + CDIER);
	writel(0, dev->regs + CTOENR);
	writel(0, dev->regs + CTERENR);

	writel(pdata->timeout,   dev->regs + CTOR);
	writel(pdata->desc_port, dev->regs + CDPR);
	writel(0xfffffffful, dev->regs + CCICLR);
	writel(0xfffffffful, dev->regs + CDICLR);
	writel(0xfffffffful, dev->regs + CTOCLR);
	writel(0xfffffffful, dev->regs + CTECLR);

	writel(0xfffffffful, dev->regs + CDR);

	for (i = 0; i < 6; i++)
		writel(0x0, dev->regs + CRCONF1 + i*4);
	for (i = 0; i < 32; i++)
		writel(0x0, dev->regs + C0DESCADR + i*4);

	/* configure channel --> hw_channel mappings */
	for (i = 0; i < 32; i++) {
		unsigned int val, channel = dev->priorities[i];
		if (channel > 31)
			continue;

		val = readl(dev->regs + CRCONF1 + channel / 6 * 4);
		val &= ~(0x1f << (channel % 6 * 5));
		val |= i << (channel % 6 * 5);
		writel(val, dev->regs + CRCONF1 + channel / 6 * 4);
	}

	spin_lock_init(&dev->lock);

	/* Initialize IRQs */
	tasklet_init(&dev->tasklet_done, dmw96dma_tasklet_done, (unsigned long)dev);
	tasklet_init(&dev->tasklet_err, dmw96dma_tasklet_err, (unsigned long)dev);

	err = request_irq(irq_done, dmw96dma_irq_done, IRQF_DISABLED,
		"dmw96_gdmac_done", dev);
	if (err)
		goto err_irq1;
	err = request_irq(irq_err, dmw96dma_irq_err, IRQF_DISABLED,
		"dmw96_gdmac_err", dev);
	if (err)
		goto err_irq2;

        dev->td_pool = dma_pool_create("dmw96dma_desc_pool", &pdev->dev,
		sizeof(struct hw_td), 32, 0);
	if (!dev->td_pool) {
		err = -ENOMEM;
		goto err_dma;
	}

	dmw96dma_int_enable(dev);
	clk_disable(dev->clk);

	printk(KERN_INFO "%s: DMW96 Generic DMA Controller\n",
			dev_name(&pdev->dev));

	return 0;

err_dma:
	free_irq(irq_err, dev);
err_irq2:
	free_irq(irq_done, dev);
err_irq1:
	tasklet_kill(&dev->tasklet_err);
	tasklet_kill(&dev->tasklet_done);
	clk_disable(dev->clk);
	clk_put(dev->clk);
err_clk:
	iounmap(dev->regs);
	dev->regs = NULL;
err_release_r:
	release_mem_region(io->start, DMA_REGLEN);
err_kfree:
	kfree(dev);
	_dmw96dma_dev = NULL;
	return err;
}

static int __exit dmw96dma_remove(struct platform_device *pdev)
{
	struct dmw96dma_dev *dev = platform_get_drvdata(pdev);
	struct resource *io;

	dmw96dma_int_disable(dev);

	clk_disable(dev->clk);
	clk_put(dev->clk);

	free_irq(platform_get_irq(pdev, 1), dev);
	free_irq(platform_get_irq(pdev, 0), dev);
	tasklet_kill(&dev->tasklet_err);
	tasklet_kill(&dev->tasklet_done);

	iounmap(dev->regs);

	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
	release_mem_region(io->start, DMA_REGLEN);

	dma_pool_destroy(dev->td_pool);

	kfree(dev);

	return 0;
}

static struct platform_driver dmw96dma_driver = {
	.remove       = __exit_p(dmw96dma_remove),
	.driver = {
		.name  = "dmw96dma",
		.owner = THIS_MODULE,
	},
};

static int __init dmw96dma_init(void)
{
	return platform_driver_probe(&dmw96dma_driver, dmw96dma_probe);
}
subsys_initcall(dmw96dma_init);

static void __exit dmw96dma_exit(void)
{
	platform_driver_unregister(&dmw96dma_driver);
}
module_exit(dmw96dma_exit);

MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("DMW96 DMA Controller driver");
MODULE_AUTHOR("Jan Kloetzke <jan.kloetzke@dspg.com>");

