summaryrefslogtreecommitdiff
path: root/Source/DirectFB/gfxdrivers/davinci/davinci_c64x.c
diff options
context:
space:
mode:
Diffstat (limited to 'Source/DirectFB/gfxdrivers/davinci/davinci_c64x.c')
-rwxr-xr-xSource/DirectFB/gfxdrivers/davinci/davinci_c64x.c2053
1 files changed, 2053 insertions, 0 deletions
diff --git a/Source/DirectFB/gfxdrivers/davinci/davinci_c64x.c b/Source/DirectFB/gfxdrivers/davinci/davinci_c64x.c
new file mode 100755
index 0000000..431ffdd
--- /dev/null
+++ b/Source/DirectFB/gfxdrivers/davinci/davinci_c64x.c
@@ -0,0 +1,2053 @@
+/*
+ TI Davinci driver - C64X+ DSP Library
+
+ (c) Copyright 2008 directfb.org
+ (c) Copyright 2007 Telio AG
+
+ Written by Denis Oliver Kropp <dok@directfb.org> and
+ Olaf Dreesen <olaf@directfb.org>.
+
+ All rights reserved.
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License
+ version 2 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this library; if not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+*/
+
+//#define DIRECT_ENABLE_DEBUG
+
+#include <config.h>
+
+#include <asm/types.h>
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <directfb_util.h>
+
+#include <direct/clock.h>
+#include <direct/debug.h>
+#include <direct/log.h>
+#include <direct/messages.h>
+#include <direct/util.h>
+
+#include "davinci_c64x.h"
+
+
+/**********************************************************************************************************************/
+
+#define C64X_DEVICE "/dev/c64x"
+#define C64X_DEVICE0 "/dev/c64x0"
+#define C64X_QLEN direct_page_align( sizeof(c64xTaskControl) )
+#define C64X_MLEN direct_page_align( 0x2000000 )
+
+__attribute__((noinline))
+static void
+davinci_c64x_queue_error( DavinciC64x *c64x, const char *msg )
+{
+ c64xTaskControl *ctl = c64x->ctl;
+ uint32_t dsp = ctl->QL_dsp;
+ uint32_t arm = ctl->QL_arm;
+ uint32_t armp = (arm-1) & C64X_QUEUE_MASK;
+ c64xTask *dsp_task = &c64x->QueueL[dsp];
+ c64xTask *arm_task = &c64x->QueueL[arm];
+ c64xTask *armp_task = &c64x->QueueL[armp];
+
+ D_PERROR( "Davinci/C64X+: %s [DSP %d / %d (%s), ARM %d / %d (%s) <- %d / %d (%s)]\n",
+ msg,
+ dsp,
+ (dsp_task->c64x_function >> 2) & 0x3fff,
+ state_names[dsp_task->c64x_function & 3],
+ arm,
+ (arm_task->c64x_function >> 2) & 0x3fff,
+ state_names[arm_task->c64x_function & 3],
+ armp,
+ (armp_task->c64x_function >> 2) & 0x3fff,
+ state_names[armp_task->c64x_function & 3] );
+}
+
+/*
+
+1. Idle Case
+
+ ARM ARM
+ DSP DSP
+ | . . . . . . . . | | . . . . . . . . | free = length-1
+
+
+2. Busy Case (ARM after)
+
+ ARM ARM
+ DSP DSP
+ | o . . . . . . . | | . o o . . . . . | free = length-1 - arm + dsp
+
+
+3. Busy Case (ARM before)
+
+ ARM ARM
+ DSP DSP
+ | . . . . . o o o | | o o . . . . . o | free = dsp - arm - 1
+
+
+4. Full Case (ARM after)
+
+ ARM
+ DSP
+ | o o o o o o o . | free = 0
+
+
+5. Full Case (ARM before)
+
+ ARM ARM
+ DSP DSP
+ | o o o o o . o o | | . o o o o o o o | free = 0
+
+*/
+
+DFBResult
+davinci_c64x_emit_tasks( DavinciC64x *c64x,
+ DavinciC64xTasks *tasks,
+ DavinciC64xEmitFlags flags )
+{
+ c64xTaskControl *ctl = c64x->ctl;
+ uint32_t arm = ctl->QL_arm;
+ unsigned int emitted = 0;
+ unsigned int timeout = 23;
+
+ D_MAGIC_ASSERT( tasks, DavinciC64xTasks );
+
+ while (emitted < tasks->num_tasks) {
+ uint32_t dsp = ctl->QL_dsp;
+ int free;
+
+ if (arm == dsp)
+ free = C64X_QUEUE_LENGTH - 1;
+ else if (arm > dsp)
+ free = C64X_QUEUE_LENGTH - 1 - arm + dsp;
+ else
+ free = dsp - arm - 1;
+
+ if (free) {
+ int emit = MIN( free, tasks->num_tasks - emitted );
+ int copy = MIN( emit, C64X_QUEUE_LENGTH - arm );
+
+ memcpy( (void*) &c64x->QueueL[arm], (void*) &tasks->tasks[emitted], sizeof(c64xTask) * copy );
+
+ if (copy < emit) {
+ memcpy( (void*) &c64x->QueueL[0], (void*) &tasks->tasks[emitted+copy], sizeof(c64xTask) * (emit - copy) );
+
+ arm = (emit - copy);
+ }
+ else
+ arm = (arm + copy) & C64X_QUEUE_MASK;
+
+ mb();
+
+ ctl->QL_arm = arm;
+
+ mb();
+
+ emitted += emit;
+
+ timeout = 23;
+ }
+ else {
+ if (!timeout--) {
+ davinci_c64x_queue_error( c64x, "Emit Timeout!" );
+ return DFB_TIMEOUT;
+ }
+
+ usleep( 7000 );
+ }
+ }
+
+ if (flags & C64X_TEF_RESET)
+ tasks->num_tasks = 0;
+
+ return DFB_OK;
+}
+
+DFBResult
+davinci_c64x_tasks_init( DavinciC64xTasks *tasks,
+ unsigned int size )
+{
+ tasks->tasks = D_MALLOC( sizeof(c64xTask) * size );
+ if (!tasks->tasks)
+ return D_OOM();
+
+ tasks->max_tasks = size;
+ tasks->num_tasks = 0;
+
+ D_MAGIC_SET( tasks, DavinciC64xTasks );
+
+ return DFB_OK;
+}
+
+DFBResult
+davinci_c64x_tasks_destroy( DavinciC64xTasks *tasks )
+{
+ D_MAGIC_ASSERT( tasks, DavinciC64xTasks );
+ D_ASSERT( tasks->tasks != NULL );
+
+ D_FREE( (void*) tasks->tasks );
+
+ tasks->tasks = NULL;
+
+ D_MAGIC_CLEAR( tasks );
+
+ return DFB_OK;
+}
+
+DFBResult
+davinci_c64x_wait_low( DavinciC64x *c64x )
+{
+ DFBResult ret;
+ c64xTaskControl *ctl = c64x->ctl;
+
+ while (ctl->QL_dsp != ctl->QL_arm) {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = C64X_FLAG_TODO | C64X_FLAG_INTERRUPT;
+
+ c64x_submit_task( c64x, task );
+
+ if (ioctl( c64x->fd, C64X_IOCTL_WAIT_LOW )) {
+ c64xTask *dsp_task = &c64x->QueueL[ctl->QL_dsp];
+
+ ret = errno2result( errno );
+ D_PERROR( "Davinci/C64X+: C64X_IOCTL_WAIT_LOW failed! [DSP %d / %d (%s), ARM %d / %d (%s)]\n",
+ ctl->QL_dsp,
+ (dsp_task->c64x_function >> 2) & 0x3fff,
+ state_names[dsp_task->c64x_function & 3],
+ ctl->QL_arm,
+ (task->c64x_function >> 2) & 0x3fff,
+ state_names[task->c64x_function & 3] );
+ return ret;
+ }
+ }
+
+ return DFB_OK;
+}
+
+/**********************************************************************************************************************/
+/* Benchmarking or Testing */
+/**********************************************************************************************************************/
+
+#if 1
+#define BRINTF(x...) do { direct_log_printf( NULL, x ); } while (0)
+#else
+#define BRINTF(x...) printf( x )
+#endif
+
+static void
+bench_mem( const char *name,
+ void *ptr,
+ int length,
+ bool copy,
+ bool from )
+{
+ int i, num;
+ long long t1, t2, dt, total;
+ char buf[0x100];
+
+ if (length > sizeof(buf))
+ length = sizeof(buf);
+
+ num = 0x2000000 / length;
+
+ t1 = direct_clock_get_abs_micros();
+
+ if (copy) {
+ if (from)
+ for (i=0; i<num; i++)
+ memcpy( buf, ptr, length );
+ else
+ for (i=0; i<num; i++)
+ memcpy( ptr, buf, length );
+ }
+ else
+ for (i=0; i<num; i++)
+ memset( ptr, 0, length );
+
+ t2 = direct_clock_get_abs_micros();
+
+ dt = t2 - t1;
+ total = i * length;
+
+ D_INFO( "Davinci/C64X: MEMORY BENCHMARK on %-7s - %-15s %lld.%03lld MB/sec\n",
+ name, copy ? from ? "memcpy() from" : "memcpy() to" : "memset()",
+ total / dt, (total * 1000 / dt) % 1000 );
+}
+
+
+/* insert idct code for testing here */
+
+
+#define DVA_BLOCK_WORD( val, index, EOB ) (((val) << 16) | (((index)&0x3f) << 1) | ((EOB) ? 1 : 0))
+
+static inline void
+test_load_block( DavinciC64x *c64x, bool dct_type_interlaced )
+{
+ int i;
+ int num = 0;
+ short *dst = c64x->mem + 0x01000000;
+ int *src = c64x->mem + 0x01100000;
+
+
+#if 0
+ src[num++] = DVA_BLOCK_WORD( 100, 0, 1 );
+ src[num++] = DVA_BLOCK_WORD( 200, 0, 0 );
+ src[num++] = DVA_BLOCK_WORD( 210, 1, 0 );
+ src[num++] = DVA_BLOCK_WORD( 220, 2, 1 );
+ src[num++] = DVA_BLOCK_WORD( 300, 0, 1 );
+ src[num++] = DVA_BLOCK_WORD( 400, 0, 0 );
+ src[num++] = DVA_BLOCK_WORD( 410, 1, 1 );
+ src[num++] = DVA_BLOCK_WORD( 500, 0, 0 );
+ src[num++] = DVA_BLOCK_WORD( 510, 63, 1 );
+ src[num++] = DVA_BLOCK_WORD( 600, 63, 1 );
+#else
+ src[num++] = DVA_BLOCK_WORD(136, 0, 0);
+ src[num++] = DVA_BLOCK_WORD(-12, 8, 0);
+ src[num++] = DVA_BLOCK_WORD(7, 16, 0);
+ src[num++] = DVA_BLOCK_WORD(-2, 24, 1);
+
+ src[num++] = DVA_BLOCK_WORD(136, 0, 0);
+ src[num++] = DVA_BLOCK_WORD(-12, 8, 0);
+ src[num++] = DVA_BLOCK_WORD(7, 16, 0);
+ src[num++] = DVA_BLOCK_WORD(-2, 24, 1);
+
+
+ src[num++] = DVA_BLOCK_WORD(1076, 0, 0);
+ src[num++] = DVA_BLOCK_WORD(-204, 8, 0);
+ src[num++] = DVA_BLOCK_WORD(-168, 16, 0);
+ src[num++] = DVA_BLOCK_WORD(-129, 24, 0);
+ src[num++] = DVA_BLOCK_WORD(-100, 32, 0);
+ src[num++] = DVA_BLOCK_WORD(-40, 40, 0);
+ src[num++] = DVA_BLOCK_WORD(-14, 48, 1);
+#if 1
+ src[num++] = DVA_BLOCK_WORD(1068, 0, 0);
+ src[num++] = DVA_BLOCK_WORD(2, 1, 0);
+ src[num++] = DVA_BLOCK_WORD(-202, 8, 0);
+ src[num++] = DVA_BLOCK_WORD(-168, 16, 0);
+ src[num++] = DVA_BLOCK_WORD(-2, 9, 0);
+ src[num++] = DVA_BLOCK_WORD(-129, 24, 0);
+ src[num++] = DVA_BLOCK_WORD(-97, 32, 0);
+ src[num++] = DVA_BLOCK_WORD(-40, 40, 0);
+ src[num++] = DVA_BLOCK_WORD(-13, 48, 1);
+#else
+ src[num++] = DVA_BLOCK_WORD(1068, 0, 0);
+// src[num++] = DVA_BLOCK_WORD(2, 1, 0);
+ src[num++] = DVA_BLOCK_WORD(-202, 8, 0);
+ src[num++] = DVA_BLOCK_WORD(-1, 16, 0);
+// src[num++] = DVA_BLOCK_WORD(-2, 9, 0);
+ src[num++] = DVA_BLOCK_WORD(-1, 24, 0);
+ src[num++] = DVA_BLOCK_WORD(-97, 32, 1);
+// src[num++] = DVA_BLOCK_WORD(-40, 40, 0);
+// src[num++] = DVA_BLOCK_WORD(-13, 48, 1);
+#endif
+
+ src[num++] = DVA_BLOCK_WORD(1048, 0, 0);
+ src[num++] = DVA_BLOCK_WORD(-26, 8, 0);
+ src[num++] = DVA_BLOCK_WORD(4, 16, 0);
+ src[num++] = DVA_BLOCK_WORD(5, 24, 0);
+ src[num++] = DVA_BLOCK_WORD(-4, 32, 1);
+
+ src[num++] = DVA_BLOCK_WORD(996, 0, 0);
+ src[num++] = DVA_BLOCK_WORD(24, 8, 0);
+ src[num++] = DVA_BLOCK_WORD(-2, 24, 0);
+ src[num++] = DVA_BLOCK_WORD(3, 32, 0);
+ src[num++] = DVA_BLOCK_WORD(-4, 48, 1);
+#endif
+
+ BRINTF("\n");
+ BRINTF("\n\n.======================== Testing load_block (dct_type_interlaced: %s) ========================.\n",
+ dct_type_interlaced ? "yes" : "no");
+ BRINTF("\n");
+ BRINTF( "SOURCE (DVABlockWords)\n" );
+ BRINTF("\n");
+
+ for (i=0; i<num; i++)
+ BRINTF("0x%08x (%d, %d, %d)\n", (u32)src[i], src[i] >> 16, (src[i] >> 1) & 0x3f, src[i] & 1);
+
+ BRINTF("\n\n");
+
+
+ memset( dst, 0x55, 0x100000 );
+
+
+ // test routine
+ davinci_c64x_load_block( c64x, DAVINCI_C64X_MEM+0x01100000, 10, dct_type_interlaced ? 0x7f : 0x3f );
+
+ // copy idct buffer to memory where we can read it
+ davinci_c64x_blit_16( c64x, DAVINCI_C64X_MEM+0x01000000, 0, 0xf065c0, 0, 16 * 24, 1 );
+
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_wait_low( c64x );
+
+
+ BRINTF( "-> IDCT BUFFER (16x16 + [ 8x8 8x8 ] shorts)\n" );
+ BRINTF("\n");
+
+ for (i=0; i<16*24; i++) {
+ BRINTF("%5d ", dst[i] );
+ if ((i&15)==15) {
+ BRINTF("\n");
+ }
+ if ((i&255)==255) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n\n");
+
+#if 1
+ s16 *blocks = c64x->mem + 0x01200000;
+ int offset = 0;
+
+ memset( blocks, 0, 1024 );
+
+ for (i=0; i<num; i++) {
+ blocks[offset + ((src[i] >> 1) & 0x3f)] = src[i] >> 16;
+
+ if (src[i] & 1)
+ offset += 64;
+ }
+
+ memset( dst, 0x55, 0x100000 );
+
+ // test routine
+ for (i=0; i<6; i++)
+ davinci_c64x_dva_idct( c64x, DAVINCI_C64X_MEM+0x01000000 + i*128, 16, DAVINCI_C64X_MEM+0x01200000 + i*128 );
+
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_wait_low( c64x );
+
+
+ BRINTF( "-> SINGLE IDCT (59) BLOCKS (6x 8x8 shorts)\n" );
+ BRINTF("\n");
+
+ for (i=0; i<6*64; i++) {
+ BRINTF("%5d ", dst[i] );
+ if ((i&7)==7) {
+ BRINTF("\n");
+ }
+ if ((i&63)==63) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n\n");
+#endif
+
+#if 0
+// s16 blocks[384];
+// int offset = 0;
+ offset = 0;
+
+ memset( blocks, 0, 1024 );
+
+ for (i=0; i<num; i++) {
+ blocks[offset + ((src[i] >> 1) & 0x3f)] = src[i] >> 16;
+
+ if (src[i] & 1) {
+ int n;
+
+ for (n = 0; n < 8; n++)
+ idct_row (blocks + offset + 8 * n);
+
+ for (n = 0; n < 8; n++)
+ idct_col (blocks + offset + n);
+
+ offset += 64;
+ }
+ }
+
+ BRINTF( "-> REFERENCE IDCT BLOCKS (6x 8x8 shorts)\n" );
+ BRINTF("\n");
+
+ for (i=0; i<6*64; i++) {
+ BRINTF("%5d ", blocks[i] );
+ if ((i&7)==7) {
+ BRINTF("\n");
+ }
+ if ((i&63)==63) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n\n");
+#endif
+}
+
+static inline void
+bench_dezigzag( DavinciC64x *c64x )
+{
+ int i, num;
+ long long t1, t2, dt, total;
+ //int length = 0x10000;
+
+ num = 0x200000;// / length;
+
+ short *p = c64x->mem + 0x1000000;
+
+ for (i=0; i<64; i++) {
+ p[i] = i;
+ BRINTF("%3d ", p[i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = C64X_DEZIGZAG | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x200000;
+ task->c64x_arg[1] = (DAVINCI_C64X_MEM+0x01000000)+0x000000;
+ //task->c64x_arg[2] = length/4;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+ p = c64x->mem + 0x1200000;
+ for (i=0; i<64; i++) {
+ BRINTF("%3d ", p[i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ dt = t2 - t1;
+ total = num;// * length;
+
+ D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "de_zigzag()", total * 1000000ULL / dt );
+}
+
+#define DUMP_PIXELS 1
+
+static inline void
+bench_blend_argb( DavinciC64x *c64x, int sub )
+{
+ int i, num;
+ long long t1, t2, dt, total;
+
+ num = 1;//0x20000;
+
+ u32 *src = c64x->mem + 0x1000000;
+ u32 *dst = c64x->mem + 0x1200000;
+
+ BRINTF( "\nTESTING BLEND_32 SUB %d\n", sub );
+
+ BRINTF( "\nSOURCE " );
+
+ for (i=0; i<DUMP_PIXELS; i++) {
+ src[i] = (i << 26) | ((i & 0x30) << 20) | (i * 0x010204 + 3);
+
+ if (!i)
+ src[i] = 0xc0c08001;
+
+ BRINTF("%02x %02x %02x %02x ", src[i] >> 24, (src[i] >> 16) & 0xff, (src[i] >> 8) & 0xff, src[i] & 0xff);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF( "\nDESTINATION " );
+
+ for (i=0; i<DUMP_PIXELS; i++) {
+ dst[i] = i * 0x04040404;
+
+ if (!i)
+ dst[i] = 0xe0e0e0e0;
+
+ BRINTF("%02x %02x %02x %02x ", dst[i] >> 24, (dst[i] >> 16) & 0xff, (dst[i] >> 8) & 0xff, dst[i] & 0xff);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = C64X_BLEND_32 | C64X_FLAG_TODO | (sub << 16);
+
+ task->c64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x200000;
+ task->c64x_arg[1] = 32;
+ task->c64x_arg[2] = (DAVINCI_C64X_MEM+0x01000000)+0x000000;
+ task->c64x_arg[3] = 32;
+ task->c64x_arg[4] = 8;
+ task->c64x_arg[5] = 8;
+ task->c64x_arg[6] = 0x80;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+ BRINTF( "\n\nDESTINATION (AFTER) " );
+
+ for (i=0; i<DUMP_PIXELS; i++) {
+ BRINTF("%02x %02x %02x %02x ", dst[i] >> 24, (dst[i] >> 16) & 0xff, (dst[i] >> 8) & 0xff, dst[i] & 0xff);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n\n");
+
+ dt = t2 - t1;
+ total = num;
+
+ D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "blend_32(8x8)", total * 1000000ULL / dt );
+}
+
+static inline void
+bench_fetch_uyvy( DavinciC64x *c64x, bool interleave, int xoff, int yoff ) {
+ int i, x, y, num=1;
+ long long t1, t2, dt, total;
+ u8 *yuv = c64x->mem + 0x1000000;
+ u8 *src = c64x->mem + 0x1200000;
+
+ BRINTF("\n\n\n.======================== Testing fetch_uyvy (inter %d, xoff %d, yoff %d) ========================.\n\n",
+ interleave, xoff, yoff);
+
+ for (y=0; y<20; y++) {
+ for (x=0; x<40; x++) {
+ int val = (y*40)+x;
+ src[y*1440 + x] = val;
+ BRINTF("%02x ", val&0xff);
+ }
+ BRINTF("\n");
+ }
+ BRINTF("\n");
+
+ memset( yuv, 0xAA, 0x100000 );
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x000000;
+ task->c64x_arg[1] = (DAVINCI_C64X_MEM+0x01000000)+0x200000 + yoff*1440 + xoff * 2;
+ task->c64x_arg[2] = 1440;
+
+ task->c64x_function = (21 << 2) | C64X_FLAG_TODO;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+ BRINTF( "\n\nDESTINATION\n\nY:\n" );
+ for (y=0;y<27;y++) {
+ if (y==18) BRINTF("\nUV:\n");
+ for (x=0;x<32;x++) {
+ BRINTF("%02x ",yuv[y*32+x]);
+ }
+ BRINTF("\n");
+ }
+
+ dt = t2 - t1;
+ total = num;
+
+ D_INFO("\n\nDavinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "blend_fetch_uyvy(16x16)", total * 1000000ULL / dt );
+}
+#if 0
+static inline void
+bench_fetch_uyvy( DavinciC64x *c64x, bool interleave, int xoff, int yoff )
+{
+ int i, x, y, num;
+ long long t1, t2, dt, total;
+
+ num = 1;//0x20000;
+
+ u8 *yuv = c64x->mem + 0x1000000;
+ u8 *src = c64x->mem + 0x1200000;
+
+ BRINTF("\n");
+ BRINTF("\n\n.======================== Testing fetch_uyvy (inter %d, xoff %d, yoff %d) ========================.\n",
+ interleave, xoff, yoff);
+ BRINTF("\n");
+ BRINTF( "SOURCE (20x20)\n" );
+
+ for (y=0; y<20; y++) {
+ for (x=0; x<40; x++) {
+ int val = (x & 1) ? (x * 4 + y*0x10) : (x/4 + 0x40 + (x&2) * 0x10 + y*0x08);
+
+ src[y*1440 + x] = val;
+
+ BRINTF("%02x ", val&0xff);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ memset( yuv, 0x55, 0x100000 );
+
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = (19 << 2) | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x000000;
+ task->c64x_arg[1] = (DAVINCI_C64X_MEM+0x01000000)+0x200000 + yoff*1440 + xoff * 2;
+ task->c64x_arg[2] = 1440;
+ task->c64x_arg[3] = 16;
+ task->c64x_arg[4] = interleave ? 1 : 0;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+
+ BRINTF( "\n\nDESTINATION (17x18 / [9x9 9x9])\n" );
+
+ for (y=0; y<18; y++) {
+ for (x=0; x<17; x++) {
+ BRINTF("%02x ", yuv[y*32 + x]);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ for (y=0; y<9; y++) {
+ for (x=0; x<9; x++) {
+ BRINTF("%02x ", yuv[y*32 + x + 32*18]);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ for (y=0; y<9; y++) {
+ for (x=0; x<9; x++) {
+ BRINTF("%02x ", yuv[y*32 + x + 32*18+16]);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ BRINTF("\n\n");
+
+ dt = t2 - t1;
+ total = num;
+
+ D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "blend_fetch_uyvy(16x16)", total * 1000000ULL / dt );
+}
+#endif
+
+#if 0
+static inline void
+bench_put_idct( DavinciC64x *c64x, int dct_type )
+{
+ int i, num;
+ long long t1, t2, dt, total;
+ //int length = 0x10000;
+
+ num = 0x10000;// / length;
+
+ u8 *dst = c64x->mem + 0x01000000;
+ int *src = c64x->mem + 0x01200000;
+
+ src[0] = DVA_BLOCK_WORD( 100, 0, 1 );
+ src[1] = DVA_BLOCK_WORD( 200, 0, 0 );
+ src[2] = DVA_BLOCK_WORD( 210, 1, 0 );
+ src[3] = DVA_BLOCK_WORD( 220, 2, 1 );
+ src[4] = DVA_BLOCK_WORD( 300, 0, 1 );
+ src[5] = DVA_BLOCK_WORD( 400, 0, 0 );
+ src[6] = DVA_BLOCK_WORD( 410, 1, 1 );
+ src[7] = DVA_BLOCK_WORD( 500, 0, 0 );
+ src[8] = DVA_BLOCK_WORD( 510, 63, 1 );
+ src[9] = DVA_BLOCK_WORD( 600, 63, 1 );
+
+ BRINTF("\n");
+ BRINTF("\n\n.======================== Testing put_idct (%d) ========================.\n", dct_type);
+ BRINTF("\n");
+
+ memset( dst, 0x55, 0x100000 );
+
+ for (i=0; i<10; i++) {
+ BRINTF("0x%08x (%d, %d, %d)\n", (u32)src[i], src[i] >> 16, (src[i] >> 1) & 0x3f, src[i] & 1);
+ }
+
+ BRINTF("\n");
+
+ t1 = direct_clock_get_abs_micros();
+
+ {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = C64X_LOAD_BLOCK | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = DAVINCI_C64X_MEM+0x1200000;
+ task->c64x_arg[1] = 10;
+ task->c64x_arg[2] = 0x3f;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ davinci_c64x_blit_16( c64x, (DAVINCI_C64X_MEM+0x01000000), 0, 0xf06180, 0, 384, 1 );
+ davinci_c64x_blit_16( c64x, (DAVINCI_C64X_MEM+0x01100000), 0, 0xf06480, 0, 384/2, 1 );
+
+ davinci_c64x_put_uyvy_16x16( c64x, (DAVINCI_C64X_MEM+0x01300000), 32, 0xf06180, 0 );
+
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+
+ for (i=0; i<384; i++) {
+ BRINTF("%5d ", dst[i] );
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ if (i%64==63) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n\n");
+
+
+ for (i=0; i<384; i++) {
+ BRINTF("%3d ", duv[i] );
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ if (i%64==63) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n\n");
+
+ for (i=0; i<16*16*2; i++) {
+ BRINTF("%02x ", duy[i]);
+
+ if (i%32==31) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n");
+
+ dt = t2 - t1;
+ total = num;// * length;
+
+ D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "block_load()", total * 1000000ULL / dt );
+}
+#endif
+
+static inline void
+bench_put_mc( DavinciC64x *c64x, bool interleave )
+{
+ int x, y, i, num;
+ long long t1, t2, dt, total;
+
+ num = 1;//720/16*576/16;
+
+ u8 *dst = c64x->mem + 0x1000000;
+ u8 *src = c64x->mem + 0x1200000;
+
+ BRINTF("\n");
+ BRINTF("\n\n.======================== Testing put_mc (%d) ========================.\n", interleave);
+ BRINTF("\n");
+ BRINTF("SOURCE (16x16 / [8x8 8x8]\n");
+
+ for (y=0; y<16; y++) {
+ for (x=0; x<16; x++) {
+ u8 val = (x << 4) + y;
+ src[y*16 + x] = val;
+ BRINTF("%02x ", val);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ for (y=0; y<8; y++) {
+ for (x=0; x<8; x++) {
+ u8 val = (x << 4) + y*2;
+ src[y*16 + x + 16*16] = val;
+ BRINTF("%02x ", val);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ for (y=0; y<8; y++) {
+ for (x=0; x<8; x++) {
+ u8 val = (x << 4) + y*2;
+ src[y*16 + x + 16*16 + 8] = val;
+ BRINTF("%02x ", val);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ memset( dst, 0x55, 0x100000 );
+
+ davinci_c64x_blit_32( c64x, C64X_MC_BUFFER_Y, 16, DAVINCI_C64X_MEM+0x1200000, 16, 4, 24 );
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_wait_low( c64x );
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = C64X_PUT_MC_UYVY_16x16 | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = DAVINCI_C64X_MEM+0x1000000;
+ task->c64x_arg[1] = 1440;
+ task->c64x_arg[2] = interleave ? 1 : 0;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+
+ BRINTF("\n");
+ BRINTF("DESTINATION (16x16 UYVY)\n");
+
+ for (y=0; y<16; y++) {
+ for (x=0; x<32; x++)
+ BRINTF("%02x ", dst[y*1440 + x]);
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n\n");
+
+ dt = t2 - t1;
+ total = num;
+
+ D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "put_mc_16x16()", total * 1000000ULL / dt );
+}
+
+static inline void
+bench_put_sum( DavinciC64x *c64x, bool interleave )
+{
+ int x, y, i, num;
+ long long t1, t2, dt, total;
+
+ num = 1;//720/16*576/16;
+
+ u8 *dst = c64x->mem + 0x1000000;
+ u8 *src = c64x->mem + 0x1200000;
+ u32 *words = c64x->mem + 0x1100000;
+
+ BRINTF("\n");
+ BRINTF("\n\n.======================== Testing put_sum (%d) ========================.\n", interleave);
+ BRINTF("\n");
+ BRINTF("WORDS (6x IDCT with one value)\n");
+
+ words[0] = DVA_BLOCK_WORD( 0, 0, 1 );
+ words[1] = DVA_BLOCK_WORD( 50, 0, 1 );
+ words[2] = DVA_BLOCK_WORD( 100, 0, 1 );
+ words[3] = DVA_BLOCK_WORD( 150, 0, 1 );
+ words[4] = DVA_BLOCK_WORD( 200, 0, 1 );
+ words[5] = DVA_BLOCK_WORD( 250, 0, 1 );
+
+ BRINTF("\n");
+ BRINTF("\n");
+
+ memset( dst, 0x55, 0x100000 );
+
+ for (i=0; i<6; i++) {
+ BRINTF("0x%08x (%d, %d, %d)\n", (u32)words[i], words[i] >> 16, (words[i] >> 1) & 0x3f, words[i] & 1);
+ }
+
+ {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = C64X_LOAD_BLOCK | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = DAVINCI_C64X_MEM+0x1100000;
+ task->c64x_arg[1] = 6;
+ task->c64x_arg[2] = 0x3f;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ BRINTF("\n");
+ BRINTF("SOURCE (16x16 / [8x8 8x8]\n");
+
+ for (y=0; y<16; y++) {
+ for (x=0; x<16; x++) {
+ u8 val = (x << 4) + y;
+ src[y*16 + x] = val;
+ BRINTF("%02x ", val);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ for (y=0; y<8; y++) {
+ for (x=0; x<8; x++) {
+ u8 val = (x << 4) + y*2;
+ src[y*16 + x + 16*16] = val;
+ BRINTF("%02x ", val);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ for (y=0; y<8; y++) {
+ for (x=0; x<8; x++) {
+ u8 val = (x << 4) + y*2;
+ src[y*16 + x + 16*16 + 8] = val;
+ BRINTF("%02x ", val);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ memset( dst, 0x55, 0x100000 );
+
+ davinci_c64x_blit_32( c64x, C64X_MC_BUFFER_Y, 16, DAVINCI_C64X_MEM+0x1200000, 16, 4, 24 );
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_wait_low( c64x );
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = C64X_PUT_SUM_UYVY_16x16 | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = DAVINCI_C64X_MEM+0x1000000;
+ task->c64x_arg[1] = 1440;
+ task->c64x_arg[2] = interleave ? 1 : 0;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+
+ BRINTF("\n");
+ BRINTF("DESTINATION (16x16 UYVY)\n");
+
+ for (y=0; y<16; y++) {
+ for (x=0; x<32; x++)
+ BRINTF("%02x ", dst[y*1440 + x]);
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n\n");
+
+ dt = t2 - t1;
+ total = num;
+
+ D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "put_sum_16x16()", total * 1000000ULL / dt );
+}
+
+static inline void
+bench_sat_mc( DavinciC64x *c64x )
+{
+ int x, y, i, num;
+ long long t1, t2, dt, total;
+
+ num = 1;//720/16*576/16;
+
+ u8 *dst = c64x->mem + 0x1000000;
+ u8 *src = c64x->mem + 0x1200000;
+
+ BRINTF("\n\n.======================== Testing sat_mc ========================.\n");
+ BRINTF("\n");
+ BRINTF("SOURCE (16x16 / [8x8 8x8]\n");
+
+ for (y=0; y<16; y++) {
+ for (x=0; x<16; x++) {
+ u8 val = (x << 4) + y;
+ src[y*16 + x] = val;
+ BRINTF("%02x ", val);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ for (y=0; y<8; y++) {
+ for (x=0; x<8; x++) {
+ u8 val = (x << 4) + y*2;
+ src[y*16 + x + 16*16] = val;
+ BRINTF("%02x ", val);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ for (y=0; y<8; y++) {
+ for (x=0; x<8; x++) {
+ u8 val = (x << 4) + y*2;
+ src[y*16 + x + 16*16 + 8] = val;
+ BRINTF("%02x ", val);
+ }
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ memset( dst, 0x55, 0x100000 );
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = (57 << 2) | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = DAVINCI_C64X_MEM+0x1000000;
+ task->c64x_arg[1] = DAVINCI_C64X_MEM+0x1200000;
+ task->c64x_arg[2] = 16;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+
+ BRINTF("\n");
+ BRINTF("DESTINATION (16x16 / [8x8 8x8]\n");
+
+ for (y=0; y<16; y++) {
+ for (x=0; x<16; x++)
+ BRINTF("%02x ", dst[y*16 + x]);
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ for (y=0; y<8; y++) {
+ for (x=0; x<8; x++)
+ BRINTF("%02x ", dst[y*16 + x + 16*16]);
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ for (y=0; y<8; y++) {
+ for (x=0; x<8; x++)
+ BRINTF("%02x ", dst[y*16 + x + 16*16 + 8]);
+
+ BRINTF("\n");
+ }
+
+ BRINTF("\n\n");
+
+ dt = t2 - t1;
+ total = num;
+
+ D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "sat_mc_16x16()", total * 1000000ULL / dt );
+}
+
+static inline void
+bench_uyvy_1( DavinciC64x *c64x, bool progressive )
+{
+ c64xTask *task;
+ int i, num;
+ long long t1, t2, dt, total;
+
+ num = 720/16*576/16;
+
+ u8 *u = c64x->mem + 0x1200000;
+ u8 *p = c64x->mem + 0x1000000;
+
+ BRINTF("\n\n\n.======================== Testing put_uyvy (%s) ========================.\n\n",
+ progressive ? "progressive" : "interlaced");
+
+ for (i=0; i<256; i++) {
+ p[i] = i - 128;
+ BRINTF("Y%-3d ", p[i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ for (i=0; i<64; i++) {
+ p[256+i] = i-32;
+ BRINTF("U%-3d ", p[256+i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ for (i=0; i<64; i++) {
+ p[320+i] = i-32;
+ BRINTF("V%-3d ", p[320+i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n");
+
+ for (i=0; i<384; i++) {
+ BRINTF("%4d ", p[i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n");
+
+ memset( u, 0x55, 720*576*2 );
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ davinci_c64x_dva_begin_frame( c64x, 720 * 2, (DAVINCI_C64X_MEM+0x01000000)+0x200000+i*16*16*2, 0, 0, progressive ? 0x100 : 0 );
+
+ task = c64x_get_task( c64x );
+
+ task->c64x_function = C64X_PUT_UYVY_16x16 | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x200000+i*16*16*2;
+ task->c64x_arg[1] = 720 * 2;
+ task->c64x_arg[2] = (DAVINCI_C64X_MEM+0x01000000);
+ task->c64x_arg[3] = 0;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ BRINTF("\n");
+
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+ for (i=0; i<16*16*2; i++) {
+ BRINTF("%02x ", u[i/32*720*2 + i%32]);
+
+ if (i%32==31) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n");
+
+ dt = t2 - t1;
+ total = num;
+
+ D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "put_uyvy_16x16()", total * 1000000ULL / dt );
+}
+
+static inline void
+bench_uyvy_2( DavinciC64x *c64x, bool progressive )
+{
+ c64xTask *task;
+ int i, num;
+ long long t1, t2, dt, total;
+
+ num = 1;//720/16*576/16;
+
+ u8 *u = c64x->mem + 0x0200000;
+ u8 *p = c64x->mem + 0x0000000;
+
+ BRINTF("\n\n\n.======================== Testing put_uyvy (%s) ========================.\n\n",
+ progressive ? "progressive" : "interlaced");
+
+ for (i=0; i<256; i++) {
+ p[i] = i/8;
+ BRINTF("Y%-3d ", p[i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ for (i=0; i<64; i++) {
+ p[256+i] = i/8 + 128;
+ BRINTF("U%-3d ", p[256+i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ for (i=0; i<64; i++) {
+ p[320+i] = i/8 + 240;
+ BRINTF("V%-3d ", p[320+i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n");
+
+ for (i=0; i<384; i++) {
+ BRINTF("%4d ", p[i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n");
+
+ memset( u, 0x55, 720*576*2 );
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ davinci_c64x_dva_begin_frame( c64x, 720 * 2, (DAVINCI_C64X_MEM+0x01000000)+0x200000+i*16*16*2, 0, 0, progressive ? 0x100 : 0 );
+
+ task = c64x_get_task( c64x );
+
+ task->c64x_function = C64X_PUT_UYVY_16x16 | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = DAVINCI_C64X_MEM+0x200000+i*16*16*2;
+ task->c64x_arg[1] = 720 * 2;
+ task->c64x_arg[2] = DAVINCI_C64X_MEM;
+ task->c64x_arg[3] = 0;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ BRINTF("\n");
+
+ davinci_c64x_write_back_all( c64x );
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+ for (i=0; i<16*16*2; i++) {
+ BRINTF("%02x ", u[i/32*720*2 + i%32]);
+
+ if (i%32==31) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n");
+
+ dt = t2 - t1;
+ total = num;
+
+ D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "put_uyvy_16x16()", total * 1000000ULL / dt );
+}
+
+static inline void
+bench_uyvy_3( DavinciC64x *c64x, bool progressive )
+{
+ c64xTask *task;
+ int i, num;
+ long long t1, t2, dt, total;
+
+ num = 1;//720/16*576/16;
+
+ u8 *u = c64x->mem + 0x1200000;
+ u8 *p = c64x->mem + 0x1000000;
+
+ BRINTF("\n\n\n.======================== Testing put_uyvy (%s) ========================.\n\n",
+ progressive ? "progressive" : "interlaced");
+
+ for (i=0; i<256; i++) {
+ p[i] = i%8;
+ BRINTF("Y%-3d ", p[i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ for (i=0; i<64; i++) {
+ p[256+i] = i%8 + 128;
+ BRINTF("U%-3d ", p[256+i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ for (i=0; i<64; i++) {
+ p[320+i] = i%8 + 240;
+ BRINTF("V%-3d ", p[320+i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n");
+
+ for (i=0; i<384; i++) {
+ BRINTF("%4d ", p[i]);
+ if (i%8==7) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n");
+
+ memset( u, 0x55, 720*576*2 );
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ davinci_c64x_dva_begin_frame( c64x, 720 * 2, (DAVINCI_C64X_MEM+0x01000000)+0x200000+i*16*16*2, 0, 0, progressive ? 0x100 : 0 );
+
+ task = c64x_get_task( c64x );
+
+ task->c64x_function = C64X_PUT_UYVY_16x16 | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x200000+i*16*16*2;
+ task->c64x_arg[1] = 720 * 2;
+ task->c64x_arg[2] = (DAVINCI_C64X_MEM+0x01000000);
+ task->c64x_arg[3] = 0;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ BRINTF("\n");
+
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+ for (i=0; i<16*16*2; i++) {
+ BRINTF("%02x ", u[i/32*720*2 + i%32]);
+
+ if (i%32==31) {
+ BRINTF("\n");
+ }
+ }
+
+ BRINTF("\n");
+
+ dt = t2 - t1;
+ total = num;
+
+ D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "put_uyvy_16x16()", total * 1000000ULL / dt );
+}
+
+static inline void
+bench_mc( DavinciC64x *c64x, int func, int w, int h, bool avg, const char *name )
+{
+ int i, x, y, num;
+ long long t1, t2, dt, total;
+
+ num = 0x1;//0000;
+
+ u8 *dst = c64x->mem + 0x1200000;
+ u8 *dsr = c64x->mem + 0x1100000;
+ u8 *src = c64x->mem + 0x1000000;
+
+ BRINTF("\n\n.============ Testing %s ============.\n", name);
+ BRINTF("\n");
+ BRINTF("SRC REF\n");
+
+ for (y=0; y<h+1; y++) {
+ for (x=0; x<w+1; x++) {
+ src[x+y*32] = x*y;
+ BRINTF("%-3d ", src[x+y*32]);
+ }
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ BRINTF("DST REF\n");
+
+ for (y=0; y<h; y++) {
+ for (x=0; x<w; x++) {
+ dsr[x+y*32] = w*h-1-x*y;
+ BRINTF("%-3d ", dsr[x+y*32]);
+ }
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+
+ for (i=0; i<0x100000; i++) {
+ dst[i] = i;
+ }
+
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = (func << 2) | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = DAVINCI_C64X_MEM + 0x01200000;
+ task->c64x_arg[1] = 32;
+ task->c64x_arg[2] = DAVINCI_C64X_MEM + 0x01000000;
+ task->c64x_arg[3] = DAVINCI_C64X_MEM + 0x01100000;
+ task->c64x_arg[4] = 32;
+ task->c64x_arg[5] = h;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+ BRINTF("-> DST\n");
+
+ for (y=0; y<h; y++) {
+ for (x=0; x<w; x++) {
+ BRINTF("%-3d ", dst[x+y*32]);
+ }
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ dt = t2 - t1;
+ total = num;
+
+ BRINTF( "BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ name, total * 1000000ULL / dt );
+}
+
+static inline void
+bench_div( DavinciC64x *c64x, u32 nom, u32 den )
+{
+ c64xTask *task = c64x_get_task( c64x );
+
+ BRINTF("\n\n.============ Testing div ============.\n");
+ BRINTF("\n");
+
+ task->c64x_function = (63 << 2) | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = nom;
+ task->c64x_arg[1] = den;
+
+ c64x_submit_task( c64x, task );
+
+ davinci_c64x_wait_low( c64x );
+
+ BRINTF("%x / %x = %x\n\n\n", nom, den, task->c64x_return);
+}
+
+static inline void
+bench_dither_argb( DavinciC64x *c64x )
+{
+ int i, x, y, num, w = 8, h = 17;
+ long long t1, t2, dt, total;
+
+ num = 0x10000;
+
+ u16 *dr = c64x->mem + 0x1200000;
+ u8 *da = c64x->mem + 0x1100000;
+ u32 *src = c64x->mem + 0x1000000;
+
+ BRINTF("\n\n.======================== Testing dither_argb ========================.\n");
+ BRINTF("\n");
+ BRINTF("SOURCE ARGB\n");
+
+ for (y=0; y<h-1; y++) {
+ for (x=0; x<w; x++) {
+ src[x+y*32] = 0x10101010 * y + 0x888888 * x;
+ BRINTF("%08x ", src[x+y*32]);
+ }
+ BRINTF("\n");
+ }
+ for (x=0; x<w; x++) {
+ src[x+(h-1)*32] = 0xffffffff;
+ BRINTF("%08x ", src[x+y*32]);
+ }
+
+ BRINTF("\n");
+ BRINTF("\n");
+
+ memset( dr, 0x55, 0x100000 );
+ memset( da, 0x55, 0x100000 );
+
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = C64X_DITHER_ARGB | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = DAVINCI_C64X_MEM + 0x01200000;
+ task->c64x_arg[1] = DAVINCI_C64X_MEM + 0x01100000;
+ task->c64x_arg[2] = 64;
+ task->c64x_arg[3] = DAVINCI_C64X_MEM + 0x01000000;
+ task->c64x_arg[4] = 128;
+ task->c64x_arg[5] = w;
+ task->c64x_arg[6] = h;
+
+ c64x_submit_task( c64x, task );
+ }
+
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+ BRINTF("-> DST RGB\n");
+
+ for (y=0; y<h; y++) {
+ for (x=0; x<w; x++) {
+ BRINTF(" %04x ", dr[x+y*32]);
+ }
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ BRINTF("-> DST ALPHA\n");
+
+ for (y=0; y<h; y++) {
+ for (x=0; x<w; x++) {
+ if (x&1)
+ BRINTF(" %x ", da[x/2+y*64] & 0xF);
+ else
+ BRINTF(" %x ", da[x/2+y*64] >> 4);
+ }
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+ dt = t2 - t1;
+ total = num;
+
+ BRINTF( "BENCHMARK on DSP - %-15s %lld Calls/sec\n",
+ "dither_argb", total * 1000000ULL / dt );
+}
+
+
+
+
+/**********************************************************************************************************************/
+/*** 32 bit scaler ****************************************************************************************************/
+/**********************************************************************************************************************/
+
+typedef struct {
+ DFBRegion clip;
+ const void *colors;
+ ulong protect;
+ ulong key;
+} StretchCtx;
+
+typedef void (*StretchHVx)( void *dst,
+ int dpitch,
+ const void *src,
+ int spitch,
+ int width,
+ int height,
+ int dst_width,
+ int dst_height,
+ const StretchCtx *ctx );
+
+#define STRETCH_NONE 0
+#define STRETCH_SRCKEY 1
+#define STRETCH_PROTECT 2
+#define STRETCH_SRCKEY_PROTECT 3
+#define STRETCH_NUM 4
+
+typedef struct {
+ struct {
+ StretchHVx up[STRETCH_NUM];
+ StretchHVx down[STRETCH_NUM];
+ } f[DFB_NUM_PIXELFORMATS];
+} StretchFunctionTable;
+
+
+#define DST_FORMAT DSPF_ARGB
+#define TABLE_NAME stretch_32
+#define FUNC_NAME(UPDOWN,K,P,F) stretch_32_ ## UPDOWN ## _ ## K ## P ## _ ## F
+#define SHIFT_R8 8
+#define SHIFT_L8 8
+#define X_00FF00FF 0x00ff00ff
+#define X_FF00FF00 0xff00ff00
+#define MASK_RGB 0x00ffffff
+#define HAS_ALPHA
+
+#include <gfx/generic/stretch_up_down_32.h>
+
+#undef DST_FORMAT
+#undef TABLE_NAME
+#undef FUNC_NAME
+#undef SHIFT_R8
+#undef SHIFT_L8
+#undef X_00FF00FF
+#undef X_FF00FF00
+#undef MASK_RGB
+#undef HAS_ALPHA
+
+
+static inline void
+bench_stretch_32( DavinciC64x *c64x, int sw, int sh, int dw, int dh )
+{
+ int i, x, y, num;
+ long long t1, t2, dt, total;
+ bool down = (dw < sw) && (dh < sh);
+
+#if 0
+ int SW = (sw + 5) & ~3;
+ int SH = (sh + 5) & ~3;
+ int DW = (dw + 5) & ~3;
+ int DH = (dh + 5) & ~3;
+#else
+ int SW = sw;
+ int SH = sh;
+ int DW = dw;
+ int DH = dh;
+#endif
+
+ num = 1;//0x10000;
+
+ u32 cpu[DW * DH];
+ u32 *dst = c64x->mem + 0x1200000;
+ u32 *src = c64x->mem + 0x1000000;
+
+ memset( src, 0x55, 0x100000 );
+
+ for (y=0; y<sh; y++) {
+ for (x=0; x<sw; x++) {
+ src[x + y*SW] = 0xffffffff * x;// 0x10010203 * x + 0x04202020 * (y + 1);
+ }
+ }
+
+
+ BRINTF("\n\n.======================== Testing stretch_32( %dx%d -> %dx%d ) ========================.\n", sw, sh, dw, dh);
+ BRINTF("\n");
+ BRINTF("SOURCE IMAGE (%dx%d) [%dx%d]\n", sw, sh, SW, SH);
+
+ for (y=0; y<SH; y++) {
+ for (x=0; x<SW; x++) {
+ BRINTF("%08x ", src[x + y*SW]);
+ }
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+ BRINTF("\n");
+
+ memset( dst, 0x55, 0x100000 );
+ memset( cpu, 0x55, sizeof(cpu) );
+
+
+ t1 = direct_clock_get_abs_micros();
+
+ for (i=0; i<num; i++) {
+ c64xTask *task = c64x_get_task( c64x );
+
+ task->c64x_function = (down ?
+ C64X_STRETCH_32_down :
+ C64X_STRETCH_32_up ) | C64X_FLAG_TODO;
+
+ task->c64x_arg[0] = DAVINCI_C64X_MEM + 0x1200000;
+ task->c64x_arg[1] = DAVINCI_C64X_MEM + 0x1000000;
+ task->c64x_arg[2] = (DW * 4) | ((SW * 4) << 16);
+ task->c64x_arg[3] = dh | (dw << 16);
+ task->c64x_arg[4] = sh | (sw << 16);
+ task->c64x_arg[5] = (dw - 1) | ((dh - 1) << 16);
+ task->c64x_arg[6] = 0 | (0 << 16);
+
+ c64x_submit_task( c64x, task );
+ }
+
+ davinci_c64x_write_back_all( c64x );
+
+ davinci_c64x_wait_low( c64x );
+
+ t2 = direct_clock_get_abs_micros();
+
+
+ BRINTF("-> DSP RESULT (%dx%d) [%dx%d]\n", dw, dh, DW, DH);
+
+ for (y=0; y<DH; y++) {
+ for (x=0; x<DW; x++) {
+ BRINTF("%08x ", dst[x + y*DW]);
+ }
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+
+
+ {
+ StretchHVx func = (down ?
+ stretch_32.f[DFB_PIXELFORMAT_INDEX(DSPF_ARGB)].down[STRETCH_NONE] :
+ stretch_32.f[DFB_PIXELFORMAT_INDEX(DSPF_ARGB)].up[STRETCH_NONE]);
+ StretchCtx ctx = { .clip = DFB_REGION_INIT_FROM_RECTANGLE_VALS( 0, 0, dw, dh ) };
+
+ func( cpu, DW * 4, src, SW * 4, sw, sh, dw, dh, &ctx );
+
+ BRINTF("-> CPU RESULT (%dx%d) [%dx%d]\n", dw, dh, DW, DH);
+
+ for (y=0; y<DH; y++) {
+ for (x=0; x<DW; x++) {
+ BRINTF("%08x ", cpu[x + y*DW]);
+ }
+ BRINTF("\n");
+ }
+
+ BRINTF("\n");
+ }
+
+ dt = t2 - t1;
+ total = num;
+
+ BRINTF( "BENCHMARK on DSP - stretch_32_up %lld Calls/sec\n", total * 1000000ULL / dt );
+}
+
+static inline void
+run_benchmarks( const char *name,
+ void *ptr,
+ int length )
+{
+ bench_mem( name, ptr, length, false, false );
+ bench_mem( name, ptr, length, true, false );
+ bench_mem( name, ptr, length, true, true );
+}
+
+/**********************************************************************************************************************/
+/* Public Functions */
+/**********************************************************************************************************************/
+
+DFBResult
+davinci_c64x_open( DavinciC64x *c64x )
+{
+ DFBResult ret;
+ int fd;
+ void *map_m;
+ void *map_q = NULL;
+
+ mknod( C64X_DEVICE, 0666 | S_IFCHR, makedev( 400, 0 ) );
+
+ fd = direct_try_open( C64X_DEVICE, C64X_DEVICE0, O_RDWR, true );
+ if (fd < 0)
+ return DFB_IO;
+
+ map_q = mmap( NULL, C64X_QLEN, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 );
+ if (map_q == MAP_FAILED) {
+ ret = errno2result( errno );
+ D_PERROR( "Davinci/C64X: Mapping %lu bytes at %lu via '%s' failed!\n", C64X_QLEN, 0UL, C64X_DEVICE );
+ goto error;
+ }
+
+// run_benchmarks( "Queue", map_q, C64X_QLEN );
+
+ map_m = mmap( NULL, C64X_MLEN, PROT_READ | PROT_WRITE, MAP_SHARED, fd, C64X_QLEN );
+ if (map_m == MAP_FAILED) {
+ ret = errno2result( errno );
+ D_PERROR( "Davinci/C64X: Mapping %lu bytes at %lu via '%s' failed!\n", C64X_MLEN, C64X_QLEN, C64X_DEVICE );
+ goto error;
+ }
+
+// run_benchmarks( "Memory", map_m, C64X_MLEN );
+
+ c64x->fd = fd;
+ c64x->ctl = map_q;
+ c64x->mem = map_m;
+ c64x->QueueL = map_m + 0x01e00000;
+
+ D_INFO( "Davinci/C64X: Low ARM %d / DSP %d, High ARM %d / DSP %d\n",
+ c64x->ctl->QL_arm, c64x->ctl->QL_dsp, c64x->ctl->QH_arm, c64x->ctl->QH_dsp );
+
+ D_MAGIC_SET( c64x, DavinciC64x );
+
+if (getenv("C64X_TEST")) {
+// test_load_block( c64x, false );
+
+// test_load_block( c64x, true );
+
+// bench_dither_argb( c64x );
+
+#if 0
+ bench_uyvy_1( c64x, true );
+ bench_uyvy_1( c64x, false );
+ bench_uyvy_2( c64x, true );
+ bench_uyvy_2( c64x, false );
+ bench_uyvy_3( c64x, true );
+ bench_uyvy_3( c64x, false );
+#endif
+
+#if 0
+ bench_blend_argb( c64x, 0 );
+ bench_blend_argb( c64x, 1 );
+ bench_blend_argb( c64x, 2 );
+ bench_blend_argb( c64x, 3 );
+#endif
+
+#if 0
+ bench_stretch_32( c64x, 2, 1, 16, 1 );
+ bench_stretch_32( c64x, 2, 2, 16, 2 );
+
+ bench_stretch_32( c64x, 2, 1, 3, 1 );
+ bench_stretch_32( c64x, 4, 1, 6, 1 );
+
+ bench_stretch_32( c64x, 3, 1, 2, 1 );
+ bench_stretch_32( c64x, 6, 1, 4, 1 );
+#endif
+
+#if 1
+ bench_fetch_uyvy( c64x, false, 0, 0 );
+ bench_fetch_uyvy( c64x, false, 1, 0 );
+ bench_fetch_uyvy( c64x, false, 0, 1 );
+ bench_fetch_uyvy( c64x, false, 1, 1 );
+ bench_fetch_uyvy( c64x, true, 0, 0 );
+ bench_fetch_uyvy( c64x, true, 1, 0 );
+ bench_fetch_uyvy( c64x, true, 0, 1 );
+ bench_fetch_uyvy( c64x, true, 1, 1 );
+#endif
+
+#if 0
+ bench_put_mc( c64x, false );
+ bench_put_mc( c64x, true );
+
+ bench_put_sum( c64x, false );
+ bench_put_sum( c64x, true );
+
+ bench_sat_mc( c64x );
+#endif
+
+#if 0
+ bench_mc( c64x, 32, 8, 8, false, "mc_put_o_8" );
+ bench_mc( c64x, 33, 8, 8, false, "mc_put_x_8" );
+ bench_mc( c64x, 34, 8, 8, false, "mc_put_y_8" );
+ bench_mc( c64x, 35, 8, 8, false, "mc_put_xy_8" );
+ bench_mc( c64x, 36, 16, 16, false, "mc_put_o_16" );
+ bench_mc( c64x, 37, 16, 16, false, "mc_put_x_16" );
+ bench_mc( c64x, 38, 16, 16, false, "mc_put_y_16" );
+ bench_mc( c64x, 39, 16, 16, false, "mc_put_xy_16" );
+#endif
+
+#if 0
+ bench_mc( c64x, 40, 8, 8, true, "mc_avg_o_8" );
+ bench_mc( c64x, 41, 8, 8, true, "mc_avg_x_8" );
+ bench_mc( c64x, 42, 8, 8, true, "mc_avg_y_8" );
+ bench_mc( c64x, 43, 8, 8, true, "mc_avg_xy_8" );
+ bench_mc( c64x, 44, 16, 16, true, "mc_avg_o_16" );
+ bench_mc( c64x, 45, 16, 16, true, "mc_avg_x_16" );
+ bench_mc( c64x, 46, 16, 16, true, "mc_avg_y_16" );
+ bench_mc( c64x, 47, 16, 16, true, "mc_avg_xy_16" );
+#endif
+
+#if 0
+ bench_div( c64x, 1, 3 );
+ bench_div( c64x, 1000, 333 );
+ bench_div( c64x, 1000, 334 );
+ bench_div( c64x, 6666, 2222 );
+ bench_div( c64x, 1234, 1234 );
+ bench_div( c64x, 4000, 0 );
+ bench_div( c64x, 5000, 0 );
+ bench_div( c64x, 10000, 3 );
+ bench_div( c64x, 14, 3 );
+ bench_div( c64x, 0x10000, 0x1000 );
+ bench_div( c64x, 0x1000, 0x100 );
+ bench_div( c64x, 0x100000, 2 );
+#endif
+}
+
+ return DFB_OK;
+
+
+error:
+ if (map_q)
+ munmap( map_q, C64X_QLEN );
+
+ close( fd );
+
+ return ret;
+}
+
+DFBResult
+davinci_c64x_close( DavinciC64x *c64x )
+{
+ D_MAGIC_ASSERT( c64x, DavinciC64x );
+
+ munmap( (void*) c64x->mem, C64X_MLEN );
+ munmap( (void*) c64x->ctl, C64X_QLEN );
+
+ close( c64x->fd );
+
+ D_MAGIC_CLEAR( c64x );
+
+ return DFB_OK;
+}
+