From 7fe60435bce6595a9c58a9bfd8244d74b5320e96 Mon Sep 17 00:00:00 2001 From: Benjamin Franzke Date: Tue, 15 Jan 2013 08:46:13 +0100 Subject: Import DirectFB141_2k11R3_beta5 --- Source/DirectFB/gfxdrivers/davinci/davinci_c64x.c | 2053 +++++++++++++++++++++ 1 file changed, 2053 insertions(+) create mode 100755 Source/DirectFB/gfxdrivers/davinci/davinci_c64x.c (limited to 'Source/DirectFB/gfxdrivers/davinci/davinci_c64x.c') diff --git a/Source/DirectFB/gfxdrivers/davinci/davinci_c64x.c b/Source/DirectFB/gfxdrivers/davinci/davinci_c64x.c new file mode 100755 index 0000000..431ffdd --- /dev/null +++ b/Source/DirectFB/gfxdrivers/davinci/davinci_c64x.c @@ -0,0 +1,2053 @@ +/* + TI Davinci driver - C64X+ DSP Library + + (c) Copyright 2008 directfb.org + (c) Copyright 2007 Telio AG + + Written by Denis Oliver Kropp and + Olaf Dreesen . + + All rights reserved. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + version 2 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public + License along with this library; if not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. +*/ + +//#define DIRECT_ENABLE_DEBUG + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include "davinci_c64x.h" + + +/**********************************************************************************************************************/ + +#define C64X_DEVICE "/dev/c64x" +#define C64X_DEVICE0 "/dev/c64x0" +#define C64X_QLEN direct_page_align( sizeof(c64xTaskControl) ) +#define C64X_MLEN direct_page_align( 0x2000000 ) + +__attribute__((noinline)) +static void +davinci_c64x_queue_error( DavinciC64x *c64x, const char *msg ) +{ + c64xTaskControl *ctl = c64x->ctl; + uint32_t dsp = ctl->QL_dsp; + uint32_t arm = ctl->QL_arm; + uint32_t armp = (arm-1) & C64X_QUEUE_MASK; + c64xTask *dsp_task = &c64x->QueueL[dsp]; + c64xTask *arm_task = &c64x->QueueL[arm]; + c64xTask *armp_task = &c64x->QueueL[armp]; + + D_PERROR( "Davinci/C64X+: %s [DSP %d / %d (%s), ARM %d / %d (%s) <- %d / %d (%s)]\n", + msg, + dsp, + (dsp_task->c64x_function >> 2) & 0x3fff, + state_names[dsp_task->c64x_function & 3], + arm, + (arm_task->c64x_function >> 2) & 0x3fff, + state_names[arm_task->c64x_function & 3], + armp, + (armp_task->c64x_function >> 2) & 0x3fff, + state_names[armp_task->c64x_function & 3] ); +} + +/* + +1. Idle Case + + ARM ARM + DSP DSP + | . . . . . . . . | | . . . . . . . . | free = length-1 + + +2. Busy Case (ARM after) + + ARM ARM + DSP DSP + | o . . . . . . . | | . o o . . . . . | free = length-1 - arm + dsp + + +3. Busy Case (ARM before) + + ARM ARM + DSP DSP + | . . . . . o o o | | o o . . . . . o | free = dsp - arm - 1 + + +4. Full Case (ARM after) + + ARM + DSP + | o o o o o o o . | free = 0 + + +5. Full Case (ARM before) + + ARM ARM + DSP DSP + | o o o o o . o o | | . o o o o o o o | free = 0 + +*/ + +DFBResult +davinci_c64x_emit_tasks( DavinciC64x *c64x, + DavinciC64xTasks *tasks, + DavinciC64xEmitFlags flags ) +{ + c64xTaskControl *ctl = c64x->ctl; + uint32_t arm = ctl->QL_arm; + unsigned int emitted = 0; + unsigned int timeout = 23; + + D_MAGIC_ASSERT( tasks, DavinciC64xTasks ); + + while (emitted < tasks->num_tasks) { + uint32_t dsp = ctl->QL_dsp; + int free; + + if (arm == dsp) + free = C64X_QUEUE_LENGTH - 1; + else if (arm > dsp) + free = C64X_QUEUE_LENGTH - 1 - arm + dsp; + else + free = dsp - arm - 1; + + if (free) { + int emit = MIN( free, tasks->num_tasks - emitted ); + int copy = MIN( emit, C64X_QUEUE_LENGTH - arm ); + + memcpy( (void*) &c64x->QueueL[arm], (void*) &tasks->tasks[emitted], sizeof(c64xTask) * copy ); + + if (copy < emit) { + memcpy( (void*) &c64x->QueueL[0], (void*) &tasks->tasks[emitted+copy], sizeof(c64xTask) * (emit - copy) ); + + arm = (emit - copy); + } + else + arm = (arm + copy) & C64X_QUEUE_MASK; + + mb(); + + ctl->QL_arm = arm; + + mb(); + + emitted += emit; + + timeout = 23; + } + else { + if (!timeout--) { + davinci_c64x_queue_error( c64x, "Emit Timeout!" ); + return DFB_TIMEOUT; + } + + usleep( 7000 ); + } + } + + if (flags & C64X_TEF_RESET) + tasks->num_tasks = 0; + + return DFB_OK; +} + +DFBResult +davinci_c64x_tasks_init( DavinciC64xTasks *tasks, + unsigned int size ) +{ + tasks->tasks = D_MALLOC( sizeof(c64xTask) * size ); + if (!tasks->tasks) + return D_OOM(); + + tasks->max_tasks = size; + tasks->num_tasks = 0; + + D_MAGIC_SET( tasks, DavinciC64xTasks ); + + return DFB_OK; +} + +DFBResult +davinci_c64x_tasks_destroy( DavinciC64xTasks *tasks ) +{ + D_MAGIC_ASSERT( tasks, DavinciC64xTasks ); + D_ASSERT( tasks->tasks != NULL ); + + D_FREE( (void*) tasks->tasks ); + + tasks->tasks = NULL; + + D_MAGIC_CLEAR( tasks ); + + return DFB_OK; +} + +DFBResult +davinci_c64x_wait_low( DavinciC64x *c64x ) +{ + DFBResult ret; + c64xTaskControl *ctl = c64x->ctl; + + while (ctl->QL_dsp != ctl->QL_arm) { + c64xTask *task = c64x_get_task( c64x ); + + task->c64x_function = C64X_FLAG_TODO | C64X_FLAG_INTERRUPT; + + c64x_submit_task( c64x, task ); + + if (ioctl( c64x->fd, C64X_IOCTL_WAIT_LOW )) { + c64xTask *dsp_task = &c64x->QueueL[ctl->QL_dsp]; + + ret = errno2result( errno ); + D_PERROR( "Davinci/C64X+: C64X_IOCTL_WAIT_LOW failed! [DSP %d / %d (%s), ARM %d / %d (%s)]\n", + ctl->QL_dsp, + (dsp_task->c64x_function >> 2) & 0x3fff, + state_names[dsp_task->c64x_function & 3], + ctl->QL_arm, + (task->c64x_function >> 2) & 0x3fff, + state_names[task->c64x_function & 3] ); + return ret; + } + } + + return DFB_OK; +} + +/**********************************************************************************************************************/ +/* Benchmarking or Testing */ +/**********************************************************************************************************************/ + +#if 1 +#define BRINTF(x...) do { direct_log_printf( NULL, x ); } while (0) +#else +#define BRINTF(x...) printf( x ) +#endif + +static void +bench_mem( const char *name, + void *ptr, + int length, + bool copy, + bool from ) +{ + int i, num; + long long t1, t2, dt, total; + char buf[0x100]; + + if (length > sizeof(buf)) + length = sizeof(buf); + + num = 0x2000000 / length; + + t1 = direct_clock_get_abs_micros(); + + if (copy) { + if (from) + for (i=0; imem + 0x01000000; + int *src = c64x->mem + 0x01100000; + + +#if 0 + src[num++] = DVA_BLOCK_WORD( 100, 0, 1 ); + src[num++] = DVA_BLOCK_WORD( 200, 0, 0 ); + src[num++] = DVA_BLOCK_WORD( 210, 1, 0 ); + src[num++] = DVA_BLOCK_WORD( 220, 2, 1 ); + src[num++] = DVA_BLOCK_WORD( 300, 0, 1 ); + src[num++] = DVA_BLOCK_WORD( 400, 0, 0 ); + src[num++] = DVA_BLOCK_WORD( 410, 1, 1 ); + src[num++] = DVA_BLOCK_WORD( 500, 0, 0 ); + src[num++] = DVA_BLOCK_WORD( 510, 63, 1 ); + src[num++] = DVA_BLOCK_WORD( 600, 63, 1 ); +#else + src[num++] = DVA_BLOCK_WORD(136, 0, 0); + src[num++] = DVA_BLOCK_WORD(-12, 8, 0); + src[num++] = DVA_BLOCK_WORD(7, 16, 0); + src[num++] = DVA_BLOCK_WORD(-2, 24, 1); + + src[num++] = DVA_BLOCK_WORD(136, 0, 0); + src[num++] = DVA_BLOCK_WORD(-12, 8, 0); + src[num++] = DVA_BLOCK_WORD(7, 16, 0); + src[num++] = DVA_BLOCK_WORD(-2, 24, 1); + + + src[num++] = DVA_BLOCK_WORD(1076, 0, 0); + src[num++] = DVA_BLOCK_WORD(-204, 8, 0); + src[num++] = DVA_BLOCK_WORD(-168, 16, 0); + src[num++] = DVA_BLOCK_WORD(-129, 24, 0); + src[num++] = DVA_BLOCK_WORD(-100, 32, 0); + src[num++] = DVA_BLOCK_WORD(-40, 40, 0); + src[num++] = DVA_BLOCK_WORD(-14, 48, 1); +#if 1 + src[num++] = DVA_BLOCK_WORD(1068, 0, 0); + src[num++] = DVA_BLOCK_WORD(2, 1, 0); + src[num++] = DVA_BLOCK_WORD(-202, 8, 0); + src[num++] = DVA_BLOCK_WORD(-168, 16, 0); + src[num++] = DVA_BLOCK_WORD(-2, 9, 0); + src[num++] = DVA_BLOCK_WORD(-129, 24, 0); + src[num++] = DVA_BLOCK_WORD(-97, 32, 0); + src[num++] = DVA_BLOCK_WORD(-40, 40, 0); + src[num++] = DVA_BLOCK_WORD(-13, 48, 1); +#else + src[num++] = DVA_BLOCK_WORD(1068, 0, 0); +// src[num++] = DVA_BLOCK_WORD(2, 1, 0); + src[num++] = DVA_BLOCK_WORD(-202, 8, 0); + src[num++] = DVA_BLOCK_WORD(-1, 16, 0); +// src[num++] = DVA_BLOCK_WORD(-2, 9, 0); + src[num++] = DVA_BLOCK_WORD(-1, 24, 0); + src[num++] = DVA_BLOCK_WORD(-97, 32, 1); +// src[num++] = DVA_BLOCK_WORD(-40, 40, 0); +// src[num++] = DVA_BLOCK_WORD(-13, 48, 1); +#endif + + src[num++] = DVA_BLOCK_WORD(1048, 0, 0); + src[num++] = DVA_BLOCK_WORD(-26, 8, 0); + src[num++] = DVA_BLOCK_WORD(4, 16, 0); + src[num++] = DVA_BLOCK_WORD(5, 24, 0); + src[num++] = DVA_BLOCK_WORD(-4, 32, 1); + + src[num++] = DVA_BLOCK_WORD(996, 0, 0); + src[num++] = DVA_BLOCK_WORD(24, 8, 0); + src[num++] = DVA_BLOCK_WORD(-2, 24, 0); + src[num++] = DVA_BLOCK_WORD(3, 32, 0); + src[num++] = DVA_BLOCK_WORD(-4, 48, 1); +#endif + + BRINTF("\n"); + BRINTF("\n\n.======================== Testing load_block (dct_type_interlaced: %s) ========================.\n", + dct_type_interlaced ? "yes" : "no"); + BRINTF("\n"); + BRINTF( "SOURCE (DVABlockWords)\n" ); + BRINTF("\n"); + + for (i=0; i> 16, (src[i] >> 1) & 0x3f, src[i] & 1); + + BRINTF("\n\n"); + + + memset( dst, 0x55, 0x100000 ); + + + // test routine + davinci_c64x_load_block( c64x, DAVINCI_C64X_MEM+0x01100000, 10, dct_type_interlaced ? 0x7f : 0x3f ); + + // copy idct buffer to memory where we can read it + davinci_c64x_blit_16( c64x, DAVINCI_C64X_MEM+0x01000000, 0, 0xf065c0, 0, 16 * 24, 1 ); + + davinci_c64x_write_back_all( c64x ); + davinci_c64x_write_back_all( c64x ); + davinci_c64x_wait_low( c64x ); + + + BRINTF( "-> IDCT BUFFER (16x16 + [ 8x8 8x8 ] shorts)\n" ); + BRINTF("\n"); + + for (i=0; i<16*24; i++) { + BRINTF("%5d ", dst[i] ); + if ((i&15)==15) { + BRINTF("\n"); + } + if ((i&255)==255) { + BRINTF("\n"); + } + } + + BRINTF("\n\n"); + +#if 1 + s16 *blocks = c64x->mem + 0x01200000; + int offset = 0; + + memset( blocks, 0, 1024 ); + + for (i=0; i> 1) & 0x3f)] = src[i] >> 16; + + if (src[i] & 1) + offset += 64; + } + + memset( dst, 0x55, 0x100000 ); + + // test routine + for (i=0; i<6; i++) + davinci_c64x_dva_idct( c64x, DAVINCI_C64X_MEM+0x01000000 + i*128, 16, DAVINCI_C64X_MEM+0x01200000 + i*128 ); + + davinci_c64x_write_back_all( c64x ); + davinci_c64x_write_back_all( c64x ); + davinci_c64x_wait_low( c64x ); + + + BRINTF( "-> SINGLE IDCT (59) BLOCKS (6x 8x8 shorts)\n" ); + BRINTF("\n"); + + for (i=0; i<6*64; i++) { + BRINTF("%5d ", dst[i] ); + if ((i&7)==7) { + BRINTF("\n"); + } + if ((i&63)==63) { + BRINTF("\n"); + } + } + + BRINTF("\n\n"); +#endif + +#if 0 +// s16 blocks[384]; +// int offset = 0; + offset = 0; + + memset( blocks, 0, 1024 ); + + for (i=0; i> 1) & 0x3f)] = src[i] >> 16; + + if (src[i] & 1) { + int n; + + for (n = 0; n < 8; n++) + idct_row (blocks + offset + 8 * n); + + for (n = 0; n < 8; n++) + idct_col (blocks + offset + n); + + offset += 64; + } + } + + BRINTF( "-> REFERENCE IDCT BLOCKS (6x 8x8 shorts)\n" ); + BRINTF("\n"); + + for (i=0; i<6*64; i++) { + BRINTF("%5d ", blocks[i] ); + if ((i&7)==7) { + BRINTF("\n"); + } + if ((i&63)==63) { + BRINTF("\n"); + } + } + + BRINTF("\n\n"); +#endif +} + +static inline void +bench_dezigzag( DavinciC64x *c64x ) +{ + int i, num; + long long t1, t2, dt, total; + //int length = 0x10000; + + num = 0x200000;// / length; + + short *p = c64x->mem + 0x1000000; + + for (i=0; i<64; i++) { + p[i] = i; + BRINTF("%3d ", p[i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + t1 = direct_clock_get_abs_micros(); + + for (i=0; ic64x_function = C64X_DEZIGZAG | C64X_FLAG_TODO; + + task->c64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x200000; + task->c64x_arg[1] = (DAVINCI_C64X_MEM+0x01000000)+0x000000; + //task->c64x_arg[2] = length/4; + + c64x_submit_task( c64x, task ); + } + + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + p = c64x->mem + 0x1200000; + for (i=0; i<64; i++) { + BRINTF("%3d ", p[i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + dt = t2 - t1; + total = num;// * length; + + D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "de_zigzag()", total * 1000000ULL / dt ); +} + +#define DUMP_PIXELS 1 + +static inline void +bench_blend_argb( DavinciC64x *c64x, int sub ) +{ + int i, num; + long long t1, t2, dt, total; + + num = 1;//0x20000; + + u32 *src = c64x->mem + 0x1000000; + u32 *dst = c64x->mem + 0x1200000; + + BRINTF( "\nTESTING BLEND_32 SUB %d\n", sub ); + + BRINTF( "\nSOURCE " ); + + for (i=0; i> 24, (src[i] >> 16) & 0xff, (src[i] >> 8) & 0xff, src[i] & 0xff); + if (i%8==7) { + BRINTF("\n"); + } + } + + BRINTF( "\nDESTINATION " ); + + for (i=0; i> 24, (dst[i] >> 16) & 0xff, (dst[i] >> 8) & 0xff, dst[i] & 0xff); + if (i%8==7) { + BRINTF("\n"); + } + } + + t1 = direct_clock_get_abs_micros(); + + for (i=0; ic64x_function = C64X_BLEND_32 | C64X_FLAG_TODO | (sub << 16); + + task->c64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x200000; + task->c64x_arg[1] = 32; + task->c64x_arg[2] = (DAVINCI_C64X_MEM+0x01000000)+0x000000; + task->c64x_arg[3] = 32; + task->c64x_arg[4] = 8; + task->c64x_arg[5] = 8; + task->c64x_arg[6] = 0x80; + + c64x_submit_task( c64x, task ); + } + + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + BRINTF( "\n\nDESTINATION (AFTER) " ); + + for (i=0; i> 24, (dst[i] >> 16) & 0xff, (dst[i] >> 8) & 0xff, dst[i] & 0xff); + if (i%8==7) { + BRINTF("\n"); + } + } + + BRINTF("\n\n"); + + dt = t2 - t1; + total = num; + + D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "blend_32(8x8)", total * 1000000ULL / dt ); +} + +static inline void +bench_fetch_uyvy( DavinciC64x *c64x, bool interleave, int xoff, int yoff ) { + int i, x, y, num=1; + long long t1, t2, dt, total; + u8 *yuv = c64x->mem + 0x1000000; + u8 *src = c64x->mem + 0x1200000; + + BRINTF("\n\n\n.======================== Testing fetch_uyvy (inter %d, xoff %d, yoff %d) ========================.\n\n", + interleave, xoff, yoff); + + for (y=0; y<20; y++) { + for (x=0; x<40; x++) { + int val = (y*40)+x; + src[y*1440 + x] = val; + BRINTF("%02x ", val&0xff); + } + BRINTF("\n"); + } + BRINTF("\n"); + + memset( yuv, 0xAA, 0x100000 ); + + t1 = direct_clock_get_abs_micros(); + + for (i=0; ic64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x000000; + task->c64x_arg[1] = (DAVINCI_C64X_MEM+0x01000000)+0x200000 + yoff*1440 + xoff * 2; + task->c64x_arg[2] = 1440; + + task->c64x_function = (21 << 2) | C64X_FLAG_TODO; + + c64x_submit_task( c64x, task ); + } + + davinci_c64x_write_back_all( c64x ); + davinci_c64x_write_back_all( c64x ); + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + BRINTF( "\n\nDESTINATION\n\nY:\n" ); + for (y=0;y<27;y++) { + if (y==18) BRINTF("\nUV:\n"); + for (x=0;x<32;x++) { + BRINTF("%02x ",yuv[y*32+x]); + } + BRINTF("\n"); + } + + dt = t2 - t1; + total = num; + + D_INFO("\n\nDavinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "blend_fetch_uyvy(16x16)", total * 1000000ULL / dt ); +} +#if 0 +static inline void +bench_fetch_uyvy( DavinciC64x *c64x, bool interleave, int xoff, int yoff ) +{ + int i, x, y, num; + long long t1, t2, dt, total; + + num = 1;//0x20000; + + u8 *yuv = c64x->mem + 0x1000000; + u8 *src = c64x->mem + 0x1200000; + + BRINTF("\n"); + BRINTF("\n\n.======================== Testing fetch_uyvy (inter %d, xoff %d, yoff %d) ========================.\n", + interleave, xoff, yoff); + BRINTF("\n"); + BRINTF( "SOURCE (20x20)\n" ); + + for (y=0; y<20; y++) { + for (x=0; x<40; x++) { + int val = (x & 1) ? (x * 4 + y*0x10) : (x/4 + 0x40 + (x&2) * 0x10 + y*0x08); + + src[y*1440 + x] = val; + + BRINTF("%02x ", val&0xff); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + memset( yuv, 0x55, 0x100000 ); + + + t1 = direct_clock_get_abs_micros(); + + for (i=0; ic64x_function = (19 << 2) | C64X_FLAG_TODO; + + task->c64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x000000; + task->c64x_arg[1] = (DAVINCI_C64X_MEM+0x01000000)+0x200000 + yoff*1440 + xoff * 2; + task->c64x_arg[2] = 1440; + task->c64x_arg[3] = 16; + task->c64x_arg[4] = interleave ? 1 : 0; + + c64x_submit_task( c64x, task ); + } + + davinci_c64x_write_back_all( c64x ); + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + + BRINTF( "\n\nDESTINATION (17x18 / [9x9 9x9])\n" ); + + for (y=0; y<18; y++) { + for (x=0; x<17; x++) { + BRINTF("%02x ", yuv[y*32 + x]); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + for (y=0; y<9; y++) { + for (x=0; x<9; x++) { + BRINTF("%02x ", yuv[y*32 + x + 32*18]); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + for (y=0; y<9; y++) { + for (x=0; x<9; x++) { + BRINTF("%02x ", yuv[y*32 + x + 32*18+16]); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + BRINTF("\n\n"); + + dt = t2 - t1; + total = num; + + D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "blend_fetch_uyvy(16x16)", total * 1000000ULL / dt ); +} +#endif + +#if 0 +static inline void +bench_put_idct( DavinciC64x *c64x, int dct_type ) +{ + int i, num; + long long t1, t2, dt, total; + //int length = 0x10000; + + num = 0x10000;// / length; + + u8 *dst = c64x->mem + 0x01000000; + int *src = c64x->mem + 0x01200000; + + src[0] = DVA_BLOCK_WORD( 100, 0, 1 ); + src[1] = DVA_BLOCK_WORD( 200, 0, 0 ); + src[2] = DVA_BLOCK_WORD( 210, 1, 0 ); + src[3] = DVA_BLOCK_WORD( 220, 2, 1 ); + src[4] = DVA_BLOCK_WORD( 300, 0, 1 ); + src[5] = DVA_BLOCK_WORD( 400, 0, 0 ); + src[6] = DVA_BLOCK_WORD( 410, 1, 1 ); + src[7] = DVA_BLOCK_WORD( 500, 0, 0 ); + src[8] = DVA_BLOCK_WORD( 510, 63, 1 ); + src[9] = DVA_BLOCK_WORD( 600, 63, 1 ); + + BRINTF("\n"); + BRINTF("\n\n.======================== Testing put_idct (%d) ========================.\n", dct_type); + BRINTF("\n"); + + memset( dst, 0x55, 0x100000 ); + + for (i=0; i<10; i++) { + BRINTF("0x%08x (%d, %d, %d)\n", (u32)src[i], src[i] >> 16, (src[i] >> 1) & 0x3f, src[i] & 1); + } + + BRINTF("\n"); + + t1 = direct_clock_get_abs_micros(); + + { + c64xTask *task = c64x_get_task( c64x ); + + task->c64x_function = C64X_LOAD_BLOCK | C64X_FLAG_TODO; + + task->c64x_arg[0] = DAVINCI_C64X_MEM+0x1200000; + task->c64x_arg[1] = 10; + task->c64x_arg[2] = 0x3f; + + c64x_submit_task( c64x, task ); + } + + davinci_c64x_blit_16( c64x, (DAVINCI_C64X_MEM+0x01000000), 0, 0xf06180, 0, 384, 1 ); + davinci_c64x_blit_16( c64x, (DAVINCI_C64X_MEM+0x01100000), 0, 0xf06480, 0, 384/2, 1 ); + + davinci_c64x_put_uyvy_16x16( c64x, (DAVINCI_C64X_MEM+0x01300000), 32, 0xf06180, 0 ); + + davinci_c64x_write_back_all( c64x ); + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + + for (i=0; i<384; i++) { + BRINTF("%5d ", dst[i] ); + if (i%8==7) { + BRINTF("\n"); + } + if (i%64==63) { + BRINTF("\n"); + } + } + + BRINTF("\n\n"); + + + for (i=0; i<384; i++) { + BRINTF("%3d ", duv[i] ); + if (i%8==7) { + BRINTF("\n"); + } + if (i%64==63) { + BRINTF("\n"); + } + } + + BRINTF("\n\n"); + + for (i=0; i<16*16*2; i++) { + BRINTF("%02x ", duy[i]); + + if (i%32==31) { + BRINTF("\n"); + } + } + + BRINTF("\n"); + + dt = t2 - t1; + total = num;// * length; + + D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "block_load()", total * 1000000ULL / dt ); +} +#endif + +static inline void +bench_put_mc( DavinciC64x *c64x, bool interleave ) +{ + int x, y, i, num; + long long t1, t2, dt, total; + + num = 1;//720/16*576/16; + + u8 *dst = c64x->mem + 0x1000000; + u8 *src = c64x->mem + 0x1200000; + + BRINTF("\n"); + BRINTF("\n\n.======================== Testing put_mc (%d) ========================.\n", interleave); + BRINTF("\n"); + BRINTF("SOURCE (16x16 / [8x8 8x8]\n"); + + for (y=0; y<16; y++) { + for (x=0; x<16; x++) { + u8 val = (x << 4) + y; + src[y*16 + x] = val; + BRINTF("%02x ", val); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + for (y=0; y<8; y++) { + for (x=0; x<8; x++) { + u8 val = (x << 4) + y*2; + src[y*16 + x + 16*16] = val; + BRINTF("%02x ", val); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + for (y=0; y<8; y++) { + for (x=0; x<8; x++) { + u8 val = (x << 4) + y*2; + src[y*16 + x + 16*16 + 8] = val; + BRINTF("%02x ", val); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + memset( dst, 0x55, 0x100000 ); + + davinci_c64x_blit_32( c64x, C64X_MC_BUFFER_Y, 16, DAVINCI_C64X_MEM+0x1200000, 16, 4, 24 ); + davinci_c64x_write_back_all( c64x ); + davinci_c64x_wait_low( c64x ); + + t1 = direct_clock_get_abs_micros(); + + for (i=0; ic64x_function = C64X_PUT_MC_UYVY_16x16 | C64X_FLAG_TODO; + + task->c64x_arg[0] = DAVINCI_C64X_MEM+0x1000000; + task->c64x_arg[1] = 1440; + task->c64x_arg[2] = interleave ? 1 : 0; + + c64x_submit_task( c64x, task ); + } + + davinci_c64x_write_back_all( c64x ); + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + + BRINTF("\n"); + BRINTF("DESTINATION (16x16 UYVY)\n"); + + for (y=0; y<16; y++) { + for (x=0; x<32; x++) + BRINTF("%02x ", dst[y*1440 + x]); + + BRINTF("\n"); + } + + BRINTF("\n\n"); + + dt = t2 - t1; + total = num; + + D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "put_mc_16x16()", total * 1000000ULL / dt ); +} + +static inline void +bench_put_sum( DavinciC64x *c64x, bool interleave ) +{ + int x, y, i, num; + long long t1, t2, dt, total; + + num = 1;//720/16*576/16; + + u8 *dst = c64x->mem + 0x1000000; + u8 *src = c64x->mem + 0x1200000; + u32 *words = c64x->mem + 0x1100000; + + BRINTF("\n"); + BRINTF("\n\n.======================== Testing put_sum (%d) ========================.\n", interleave); + BRINTF("\n"); + BRINTF("WORDS (6x IDCT with one value)\n"); + + words[0] = DVA_BLOCK_WORD( 0, 0, 1 ); + words[1] = DVA_BLOCK_WORD( 50, 0, 1 ); + words[2] = DVA_BLOCK_WORD( 100, 0, 1 ); + words[3] = DVA_BLOCK_WORD( 150, 0, 1 ); + words[4] = DVA_BLOCK_WORD( 200, 0, 1 ); + words[5] = DVA_BLOCK_WORD( 250, 0, 1 ); + + BRINTF("\n"); + BRINTF("\n"); + + memset( dst, 0x55, 0x100000 ); + + for (i=0; i<6; i++) { + BRINTF("0x%08x (%d, %d, %d)\n", (u32)words[i], words[i] >> 16, (words[i] >> 1) & 0x3f, words[i] & 1); + } + + { + c64xTask *task = c64x_get_task( c64x ); + + task->c64x_function = C64X_LOAD_BLOCK | C64X_FLAG_TODO; + + task->c64x_arg[0] = DAVINCI_C64X_MEM+0x1100000; + task->c64x_arg[1] = 6; + task->c64x_arg[2] = 0x3f; + + c64x_submit_task( c64x, task ); + } + + BRINTF("\n"); + BRINTF("SOURCE (16x16 / [8x8 8x8]\n"); + + for (y=0; y<16; y++) { + for (x=0; x<16; x++) { + u8 val = (x << 4) + y; + src[y*16 + x] = val; + BRINTF("%02x ", val); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + for (y=0; y<8; y++) { + for (x=0; x<8; x++) { + u8 val = (x << 4) + y*2; + src[y*16 + x + 16*16] = val; + BRINTF("%02x ", val); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + for (y=0; y<8; y++) { + for (x=0; x<8; x++) { + u8 val = (x << 4) + y*2; + src[y*16 + x + 16*16 + 8] = val; + BRINTF("%02x ", val); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + memset( dst, 0x55, 0x100000 ); + + davinci_c64x_blit_32( c64x, C64X_MC_BUFFER_Y, 16, DAVINCI_C64X_MEM+0x1200000, 16, 4, 24 ); + davinci_c64x_write_back_all( c64x ); + davinci_c64x_wait_low( c64x ); + + t1 = direct_clock_get_abs_micros(); + + for (i=0; ic64x_function = C64X_PUT_SUM_UYVY_16x16 | C64X_FLAG_TODO; + + task->c64x_arg[0] = DAVINCI_C64X_MEM+0x1000000; + task->c64x_arg[1] = 1440; + task->c64x_arg[2] = interleave ? 1 : 0; + + c64x_submit_task( c64x, task ); + } + + davinci_c64x_write_back_all( c64x ); + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + + BRINTF("\n"); + BRINTF("DESTINATION (16x16 UYVY)\n"); + + for (y=0; y<16; y++) { + for (x=0; x<32; x++) + BRINTF("%02x ", dst[y*1440 + x]); + + BRINTF("\n"); + } + + BRINTF("\n\n"); + + dt = t2 - t1; + total = num; + + D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "put_sum_16x16()", total * 1000000ULL / dt ); +} + +static inline void +bench_sat_mc( DavinciC64x *c64x ) +{ + int x, y, i, num; + long long t1, t2, dt, total; + + num = 1;//720/16*576/16; + + u8 *dst = c64x->mem + 0x1000000; + u8 *src = c64x->mem + 0x1200000; + + BRINTF("\n\n.======================== Testing sat_mc ========================.\n"); + BRINTF("\n"); + BRINTF("SOURCE (16x16 / [8x8 8x8]\n"); + + for (y=0; y<16; y++) { + for (x=0; x<16; x++) { + u8 val = (x << 4) + y; + src[y*16 + x] = val; + BRINTF("%02x ", val); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + for (y=0; y<8; y++) { + for (x=0; x<8; x++) { + u8 val = (x << 4) + y*2; + src[y*16 + x + 16*16] = val; + BRINTF("%02x ", val); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + for (y=0; y<8; y++) { + for (x=0; x<8; x++) { + u8 val = (x << 4) + y*2; + src[y*16 + x + 16*16 + 8] = val; + BRINTF("%02x ", val); + } + + BRINTF("\n"); + } + + BRINTF("\n"); + + memset( dst, 0x55, 0x100000 ); + + t1 = direct_clock_get_abs_micros(); + + for (i=0; ic64x_function = (57 << 2) | C64X_FLAG_TODO; + + task->c64x_arg[0] = DAVINCI_C64X_MEM+0x1000000; + task->c64x_arg[1] = DAVINCI_C64X_MEM+0x1200000; + task->c64x_arg[2] = 16; + + c64x_submit_task( c64x, task ); + } + + davinci_c64x_write_back_all( c64x ); + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + + BRINTF("\n"); + BRINTF("DESTINATION (16x16 / [8x8 8x8]\n"); + + for (y=0; y<16; y++) { + for (x=0; x<16; x++) + BRINTF("%02x ", dst[y*16 + x]); + + BRINTF("\n"); + } + + BRINTF("\n"); + + for (y=0; y<8; y++) { + for (x=0; x<8; x++) + BRINTF("%02x ", dst[y*16 + x + 16*16]); + + BRINTF("\n"); + } + + BRINTF("\n"); + + for (y=0; y<8; y++) { + for (x=0; x<8; x++) + BRINTF("%02x ", dst[y*16 + x + 16*16 + 8]); + + BRINTF("\n"); + } + + BRINTF("\n\n"); + + dt = t2 - t1; + total = num; + + D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "sat_mc_16x16()", total * 1000000ULL / dt ); +} + +static inline void +bench_uyvy_1( DavinciC64x *c64x, bool progressive ) +{ + c64xTask *task; + int i, num; + long long t1, t2, dt, total; + + num = 720/16*576/16; + + u8 *u = c64x->mem + 0x1200000; + u8 *p = c64x->mem + 0x1000000; + + BRINTF("\n\n\n.======================== Testing put_uyvy (%s) ========================.\n\n", + progressive ? "progressive" : "interlaced"); + + for (i=0; i<256; i++) { + p[i] = i - 128; + BRINTF("Y%-3d ", p[i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + for (i=0; i<64; i++) { + p[256+i] = i-32; + BRINTF("U%-3d ", p[256+i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + for (i=0; i<64; i++) { + p[320+i] = i-32; + BRINTF("V%-3d ", p[320+i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + BRINTF("\n"); + + for (i=0; i<384; i++) { + BRINTF("%4d ", p[i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + BRINTF("\n"); + + memset( u, 0x55, 720*576*2 ); + + t1 = direct_clock_get_abs_micros(); + + for (i=0; ic64x_function = C64X_PUT_UYVY_16x16 | C64X_FLAG_TODO; + + task->c64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x200000+i*16*16*2; + task->c64x_arg[1] = 720 * 2; + task->c64x_arg[2] = (DAVINCI_C64X_MEM+0x01000000); + task->c64x_arg[3] = 0; + + c64x_submit_task( c64x, task ); + } + + BRINTF("\n"); + + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + for (i=0; i<16*16*2; i++) { + BRINTF("%02x ", u[i/32*720*2 + i%32]); + + if (i%32==31) { + BRINTF("\n"); + } + } + + BRINTF("\n"); + + dt = t2 - t1; + total = num; + + D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "put_uyvy_16x16()", total * 1000000ULL / dt ); +} + +static inline void +bench_uyvy_2( DavinciC64x *c64x, bool progressive ) +{ + c64xTask *task; + int i, num; + long long t1, t2, dt, total; + + num = 1;//720/16*576/16; + + u8 *u = c64x->mem + 0x0200000; + u8 *p = c64x->mem + 0x0000000; + + BRINTF("\n\n\n.======================== Testing put_uyvy (%s) ========================.\n\n", + progressive ? "progressive" : "interlaced"); + + for (i=0; i<256; i++) { + p[i] = i/8; + BRINTF("Y%-3d ", p[i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + for (i=0; i<64; i++) { + p[256+i] = i/8 + 128; + BRINTF("U%-3d ", p[256+i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + for (i=0; i<64; i++) { + p[320+i] = i/8 + 240; + BRINTF("V%-3d ", p[320+i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + BRINTF("\n"); + + for (i=0; i<384; i++) { + BRINTF("%4d ", p[i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + BRINTF("\n"); + + memset( u, 0x55, 720*576*2 ); + + t1 = direct_clock_get_abs_micros(); + + for (i=0; ic64x_function = C64X_PUT_UYVY_16x16 | C64X_FLAG_TODO; + + task->c64x_arg[0] = DAVINCI_C64X_MEM+0x200000+i*16*16*2; + task->c64x_arg[1] = 720 * 2; + task->c64x_arg[2] = DAVINCI_C64X_MEM; + task->c64x_arg[3] = 0; + + c64x_submit_task( c64x, task ); + } + + BRINTF("\n"); + + davinci_c64x_write_back_all( c64x ); + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + for (i=0; i<16*16*2; i++) { + BRINTF("%02x ", u[i/32*720*2 + i%32]); + + if (i%32==31) { + BRINTF("\n"); + } + } + + BRINTF("\n"); + + dt = t2 - t1; + total = num; + + D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "put_uyvy_16x16()", total * 1000000ULL / dt ); +} + +static inline void +bench_uyvy_3( DavinciC64x *c64x, bool progressive ) +{ + c64xTask *task; + int i, num; + long long t1, t2, dt, total; + + num = 1;//720/16*576/16; + + u8 *u = c64x->mem + 0x1200000; + u8 *p = c64x->mem + 0x1000000; + + BRINTF("\n\n\n.======================== Testing put_uyvy (%s) ========================.\n\n", + progressive ? "progressive" : "interlaced"); + + for (i=0; i<256; i++) { + p[i] = i%8; + BRINTF("Y%-3d ", p[i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + for (i=0; i<64; i++) { + p[256+i] = i%8 + 128; + BRINTF("U%-3d ", p[256+i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + for (i=0; i<64; i++) { + p[320+i] = i%8 + 240; + BRINTF("V%-3d ", p[320+i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + BRINTF("\n"); + + for (i=0; i<384; i++) { + BRINTF("%4d ", p[i]); + if (i%8==7) { + BRINTF("\n"); + } + } + + BRINTF("\n"); + + memset( u, 0x55, 720*576*2 ); + + t1 = direct_clock_get_abs_micros(); + + for (i=0; ic64x_function = C64X_PUT_UYVY_16x16 | C64X_FLAG_TODO; + + task->c64x_arg[0] = (DAVINCI_C64X_MEM+0x01000000)+0x200000+i*16*16*2; + task->c64x_arg[1] = 720 * 2; + task->c64x_arg[2] = (DAVINCI_C64X_MEM+0x01000000); + task->c64x_arg[3] = 0; + + c64x_submit_task( c64x, task ); + } + + BRINTF("\n"); + + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + for (i=0; i<16*16*2; i++) { + BRINTF("%02x ", u[i/32*720*2 + i%32]); + + if (i%32==31) { + BRINTF("\n"); + } + } + + BRINTF("\n"); + + dt = t2 - t1; + total = num; + + D_INFO( "Davinci/C64X: BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "put_uyvy_16x16()", total * 1000000ULL / dt ); +} + +static inline void +bench_mc( DavinciC64x *c64x, int func, int w, int h, bool avg, const char *name ) +{ + int i, x, y, num; + long long t1, t2, dt, total; + + num = 0x1;//0000; + + u8 *dst = c64x->mem + 0x1200000; + u8 *dsr = c64x->mem + 0x1100000; + u8 *src = c64x->mem + 0x1000000; + + BRINTF("\n\n.============ Testing %s ============.\n", name); + BRINTF("\n"); + BRINTF("SRC REF\n"); + + for (y=0; yc64x_function = (func << 2) | C64X_FLAG_TODO; + + task->c64x_arg[0] = DAVINCI_C64X_MEM + 0x01200000; + task->c64x_arg[1] = 32; + task->c64x_arg[2] = DAVINCI_C64X_MEM + 0x01000000; + task->c64x_arg[3] = DAVINCI_C64X_MEM + 0x01100000; + task->c64x_arg[4] = 32; + task->c64x_arg[5] = h; + + c64x_submit_task( c64x, task ); + } + + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + BRINTF("-> DST\n"); + + for (y=0; yc64x_function = (63 << 2) | C64X_FLAG_TODO; + + task->c64x_arg[0] = nom; + task->c64x_arg[1] = den; + + c64x_submit_task( c64x, task ); + + davinci_c64x_wait_low( c64x ); + + BRINTF("%x / %x = %x\n\n\n", nom, den, task->c64x_return); +} + +static inline void +bench_dither_argb( DavinciC64x *c64x ) +{ + int i, x, y, num, w = 8, h = 17; + long long t1, t2, dt, total; + + num = 0x10000; + + u16 *dr = c64x->mem + 0x1200000; + u8 *da = c64x->mem + 0x1100000; + u32 *src = c64x->mem + 0x1000000; + + BRINTF("\n\n.======================== Testing dither_argb ========================.\n"); + BRINTF("\n"); + BRINTF("SOURCE ARGB\n"); + + for (y=0; yc64x_function = C64X_DITHER_ARGB | C64X_FLAG_TODO; + + task->c64x_arg[0] = DAVINCI_C64X_MEM + 0x01200000; + task->c64x_arg[1] = DAVINCI_C64X_MEM + 0x01100000; + task->c64x_arg[2] = 64; + task->c64x_arg[3] = DAVINCI_C64X_MEM + 0x01000000; + task->c64x_arg[4] = 128; + task->c64x_arg[5] = w; + task->c64x_arg[6] = h; + + c64x_submit_task( c64x, task ); + } + + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + BRINTF("-> DST RGB\n"); + + for (y=0; y DST ALPHA\n"); + + for (y=0; y> 4); + } + BRINTF("\n"); + } + + BRINTF("\n"); + + dt = t2 - t1; + total = num; + + BRINTF( "BENCHMARK on DSP - %-15s %lld Calls/sec\n", + "dither_argb", total * 1000000ULL / dt ); +} + + + + +/**********************************************************************************************************************/ +/*** 32 bit scaler ****************************************************************************************************/ +/**********************************************************************************************************************/ + +typedef struct { + DFBRegion clip; + const void *colors; + ulong protect; + ulong key; +} StretchCtx; + +typedef void (*StretchHVx)( void *dst, + int dpitch, + const void *src, + int spitch, + int width, + int height, + int dst_width, + int dst_height, + const StretchCtx *ctx ); + +#define STRETCH_NONE 0 +#define STRETCH_SRCKEY 1 +#define STRETCH_PROTECT 2 +#define STRETCH_SRCKEY_PROTECT 3 +#define STRETCH_NUM 4 + +typedef struct { + struct { + StretchHVx up[STRETCH_NUM]; + StretchHVx down[STRETCH_NUM]; + } f[DFB_NUM_PIXELFORMATS]; +} StretchFunctionTable; + + +#define DST_FORMAT DSPF_ARGB +#define TABLE_NAME stretch_32 +#define FUNC_NAME(UPDOWN,K,P,F) stretch_32_ ## UPDOWN ## _ ## K ## P ## _ ## F +#define SHIFT_R8 8 +#define SHIFT_L8 8 +#define X_00FF00FF 0x00ff00ff +#define X_FF00FF00 0xff00ff00 +#define MASK_RGB 0x00ffffff +#define HAS_ALPHA + +#include + +#undef DST_FORMAT +#undef TABLE_NAME +#undef FUNC_NAME +#undef SHIFT_R8 +#undef SHIFT_L8 +#undef X_00FF00FF +#undef X_FF00FF00 +#undef MASK_RGB +#undef HAS_ALPHA + + +static inline void +bench_stretch_32( DavinciC64x *c64x, int sw, int sh, int dw, int dh ) +{ + int i, x, y, num; + long long t1, t2, dt, total; + bool down = (dw < sw) && (dh < sh); + +#if 0 + int SW = (sw + 5) & ~3; + int SH = (sh + 5) & ~3; + int DW = (dw + 5) & ~3; + int DH = (dh + 5) & ~3; +#else + int SW = sw; + int SH = sh; + int DW = dw; + int DH = dh; +#endif + + num = 1;//0x10000; + + u32 cpu[DW * DH]; + u32 *dst = c64x->mem + 0x1200000; + u32 *src = c64x->mem + 0x1000000; + + memset( src, 0x55, 0x100000 ); + + for (y=0; y %dx%d ) ========================.\n", sw, sh, dw, dh); + BRINTF("\n"); + BRINTF("SOURCE IMAGE (%dx%d) [%dx%d]\n", sw, sh, SW, SH); + + for (y=0; yc64x_function = (down ? + C64X_STRETCH_32_down : + C64X_STRETCH_32_up ) | C64X_FLAG_TODO; + + task->c64x_arg[0] = DAVINCI_C64X_MEM + 0x1200000; + task->c64x_arg[1] = DAVINCI_C64X_MEM + 0x1000000; + task->c64x_arg[2] = (DW * 4) | ((SW * 4) << 16); + task->c64x_arg[3] = dh | (dw << 16); + task->c64x_arg[4] = sh | (sw << 16); + task->c64x_arg[5] = (dw - 1) | ((dh - 1) << 16); + task->c64x_arg[6] = 0 | (0 << 16); + + c64x_submit_task( c64x, task ); + } + + davinci_c64x_write_back_all( c64x ); + + davinci_c64x_wait_low( c64x ); + + t2 = direct_clock_get_abs_micros(); + + + BRINTF("-> DSP RESULT (%dx%d) [%dx%d]\n", dw, dh, DW, DH); + + for (y=0; y CPU RESULT (%dx%d) [%dx%d]\n", dw, dh, DW, DH); + + for (y=0; yfd = fd; + c64x->ctl = map_q; + c64x->mem = map_m; + c64x->QueueL = map_m + 0x01e00000; + + D_INFO( "Davinci/C64X: Low ARM %d / DSP %d, High ARM %d / DSP %d\n", + c64x->ctl->QL_arm, c64x->ctl->QL_dsp, c64x->ctl->QH_arm, c64x->ctl->QH_dsp ); + + D_MAGIC_SET( c64x, DavinciC64x ); + +if (getenv("C64X_TEST")) { +// test_load_block( c64x, false ); + +// test_load_block( c64x, true ); + +// bench_dither_argb( c64x ); + +#if 0 + bench_uyvy_1( c64x, true ); + bench_uyvy_1( c64x, false ); + bench_uyvy_2( c64x, true ); + bench_uyvy_2( c64x, false ); + bench_uyvy_3( c64x, true ); + bench_uyvy_3( c64x, false ); +#endif + +#if 0 + bench_blend_argb( c64x, 0 ); + bench_blend_argb( c64x, 1 ); + bench_blend_argb( c64x, 2 ); + bench_blend_argb( c64x, 3 ); +#endif + +#if 0 + bench_stretch_32( c64x, 2, 1, 16, 1 ); + bench_stretch_32( c64x, 2, 2, 16, 2 ); + + bench_stretch_32( c64x, 2, 1, 3, 1 ); + bench_stretch_32( c64x, 4, 1, 6, 1 ); + + bench_stretch_32( c64x, 3, 1, 2, 1 ); + bench_stretch_32( c64x, 6, 1, 4, 1 ); +#endif + +#if 1 + bench_fetch_uyvy( c64x, false, 0, 0 ); + bench_fetch_uyvy( c64x, false, 1, 0 ); + bench_fetch_uyvy( c64x, false, 0, 1 ); + bench_fetch_uyvy( c64x, false, 1, 1 ); + bench_fetch_uyvy( c64x, true, 0, 0 ); + bench_fetch_uyvy( c64x, true, 1, 0 ); + bench_fetch_uyvy( c64x, true, 0, 1 ); + bench_fetch_uyvy( c64x, true, 1, 1 ); +#endif + +#if 0 + bench_put_mc( c64x, false ); + bench_put_mc( c64x, true ); + + bench_put_sum( c64x, false ); + bench_put_sum( c64x, true ); + + bench_sat_mc( c64x ); +#endif + +#if 0 + bench_mc( c64x, 32, 8, 8, false, "mc_put_o_8" ); + bench_mc( c64x, 33, 8, 8, false, "mc_put_x_8" ); + bench_mc( c64x, 34, 8, 8, false, "mc_put_y_8" ); + bench_mc( c64x, 35, 8, 8, false, "mc_put_xy_8" ); + bench_mc( c64x, 36, 16, 16, false, "mc_put_o_16" ); + bench_mc( c64x, 37, 16, 16, false, "mc_put_x_16" ); + bench_mc( c64x, 38, 16, 16, false, "mc_put_y_16" ); + bench_mc( c64x, 39, 16, 16, false, "mc_put_xy_16" ); +#endif + +#if 0 + bench_mc( c64x, 40, 8, 8, true, "mc_avg_o_8" ); + bench_mc( c64x, 41, 8, 8, true, "mc_avg_x_8" ); + bench_mc( c64x, 42, 8, 8, true, "mc_avg_y_8" ); + bench_mc( c64x, 43, 8, 8, true, "mc_avg_xy_8" ); + bench_mc( c64x, 44, 16, 16, true, "mc_avg_o_16" ); + bench_mc( c64x, 45, 16, 16, true, "mc_avg_x_16" ); + bench_mc( c64x, 46, 16, 16, true, "mc_avg_y_16" ); + bench_mc( c64x, 47, 16, 16, true, "mc_avg_xy_16" ); +#endif + +#if 0 + bench_div( c64x, 1, 3 ); + bench_div( c64x, 1000, 333 ); + bench_div( c64x, 1000, 334 ); + bench_div( c64x, 6666, 2222 ); + bench_div( c64x, 1234, 1234 ); + bench_div( c64x, 4000, 0 ); + bench_div( c64x, 5000, 0 ); + bench_div( c64x, 10000, 3 ); + bench_div( c64x, 14, 3 ); + bench_div( c64x, 0x10000, 0x1000 ); + bench_div( c64x, 0x1000, 0x100 ); + bench_div( c64x, 0x100000, 2 ); +#endif +} + + return DFB_OK; + + +error: + if (map_q) + munmap( map_q, C64X_QLEN ); + + close( fd ); + + return ret; +} + +DFBResult +davinci_c64x_close( DavinciC64x *c64x ) +{ + D_MAGIC_ASSERT( c64x, DavinciC64x ); + + munmap( (void*) c64x->mem, C64X_MLEN ); + munmap( (void*) c64x->ctl, C64X_QLEN ); + + close( c64x->fd ); + + D_MAGIC_CLEAR( c64x ); + + return DFB_OK; +} + -- cgit