/* fv0 fv4 fv8 fv12 */ .global _sobel_fipr_store_queue2 _sobel_fipr_store_queue2: /* r0: var (buffer address) */ /* r1: (temporary) */ /* r2: (temporary) */ /* r3: const 640 4 */ /* r4: const 642 4 */ /* r5: const 1280 4 */ /* r6: const 1281 4 */ /* r7: const 1282 4 */ /* r8: var (output address / store queue) */ /* r9: (temporary) */ /* r10: var (x loop counter) */ /* r11: var (y loop counter) */ /* r12: var (prefetch address: input address + 1280 4) */ /* r13: var (input address) */ /* r14: - */ __setup: mov.l r8,@-r15 mov.l r9,@-r15 mov.l r10,@-r15 mov.l r11,@-r15 mov.l r12,@-r15 mov.l r13,@-r15 fmov.s fr12,@-r15 fmov.s fr13,@-r15 fmov.s fr14,@-r15 fmov.s fr15,@-r15 fldi1 fr8 /* 1.0 */ fldi1 fr9 /* 2.0 */ fldi1 fr10 /* 1.0 */ fldi0 fr11 /* 0.0 */ fadd fr9,fr9 fldi1 fr12 fmov fr9,fr13 fldi1 fr14 fldi0 fr15 fneg fr12 fneg fr13 fneg fr14 /* constants */ mova _const_100f,r0 /* use r0 as temporary */ fmov.s @r0,fr0 fschg fmov dr0,xd0 fschg /* set qacr0 */ mov r5,r0 /* r5: C argument */ shlr16 r0 /* use r0 as temporary */ mov.l _const_qacr0,r9 /* use r9 as temporary */ shlr8 r0 and #28,r0 /* 0b11100 */ mov.l r0,@r9 mov.l r0,@(4,r9) /* qacr1 */ /* translate r8 to store queue address; keep bits [25:6] */ mov r5,r8 /* r5: C argument */ mov.l _const_store_queue_mask,r0 /* use r0 as temporary */ and r0,r8 mov.l _const_store_queue,r9 /* use r9 as temporary */ or r9,r8 /* 0xe0000000 | (in_addr & 0x03ffffc0) */ /* save C input argument */ mov r4,r13 /* r4 saved as r13 (input data) */ mov r6,r0 /* r6 saved as r0 (temporary buffer) */ /* offsets */ mov.w _const_640,r3 mov.w _const_642,r4 mov.w _const_1280,r5 mov.w _const_1281,r6 mov.w _const_1282,r7 bra _prime_pixels_loop_init nop .align 4 _const_100f: .float 3900 _const_store_queue: .long 0xe0000000 _const_store_queue_mask: .long 0x03ffffc0 /* (0xffffffff & (~0b111111)) & (~(0b111111 << 26)) */ _const_qacr0: .long 0xff000038 _const_640: .short (640 * 4) _const_642: .short (642 * 4) _const_1280: .short (1280 * 4) _const_1281: .short (1281 * 4) _const_1282: .short (1282 * 4) /* use r10 as temporary to load the first 1280 pixels; 8 pixels per loop iteration */ .align 4 _prime_pixels_loop_init: mov #80,r10 /* 1280 / 8 */ shll r10 mov r0,r12 _prime_pixels_loop: .include "unpack_pixel.s" dt r10 bt _loop_init bra _prime_pixels_loop nop .align 4 _loop_init: /* skip first row */ add r3,r0 /* r3: const (640 * 4) */ add r3,r8 mov.w _const_height,r11 /* 478 */ bra _loop mov #80,r10 /* 640 / 8 */ _const_height: .short 478 _loop: _loop_width: /* prefetch at r8 + 1280 */ /* process the next 8 pixels */ .include "unpack_pixel.s" .include "sobel_fipr_inner2.s" mov.l r9,@r8 /* save result in the store queue */ .include "sobel_fipr_inner2.s" mov.l r9,@(4,r8) /* save result in the store queue */ .include "sobel_fipr_inner2.s" mov.l r9,@(8,r8) /* save result in the store queue */ .include "sobel_fipr_inner2.s" mov.l r9,@(12,r8) /* save result in the store queue */ .include "sobel_fipr_inner2.s" mov.l r9,@(16,r8) /* save result in the store queue */ .include "sobel_fipr_inner2.s" mov.l r9,@(20,r8) /* save result in the store queue */ .include "sobel_fipr_inner2.s" mov.l r9,@(24,r8) /* save result in the store queue */ .include "sobel_fipr_inner2.s" mov.l r9,@(28,r8) /* save result in the store queue */ /* send the store queue */ pref @r8 add #32,r8 dt r10 bt _row_decrement bra _loop_width nop /* end of _loop_width */ _row_decrement: /* row decrement */ dt r11 bt _return bra _loop mov #80,r10 /* 640 / 8 */ /* restore registers */ _return: fmov.s @r15+,fr15 fmov.s @r15+,fr14 fmov.s @r15+,fr13 fmov.s @r15+,fr12 mov.l @r15+,r13 mov.l @r15+,r12 mov.l @r15+,r11 mov.l @r15+,r10 mov.l @r15+,r9 mov.l @r15+,r8 rts nop _const_638_b: .short 638