|
| Listing 3
A more parallel implementation of the C loop
|
1 ; using delay slots and
duplicate execution units of the device
2 ; 10 cycles per iteration
3
4 L1: LDW .D2 *B++,B5 ;load B[i] into B5
5 || LDW .D1 *A++,A4 ;load A[i] into A4
6
7 NOP 2 ; wait load to complete
8 SUB .L2 i,1,i ;decrement i
9 [i] B .S1 L1 ; if i != 0, goto L1
10
11 MPYSP .M1X B5,A4,A4 ; A4 = A4 * B5
12 NOP 3 ; wait mpy to complete
13
14 STW .D1 A4,*C++ ;store A4 into C[i]
|
Back
|
|
|
|
|