Может быть что-то вроде этого:
lbu $10, matrix
lbu $11, matrix+1
lbu $12, matrix+2
lbu $13, matrix+3
lbu $14, matrix+4
lbu $15, matrix+5
lbu $16, matrix+6
lbu $17, matrix+7
lbu $18, matrix+8
lbu $19, matrix+9
lbu $20, matrix+10
lbu $21, matrix+11
lbu $22, matrix+12
lbu $23, matrix+13
lbu $24, matrix+14
lbu $25, matrix+15
addiu $2, $0, 8
addiu $9, $0, 256
loop:
addiu $2, $2, -1
srl $9, $9, 1
addu $27, $0, $0
and $26, $10, $9
srlv $26, $26, $2
or $27, $27, $26
and $26, $11, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $12, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $13, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $14, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $15, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $16, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $17, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $18, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $19, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $20, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $21, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $22, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $23, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $24, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
and $26, $25, $9
srlv $26, $26, $2
sll $27, $27, 1
or $27, $27, $26
sll $3, $2, 1
sh $27, transposed($3)
bgez $2, loop
nop
.data 0x2000
matrix:
.byte 0x80
.byte 0x80
.byte 0x40
.byte 0x40
.byte 0x20
.byte 0x20
.byte 0x10
.byte 0x10
.byte 0x08
.byte 0x08
.byte 0x04
.byte 0x04
.byte 0x02
.byte 0x02
.byte 0x01
.byte 0x01
.data 0x3000
transposed:
.half 0
.half 0
.half 0
.half 0
.half 0
.half 0
.half 0
.half 0
Он считывает входную матрицу и затем выполняет цикл 8 раз (один раз для каждой транспонированной строки матрицы).