The following function illustrates a technique for
loop unrolling that, depending on what you want to unroll,
can be faster than Duff's Device, and is certainly less obscure.
//
// This function copies an array of characters into
// an array of short integers.
//
// numEls is the number of array elements to be copied.
//
void byte2short (short* shortArr, char* byteArr, long numEls)
{
long idx;
long numElsM7;
numElsM7 = numEls - 7;
for (idx = 0; idx < numElsM7; idx += 8)
{
shortArr[idx] = byteArr[idx];
shortArr[idx + 1] = byteArr[idx + 1];
shortArr[idx + 2] = byteArr[idx + 2];
shortArr[idx + 3] = byteArr[idx + 3];
shortArr[idx + 4] = byteArr[idx + 4];
shortArr[idx + 5] = byteArr[idx + 5];
shortArr[idx + 6] = byteArr[idx + 6];
shortArr[idx + 7] = byteArr[idx + 7];
}
for (; idx < numEls; ++idx)
{
shortArr[idx] = byteArr[idx];
}
return;
}