C中最快的解交织 *** 作?

C中最快的解交织 *** 作?,第1张

概述我有一个指向混合字节数组的指针,它包含两个不同的数组array1和array2的交错字节.说混合看起来像这样: a1b2c3d4... 我需要做的是对字节进行解交织,所以我得到array1 = abcd …和array2 = 1234 ….我知道提前混合的长度,array1和array2的长度是等效的混合/ 2. 这是我当前的实现(array1和array2已经分配): int i, j;int 我有一个指向混合字节数组的指针,它包含两个不同的数组array1和array2的交错字节.说混合看起来像这样:

我需要做的是对字节进行解交织,所以我得到array1 = abcd …和array2 = 1234 ….我知道提前混合的长度,array1和array2的长度是等效的混合/ 2.


int i,j;int mixedLength_2 = mixedLength / 2;for (i = 0,j = 0; i < mixedLength_2; i++,j += 2){    array1[i] = mixed[j];    array2[i] = mixed[j+1];}

这避免了任何昂贵的乘法或除法运算,但仍然运行不够快.我希望有一些像memcpy这样的东西,它使用可以使用低级块复制 *** 作来加速进程的索引器.比现在有更快的实现吗?


目标平台是iOS和Mac的Objective-C.一个快速的 *** 作对于iOS设备来说更为重要,因此针对iOS的解决方案会比没有更好.



voID interleave(const uint8_t *srcA,const uint8_t *srcB,uint8_t *dstAB,size_t dstablength){#if defined __ARM_NEON__    // attempt to use NEON intrinsics    // iterate 32-bytes at a time    div_t dstablength_32 = div(dstablength,32);    if (dstablength_32.rem == 0)    {        while (dstablength_32.quot --> 0)        {            const uint8x16_t a = vld1q_u8(srcA);            const uint8x16_t b = vld1q_u8(srcB);            const uint8x16x2_t ab = { a,b };            vst2q_u8(dstAB,ab);            srcA += 16;            srcB += 16;            dstAB += 32;        }        return;    }    // iterate 16-bytes at a time    div_t dstablength_16 = div(dstablength,16);    if (dstablength_16.rem == 0)    {        while (dstablength_16.quot --> 0)        {            const uint8x8_t a = vld1_u8(srcA);            const uint8x8_t b = vld1_u8(srcB);            const uint8x8x2_t ab = { a,b };            vst2_u8(dstAB,ab);            srcA += 8;            srcB += 8;            dstAB += 16;        }        return;    }#endif    // if the bytes were not aligned properly    // or NEON is unavailable,fall back to    // an optimized iteration    // iterate 8-bytes at a time    div_t dstablength_8 = div(dstablength,8);    if (dstablength_8.rem == 0)    {        typedef union        {            uint64_t wIDe;            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;        } ab8x8_t;        uint64_t *dstAB64 = (uint64_t *)dstAB;        int j = 0;        for (int i = 0; i < dstablength_8.quot; i++)        {            ab8x8_t cursor;            cursor.narrow.a1 = srcA[j  ];            cursor.narrow.b1 = srcB[j++];            cursor.narrow.a2 = srcA[j  ];            cursor.narrow.b2 = srcB[j++];            cursor.narrow.a3 = srcA[j  ];            cursor.narrow.b3 = srcB[j++];            cursor.narrow.a4 = srcA[j  ];            cursor.narrow.b4 = srcB[j++];            dstAB64[i] = cursor.wIDe;        }        return;    }    // iterate 4-bytes at a time    div_t dstablength_4 = div(dstablength,4);    if (dstablength_4.rem == 0)    {        typedef union        {            uint32_t wIDe;            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;        } ab8x4_t;        uint32_t *dstAB32 = (uint32_t *)dstAB;        int j = 0;        for (int i = 0; i < dstablength_4.quot; i++)        {            ab8x4_t cursor;            cursor.narrow.a1 = srcA[j  ];            cursor.narrow.b1 = srcB[j++];            cursor.narrow.a2 = srcA[j  ];            cursor.narrow.b2 = srcB[j++];            dstAB32[i] = cursor.wIDe;        }        return;    }    // iterate 2-bytes at a time    div_t dstablength_2 = div(dstablength,2);    typedef union    {        uint16_t wIDe;        struct { uint8_t a; uint8_t b; } narrow;    } ab8x2_t;    uint16_t *dstAB16 = (uint16_t *)dstAB;    for (int i = 0; i < dstablength_2.quot; i++)    {        ab8x2_t cursor;        cursor.narrow.a = srcA[i];        cursor.narrow.b = srcB[i];        dstAB16[i] = cursor.wIDe;    }}voID deinterleave(const uint8_t *srcAB,uint8_t *dstA,uint8_t *dstB,size_t srcABLength){#if defined __ARM_NEON__    // attempt to use NEON intrinsics    // iterate 32-bytes at a time    div_t srcABLength_32 = div(srcABLength,32);    if (srcABLength_32.rem == 0)    {        while (srcABLength_32.quot --> 0)        {            const uint8x16x2_t ab = vld2q_u8(srcAB);            vst1q_u8(dstA,ab.val[0]);            vst1q_u8(dstB,ab.val[1]);            srcAB += 32;            dstA += 16;            dstB += 16;        }        return;    }    // iterate 16-bytes at a time    div_t srcABLength_16 = div(srcABLength,16);    if (srcABLength_16.rem == 0)    {        while (srcABLength_16.quot --> 0)        {            const uint8x8x2_t ab = vld2_u8(srcAB);            vst1_u8(dstA,ab.val[0]);            vst1_u8(dstB,ab.val[1]);            srcAB += 16;            dstA += 8;            dstB += 8;        }        return;    }#endif    // if the bytes were not aligned properly    // or NEON is unavailable,fall back to    // an optimized iteration    // iterate 8-bytes at a time    div_t srcABLength_8 = div(srcABLength,8);    if (srcABLength_8.rem == 0)    {        typedef union        {            uint64_t wIDe;            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;        } ab8x8_t;        uint64_t *srcAB64 = (uint64_t *)srcAB;        int j = 0;        for (int i = 0; i < srcABLength_8.quot; i++)        {            ab8x8_t cursor;            cursor.wIDe = srcAB64[i];            dstA[j  ] = cursor.narrow.a1;            dstB[j++] = cursor.narrow.b1;            dstA[j  ] = cursor.narrow.a2;            dstB[j++] = cursor.narrow.b2;            dstA[j  ] = cursor.narrow.a3;            dstB[j++] = cursor.narrow.b3;            dstA[j  ] = cursor.narrow.a4;            dstB[j++] = cursor.narrow.b4;        }        return;    }    // iterate 4-bytes at a time    div_t srcABLength_4 = div(srcABLength,4);    if (srcABLength_4.rem == 0)    {        typedef union        {            uint32_t wIDe;            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;        } ab8x4_t;        uint32_t *srcAB32 = (uint32_t *)srcAB;        int j = 0;        for (int i = 0; i < srcABLength_4.quot; i++)        {            ab8x4_t cursor;            cursor.wIDe = srcAB32[i];            dstA[j  ] = cursor.narrow.a1;            dstB[j++] = cursor.narrow.b1;            dstA[j  ] = cursor.narrow.a2;            dstB[j++] = cursor.narrow.b2;        }        return;    }    // iterate 2-bytes at a time    div_t srcABLength_2 = div(srcABLength,2);    typedef union    {        uint16_t wIDe;        struct { uint8_t a; uint8_t b; } narrow;    } ab8x2_t;    uint16_t *srcAB16 = (uint16_t *)srcAB;    for (int i = 0; i < srcABLength_2.quot; i++)    {        ab8x2_t cursor;        cursor.wIDe = srcAB16[i];        dstA[i] = cursor.narrow.a;        dstB[i] = cursor.narrow.b;    }}
解决方法 在我的头顶,我不知道一个库函数去交织2通道字节数据.然而,值得一提的是,苹果提出了一个错误报告来请求这样的功能.

在此期间,使用NEON或SSE内在函数对这样的函数进行向量化很容易.具体来说,在ARM上,您将需要使用vld1q_u8从每个源数组vuzpq_u8加载一个向量来对其进行解交织,并使用vst1q_u8来存储结果向量;这是一个粗略的草图,我没有测试甚至试图构建,但它应该说明一般的想法.更复杂的实现是绝对可能的(特别是NEON可以在单个指令中加载/存储两个16B寄存器,编译器可能无法执行此 *** 作,根据缓冲区的长度,一些流水线和/或展开可能是有益的是):

#if defined __ARM_NEON__#   include <arm_neon.h>#endif#include <stdint.h>#include <stddef.h>voID deinterleave(uint8_t *mixed,uint8_t *array1,uint8_t *array2,size_t mixedLength) {#if defined __ARM_NEON__    size_t vectors = mixedLength / 32;    mixedLength %= 32;    while (vectors --> 0) {        const uint8x16_t src0 = vld1q_u8(mixed);        const uint8x16_t src1 = vld1q_u8(mixed + 16);        const uint8x16x2_t dst = vuzpq_u8(src0,src1);        vst1q_u8(array1,dst.val[0]);        vst1q_u8(array2,dst.val[1]);        mixed += 32;        array1 += 16;        array2 += 16;    }#endif    for (size_t i=0; i<mixedLength/2; ++i) {        array1[i] = mixed[2*i];        array2[i] = mixed[2*i + 1];    }}

以上是内存溢出为你收集整理的C中最快的解交织 *** 作?全部内容,希望文章能够帮你解决C中最快的解交织 *** 作?所遇到的程序开发问题。



原文地址: https://outofmemory.cn/langs/1250878.html

打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-07
下一篇 2022-06-07



