到目前为止,我已经找到了以下方法:
// Computes the number of leading zero bits.// Here,avx_word is of type _m256i.if (!_mm256_testz_si256(avx_word,avx_word)) { uint64_t word = _mm256_extract_epi64(avx_word,0); if (word > 0) return (__builtin_clzll(word)); word = _mm256_extract_epi64(avx_word,1); if (word > 0) return (__builtin_clzll(word) + 64); word = _mm256_extract_epi64(avx_word,2); if (word > 0) return (__builtin_clzll(word) + 128); word = _mm256_extract_epi64(avx_word,3); return (__builtin_clzll(word) + 192);} else return 256; // word is entirely zero
但是,我发现在256位寄存器中找出确切的非零字是相当笨拙的.
有人知道是否有更优雅(或更快)的方法吗?
正如附加信息:
我实际上想要计算由逻辑AND创建的任意长向量的第一个设置位的索引,并且我将标准64位 *** 作的性能与SSE和AVX-2代码进行比较.
这是我的整个测试代码:
#include <stdio.h>#include <stdlib.h>#include <immintrin.h>#include <stdint.h>#include <assert.h>#include <time.h>#include <sys/time.h>#include <stdalign.h>#define ALL 0xFFFFFFFF#define NONE 0x0#define BV_SHIFTBITS ((size_t) 6)#define BV_MOD_WORD ((size_t) 63)#define BV_ONE ((uint64_t) 1)#define BV_ZERO ((uint64_t) 0)#define BV_WORDSIZE ((uint64_t) 64)uint64_t*Vector_new( size_t num_bits) { assert ((num_bits % 256) == 0); size_t num_words = num_bits >> BV_SHIFTBITS; size_t mod = num_bits & BV_MOD_WORD; if (mod > 0) assert (0); uint64_t* words; posix_memalign((voID**) &(words),32,sizeof(uint64_t) * num_words); for (size_t i = 0; i < num_words; ++i) words[i] = 0; return words;}voIDVector_set( uint64_t* vector,size_t pos) { const size_t word_index = pos >> BV_SHIFTBITS; const size_t offset = pos & BV_MOD_WORD; vector[word_index] |= (BV_ONE << (BV_MOD_WORD - offset));}size_tVector_and_first_bit( uint64_t** vectors,const size_t num_vectors,const size_t num_words) { for (size_t i = 0; i < num_words; ++i) { uint64_t word = vectors[0][i]; for (size_t j = 1; j < num_vectors; ++j) word &= vectors[j][i]; if (word > 0) return (1 + i * BV_WORDSIZE + __builtin_clzll(word)); } return 0;}size_tVector_and_first_bit_256( uint64_t** vectors,const size_t num_avx_words) { for (size_t i = 0; i < num_avx_words; ++i) { const size_t addr_offset = i << 2; __m256i avx_word = _mm256_load_si256( (__m256i const*) (vectors[0] + addr_offset)); // AND the AVX words for (size_t j = 1; j < num_vectors; ++j) { avx_word = _mm256_and_si256( avx_word,_mm256_load_si256((__m256i const*) (vectors[j] + addr_offset)) ); } // test whether resulting AVX word is not zero if (!_mm256_testz_si256(avx_word,avx_word)) { uint64_t word = _mm256_extract_epi64(avx_word,0); const size_t shift = i << 8; if (word > 0) return (1 + shift + __builtin_clzll(word)); word = _mm256_extract_epi64(avx_word,1); if (word > 0) return (1 + shift + __builtin_clzll(word) + 64); word = _mm256_extract_epi64(avx_word,2); if (word > 0) return (1 + shift + __builtin_clzll(word) + 128); word = _mm256_extract_epi64(avx_word,3); return (1 + shift + __builtin_clzll(word) + 192); } } return 0;}size_tVector_and_first_bit_128( uint64_t** vectors,const size_t num_avx_words) { for (size_t i = 0; i < num_avx_words; ++i) { const size_t addr_offset = i << 1; __m128i avx_word = _mm_load_si128( (__m128i const*) (vectors[0] + addr_offset)); // AND the AVX words for (size_t j = 1; j < num_vectors; ++j) { avx_word = _mm_and_si128( avx_word,_mm_load_si128((__m128i const*) (vectors[j] + addr_offset)) ); } // test whether resulting AVX word is not zero if (!_mm_test_all_zeros(avx_word,avx_word)) { uint64_t word = _mm_extract_epi64(avx_word,0); if (word > 0) return (1 + (i << 7) + __builtin_clzll(word)); word = _mm_extract_epi64(avx_word,1); return (1 + (i << 7) + __builtin_clzll(word) + 64); } } return 0;}uint64_t*make_random_vector( const size_t num_bits,const size_t propability) { uint64_t* vector = Vector_new(num_bits); for (size_t i = 0; i < num_bits; ++i) { const int x = rand() % 10; if (x >= (int) propability) Vector_set(vector,i); } return vector;}size_tmillis( const struct timeval* end,const struct timeval* start) { struct timeval e = *end; struct timeval s = *start; return (1000 * (e.tv_sec - s.tv_sec) + (e.tv_usec - s.tv_usec) / 1000);}intmain( int argc,char** argv) { if (argc != 6) printf("fuck %s\n",argv[0]); srand(time(NulL)); const size_t num_vectors = atoi(argv[1]); const size_t size = atoi(argv[2]); const size_t num_iterations = atoi(argv[3]); const size_t num_dimensions = atoi(argv[4]); const size_t propability = atoi(argv[5]); const size_t num_words = size / 64; const size_t num_sse_words = num_words / 2; const size_t num_avx_words = num_words / 4; assert(num_vectors > 0); assert(size > 0); assert(num_iterations > 0); assert(num_dimensions > 0); struct timeval t1; gettimeofday(&t1,NulL); uint64_t*** vectors = (uint64_t***) malloc(sizeof(uint64_t**) * num_vectors); for (size_t j = 0; j < num_vectors; ++j) { vectors[j] = (uint64_t**) malloc(sizeof(uint64_t*) * num_dimensions); for (size_t i = 0; i < num_dimensions; ++i) vectors[j][i] = make_random_vector(size,propability); } struct timeval t2; gettimeofday(&t2,NulL); printf("Creation: %zu ms\n",millis(&t2,&t1)); size_t* results_64 = (size_t*) malloc(sizeof(size_t) * num_vectors); size_t* results_128 = (size_t*) malloc(sizeof(size_t) * num_vectors); size_t* results_256 = (size_t*) malloc(sizeof(size_t) * num_vectors); gettimeofday(&t1,NulL); for (size_t j = 0; j < num_iterations; ++j) for (size_t i = 0; i < num_vectors; ++i) results_64[i] = Vector_and_first_bit(vectors[i],num_dimensions,num_words); gettimeofday(&t2,NulL); const size_t millis_64 = millis(&t2,&t1); printf("64 : %zu ms\n",millis_64); gettimeofday(&t1,NulL); for (size_t j = 0; j < num_iterations; ++j) for (size_t i = 0; i < num_vectors; ++i) results_128[i] = Vector_and_first_bit_128(vectors[i],num_sse_words); gettimeofday(&t2,NulL); const size_t millis_128 = millis(&t2,&t1); const double factor_128 = (double) millis_64 / (double) millis_128; printf("128 : %zu ms (factor: %.2f)\n",millis_128,factor_128); gettimeofday(&t1,NulL); for (size_t j = 0; j < num_iterations; ++j) for (size_t i = 0; i < num_vectors; ++i) results_256[i] = Vector_and_first_bit_256(vectors[i],num_avx_words); gettimeofday(&t2,NulL); const size_t millis_256 = millis(&t2,&t1); const double factor_256 = (double) millis_64 / (double) millis_256; printf("256 : %zu ms (factor: %.2f)\n",millis_256,factor_256); for (size_t i = 0; i < num_vectors; ++i) { if (results_64[i] != results_256[i]) printf("ERROR: %zu (64) != %zu (256) with i = %zu\n",results_64[i],results_256[i],i); if (results_64[i] != results_128[i]) printf("ERROR: %zu (64) != %zu (128) with i = %zu\n",results_128[i],i); } free(results_64); free(results_128); free(results_256); for (size_t j = 0; j < num_vectors; ++j) { for (size_t i = 0; i < num_dimensions; ++i) free(vectors[j][i]); free(vectors[j]); } free(vectors); return 0;}
编译:
gcc -o main main.c -O3 -Wall -Wextra -pedantic-errors -Werror -march=native -std=c99 -fno-tree-vectorize
执行:
./main 1000 8192 50000 5 9
参数意味着:1000个测试用例,长度为8192位的向量,50000,测试重复(最后两个参数是小调整).
在我的机器上进行上述调用的示例输出:
Creation: 363 ms64 : 15000 ms128 : 10070 ms (factor: 1.49)256 : 6784 ms (factor: 2.21)解决方法 如果您的输入值均匀分布,则几乎所有时间最高设置位都将位于向量的前64位(1 ^ 2 ^ 64).在这种情况下的分支将非常好地预测. @Nejc’s answer is good for that case.
但是lzcnt是解决方案的一部分的许多问题具有均匀分布的输出(或类似的),因此无分支版本具有优势.不是严格一致的,而是最高设置位通常不是最高64位的任何地方.
Wim在比较位图上使用lzcnt来寻找合适的元素是一种非常好的方法.
但是,使用存储/重新加载的向量的运行时变量索引可能比shuffle更好.存储转发延迟很低(Skylake可能需要5到7个周期),并且延迟与索引生成并行(比较/ movemask / lzcnt).在知道索引之后,movd / vpermd / movd交叉的shuffle策略需要5个周期,以将正确的元素放入整数寄存器中. (见http://agner.org/optimize/)
我认为这个版本应该是Haswell / Skylake(和RyZen)的更好的延迟,以及更好的吞吐量. (在RyZen上vpermd非常慢,所以它应该非常好)负载的地址计算应该具有与存储转发相似的延迟,所以它是一个折腾,其中一个实际上是关键路径.
将堆栈对齐32以避免32字节存储上的高速缓存行拆分需要额外的指令,因此如果它可以内联到多次使用它的函数中,或者对于其他一些__m256i已经需要那么多对齐,那么这是最好的.
#include <stdint.h>#include <immintrin.h>#ifndef _MSC_VER#include <stdalign.h> //MSVC is missing this?#else#include <intrin.h>#pragma intrinsic(_BitScanReverse) // https://msdn.microsoft.com/en-us/library/fbxyd7zd.aspx suggests this#endif// undefined result for mask=0,like BSRuint32_t bsr_nonzero(uint32_t mask){// on Intel,bsr has a minor advantage for the first step// for AMD,BSR is slow so you should use 31-LZCNT. //return 31 - _lzcnt_u32(mask); // Intel's docs say there should be a _bit_scan_reverse(x),maybe try that with ICC #ifdef _MSC_VER unsigned long tmp; _BitScanReverse(&tmp,mask); return tmp; #else return 31 - __builtin_clz(mask); #endif}
而有趣的部分:
int mm256_lzcnt_si256(__m256i vec){ __m256i nonzero_elem = _mm256_cmpeq_epi8(vec,_mm256_setzero_si256()); unsigned mask = ~_mm256_movemask_epi8(nonzero_elem); if (mask == 0) return 256; // if this is rare,branching is probably good. alignas(32) // gcc chooses to align elems anyway,with its clunky code uint8_t elems[32]; _mm256_storeu_si256((__m256i*)elems,vec);// unsigned lz_msk = _lzcnt_u32(mask);// unsigned IDx = 31 - lz_msk; // can use bsr to get the 31-x,because mask is kNown to be non-zero.// This takes the 31-x latency off the critical path,in parallel with final lzcnt unsigned IDx = bsr_nonzero(mask); unsigned lz_msk = 31 - IDx; unsigned highest_nonzero_byte = elems[IDx]; return lz_msk * 8 + _lzcnt_u32(highest_nonzero_byte) - 24; // lzcnt(byte)-24,because we don't want to count the leading 24 bits of padding.}
On Godbolt with gcc7.3 -O3 -march=haswell
,我们得到这样的asm来计算ymm1到esi.
vpxor xmm0,xmm0,xmm0 mov esi,256 vpcmpeqd ymm0,ymm1,ymm0 vpmovmskb eax,ymm0 xor eax,-1 # ~mask and set flags,unlike NOT je .L35 bsr eax,eax vmovdqa YMMWORD PTR [rbp-48],ymm1 # note no dependency on anything earlIEr; OoO exec can run it early mov ecx,31 mov edx,eax # this is redundant,gcc should just use rax later. But it's zero-latency on HSW/SKL and RyZen. sub ecx,eax movzx edx,BYTE PTR [rbp-48+rdx] # has to wait for the index in edx lzcnt edx,edx lea esi,[rdx-24+rcx*8] # lzcnt(byte) + lzcnt(vectormask) * 8.L35:
为了找到最高的非零元素(31-lzcnt(~movemask)),我们使用bsr
直接获取位(以及字节)索引,并从关键路径中减去.只要我们将掩码分支为零,这是安全的. (无分支版本需要初始化寄存器以避免越界索引).
在AMD cpu上,bsr明显慢于lzcnt.在Intel cpu上,除了output-dependency details中的微小变化外,它们的性能相同.
输入为零的bsr使目标寄存器保持不变,但GCC没有提供利用它的方法. (英特尔仅将其记录为未定义的输出,但AMD记录了Intel / AMD cpu在目标寄存器中产生旧值的实际行为).
如果输入为零,则bsr设置ZF,而不是像大多数指令那样基于输出. (这和输出依赖性可能是它在AMD上运行缓慢的原因.)BSR标志上的分支并不比使用xor eax设置的ZF上的分支特别好,-1来反转掩码,这就是gcc所做的.无论如何,英特尔做document a _BitScanReverse(&idx,mask)
intrinsic返回bool,但gcc不支持它(甚至不支持x86intrin.h). GNU C内置函数不会返回一个布尔值来让你使用标志结果,但是如果检查输入C变量是非零的话,gcc可能会使用bsr的标志输出来生成智能asm.
使用DWord(uint32_t)数组和vmovmskps会让第二个lzcnt使用内存源 *** 作数,而不是需要一个movzx来对一个字节进行零扩展.但是lzcnt在Skylake之前对Intel cpu有错误的依赖性,因此编译器可能倾向于单独加载并使用lzcnt相同,无论如何都要使用相同的解决方法. (我没有检查.)
Wim的版本需要lz_msk-24,因为高位24位始终为零且具有8位掩码.但是32位掩码填充了32位寄存器.
这个版本具有8位元素和32位掩码是相反的:我们需要lzcnt所选字节,不包括寄存器中的24个前导零位.所以我们的-24移动到一个不同的位置,而不是索引数组的关键路径的一部分.
gcc选择将其作为单个3分量LEA(reg reg * scale-const)的一部分来执行,这对于吞吐量非常有用,但是在最终的lzcnt之后将其置于关键路径上. (它不是免费的,因为3组件LEA在英特尔cpu上具有额外的延迟与reg reg *规模.见Agner Fog’s instruction tables).
乘以8可以作为lea的一部分,但是乘以32将需要移位(或者被折叠成两个单独的LEA).
Intel’s optimization manual说(表2-24)即使SandybrIDge也可以从256位存储转发到单字节加载而没有问题,所以我认为它在AVX2 cpu上很好,就像转发到32位负载那样4字节 – 商店的对齐块.
总结以上是内存溢出为你收集整理的计算__m256i字中的前导零全部内容,希望文章能够帮你解决计算__m256i字中的前导零所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)