今天在分析leveldb(google的bigtable的核心组件之一)代码时看到了如下的一段代码:
leveldb\util\coding.cc
char* EncodeVarint32( char* dst, uint32_t v) { // Operate on characters as unsigneds unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); static const int B = 128; if (v < ( 1<< 7)) { *(ptr++) = v; } else if (v < ( 1<< 14)) { *(ptr++) = v | B; *(ptr++) = v>> 7; } else if (v < ( 1<< 21)) { *(ptr++) = v | B; *(ptr++) = (v>> 7) | B; *(ptr++) = v>> 14; } else if (v < ( 1<< 28)) { *(ptr++) = v | B; *(ptr++) = (v>> 7) | B; *(ptr++) = (v>> 14) | B; *(ptr++) = v>> 21; } else { *(ptr++) = v | B; *(ptr++) = (v>> 7) | B; *(ptr++) = (v>> 14) | B; *(ptr++) = (v>> 21) | B; *(ptr++) = v>> 28; } return reinterpret_cast< char*>(ptr); }
目标很明确,就是减少int数的内存存储长度,也就是当int小于128时,以一个字节存储,大于128时按数字的大小逐步向上加一个字节,这样最长的整数用二进制表示时最大只有5个字节长度表示就基本上够用了。这还有个名词叫varint,即可变长度整数。无独有偶,在之前我看过云风的开源库pbc时,也看过类似的代码,如下:
https://github.com/cloudwu/pbc/blob/master/src/varint.c
_pbcV_encode32(uint32_t number, uint8_t buffer[ 10]) { if (number < 0x80) { buffer[ 0] = (uint8_t) number ; return 1; } buffer[ 0] = (uint8_t) (number | 0x80 ); if (number < 0x4000) { buffer[ 1] = (uint8_t) (number >> 7 ); return 2; } buffer[ 1] = (uint8_t) ((number >> 7) | 0x80 ); if (number < 0x200000) { buffer[ 2] = (uint8_t) (number >> 14); return 3; } buffer[ 2] = (uint8_t) ((number >> 14) | 0x80 ); if (number < 0x10000000) { buffer[ 3] = (uint8_t) (number >> 21); return 4; } buffer[ 3] = (uint8_t) ((number >> 21) | 0x80 ); buffer[ 4] = (uint8_t) (number >> 28); return 5; }