建议: * Prefer static linking and position-dependent code (as opposed to PIC, position-independent code). * Prefer 64-bit code and 32-bit data. * Prefer array indexing to pointers (this one seems to reverse every ten years). * Prefer regular memory access patterns. * Minimize control flow. * Avoid data dependencies.
算术操作的开销顺序: * comparisons * (u)int add, subtract, bitops, shift * floating point add, sub (separate unit!) * indexed array access (caveat: cache effects) * (u)int32 mul * FP mul * FP division, remainder * (u)int division, remainder
uint32_tdigits10(uint64_t v){ if (v < P01) return1; if (v < P02) return2; if (v < P03) return3; if (v < P12) { if (v < P08) { if (v < P06) { if (v < P04) return4; return5 + (v >= P05); } return7 + (v >= P07); } if (v < P10) { return9 + (v >= P09); } return11 + (v >= P11); } return12 + digits10(v / P12); }
unsignedu64ToAsciiTable(uint64_t value, char* dst){ staticconstchar digits[201] = "0001020304050607080910111213141516171819" "2021222324252627282930313233343536373839" "4041424344454647484950515253545556575859" "6061626364656667686970717273747576777879" "8081828384858687888990919293949596979899"; uint32_tconst length = digits10(value); uint32_t next = length - 1; while (value >= 100) { autoconst i = (value % 100) * 2; value /= 100; dst[next] = digits[i + 1]; dst[next - 1] = digits[i]; next -= 2; } // Handle last 1-2 digits if (value < 10) { dst[next] = '0' + uint32_t(value); } else { auto i = uint32_t(value) * 2; dst[next] = digits[i + 1]; dst[next - 1] = digits[i]; } return length; }