|
|
|
@ -53,24 +53,40 @@ |
|
|
|
|
* fasthash as implemented here has two interfaces: |
|
|
|
|
* |
|
|
|
|
* 1) Standalone functions, e.g. fasthash32() for a single value with a |
|
|
|
|
* known length. |
|
|
|
|
* known length. These return the same hash code as the original, at |
|
|
|
|
* least on little-endian machines. |
|
|
|
|
* |
|
|
|
|
* 2) Incremental interface. This can used for incorporating multiple |
|
|
|
|
* inputs. The standalone functions use this internally, so see fasthash64() |
|
|
|
|
* for an an example of how this works. |
|
|
|
|
* |
|
|
|
|
* The incremental interface is especially useful if any of the inputs |
|
|
|
|
* are NUL-terminated C strings, since the length is not needed ahead |
|
|
|
|
* of time. This avoids needing to call strlen(). This case is optimized |
|
|
|
|
* in fasthash_accum_cstring() : |
|
|
|
|
* inputs. First, initialize the hash state (here with a zero seed): |
|
|
|
|
* |
|
|
|
|
* fasthash_state hs; |
|
|
|
|
* fasthash_init(&hs, 0); |
|
|
|
|
* len = fasthash_accum_cstring(&hs, *str); |
|
|
|
|
* |
|
|
|
|
* If the inputs are of types that can be trivially cast to uint64, it's |
|
|
|
|
* sufficient to do: |
|
|
|
|
* |
|
|
|
|
* hs.accum = value1; |
|
|
|
|
* fasthash_combine(&hs); |
|
|
|
|
* hs.accum = value2; |
|
|
|
|
* fasthash_combine(&hs); |
|
|
|
|
* ... |
|
|
|
|
* return fasthash_final32(&hs, len); |
|
|
|
|
* |
|
|
|
|
* The length is computed on-the-fly. Experimentation has found that |
|
|
|
|
* For longer or variable-length input, fasthash_accum() is a more |
|
|
|
|
* flexible, but more verbose method. The standalone functions use this |
|
|
|
|
* internally, so see fasthash64() for an an example of this. |
|
|
|
|
* |
|
|
|
|
* After all inputs have been mixed in, finalize the hash: |
|
|
|
|
* |
|
|
|
|
* hashcode = fasthash_final32(&hs, 0); |
|
|
|
|
* |
|
|
|
|
* The incremental interface allows an optimization for NUL-terminated |
|
|
|
|
* C strings: |
|
|
|
|
* |
|
|
|
|
* len = fasthash_accum_cstring(&hs, str); |
|
|
|
|
* hashcode = fasthash_final32(&hs, len); |
|
|
|
|
* |
|
|
|
|
* By handling the terminator on-the-fly, we can avoid needing a strlen() |
|
|
|
|
* call to tell us how many bytes to hash. Experimentation has found that |
|
|
|
|
* SMHasher fails unless we incorporate the length, so it is passed to |
|
|
|
|
* the finalizer as a tweak. |
|
|
|
|
*/ |
|
|
|
@ -204,26 +220,33 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str) |
|
|
|
|
{ |
|
|
|
|
const char *const start = str; |
|
|
|
|
int remainder; |
|
|
|
|
uint64 zero_bytes_le; |
|
|
|
|
uint64 zero_byte_low; |
|
|
|
|
|
|
|
|
|
Assert(PointerIsAligned(start, uint64)); |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* For every chunk of input, check for zero bytes before mixing into the |
|
|
|
|
* hash. The chunk with zeros must contain the NUL terminator. We arrange |
|
|
|
|
* so that zero_byte_low tells us not only that a zero exists, but also |
|
|
|
|
* where it is, so we can hash the remainder of the string. |
|
|
|
|
* |
|
|
|
|
* The haszero64 calculation will set bits corresponding to the lowest |
|
|
|
|
* byte where a zero exists, so that suffices for little-endian machines. |
|
|
|
|
* For big-endian machines, we would need bits set for the highest zero |
|
|
|
|
* byte in the chunk, since the trailing junk past the terminator could |
|
|
|
|
* contain additional zeros. haszero64 does not give us that, so we |
|
|
|
|
* byteswap the chunk first. |
|
|
|
|
*/ |
|
|
|
|
for (;;) |
|
|
|
|
{ |
|
|
|
|
uint64 chunk = *(uint64 *) str; |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* With little-endian representation, we can use this calculation, |
|
|
|
|
* which sets bits in the first byte in the result word that |
|
|
|
|
* corresponds to a zero byte in the original word. The rest of the |
|
|
|
|
* bytes are indeterminate, so cannot be used on big-endian machines |
|
|
|
|
* without either swapping or a bytewise check. |
|
|
|
|
*/ |
|
|
|
|
#ifdef WORDS_BIGENDIAN |
|
|
|
|
zero_bytes_le = haszero64(pg_bswap64(chunk)); |
|
|
|
|
zero_byte_low = haszero64(pg_bswap64(chunk)); |
|
|
|
|
#else |
|
|
|
|
zero_bytes_le = haszero64(chunk); |
|
|
|
|
zero_byte_low = haszero64(chunk); |
|
|
|
|
#endif |
|
|
|
|
if (zero_bytes_le) |
|
|
|
|
if (zero_byte_low) |
|
|
|
|
break; |
|
|
|
|
|
|
|
|
|
hs->accum = chunk; |
|
|
|
@ -232,12 +255,11 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* For the last word, only use bytes up to the NUL for the hash. Bytes |
|
|
|
|
* with set bits will be 0x80, so calculate the first occurrence of a zero |
|
|
|
|
* byte within the input word by counting the number of trailing (because |
|
|
|
|
* little-endian) zeros and dividing the result by 8. |
|
|
|
|
* The byte corresponding to the NUL will be 0x80, so the rightmost bit |
|
|
|
|
* position will be in the range 7, 15, ..., 63. Turn this into byte |
|
|
|
|
* position by dividing by 8. |
|
|
|
|
*/ |
|
|
|
|
remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE; |
|
|
|
|
remainder = pg_rightmost_one_pos64(zero_byte_low) / BITS_PER_BYTE; |
|
|
|
|
fasthash_accum(hs, str, remainder); |
|
|
|
|
str += remainder; |
|
|
|
|
|
|
|
|
|