mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[Support] Optimize SHA1 implementation
* Add inline to the helper functions because gcc-9 won't inline all of them without the hint. I've avoided `__attribute__((always_inline))` because gcc and clang will inline without it, and improves compatibility. * Replace the byte-by-byte copy in update() with endian::readbe32() since perf reports that 1/2 of the time is spent copying into the buffer before this patch. When lld uses --build-id=sha1 it spends 30-45% of CPU in SHA1 depending on the binary (not wall-time since it is parallel). This patch speeds up SHA1 by a factor of 2 on clang-8 and 3 on gcc-6. This leads to a >10% improvement in overall linking time. lld-speed-test benchmarks run on an Intel i9-9900k with Turbo disabled on CPU 0 compiled with clang-9. Stats recorded with `perf stat -r 5`. All inputs are using `--build-id=sha1`. | Input | Before (seconds) | After (seconds) | | --- | --- | --- | | chrome | 2.14 | 1.82 (-15%) | | chrome-icf | 2.56 | 2.29 (-10%) | | clang | 0.65 | 0.53 (-18%) | | clang-fsds | 0.69 | 0.58 (-16%) | | clang-gdb-index | 21.71 | 19.3 (-11%) | | gold | 0.42 | 0.34 (-19%) | | gold-fsds | 0.431 | 0.355 (-17%) | | linux-kernel | 0.625 | 0.575 (-8%) | | llvm-as | 0.045 | 0.039 (-14%) | | llvm-as-fsds | 0.035 | 0.039 (-11%) | | mozilla | 11.3 | 9.8 (-13%) | | mozilla-gc | 11.84 | 10.36 (-12%) | | mozilla-O0 | 8.2 | 5.84 (-28%) | | scylla | 5.59 | 4.52 (-19%) | Reviewed By: ruiu, MaskRay Differential Revision: https://reviews.llvm.org/D69295
This commit is contained in:
parent
35f403d954
commit
639a0c16d7
@ -16,6 +16,7 @@
|
|||||||
|
|
||||||
#include "llvm/Support/SHA1.h"
|
#include "llvm/Support/SHA1.h"
|
||||||
#include "llvm/ADT/ArrayRef.h"
|
#include "llvm/ADT/ArrayRef.h"
|
||||||
|
#include "llvm/Support/Endian.h"
|
||||||
#include "llvm/Support/Host.h"
|
#include "llvm/Support/Host.h"
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
|
|
||||||
@ -26,45 +27,45 @@ using namespace llvm;
|
|||||||
#define SHA_BIG_ENDIAN
|
#define SHA_BIG_ENDIAN
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static uint32_t rol(uint32_t Number, int Bits) {
|
static inline uint32_t rol(uint32_t Number, int Bits) {
|
||||||
return (Number << Bits) | (Number >> (32 - Bits));
|
return (Number << Bits) | (Number >> (32 - Bits));
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t blk0(uint32_t *Buf, int I) { return Buf[I]; }
|
static inline uint32_t blk0(uint32_t *Buf, int I) { return Buf[I]; }
|
||||||
|
|
||||||
static uint32_t blk(uint32_t *Buf, int I) {
|
static inline uint32_t blk(uint32_t *Buf, int I) {
|
||||||
Buf[I & 15] = rol(Buf[(I + 13) & 15] ^ Buf[(I + 8) & 15] ^ Buf[(I + 2) & 15] ^
|
Buf[I & 15] = rol(Buf[(I + 13) & 15] ^ Buf[(I + 8) & 15] ^ Buf[(I + 2) & 15] ^
|
||||||
Buf[I & 15],
|
Buf[I & 15],
|
||||||
1);
|
1);
|
||||||
return Buf[I & 15];
|
return Buf[I & 15];
|
||||||
}
|
}
|
||||||
|
|
||||||
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E,
|
static inline void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D,
|
||||||
int I, uint32_t *Buf) {
|
uint32_t &E, int I, uint32_t *Buf) {
|
||||||
E += ((B & (C ^ D)) ^ D) + blk0(Buf, I) + 0x5A827999 + rol(A, 5);
|
E += ((B & (C ^ D)) ^ D) + blk0(Buf, I) + 0x5A827999 + rol(A, 5);
|
||||||
B = rol(B, 30);
|
B = rol(B, 30);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E,
|
static inline void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D,
|
||||||
int I, uint32_t *Buf) {
|
uint32_t &E, int I, uint32_t *Buf) {
|
||||||
E += ((B & (C ^ D)) ^ D) + blk(Buf, I) + 0x5A827999 + rol(A, 5);
|
E += ((B & (C ^ D)) ^ D) + blk(Buf, I) + 0x5A827999 + rol(A, 5);
|
||||||
B = rol(B, 30);
|
B = rol(B, 30);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E,
|
static inline void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D,
|
||||||
int I, uint32_t *Buf) {
|
uint32_t &E, int I, uint32_t *Buf) {
|
||||||
E += (B ^ C ^ D) + blk(Buf, I) + 0x6ED9EBA1 + rol(A, 5);
|
E += (B ^ C ^ D) + blk(Buf, I) + 0x6ED9EBA1 + rol(A, 5);
|
||||||
B = rol(B, 30);
|
B = rol(B, 30);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E,
|
static inline void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D,
|
||||||
int I, uint32_t *Buf) {
|
uint32_t &E, int I, uint32_t *Buf) {
|
||||||
E += (((B | C) & D) | (B & C)) + blk(Buf, I) + 0x8F1BBCDC + rol(A, 5);
|
E += (((B | C) & D) | (B & C)) + blk(Buf, I) + 0x8F1BBCDC + rol(A, 5);
|
||||||
B = rol(B, 30);
|
B = rol(B, 30);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void r4(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E,
|
static inline void r4(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D,
|
||||||
int I, uint32_t *Buf) {
|
uint32_t &E, int I, uint32_t *Buf) {
|
||||||
E += (B ^ C ^ D) + blk(Buf, I) + 0xCA62C1D6 + rol(A, 5);
|
E += (B ^ C ^ D) + blk(Buf, I) + 0xCA62C1D6 + rol(A, 5);
|
||||||
B = rol(B, 30);
|
B = rol(B, 30);
|
||||||
}
|
}
|
||||||
@ -210,8 +211,31 @@ void SHA1::writebyte(uint8_t Data) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void SHA1::update(ArrayRef<uint8_t> Data) {
|
void SHA1::update(ArrayRef<uint8_t> Data) {
|
||||||
for (auto &C : Data)
|
InternalState.ByteCount += Data.size();
|
||||||
writebyte(C);
|
|
||||||
|
// Finish the current block.
|
||||||
|
if (InternalState.BufferOffset > 0) {
|
||||||
|
const size_t Remainder = std::min<size_t>(
|
||||||
|
Data.size(), BLOCK_LENGTH - InternalState.BufferOffset);
|
||||||
|
for (size_t I = 0; I < Remainder; ++I)
|
||||||
|
addUncounted(Data[I]);
|
||||||
|
Data = Data.drop_front(Remainder);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fast buffer filling for large inputs.
|
||||||
|
while (Data.size() >= BLOCK_LENGTH) {
|
||||||
|
assert(InternalState.BufferOffset == 0);
|
||||||
|
assert(BLOCK_LENGTH % 4 == 0);
|
||||||
|
constexpr size_t BLOCK_LENGTH_32 = BLOCK_LENGTH / 4;
|
||||||
|
for (size_t I = 0; I < BLOCK_LENGTH_32; ++I)
|
||||||
|
InternalState.Buffer.L[I] = support::endian::read32be(&Data[I * 4]);
|
||||||
|
hashBlock();
|
||||||
|
Data = Data.drop_front(BLOCK_LENGTH);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finish the remainder.
|
||||||
|
for (uint8_t C : Data)
|
||||||
|
addUncounted(C);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SHA1::pad() {
|
void SHA1::pad() {
|
||||||
|
@ -43,6 +43,22 @@ TEST(sha1_hash_test, Basic) {
|
|||||||
ASSERT_EQ("2EF7BDE608CE5404E97D5F042F95F89F1C232871", Hash);
|
ASSERT_EQ("2EF7BDE608CE5404E97D5F042F95F89F1C232871", Hash);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(sha1_hash_test, Update) {
|
||||||
|
SHA1 sha1;
|
||||||
|
std::string Input = "123456789012345678901234567890";
|
||||||
|
ASSERT_EQ(Input.size(), 30UL);
|
||||||
|
// 3 short updates.
|
||||||
|
sha1.update(Input);
|
||||||
|
sha1.update(Input);
|
||||||
|
sha1.update(Input);
|
||||||
|
// Long update that gets into the optimized loop with prefix/suffix.
|
||||||
|
sha1.update(Input + Input + Input + Input);
|
||||||
|
// 18 bytes buffered now.
|
||||||
|
|
||||||
|
std::string Hash = toHex(sha1.final());
|
||||||
|
ASSERT_EQ("3E4A614101AD84985AB0FE54DC12A6D71551E5AE", Hash);
|
||||||
|
}
|
||||||
|
|
||||||
// Check that getting the intermediate hash in the middle of the stream does
|
// Check that getting the intermediate hash in the middle of the stream does
|
||||||
// not invalidate the final result.
|
// not invalidate the final result.
|
||||||
TEST(raw_sha1_ostreamTest, Intermediate) {
|
TEST(raw_sha1_ostreamTest, Intermediate) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user