diff --git a/3rdparty/7z/src/7z.h b/3rdparty/7z/src/7z.h index 82813c298b..969523cd3e 100644 --- a/3rdparty/7z/src/7z.h +++ b/3rdparty/7z/src/7z.h @@ -1,5 +1,5 @@ /* 7z.h -- 7z interface -2017-04-03 : Igor Pavlov : Public domain */ +2018-07-02 : Igor Pavlov : Public domain */ #ifndef __7Z_H #define __7Z_H @@ -91,6 +91,8 @@ typedef struct UInt64 *CoderUnpackSizes; // for all coders in all folders Byte *CodersData; + + UInt64 RangeLimit; } CSzAr; UInt64 SzAr_GetFolderUnpackSize(const CSzAr *p, UInt32 folderIndex); diff --git a/3rdparty/7z/src/7zArcIn.c b/3rdparty/7z/src/7zArcIn.c index 68cc12ff45..7ccc721012 100644 --- a/3rdparty/7z/src/7zArcIn.c +++ b/3rdparty/7z/src/7zArcIn.c @@ -1,5 +1,5 @@ /* 7zArcIn.c -- 7z Input functions -2018-12-31 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -75,7 +75,7 @@ static SRes SzBitUi32s_Alloc(CSzBitUi32s *p, size_t num, ISzAllocPtr alloc) return SZ_OK; } -void SzBitUi32s_Free(CSzBitUi32s *p, ISzAllocPtr alloc) +static void SzBitUi32s_Free(CSzBitUi32s *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->Defs); p->Defs = NULL; ISzAlloc_Free(alloc, p->Vals); p->Vals = NULL; @@ -83,7 +83,7 @@ void SzBitUi32s_Free(CSzBitUi32s *p, ISzAllocPtr alloc) #define SzBitUi64s_Init(p) { (p)->Defs = NULL; (p)->Vals = NULL; } -void SzBitUi64s_Free(CSzBitUi64s *p, ISzAllocPtr alloc) +static void SzBitUi64s_Free(CSzBitUi64s *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->Defs); p->Defs = NULL; ISzAlloc_Free(alloc, p->Vals); p->Vals = NULL; @@ -105,6 +105,8 @@ static void SzAr_Init(CSzAr *p) p->CoderUnpackSizes = NULL; p->CodersData = NULL; + + p->RangeLimit = 0; } static void SzAr_Free(CSzAr *p, ISzAllocPtr alloc) @@ -502,7 +504,7 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd) return SZ_ERROR_ARCHIVE; if (propsSize >= 0x80) return SZ_ERROR_UNSUPPORTED; - coder->PropsOffset = sd->Data - dataStart; + coder->PropsOffset = (size_t)(sd->Data - dataStart); coder->PropsSize = (Byte)propsSize; sd->Data += (size_t)propsSize; sd->Size -= (size_t)propsSize; @@ -677,7 +679,7 @@ static SRes ReadUnpackInfo(CSzAr *p, { UInt32 numCoders, ci, numInStreams = 0; - p->FoCodersOffsets[fo] = sd.Data - startBufPtr; + p->FoCodersOffsets[fo] = (size_t)(sd.Data - startBufPtr); RINOK(SzReadNumber32(&sd, &numCoders)); if (numCoders == 0 || numCoders > k_Scan_NumCoders_MAX) @@ -797,7 +799,7 @@ static SRes ReadUnpackInfo(CSzAr *p, p->FoToCoderUnpackSizes[fo] = numCodersOutStreams; { - size_t dataSize = sd.Data - startBufPtr; + const size_t dataSize = (size_t)(sd.Data - startBufPtr); p->FoStartPackStreamIndex[fo] = packStreamIndex; p->FoCodersOffsets[fo] = dataSize; MY_ALLOC_ZE_AND_CPY(p->CodersData, dataSize, startBufPtr, alloc); @@ -885,7 +887,7 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi) if (numStreams != 1 || !SzBitWithVals_Check(&p->FolderCRCs, i)) numSubDigests += numStreams; } - ssi->sdNumSubStreams.Size = sd->Data - ssi->sdNumSubStreams.Data; + ssi->sdNumSubStreams.Size = (size_t)(sd->Data - ssi->sdNumSubStreams.Data); continue; } if (type == k7zIdCRC || type == k7zIdSize || type == k7zIdEnd) @@ -907,7 +909,7 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi) { ssi->sdSizes.Data = sd->Data; RINOK(SkipNumbers(sd, numUnpackSizesInData)); - ssi->sdSizes.Size = sd->Data - ssi->sdSizes.Data; + ssi->sdSizes.Size = (size_t)(sd->Data - ssi->sdSizes.Data); RINOK(ReadID(sd, &type)); } @@ -919,7 +921,7 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi) { ssi->sdCRCs.Data = sd->Data; RINOK(SkipBitUi32s(sd, numSubDigests)); - ssi->sdCRCs.Size = sd->Data - ssi->sdCRCs.Data; + ssi->sdCRCs.Size = (size_t)(sd->Data - ssi->sdCRCs.Data); } else { @@ -947,7 +949,11 @@ static SRes SzReadStreamsInfo(CSzAr *p, if (type == k7zIdPackInfo) { RINOK(ReadNumber(sd, dataOffset)); + if (*dataOffset > p->RangeLimit) + return SZ_ERROR_ARCHIVE; RINOK(ReadPackInfo(p, sd, alloc)); + if (p->PackPositions[p->NumPackStreams] > p->RangeLimit - *dataOffset) + return SZ_ERROR_ARCHIVE; RINOK(ReadID(sd, &type)); } if (type == k7zIdUnpackInfo) @@ -1028,12 +1034,12 @@ static SRes SzReadFileNames(const Byte *data, size_t size, UInt32 numFiles, size return SZ_ERROR_ARCHIVE; for (p = data + pos; #ifdef _WIN32 - *(const UInt16 *)p != 0 + *(const UInt16 *)(const void *)p != 0 #else p[0] != 0 || p[1] != 0 #endif ; p += 2); - pos = p - data + 2; + pos = (size_t)(p - data) + 2; *offsets++ = (pos >> 1); } while (--numFiles); @@ -1133,6 +1139,8 @@ static SRes SzReadHeader2( SRes res; SzAr_Init(&tempAr); + tempAr.RangeLimit = p->db.RangeLimit; + res = SzReadAndDecodePackedStreams(inStream, sd, tempBufs, NUM_ADDITIONAL_STREAMS_MAX, p->startPosAfterHeader, &tempAr, allocTemp); *numTempBufs = tempAr.NumFolders; @@ -1526,11 +1534,13 @@ static SRes SzArEx_Open2( nextHeaderSize = GetUi64(header + 20); nextHeaderCRC = GetUi32(header + 28); - p->startPosAfterHeader = startArcPos + k7zStartHeaderSize; + p->startPosAfterHeader = (UInt64)startArcPos + k7zStartHeaderSize; if (CrcCalc(header + 12, 20) != GetUi32(header + 8)) return SZ_ERROR_CRC; + p->db.RangeLimit = nextHeaderOffset; + nextHeaderSizeT = (size_t)nextHeaderSize; if (nextHeaderSizeT != nextHeaderSize) return SZ_ERROR_MEM; @@ -1543,13 +1553,13 @@ static SRes SzArEx_Open2( { Int64 pos = 0; RINOK(ILookInStream_Seek(inStream, &pos, SZ_SEEK_END)); - if ((UInt64)pos < startArcPos + nextHeaderOffset || - (UInt64)pos < startArcPos + k7zStartHeaderSize + nextHeaderOffset || - (UInt64)pos < startArcPos + k7zStartHeaderSize + nextHeaderOffset + nextHeaderSize) + if ((UInt64)pos < (UInt64)startArcPos + nextHeaderOffset || + (UInt64)pos < (UInt64)startArcPos + k7zStartHeaderSize + nextHeaderOffset || + (UInt64)pos < (UInt64)startArcPos + k7zStartHeaderSize + nextHeaderOffset + nextHeaderSize) return SZ_ERROR_INPUT_EOF; } - RINOK(LookInStream_SeekTo(inStream, startArcPos + k7zStartHeaderSize + nextHeaderOffset)); + RINOK(LookInStream_SeekTo(inStream, (UInt64)startArcPos + k7zStartHeaderSize + nextHeaderOffset)); if (!Buf_Create(&buf, nextHeaderSizeT, allocTemp)) return SZ_ERROR_MEM; @@ -1575,6 +1585,8 @@ static SRes SzArEx_Open2( Buf_Init(&tempBuf); SzAr_Init(&tempAr); + tempAr.RangeLimit = p->db.RangeLimit; + res = SzReadAndDecodePackedStreams(inStream, &sd, &tempBuf, 1, p->startPosAfterHeader, &tempAr, allocTemp); SzAr_Free(&tempAr, allocTemp); diff --git a/3rdparty/7z/src/7zCrc.c b/3rdparty/7z/src/7zCrc.c index 40ab75952a..c7ec353d60 100644 --- a/3rdparty/7z/src/7zCrc.c +++ b/3rdparty/7z/src/7zCrc.c @@ -1,5 +1,5 @@ /* 7zCrc.c -- CRC32 init -2017-06-06 : Igor Pavlov : Public domain */ +2021-04-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -26,8 +26,20 @@ typedef UInt32 (MY_FAST_CALL *CRC_FUNC)(UInt32 v, const void *data, size_t size, const UInt32 *table); +extern CRC_FUNC g_CrcUpdateT4; +CRC_FUNC g_CrcUpdateT4; +extern CRC_FUNC g_CrcUpdateT8; +CRC_FUNC g_CrcUpdateT8; +extern +CRC_FUNC g_CrcUpdateT0_32; +CRC_FUNC g_CrcUpdateT0_32; +extern +CRC_FUNC g_CrcUpdateT0_64; +CRC_FUNC g_CrcUpdateT0_64; +extern +CRC_FUNC g_CrcUpdate; CRC_FUNC g_CrcUpdate; UInt32 g_CrcTable[256 * CRC_NUM_TABLES]; @@ -44,6 +56,7 @@ UInt32 MY_FAST_CALL CrcCalc(const void *data, size_t size) #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) +UInt32 MY_FAST_CALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table); UInt32 MY_FAST_CALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table) { const Byte *p = (const Byte *)data; @@ -53,6 +66,166 @@ UInt32 MY_FAST_CALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const U return v; } + +/* ---------- hardware CRC ---------- */ + +#ifdef MY_CPU_LE + +#if defined(MY_CPU_ARM_OR_ARM64) + +// #pragma message("ARM*") + + #if defined(_MSC_VER) + #if defined(MY_CPU_ARM64) + #if (_MSC_VER >= 1910) + #define USE_ARM64_CRC + #endif + #endif + #elif (defined(__clang__) && (__clang_major__ >= 3)) \ + || (defined(__GNUC__) && (__GNUC__ > 4)) + #if !defined(__ARM_FEATURE_CRC32) + #define __ARM_FEATURE_CRC32 1 + #if (!defined(__clang__) || (__clang_major__ > 3)) // fix these numbers + #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc"))) + #endif + #endif + #if defined(__ARM_FEATURE_CRC32) + #define USE_ARM64_CRC + #include + #endif + #endif + +#else + +// no hardware CRC + +// #define USE_CRC_EMU + +#ifdef USE_CRC_EMU + +#pragma message("ARM64 CRC emulation") + +MY_FORCE_INLINE +UInt32 __crc32b(UInt32 v, UInt32 data) +{ + const UInt32 *table = g_CrcTable; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); + return v; +} + +MY_FORCE_INLINE +UInt32 __crc32w(UInt32 v, UInt32 data) +{ + const UInt32 *table = g_CrcTable; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + return v; +} + +MY_FORCE_INLINE +UInt32 __crc32d(UInt32 v, UInt64 data) +{ + const UInt32 *table = g_CrcTable; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; + return v; +} + +#endif // USE_CRC_EMU + +#endif // defined(MY_CPU_ARM64) && defined(MY_CPU_LE) + + + +#if defined(USE_ARM64_CRC) || defined(USE_CRC_EMU) + +#define T0_32_UNROLL_BYTES (4 * 4) +#define T0_64_UNROLL_BYTES (4 * 8) + +#ifndef ATTRIB_CRC +#define ATTRIB_CRC +#endif +// #pragma message("USE ARM HW CRC") + +ATTRIB_CRC +UInt32 MY_FAST_CALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table); +ATTRIB_CRC +UInt32 MY_FAST_CALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table) +{ + const Byte *p = (const Byte *)data; + UNUSED_VAR(table); + + for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_32_UNROLL_BYTES - 1)) != 0; size--) + v = __crc32b(v, *p++); + + if (size >= T0_32_UNROLL_BYTES) + { + const Byte *lim = p + size; + size &= (T0_32_UNROLL_BYTES - 1); + lim -= size; + do + { + v = __crc32w(v, *(const UInt32 *)(const void *)(p)); + v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; + v = __crc32w(v, *(const UInt32 *)(const void *)(p)); + v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; + } + while (p != lim); + } + + for (; size != 0; size--) + v = __crc32b(v, *p++); + + return v; +} + +ATTRIB_CRC +UInt32 MY_FAST_CALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table); +ATTRIB_CRC +UInt32 MY_FAST_CALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table) +{ + const Byte *p = (const Byte *)data; + UNUSED_VAR(table); + + for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_64_UNROLL_BYTES - 1)) != 0; size--) + v = __crc32b(v, *p++); + + if (size >= T0_64_UNROLL_BYTES) + { + const Byte *lim = p + size; + size &= (T0_64_UNROLL_BYTES - 1); + lim -= size; + do + { + v = __crc32d(v, *(const UInt64 *)(const void *)(p)); + v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8; + v = __crc32d(v, *(const UInt64 *)(const void *)(p)); + v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8; + } + while (p != lim); + } + + for (; size != 0; size--) + v = __crc32b(v, *p++); + + return v; +} + +#endif // defined(USE_ARM64_CRC) || defined(USE_CRC_EMU) + +#endif // MY_CPU_LE + + + + void MY_FAST_CALL CrcGenerateTable() { UInt32 i; @@ -123,6 +296,27 @@ void MY_FAST_CALL CrcGenerateTable() } } #endif + #endif + #ifdef MY_CPU_LE + #ifdef USE_ARM64_CRC + if (CPU_IsSupported_CRC32()) + { + g_CrcUpdateT0_32 = CrcUpdateT0_32; + g_CrcUpdateT0_64 = CrcUpdateT0_64; + g_CrcUpdate = + #if defined(MY_CPU_ARM) + CrcUpdateT0_32; + #else + CrcUpdateT0_64; + #endif + } + #endif + + #ifdef USE_CRC_EMU + g_CrcUpdateT0_32 = CrcUpdateT0_32; + g_CrcUpdateT0_64 = CrcUpdateT0_64; + g_CrcUpdate = CrcUpdateT0_64; + #endif #endif } diff --git a/3rdparty/7z/src/7zCrcOpt.c b/3rdparty/7z/src/7zCrcOpt.c index 2ee0de8459..efaa7ab9dc 100644 --- a/3rdparty/7z/src/7zCrcOpt.c +++ b/3rdparty/7z/src/7zCrcOpt.c @@ -1,5 +1,5 @@ /* 7zCrcOpt.c -- CRC32 calculation -2017-04-03 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -9,6 +9,7 @@ #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) +UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table) { const Byte *p = (const Byte *)data; @@ -16,7 +17,7 @@ UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const U v = CRC_UPDATE_BYTE_2(v, *p); for (; size >= 4; size -= 4, p += 4) { - v ^= *(const UInt32 *)p; + v ^= *(const UInt32 *)(const void *)p; v = (table + 0x300)[((v ) & 0xFF)] ^ (table + 0x200)[((v >> 8) & 0xFF)] @@ -28,6 +29,7 @@ UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const U return v; } +UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table); UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table) { const Byte *p = (const Byte *)data; @@ -36,13 +38,13 @@ UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const U for (; size >= 8; size -= 8, p += 8) { UInt32 d; - v ^= *(const UInt32 *)p; + v ^= *(const UInt32 *)(const void *)p; v = (table + 0x700)[((v ) & 0xFF)] ^ (table + 0x600)[((v >> 8) & 0xFF)] ^ (table + 0x500)[((v >> 16) & 0xFF)] ^ (table + 0x400)[((v >> 24))]; - d = *((const UInt32 *)p + 1); + d = *((const UInt32 *)(const void *)p + 1); v ^= (table + 0x300)[((d ) & 0xFF)] ^ (table + 0x200)[((d >> 8) & 0xFF)] @@ -72,7 +74,7 @@ UInt32 MY_FAST_CALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, co v = CRC_UPDATE_BYTE_2_BE(v, *p); for (; size >= 4; size -= 4, p += 4) { - v ^= *(const UInt32 *)p; + v ^= *(const UInt32 *)(const void *)p; v = (table + 0x000)[((v ) & 0xFF)] ^ (table + 0x100)[((v >> 8) & 0xFF)] @@ -94,13 +96,13 @@ UInt32 MY_FAST_CALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, co for (; size >= 8; size -= 8, p += 8) { UInt32 d; - v ^= *(const UInt32 *)p; + v ^= *(const UInt32 *)(const void *)p; v = (table + 0x400)[((v ) & 0xFF)] ^ (table + 0x500)[((v >> 8) & 0xFF)] ^ (table + 0x600)[((v >> 16) & 0xFF)] ^ (table + 0x700)[((v >> 24))]; - d = *((const UInt32 *)p + 1); + d = *((const UInt32 *)(const void *)p + 1); v ^= (table + 0x000)[((d ) & 0xFF)] ^ (table + 0x100)[((d >> 8) & 0xFF)] diff --git a/3rdparty/7z/src/7zDec.c b/3rdparty/7z/src/7zDec.c index 2a7b09030e..83e37d166b 100644 --- a/3rdparty/7z/src/7zDec.c +++ b/3rdparty/7z/src/7zDec.c @@ -1,5 +1,5 @@ /* 7zDec.c -- Decoding from 7z folder -2019-02-02 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -21,17 +21,20 @@ #endif #define k_Copy 0 -#define k_Delta 3 +#ifndef _7Z_NO_METHOD_LZMA2 #define k_LZMA2 0x21 +#endif #define k_LZMA 0x30101 -#define k_BCJ 0x3030103 #define k_BCJ2 0x303011B +#ifndef _7Z_NO_METHODS_FILTERS +#define k_Delta 3 +#define k_BCJ 0x3030103 #define k_PPC 0x3030205 #define k_IA64 0x3030401 #define k_ARM 0x3030501 #define k_ARMT 0x3030701 #define k_SPARC 0x3030805 - +#endif #ifdef _7ZIP_PPMD_SUPPPORT @@ -56,7 +59,7 @@ static Byte ReadByte(const IByteIn *pp) return *p->cur++; if (p->res == SZ_OK) { - size_t size = p->cur - p->begin; + size_t size = (size_t)(p->cur - p->begin); p->processed += size; p->res = ILookInStream_Skip(p->inStream, size); size = (1 << 25); @@ -101,28 +104,32 @@ static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, c Ppmd7_Init(&ppmd, order); } { - CPpmd7z_RangeDec rc; - Ppmd7z_RangeDec_CreateVTable(&rc); - rc.Stream = &s.vt; - if (!Ppmd7z_RangeDec_Init(&rc)) + ppmd.rc.dec.Stream = &s.vt; + if (!Ppmd7z_RangeDec_Init(&ppmd.rc.dec)) res = SZ_ERROR_DATA; - else if (s.extra) - res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA); - else + else if (!s.extra) { - SizeT i; - for (i = 0; i < outSize; i++) + Byte *buf = outBuffer; + const Byte *lim = buf + outSize; + for (; buf != lim; buf++) { - int sym = Ppmd7_DecodeSymbol(&ppmd, &rc.vt); + int sym = Ppmd7z_DecodeSymbol(&ppmd); if (s.extra || sym < 0) break; - outBuffer[i] = (Byte)sym; + *buf = (Byte)sym; } - if (i != outSize) - res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA); - else if (s.processed + (s.cur - s.begin) != inSize || !Ppmd7z_RangeDec_IsFinishedOK(&rc)) + if (buf != lim) res = SZ_ERROR_DATA; + else if (!Ppmd7z_RangeDec_IsFinishedOK(&ppmd.rc.dec)) + { + /* if (Ppmd7z_DecodeSymbol(&ppmd) != PPMD7_SYM_END || !Ppmd7z_RangeDec_IsFinishedOK(&ppmd.rc.dec)) */ + res = SZ_ERROR_DATA; + } } + if (s.extra) + res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA); + else if (s.processed + (size_t)(s.cur - s.begin) != inSize) + res = SZ_ERROR_DATA; } Ppmd7_Free(&ppmd, allocMain); return res; @@ -365,7 +372,9 @@ static SRes CheckSupportedFolder(const CSzFolder *f) return SZ_ERROR_UNSUPPORTED; } +#ifndef _7Z_NO_METHODS_FILTERS #define CASE_BRA_CONV(isa) case k_ ## isa: isa ## _Convert(outBuffer, outSize, 0, 0); break; +#endif static SRes SzFolder_Decode2(const CSzFolder *folder, const Byte *propsData, diff --git a/3rdparty/7z/src/7zFile.c b/3rdparty/7z/src/7zFile.c index e486901e30..900125d52c 100644 --- a/3rdparty/7z/src/7zFile.c +++ b/3rdparty/7z/src/7zFile.c @@ -1,5 +1,5 @@ /* 7zFile.c -- File IO -2017-04-03 : Igor Pavlov : Public domain */ +2021-04-29 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -7,9 +7,19 @@ #ifndef USE_WINDOWS_FILE -#ifndef UNDER_CE -#include -#endif + #include + + #ifndef USE_FOPEN + #include + #include + #ifdef _WIN32 + #include + typedef int ssize_t; + typedef int off_t; + #else + #include + #endif + #endif #else @@ -23,30 +33,36 @@ And message can be "Network connection was lost" */ -#define kChunkSizeMax (1 << 22) - #endif +#define kChunkSizeMax (1 << 22) + void File_Construct(CSzFile *p) { #ifdef USE_WINDOWS_FILE p->handle = INVALID_HANDLE_VALUE; - #else + #elif defined(USE_FOPEN) p->file = NULL; + #else + p->fd = -1; #endif } #if !defined(UNDER_CE) || !defined(USE_WINDOWS_FILE) + static WRes File_Open(CSzFile *p, const char *name, int writeMode) { #ifdef USE_WINDOWS_FILE + p->handle = CreateFileA(name, writeMode ? GENERIC_WRITE : GENERIC_READ, FILE_SHARE_READ, NULL, writeMode ? CREATE_ALWAYS : OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); return (p->handle != INVALID_HANDLE_VALUE) ? 0 : GetLastError(); - #else + + #elif defined(USE_FOPEN) + p->file = fopen(name, writeMode ? "wb+" : "rb"); return (p->file != 0) ? 0 : #ifdef UNDER_CE @@ -54,13 +70,34 @@ static WRes File_Open(CSzFile *p, const char *name, int writeMode) #else errno; #endif + + #else + + int flags = (writeMode ? (O_CREAT | O_EXCL | O_WRONLY) : O_RDONLY); + #ifdef O_BINARY + flags |= O_BINARY; + #endif + p->fd = open(name, flags, 0666); + return (p->fd != -1) ? 0 : errno; + #endif } WRes InFile_Open(CSzFile *p, const char *name) { return File_Open(p, name, 0); } -WRes OutFile_Open(CSzFile *p, const char *name) { return File_Open(p, name, 1); } + +WRes OutFile_Open(CSzFile *p, const char *name) +{ + #if defined(USE_WINDOWS_FILE) || defined(USE_FOPEN) + return File_Open(p, name, 1); + #else + p->fd = creat(name, 0666); + return (p->fd != -1) ? 0 : errno; + #endif +} + #endif + #ifdef USE_WINDOWS_FILE static WRes File_OpenW(CSzFile *p, const WCHAR *name, int writeMode) { @@ -78,74 +115,124 @@ WRes OutFile_OpenW(CSzFile *p, const WCHAR *name) { return File_OpenW(p, name, 1 WRes File_Close(CSzFile *p) { #ifdef USE_WINDOWS_FILE + if (p->handle != INVALID_HANDLE_VALUE) { if (!CloseHandle(p->handle)) return GetLastError(); p->handle = INVALID_HANDLE_VALUE; } - #else + + #elif defined(USE_FOPEN) + if (p->file != NULL) { int res = fclose(p->file); if (res != 0) + { + if (res == EOF) + return errno; return res; + } p->file = NULL; } + + #else + + if (p->fd != -1) + { + if (close(p->fd) != 0) + return errno; + p->fd = -1; + } + #endif + return 0; } + WRes File_Read(CSzFile *p, void *data, size_t *size) { size_t originalSize = *size; + *size = 0; if (originalSize == 0) return 0; #ifdef USE_WINDOWS_FILE - *size = 0; do { - DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize; + const DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize; DWORD processed = 0; - BOOL res = ReadFile(p->handle, data, curSize, &processed, NULL); + const BOOL res = ReadFile(p->handle, data, curSize, &processed, NULL); data = (void *)((Byte *)data + processed); originalSize -= processed; *size += processed; if (!res) return GetLastError(); + // debug : we can break here for partial reading mode + if (processed == 0) + break; + } + while (originalSize > 0); + + #elif defined(USE_FOPEN) + + do + { + const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize; + const size_t processed = fread(data, 1, curSize, p->file); + data = (void *)((Byte *)data + (size_t)processed); + originalSize -= processed; + *size += processed; + if (processed != curSize) + return ferror(p->file); + // debug : we can break here for partial reading mode if (processed == 0) break; } while (originalSize > 0); - return 0; #else - - *size = fread(data, 1, originalSize, p->file); - if (*size == originalSize) - return 0; - return ferror(p->file); - + + do + { + const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize; + const ssize_t processed = read(p->fd, data, curSize); + if (processed == -1) + return errno; + if (processed == 0) + break; + data = (void *)((Byte *)data + (size_t)processed); + originalSize -= (size_t)processed; + *size += (size_t)processed; + // debug : we can break here for partial reading mode + // break; + } + while (originalSize > 0); + #endif + + return 0; } + WRes File_Write(CSzFile *p, const void *data, size_t *size) { size_t originalSize = *size; + *size = 0; if (originalSize == 0) return 0; #ifdef USE_WINDOWS_FILE - *size = 0; do { - DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize; + const DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize; DWORD processed = 0; - BOOL res = WriteFile(p->handle, data, curSize, &processed, NULL); - data = (void *)((Byte *)data + processed); + const BOOL res = WriteFile(p->handle, data, curSize, &processed, NULL); + data = (const void *)((const Byte *)data + processed); originalSize -= processed; *size += processed; if (!res) @@ -154,26 +241,52 @@ WRes File_Write(CSzFile *p, const void *data, size_t *size) break; } while (originalSize > 0); - return 0; + + #elif defined(USE_FOPEN) + + do + { + const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize; + const size_t processed = fwrite(data, 1, curSize, p->file); + data = (void *)((Byte *)data + (size_t)processed); + originalSize -= processed; + *size += processed; + if (processed != curSize) + return ferror(p->file); + if (processed == 0) + break; + } + while (originalSize > 0); #else - *size = fwrite(data, 1, originalSize, p->file); - if (*size == originalSize) - return 0; - return ferror(p->file); - + do + { + const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize; + const ssize_t processed = write(p->fd, data, curSize); + if (processed == -1) + return errno; + if (processed == 0) + break; + data = (void *)((Byte *)data + (size_t)processed); + originalSize -= (size_t)processed; + *size += (size_t)processed; + } + while (originalSize > 0); + #endif + + return 0; } + WRes File_Seek(CSzFile *p, Int64 *pos, ESzSeek origin) { #ifdef USE_WINDOWS_FILE - LARGE_INTEGER value; DWORD moveMethod; - value.LowPart = (DWORD)*pos; - value.HighPart = (LONG)((UInt64)*pos >> 16 >> 16); /* for case when UInt64 is 32-bit only */ + UInt32 low = (UInt32)*pos; + LONG high = (LONG)((UInt64)*pos >> 16 >> 16); /* for case when UInt64 is 32-bit only */ switch (origin) { case SZ_SEEK_SET: moveMethod = FILE_BEGIN; break; @@ -181,34 +294,52 @@ WRes File_Seek(CSzFile *p, Int64 *pos, ESzSeek origin) case SZ_SEEK_END: moveMethod = FILE_END; break; default: return ERROR_INVALID_PARAMETER; } - value.LowPart = SetFilePointer(p->handle, value.LowPart, &value.HighPart, moveMethod); - if (value.LowPart == 0xFFFFFFFF) + low = SetFilePointer(p->handle, (LONG)low, &high, moveMethod); + if (low == (UInt32)0xFFFFFFFF) { WRes res = GetLastError(); if (res != NO_ERROR) return res; } - *pos = ((Int64)value.HighPart << 32) | value.LowPart; + *pos = ((Int64)high << 32) | low; return 0; #else - int moveMethod; - int res; + int moveMethod; // = origin; + switch (origin) { case SZ_SEEK_SET: moveMethod = SEEK_SET; break; case SZ_SEEK_CUR: moveMethod = SEEK_CUR; break; case SZ_SEEK_END: moveMethod = SEEK_END; break; - default: return 1; + default: return EINVAL; } - res = fseek(p->file, (long)*pos, moveMethod); - *pos = ftell(p->file); - return res; - #endif + #if defined(USE_FOPEN) + { + int res = fseek(p->file, (long)*pos, moveMethod); + if (res == -1) + return errno; + *pos = ftell(p->file); + if (*pos == -1) + return errno; + return 0; + } + #else + { + off_t res = lseek(p->fd, (off_t)*pos, moveMethod); + if (res == -1) + return errno; + *pos = res; + return 0; + } + + #endif // USE_FOPEN + #endif // USE_WINDOWS_FILE } + WRes File_GetLength(CSzFile *p, UInt64 *length) { #ifdef USE_WINDOWS_FILE @@ -224,13 +355,31 @@ WRes File_GetLength(CSzFile *p, UInt64 *length) *length = (((UInt64)sizeHigh) << 32) + sizeLow; return 0; - #else + #elif defined(USE_FOPEN) long pos = ftell(p->file); int res = fseek(p->file, 0, SEEK_END); *length = ftell(p->file); fseek(p->file, pos, SEEK_SET); return res; + + #else + + off_t pos; + *length = 0; + pos = lseek(p->fd, 0, SEEK_CUR); + if (pos != -1) + { + const off_t len2 = lseek(p->fd, 0, SEEK_END); + const off_t res2 = lseek(p->fd, pos, SEEK_SET); + if (len2 != -1) + { + *length = (UInt64)len2; + if (res2 != -1) + return 0; + } + } + return errno; #endif } @@ -241,7 +390,9 @@ WRes File_GetLength(CSzFile *p, UInt64 *length) static SRes FileSeqInStream_Read(const ISeqInStream *pp, void *buf, size_t *size) { CFileSeqInStream *p = CONTAINER_FROM_VTBL(pp, CFileSeqInStream, vt); - return File_Read(&p->file, buf, size) == 0 ? SZ_OK : SZ_ERROR_READ; + WRes wres = File_Read(&p->file, buf, size); + p->wres = wres; + return (wres == 0) ? SZ_OK : SZ_ERROR_READ; } void FileSeqInStream_CreateVTable(CFileSeqInStream *p) @@ -255,13 +406,17 @@ void FileSeqInStream_CreateVTable(CFileSeqInStream *p) static SRes FileInStream_Read(const ISeekInStream *pp, void *buf, size_t *size) { CFileInStream *p = CONTAINER_FROM_VTBL(pp, CFileInStream, vt); - return (File_Read(&p->file, buf, size) == 0) ? SZ_OK : SZ_ERROR_READ; + WRes wres = File_Read(&p->file, buf, size); + p->wres = wres; + return (wres == 0) ? SZ_OK : SZ_ERROR_READ; } static SRes FileInStream_Seek(const ISeekInStream *pp, Int64 *pos, ESzSeek origin) { CFileInStream *p = CONTAINER_FROM_VTBL(pp, CFileInStream, vt); - return File_Seek(&p->file, pos, origin); + WRes wres = File_Seek(&p->file, pos, origin); + p->wres = wres; + return (wres == 0) ? SZ_OK : SZ_ERROR_READ; } void FileInStream_CreateVTable(CFileInStream *p) @@ -276,7 +431,8 @@ void FileInStream_CreateVTable(CFileInStream *p) static size_t FileOutStream_Write(const ISeqOutStream *pp, const void *data, size_t size) { CFileOutStream *p = CONTAINER_FROM_VTBL(pp, CFileOutStream, vt); - File_Write(&p->file, data, &size); + WRes wres = File_Write(&p->file, data, &size); + p->wres = wres; return size; } diff --git a/3rdparty/7z/src/7zFile.h b/3rdparty/7z/src/7zFile.h index 7e263bea1b..c7a30fc2b1 100644 --- a/3rdparty/7z/src/7zFile.h +++ b/3rdparty/7z/src/7zFile.h @@ -1,17 +1,20 @@ /* 7zFile.h -- File IO -2017-04-03 : Igor Pavlov : Public domain */ +2021-02-15 : Igor Pavlov : Public domain */ #ifndef __7Z_FILE_H #define __7Z_FILE_H #ifdef _WIN32 #define USE_WINDOWS_FILE +// #include #endif #ifdef USE_WINDOWS_FILE #include #else -#include +// note: USE_FOPEN mode is limited to 32-bit file size +// #define USE_FOPEN +// #include #endif #include "7zTypes.h" @@ -24,8 +27,10 @@ typedef struct { #ifdef USE_WINDOWS_FILE HANDLE handle; - #else + #elif defined(USE_FOPEN) FILE *file; + #else + int fd; #endif } CSzFile; @@ -56,6 +61,7 @@ typedef struct { ISeqInStream vt; CSzFile file; + WRes wres; } CFileSeqInStream; void FileSeqInStream_CreateVTable(CFileSeqInStream *p); @@ -65,6 +71,7 @@ typedef struct { ISeekInStream vt; CSzFile file; + WRes wres; } CFileInStream; void FileInStream_CreateVTable(CFileInStream *p); @@ -74,6 +81,7 @@ typedef struct { ISeqOutStream vt; CSzFile file; + WRes wres; } CFileOutStream; void FileOutStream_CreateVTable(CFileOutStream *p); diff --git a/3rdparty/7z/src/7zStream.c b/3rdparty/7z/src/7zStream.c index 579741fadc..4b472a41d5 100644 --- a/3rdparty/7z/src/7zStream.c +++ b/3rdparty/7z/src/7zStream.c @@ -1,5 +1,5 @@ /* 7zStream.c -- 7z Stream functions -2017-04-03 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -37,7 +37,7 @@ SRes SeqInStream_ReadByte(const ISeqInStream *stream, Byte *buf) SRes LookInStream_SeekTo(const ILookInStream *stream, UInt64 offset) { - Int64 t = offset; + Int64 t = (Int64)offset; return ILookInStream_Seek(stream, &t, SZ_SEEK_SET); } diff --git a/3rdparty/7z/src/7zTypes.h b/3rdparty/7z/src/7zTypes.h index 593f5aa259..afe241258c 100644 --- a/3rdparty/7z/src/7zTypes.h +++ b/3rdparty/7z/src/7zTypes.h @@ -1,11 +1,13 @@ /* 7zTypes.h -- Basic types -2018-08-04 : Igor Pavlov : Public domain */ +2021-12-25 : Igor Pavlov : Public domain */ #ifndef __7Z_TYPES_H #define __7Z_TYPES_H #ifdef _WIN32 /* #include */ +#else +#include #endif #include @@ -43,18 +45,116 @@ EXTERN_C_BEGIN typedef int SRes; +#ifdef _MSC_VER + #if _MSC_VER > 1200 + #define MY_ALIGN(n) __declspec(align(n)) + #else + #define MY_ALIGN(n) + #endif +#else + #define MY_ALIGN(n) __attribute__ ((aligned(n))) +#endif + + #ifdef _WIN32 /* typedef DWORD WRes; */ typedef unsigned WRes; #define MY_SRes_HRESULT_FROM_WRes(x) HRESULT_FROM_WIN32(x) -#else +// #define MY_HRES_ERROR__INTERNAL_ERROR MY_SRes_HRESULT_FROM_WRes(ERROR_INTERNAL_ERROR) +#else // _WIN32 + +// #define ENV_HAVE_LSTAT typedef int WRes; -#define MY__FACILITY_WIN32 7 -#define MY__FACILITY__WRes MY__FACILITY_WIN32 -#define MY_SRes_HRESULT_FROM_WRes(x) ((HRESULT)(x) <= 0 ? ((HRESULT)(x)) : ((HRESULT) (((x) & 0x0000FFFF) | (MY__FACILITY__WRes << 16) | 0x80000000))) + +// (FACILITY_ERRNO = 0x800) is 7zip's FACILITY constant to represent (errno) errors in HRESULT +#define MY__FACILITY_ERRNO 0x800 +#define MY__FACILITY_WIN32 7 +#define MY__FACILITY__WRes MY__FACILITY_ERRNO + +#define MY_HRESULT_FROM_errno_CONST_ERROR(x) ((HRESULT)( \ + ( (HRESULT)(x) & 0x0000FFFF) \ + | (MY__FACILITY__WRes << 16) \ + | (HRESULT)0x80000000 )) + +#define MY_SRes_HRESULT_FROM_WRes(x) \ + ((HRESULT)(x) <= 0 ? ((HRESULT)(x)) : MY_HRESULT_FROM_errno_CONST_ERROR(x)) + +// we call macro HRESULT_FROM_WIN32 for system errors (WRes) that are (errno) +#define HRESULT_FROM_WIN32(x) MY_SRes_HRESULT_FROM_WRes(x) + +/* +#define ERROR_FILE_NOT_FOUND 2L +#define ERROR_ACCESS_DENIED 5L +#define ERROR_NO_MORE_FILES 18L +#define ERROR_LOCK_VIOLATION 33L +#define ERROR_FILE_EXISTS 80L +#define ERROR_DISK_FULL 112L +#define ERROR_NEGATIVE_SEEK 131L +#define ERROR_ALREADY_EXISTS 183L +#define ERROR_DIRECTORY 267L +#define ERROR_TOO_MANY_POSTS 298L + +#define ERROR_INTERNAL_ERROR 1359L +#define ERROR_INVALID_REPARSE_DATA 4392L +#define ERROR_REPARSE_TAG_INVALID 4393L +#define ERROR_REPARSE_TAG_MISMATCH 4394L +*/ + +// we use errno equivalents for some WIN32 errors: + +#define ERROR_INVALID_PARAMETER EINVAL +#define ERROR_INVALID_FUNCTION EINVAL +#define ERROR_ALREADY_EXISTS EEXIST +#define ERROR_FILE_EXISTS EEXIST +#define ERROR_PATH_NOT_FOUND ENOENT +#define ERROR_FILE_NOT_FOUND ENOENT +#define ERROR_DISK_FULL ENOSPC +// #define ERROR_INVALID_HANDLE EBADF + +// we use FACILITY_WIN32 for errors that has no errno equivalent +// Too many posts were made to a semaphore. +#define ERROR_TOO_MANY_POSTS ((HRESULT)0x8007012AL) +#define ERROR_INVALID_REPARSE_DATA ((HRESULT)0x80071128L) +#define ERROR_REPARSE_TAG_INVALID ((HRESULT)0x80071129L) + +// if (MY__FACILITY__WRes != FACILITY_WIN32), +// we use FACILITY_WIN32 for COM errors: +#define E_OUTOFMEMORY ((HRESULT)0x8007000EL) +#define E_INVALIDARG ((HRESULT)0x80070057L) +#define MY__E_ERROR_NEGATIVE_SEEK ((HRESULT)0x80070083L) + +/* +// we can use FACILITY_ERRNO for some COM errors, that have errno equivalents: +#define E_OUTOFMEMORY MY_HRESULT_FROM_errno_CONST_ERROR(ENOMEM) +#define E_INVALIDARG MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL) +#define MY__E_ERROR_NEGATIVE_SEEK MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL) +*/ + +// gcc / clang : (sizeof(long) == sizeof(void*)) in 32/64 bits +typedef long INT_PTR; +typedef unsigned long UINT_PTR; + +#define TEXT(quote) quote + +#define FILE_ATTRIBUTE_READONLY 0x0001 +#define FILE_ATTRIBUTE_HIDDEN 0x0002 +#define FILE_ATTRIBUTE_SYSTEM 0x0004 +#define FILE_ATTRIBUTE_DIRECTORY 0x0010 +#define FILE_ATTRIBUTE_ARCHIVE 0x0020 +#define FILE_ATTRIBUTE_DEVICE 0x0040 +#define FILE_ATTRIBUTE_NORMAL 0x0080 +#define FILE_ATTRIBUTE_TEMPORARY 0x0100 +#define FILE_ATTRIBUTE_SPARSE_FILE 0x0200 +#define FILE_ATTRIBUTE_REPARSE_POINT 0x0400 +#define FILE_ATTRIBUTE_COMPRESSED 0x0800 +#define FILE_ATTRIBUTE_OFFLINE 0x1000 +#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x2000 +#define FILE_ATTRIBUTE_ENCRYPTED 0x4000 + +#define FILE_ATTRIBUTE_UNIX_EXTENSION 0x8000 /* trick for Unix */ #endif @@ -63,6 +163,10 @@ typedef int WRes; #define RINOK(x) { int __result__ = (x); if (__result__ != 0) return __result__; } #endif +#ifndef RINOK_WRes +#define RINOK_WRes(x) { WRes __result__ = (x); if (__result__ != 0) return __result__; } +#endif + typedef unsigned char Byte; typedef short Int16; typedef unsigned short UInt16; @@ -75,6 +179,40 @@ typedef int Int32; typedef unsigned int UInt32; #endif + +#ifndef _WIN32 + +typedef int INT; +typedef Int32 INT32; +typedef unsigned int UINT; +typedef UInt32 UINT32; +typedef INT32 LONG; // LONG, ULONG and DWORD must be 32-bit for _WIN32 compatibility +typedef UINT32 ULONG; + +#undef DWORD +typedef UINT32 DWORD; + +#define VOID void + +#define HRESULT LONG + +typedef void *LPVOID; +// typedef void VOID; +// typedef ULONG_PTR DWORD_PTR, *PDWORD_PTR; +// gcc / clang on Unix : sizeof(long==sizeof(void*) in 32 or 64 bits) +typedef long INT_PTR; +typedef unsigned long UINT_PTR; +typedef long LONG_PTR; +typedef unsigned long DWORD_PTR; + +typedef size_t SIZE_T; + +#endif // _WIN32 + + +#define MY_HRES_ERROR__INTERNAL_ERROR ((HRESULT)0x8007054FL) + + #ifdef _SZ_NO_INT_64 /* define _SZ_NO_INT_64, if your compiler doesn't support 64-bit integers. @@ -128,25 +266,37 @@ typedef int BoolInt; #define MY_CDECL __cdecl #define MY_FAST_CALL __fastcall -#else +#else // _MSC_VER -#define MY_NO_INLINE -#define MY_FORCE_INLINE -#define MY_CDECL -#define MY_FAST_CALL - -/* inline keyword : for C++ / C99 */ - -/* GCC, clang: */ -/* -#if defined (__GNUC__) && (__GNUC__ >= 4) -#define MY_FORCE_INLINE __attribute__((always_inline)) +#if (defined(__GNUC__) && (__GNUC__ >= 4)) \ + || (defined(__clang__) && (__clang_major__ >= 4)) \ + || defined(__INTEL_COMPILER) \ + || defined(__xlC__) #define MY_NO_INLINE __attribute__((noinline)) +// #define MY_FORCE_INLINE __attribute__((always_inline)) inline +#else +#define MY_NO_INLINE #endif -*/ +#define MY_FORCE_INLINE + + +#define MY_CDECL + +#if defined(_M_IX86) \ + || defined(__i386__) +// #define MY_FAST_CALL __attribute__((fastcall)) +// #define MY_FAST_CALL __attribute__((cdecl)) +#define MY_FAST_CALL +#elif defined(MY_CPU_AMD64) +// #define MY_FAST_CALL __attribute__((ms_abi)) +#define MY_FAST_CALL +#else +#define MY_FAST_CALL #endif +#endif // _MSC_VER + /* The following interfaces use first parameter as pointer to structure */ @@ -335,12 +485,11 @@ struct ISzAlloc GCC 4.8.1 : classes with non-public variable members" */ -#define MY_container_of(ptr, type, m) ((type *)((char *)(1 ? (ptr) : &((type *)0)->m) - MY_offsetof(type, m))) - +#define MY_container_of(ptr, type, m) ((type *)(void *)((char *)(void *)(1 ? (ptr) : &((type *)0)->m) - MY_offsetof(type, m))) #endif -#define CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) ((type *)(ptr)) +#define CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) ((type *)(void *)(ptr)) /* #define CONTAINER_FROM_VTBL(ptr, type, m) CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) @@ -353,6 +502,7 @@ struct ISzAlloc */ +#define MY_memset_0_ARRAY(a) memset((a), 0, sizeof(a)) #ifdef _WIN32 diff --git a/3rdparty/7z/src/7zVersion.h b/3rdparty/7z/src/7zVersion.h index 0074c64be9..08618695d5 100644 --- a/3rdparty/7z/src/7zVersion.h +++ b/3rdparty/7z/src/7zVersion.h @@ -1,7 +1,7 @@ -#define MY_VER_MAJOR 19 -#define MY_VER_MINOR 00 +#define MY_VER_MAJOR 21 +#define MY_VER_MINOR 07 #define MY_VER_BUILD 0 -#define MY_VERSION_NUMBERS "19.00" +#define MY_VERSION_NUMBERS "21.07" #define MY_VERSION MY_VERSION_NUMBERS #ifdef MY_CPU_NAME @@ -10,12 +10,12 @@ #define MY_VERSION_CPU MY_VERSION #endif -#define MY_DATE "2019-02-21" +#define MY_DATE "2021-12-26" #undef MY_COPYRIGHT #undef MY_VERSION_COPYRIGHT_DATE #define MY_AUTHOR_NAME "Igor Pavlov" #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" -#define MY_COPYRIGHT_CR "Copyright (c) 1999-2018 Igor Pavlov" +#define MY_COPYRIGHT_CR "Copyright (c) 1999-2021 Igor Pavlov" #ifdef USE_COPYRIGHT_CR #define MY_COPYRIGHT MY_COPYRIGHT_CR diff --git a/3rdparty/7z/src/Aes.c b/3rdparty/7z/src/Aes.c index 8f7d50ea24..9ad66c5c8d 100644 --- a/3rdparty/7z/src/Aes.c +++ b/3rdparty/7z/src/Aes.c @@ -1,10 +1,17 @@ /* Aes.c -- AES encryption / decryption -2017-01-24 : Igor Pavlov : Public domain */ +2021-05-13 : Igor Pavlov : Public domain */ #include "Precomp.h" -#include "Aes.h" #include "CpuArch.h" +#include "Aes.h" + +AES_CODE_FUNC g_AesCbc_Decode; +#ifndef _SFX +AES_CODE_FUNC g_AesCbc_Encode; +AES_CODE_FUNC g_AesCtr_Code; +UInt32 g_Aes_SupportedFunctions_Flags; +#endif static UInt32 T[256 * 4]; static const Byte Sbox[256] = { @@ -25,23 +32,10 @@ static const Byte Sbox[256] = { 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; -void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks); - -void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *ivAes, Byte *data, size_t numBlocks); - -AES_CODE_FUNC g_AesCbc_Encode; -AES_CODE_FUNC g_AesCbc_Decode; -AES_CODE_FUNC g_AesCtr_Code; static UInt32 D[256 * 4]; static Byte InvS[256]; -static const Byte Rcon[11] = { 0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 }; - #define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF) #define Ui32(a0, a1, a2, a3) ((UInt32)(a0) | ((UInt32)(a1) << 8) | ((UInt32)(a2) << 16) | ((UInt32)(a3) << 24)) @@ -57,6 +51,36 @@ static const Byte Rcon[11] = { 0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0 #define DD(x) (D + (x << 8)) +// #define _SHOW_AES_STATUS + +#ifdef MY_CPU_X86_OR_AMD64 + #define USE_HW_AES +#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) + #if defined(__clang__) + #if (__clang_major__ >= 8) // fix that check + #define USE_HW_AES + #endif + #elif defined(__GNUC__) + #if (__GNUC__ >= 6) // fix that check + #define USE_HW_AES + #endif + #elif defined(_MSC_VER) + #if _MSC_VER >= 1910 + #define USE_HW_AES + #endif + #endif +#endif + +#ifdef USE_HW_AES +#ifdef _SHOW_AES_STATUS +#include +#define _PRF(x) x +#else +#define _PRF(x) +#endif +#endif + + void AesGenTables(void) { unsigned i; @@ -90,18 +114,48 @@ void AesGenTables(void) } } - g_AesCbc_Encode = AesCbc_Encode; - g_AesCbc_Decode = AesCbc_Decode; - g_AesCtr_Code = AesCtr_Code; - - #ifdef MY_CPU_X86_OR_AMD64 - if (CPU_Is_Aes_Supported()) { - g_AesCbc_Encode = AesCbc_Encode_Intel; - g_AesCbc_Decode = AesCbc_Decode_Intel; - g_AesCtr_Code = AesCtr_Code_Intel; + AES_CODE_FUNC d = AesCbc_Decode; + #ifndef _SFX + AES_CODE_FUNC e = AesCbc_Encode; + AES_CODE_FUNC c = AesCtr_Code; + UInt32 flags = 0; + #endif + + #ifdef USE_HW_AES + if (CPU_IsSupported_AES()) + { + // #pragma message ("AES HW") + _PRF(printf("\n===AES HW\n")); + d = AesCbc_Decode_HW; + + #ifndef _SFX + e = AesCbc_Encode_HW; + c = AesCtr_Code_HW; + flags = k_Aes_SupportedFunctions_HW; + #endif + + #ifdef MY_CPU_X86_OR_AMD64 + if (CPU_IsSupported_VAES_AVX2()) + { + _PRF(printf("\n===vaes avx2\n")); + d = AesCbc_Decode_HW_256; + #ifndef _SFX + c = AesCtr_Code_HW_256; + flags |= k_Aes_SupportedFunctions_HW_256; + #endif + } + #endif } #endif + + g_AesCbc_Decode = d; + #ifndef _SFX + g_AesCbc_Encode = e; + g_AesCtr_Code = c; + g_Aes_SupportedFunctions_Flags = flags; + #endif + } } @@ -142,8 +196,11 @@ void AesGenTables(void) void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize) { - unsigned i, wSize; - wSize = keySize + 28; + unsigned i, m; + const UInt32 *wLim; + UInt32 t; + UInt32 rcon = 1; + keySize /= 4; w[0] = ((UInt32)keySize / 2) + 3; w += 4; @@ -151,16 +208,26 @@ void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize) for (i = 0; i < keySize; i++, key += 4) w[i] = GetUi32(key); - for (; i < wSize; i++) + t = w[(size_t)keySize - 1]; + wLim = w + (size_t)keySize * 3 + 28; + m = 0; + do { - UInt32 t = w[(size_t)i - 1]; - unsigned rem = i % keySize; - if (rem == 0) - t = Ui32(Sbox[gb1(t)] ^ Rcon[i / keySize], Sbox[gb2(t)], Sbox[gb3(t)], Sbox[gb0(t)]); - else if (keySize > 6 && rem == 4) + if (m == 0) + { + t = Ui32(Sbox[gb1(t)] ^ rcon, Sbox[gb2(t)], Sbox[gb3(t)], Sbox[gb0(t)]); + rcon <<= 1; + if (rcon & 0x100) + rcon = 0x1b; + m = keySize; + } + else if (m == 4 && keySize > 6) t = Ui32(Sbox[gb0(t)], Sbox[gb1(t)], Sbox[gb2(t)], Sbox[gb3(t)]); - w[i] = w[i - keySize] ^ t; + m--; + t ^= w[0]; + w[keySize] = t; } + while (++w != wLim); } void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize) @@ -184,6 +251,7 @@ void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize) src and dest are pointers to 4 UInt32 words. src and dest can point to same block */ +// MY_FORCE_INLINE static void Aes_Encode(const UInt32 *w, UInt32 *dest, const UInt32 *src) { UInt32 s[4]; @@ -207,6 +275,7 @@ static void Aes_Encode(const UInt32 *w, UInt32 *dest, const UInt32 *src) FT4(0); FT4(1); FT4(2); FT4(3); } +MY_FORCE_INLINE static void Aes_Decode(const UInt32 *w, UInt32 *dest, const UInt32 *src) { UInt32 s[4]; @@ -294,12 +363,12 @@ void MY_FAST_CALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks) UInt32 t = temp[i]; #ifdef MY_CPU_LE_UNALIGN - *((UInt32 *)data) ^= t; + *((UInt32 *)(void *)data) ^= t; #else - data[0] ^= (t & 0xFF); - data[1] ^= ((t >> 8) & 0xFF); - data[2] ^= ((t >> 16) & 0xFF); - data[3] ^= ((t >> 24)); + data[0] = (Byte)(data[0] ^ (t & 0xFF)); + data[1] = (Byte)(data[1] ^ ((t >> 8) & 0xFF)); + data[2] = (Byte)(data[2] ^ ((t >> 16) & 0xFF)); + data[3] = (Byte)(data[3] ^ ((t >> 24))); #endif } } diff --git a/3rdparty/7z/src/Aes.h b/3rdparty/7z/src/Aes.h index 381e979d1b..602e25ea24 100644 --- a/3rdparty/7z/src/Aes.h +++ b/3rdparty/7z/src/Aes.h @@ -1,5 +1,5 @@ /* Aes.h -- AES encryption / decryption -2013-01-18 : Igor Pavlov : Public domain */ +2018-04-28 : Igor Pavlov : Public domain */ #ifndef __AES_H #define __AES_H @@ -26,12 +26,34 @@ void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *aes, const Byte *key, unsigned keySize) /* ivAes - 16-byte aligned pointer to iv+keyMode+roundKeys sequence: UInt32[AES_NUM_IVMRK_WORDS] */ void AesCbc_Init(UInt32 *ivAes, const Byte *iv); /* iv size is AES_BLOCK_SIZE */ + /* data - 16-byte aligned pointer to data */ /* numBlocks - the number of 16-byte blocks in data array */ typedef void (MY_FAST_CALL *AES_CODE_FUNC)(UInt32 *ivAes, Byte *data, size_t numBlocks); -extern AES_CODE_FUNC g_AesCbc_Encode; + extern AES_CODE_FUNC g_AesCbc_Decode; +#ifndef _SFX +extern AES_CODE_FUNC g_AesCbc_Encode; extern AES_CODE_FUNC g_AesCtr_Code; +#define k_Aes_SupportedFunctions_HW (1 << 2) +#define k_Aes_SupportedFunctions_HW_256 (1 << 3) +extern UInt32 g_Aes_SupportedFunctions_Flags; +#endif + + +#define DECLARE__AES_CODE_FUNC(funcName) \ + void MY_FAST_CALL funcName(UInt32 *ivAes, Byte *data, size_t numBlocks); + +DECLARE__AES_CODE_FUNC (AesCbc_Encode) +DECLARE__AES_CODE_FUNC (AesCbc_Decode) +DECLARE__AES_CODE_FUNC (AesCtr_Code) + +DECLARE__AES_CODE_FUNC (AesCbc_Encode_HW) +DECLARE__AES_CODE_FUNC (AesCbc_Decode_HW) +DECLARE__AES_CODE_FUNC (AesCtr_Code_HW) + +DECLARE__AES_CODE_FUNC (AesCbc_Decode_HW_256) +DECLARE__AES_CODE_FUNC (AesCtr_Code_HW_256) EXTERN_C_END diff --git a/3rdparty/7z/src/AesOpt.c b/3rdparty/7z/src/AesOpt.c index 0e7f49a1b0..1bdc9a882f 100644 --- a/3rdparty/7z/src/AesOpt.c +++ b/3rdparty/7z/src/AesOpt.c @@ -1,184 +1,776 @@ -/* AesOpt.c -- Intel's AES -2017-06-08 : Igor Pavlov : Public domain */ +/* AesOpt.c -- AES optimized code for x86 AES hardware instructions +2021-04-01 : Igor Pavlov : Public domain */ #include "Precomp.h" #include "CpuArch.h" #ifdef MY_CPU_X86_OR_AMD64 -#if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729) -#define USE_INTEL_AES + + #if defined(__clang__) + #if __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 8) + #define USE_INTEL_AES + #define ATTRIB_AES __attribute__((__target__("aes"))) + #if (__clang_major__ >= 8) + #define USE_INTEL_VAES + #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2"))) + #endif + #endif + #elif defined(__GNUC__) + #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) + #define USE_INTEL_AES + #ifndef __AES__ + #define ATTRIB_AES __attribute__((__target__("aes"))) + #endif + #if (__GNUC__ >= 8) + #define USE_INTEL_VAES + #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2"))) + #endif + #endif + #elif defined(__INTEL_COMPILER) + #if (__INTEL_COMPILER >= 1110) + #define USE_INTEL_AES + #if (__INTEL_COMPILER >= 1900) + #define USE_INTEL_VAES + #endif + #endif + #elif defined(_MSC_VER) + #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729) + #define USE_INTEL_AES + #if (_MSC_VER >= 1910) + #define USE_INTEL_VAES + #endif + #endif + #endif + +#ifndef ATTRIB_AES + #define ATTRIB_AES #endif +#ifndef ATTRIB_VAES + #define ATTRIB_VAES #endif + #ifdef USE_INTEL_AES #include -void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks) +#ifndef USE_INTEL_VAES +#define AES_TYPE_keys __m128i +#define AES_TYPE_data __m128i +#endif + +#define AES_FUNC_START(name) \ + void MY_FAST_CALL name(__m128i *p, __m128i *data, size_t numBlocks) + +#define AES_FUNC_START2(name) \ +AES_FUNC_START (name); \ +ATTRIB_AES \ +AES_FUNC_START (name) + +#define MM_OP(op, dest, src) dest = op(dest, src); +#define MM_OP_m(op, src) MM_OP(op, m, src); + +#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src); +#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src); + + +AES_FUNC_START2 (AesCbc_Encode_HW) { __m128i m = *p; + const __m128i k0 = p[2]; + const __m128i k1 = p[3]; + const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; for (; numBlocks != 0; numBlocks--, data++) { - UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; - const __m128i *w = p + 3; - m = _mm_xor_si128(m, *data); - m = _mm_xor_si128(m, p[2]); + UInt32 r = numRounds2; + const __m128i *w = p + 4; + __m128i temp = *data; + MM_XOR (temp, k0); + MM_XOR (m, temp); + MM_OP_m (_mm_aesenc_si128, k1); do { - m = _mm_aesenc_si128(m, w[0]); - m = _mm_aesenc_si128(m, w[1]); + MM_OP_m (_mm_aesenc_si128, w[0]); + MM_OP_m (_mm_aesenc_si128, w[1]); w += 2; } - while (--numRounds2 != 0); - m = _mm_aesenc_si128(m, w[0]); - m = _mm_aesenclast_si128(m, w[1]); + while (--r); + MM_OP_m (_mm_aesenclast_si128, w[0]); *data = m; } *p = m; } -#define NUM_WAYS 3 -#define AES_OP_W(op, n) { \ - const __m128i t = w[n]; \ - m0 = op(m0, t); \ - m1 = op(m1, t); \ - m2 = op(m2, t); \ - } +#define WOP_1(op) +#define WOP_2(op) WOP_1 (op) op (m1, 1); +#define WOP_3(op) WOP_2 (op) op (m2, 2); +#define WOP_4(op) WOP_3 (op) op (m3, 3); +#ifdef MY_CPU_AMD64 +#define WOP_5(op) WOP_4 (op) op (m4, 4); +#define WOP_6(op) WOP_5 (op) op (m5, 5); +#define WOP_7(op) WOP_6 (op) op (m6, 6); +#define WOP_8(op) WOP_7 (op) op (m7, 7); +#endif +/* +#define WOP_9(op) WOP_8 (op) op (m8, 8); +#define WOP_10(op) WOP_9 (op) op (m9, 9); +#define WOP_11(op) WOP_10(op) op (m10, 10); +#define WOP_12(op) WOP_11(op) op (m11, 11); +#define WOP_13(op) WOP_12(op) op (m12, 12); +#define WOP_14(op) WOP_13(op) op (m13, 13); +*/ -#define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n) -#define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n) -#define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n) -#define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n) +#ifdef MY_CPU_AMD64 + #define NUM_WAYS 8 + #define WOP_M1 WOP_8 +#else + #define NUM_WAYS 4 + #define WOP_M1 WOP_4 +#endif -void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks) +#define WOP(op) op (m0, 0); WOP_M1(op) + + +#define DECLARE_VAR(reg, ii) __m128i reg +#define LOAD_data( reg, ii) reg = data[ii]; +#define STORE_data( reg, ii) data[ii] = reg; +#if (NUM_WAYS > 1) +#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]); +#endif + +#define AVX__DECLARE_VAR(reg, ii) __m256i reg +#define AVX__LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; +#define AVX__STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; +#define AVX__XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])); + +#define MM_OP_key(op, reg) MM_OP(op, reg, key); + +#define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) +#define AES_DEC_LAST( reg, ii) MM_OP_key (_mm_aesdeclast_si128, reg) +#define AES_ENC( reg, ii) MM_OP_key (_mm_aesenc_si128, reg) +#define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) +#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) + + +#define AVX__AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) +#define AVX__AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) +#define AVX__AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) +#define AVX__AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) +#define AVX__AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) + +#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one); reg = ctr; +#define CTR_END( reg, ii) MM_XOR (data[ii], reg); + +#define AVX__CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two); reg = _mm256_xor_si256(ctr2, key); +#define AVX__CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg); + +#define WOP_KEY(op, n) { \ + const __m128i key = w[n]; \ + WOP(op); } + +#define AVX__WOP_KEY(op, n) { \ + const __m256i key = w[n]; \ + WOP(op); } + + +#define WIDE_LOOP_START \ + dataEnd = data + numBlocks; \ + if (numBlocks >= NUM_WAYS) \ + { dataEnd -= NUM_WAYS; do { \ + + +#define WIDE_LOOP_END \ + data += NUM_WAYS; \ + } while (data <= dataEnd); \ + dataEnd += NUM_WAYS; } \ + + +#define SINGLE_LOOP \ + for (; data < dataEnd; data++) + + +#define NUM_AES_KEYS_MAX 15 + +#define WIDE_LOOP_START_AVX(OP) \ + dataEnd = data + numBlocks; \ + if (numBlocks >= NUM_WAYS * 2) \ + { __m256i keys[NUM_AES_KEYS_MAX]; \ + UInt32 ii; \ + OP \ + for (ii = 0; ii < numRounds; ii++) \ + keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \ + dataEnd -= NUM_WAYS * 2; do { \ + + +#define WIDE_LOOP_END_AVX(OP) \ + data += NUM_WAYS * 2; \ + } while (data <= dataEnd); \ + dataEnd += NUM_WAYS * 2; \ + OP \ + _mm256_zeroupper(); \ + } \ + +/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, + MSVC still can insert vzeroupper instruction. */ + + +AES_FUNC_START2 (AesCbc_Decode_HW) { __m128i iv = *p; - for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS) + const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; + const __m128i *dataEnd; + p += 2; + + WIDE_LOOP_START { - UInt32 numRounds2 = *(const UInt32 *)(p + 1); - const __m128i *w = p + numRounds2 * 2; - __m128i m0, m1, m2; - { - const __m128i t = w[2]; - m0 = _mm_xor_si128(t, data[0]); - m1 = _mm_xor_si128(t, data[1]); - m2 = _mm_xor_si128(t, data[2]); - } - numRounds2--; + const __m128i *w = wStart; + + WOP (DECLARE_VAR) + WOP (LOAD_data); + WOP_KEY (AES_XOR, 1) + do { - AES_DEC(1) - AES_DEC(0) - w -= 2; + WOP_KEY (AES_DEC, 0) + w--; } - while (--numRounds2 != 0); - AES_DEC(1) - AES_DEC_LAST(0) + while (w != p); + WOP_KEY (AES_DEC_LAST, 0) - { - __m128i t; - t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t; - t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t; - t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t; - } + MM_XOR (m0, iv); + WOP_M1 (XOR_data_M1) + iv = data[NUM_WAYS - 1]; + WOP (STORE_data); } - for (; numBlocks != 0; numBlocks--, data++) + WIDE_LOOP_END + + SINGLE_LOOP { - UInt32 numRounds2 = *(const UInt32 *)(p + 1); - const __m128i *w = p + numRounds2 * 2; - __m128i m = _mm_xor_si128(w[2], *data); - numRounds2--; + const __m128i *w = wStart - 1; + __m128i m = _mm_xor_si128 (w[2], *data); do { - m = _mm_aesdec_si128(m, w[1]); - m = _mm_aesdec_si128(m, w[0]); + MM_OP_m (_mm_aesdec_si128, w[1]); + MM_OP_m (_mm_aesdec_si128, w[0]); w -= 2; } - while (--numRounds2 != 0); - m = _mm_aesdec_si128(m, w[1]); - m = _mm_aesdeclast_si128(m, w[0]); + while (w != p); + MM_OP_m (_mm_aesdec_si128, w[1]); + MM_OP_m (_mm_aesdeclast_si128, w[0]); - m = _mm_xor_si128(m, iv); + MM_XOR (m, iv); iv = *data; *data = m; } - *p = iv; + + p[-2] = iv; } -void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks) + +AES_FUNC_START2 (AesCtr_Code_HW) { __m128i ctr = *p; - __m128i one; - one.m128i_u64[0] = 1; - one.m128i_u64[1] = 0; - for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS) + UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; + const __m128i *dataEnd; + __m128i one = _mm_cvtsi32_si128(1); + + p += 2; + + WIDE_LOOP_START { - UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; const __m128i *w = p; - __m128i m0, m1, m2; - { - const __m128i t = w[2]; - ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t); - ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t); - ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t); - } - w += 3; + UInt32 r = numRoundsMinus2; + WOP (DECLARE_VAR) + WOP (CTR_START); + WOP_KEY (AES_XOR, 0) + w += 1; do { - AES_ENC(0) - AES_ENC(1) - w += 2; + WOP_KEY (AES_ENC, 0) + w += 1; } - while (--numRounds2 != 0); - AES_ENC(0) - AES_ENC_LAST(1) - data[0] = _mm_xor_si128(data[0], m0); - data[1] = _mm_xor_si128(data[1], m1); - data[2] = _mm_xor_si128(data[2], m2); + while (--r); + WOP_KEY (AES_ENC_LAST, 0) + + WOP (CTR_END); } - for (; numBlocks != 0; numBlocks--, data++) + WIDE_LOOP_END + + SINGLE_LOOP { - UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; + UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; const __m128i *w = p; __m128i m; - ctr = _mm_add_epi64(ctr, one); - m = _mm_xor_si128(ctr, p[2]); - w += 3; + MM_OP (_mm_add_epi64, ctr, one); + m = _mm_xor_si128 (ctr, p[0]); + w += 1; do { - m = _mm_aesenc_si128(m, w[0]); - m = _mm_aesenc_si128(m, w[1]); + MM_OP_m (_mm_aesenc_si128, w[0]); + MM_OP_m (_mm_aesenc_si128, w[1]); w += 2; } - while (--numRounds2 != 0); - m = _mm_aesenc_si128(m, w[0]); - m = _mm_aesenclast_si128(m, w[1]); - *data = _mm_xor_si128(*data, m); + while (--numRounds2); + MM_OP_m (_mm_aesenc_si128, w[0]); + MM_OP_m (_mm_aesenclast_si128, w[1]); + MM_XOR (*data, m); } - *p = ctr; + + p[-2] = ctr; } -#else -void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks) -{ - AesCbc_Encode(p, data, numBlocks); -} - -void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks) -{ - AesCbc_Decode(p, data, numBlocks); -} - -void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks) -{ - AesCtr_Code(p, data, numBlocks); -} +#ifdef USE_INTEL_VAES +#if defined(__clang__) && defined(_MSC_VER) +#define __SSE4_2__ +#define __AES__ +#define __AVX__ +#define __AVX2__ +#define __VAES__ +#define __AVX512F__ +#define __AVX512VL__ #endif + +#include + +#define VAES_FUNC_START2(name) \ +AES_FUNC_START (name); \ +ATTRIB_VAES \ +AES_FUNC_START (name) + +VAES_FUNC_START2 (AesCbc_Decode_HW_256) +{ + __m128i iv = *p; + const __m128i *dataEnd; + UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; + p += 2; + + WIDE_LOOP_START_AVX(;) + { + const __m256i *w = keys + numRounds - 2; + + WOP (AVX__DECLARE_VAR) + WOP (AVX__LOAD_data); + AVX__WOP_KEY (AVX__AES_XOR, 1) + + do + { + AVX__WOP_KEY (AVX__AES_DEC, 0) + w--; + } + while (w != keys); + AVX__WOP_KEY (AVX__AES_DEC_LAST, 0) + + AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])); + WOP_M1 (AVX__XOR_data_M1) + iv = data[NUM_WAYS * 2 - 1]; + WOP (AVX__STORE_data); + } + WIDE_LOOP_END_AVX(;) + + SINGLE_LOOP + { + const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3; + __m128i m = _mm_xor_si128 (w[2], *data); + do + { + MM_OP_m (_mm_aesdec_si128, w[1]); + MM_OP_m (_mm_aesdec_si128, w[0]); + w -= 2; + } + while (w != p); + MM_OP_m (_mm_aesdec_si128, w[1]); + MM_OP_m (_mm_aesdeclast_si128, w[0]); + + MM_XOR (m, iv); + iv = *data; + *data = m; + } + + p[-2] = iv; +} + + +/* +SSE2: _mm_cvtsi32_si128 : movd +AVX: _mm256_setr_m128i : vinsertf128 +AVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm + _mm256_extracti128_si256 : vextracti128 + _mm256_broadcastsi128_si256 : vbroadcasti128 +*/ + +#define AVX__CTR_LOOP_START \ + ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \ + two = _mm256_setr_m128i(one, one); \ + two = _mm256_add_epi64(two, two); \ + +// two = _mm256_setr_epi64x(2, 0, 2, 0); + +#define AVX__CTR_LOOP_ENC \ + ctr = _mm256_extracti128_si256 (ctr2, 1); \ + +VAES_FUNC_START2 (AesCtr_Code_HW_256) +{ + __m128i ctr = *p; + UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; + const __m128i *dataEnd; + __m128i one = _mm_cvtsi32_si128(1); + __m256i ctr2, two; + p += 2; + + WIDE_LOOP_START_AVX (AVX__CTR_LOOP_START) + { + const __m256i *w = keys; + UInt32 r = numRounds - 2; + WOP (AVX__DECLARE_VAR) + AVX__WOP_KEY (AVX__CTR_START, 0); + + w += 1; + do + { + AVX__WOP_KEY (AVX__AES_ENC, 0) + w += 1; + } + while (--r); + AVX__WOP_KEY (AVX__AES_ENC_LAST, 0) + + WOP (AVX__CTR_END); + } + WIDE_LOOP_END_AVX (AVX__CTR_LOOP_ENC) + + SINGLE_LOOP + { + UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; + const __m128i *w = p; + __m128i m; + MM_OP (_mm_add_epi64, ctr, one); + m = _mm_xor_si128 (ctr, p[0]); + w += 1; + do + { + MM_OP_m (_mm_aesenc_si128, w[0]); + MM_OP_m (_mm_aesenc_si128, w[1]); + w += 2; + } + while (--numRounds2); + MM_OP_m (_mm_aesenc_si128, w[0]); + MM_OP_m (_mm_aesenclast_si128, w[1]); + MM_XOR (*data, m); + } + + p[-2] = ctr; +} + +#endif // USE_INTEL_VAES + +#else // USE_INTEL_AES + +/* no USE_INTEL_AES */ + +#pragma message("AES HW_SW stub was used") + +#define AES_TYPE_keys UInt32 +#define AES_TYPE_data Byte + +#define AES_FUNC_START(name) \ + void MY_FAST_CALL name(UInt32 *p, Byte *data, size_t numBlocks) \ + +#define AES_COMPAT_STUB(name) \ + AES_FUNC_START(name); \ + AES_FUNC_START(name ## _HW) \ + { name(p, data, numBlocks); } + +AES_COMPAT_STUB (AesCbc_Encode) +AES_COMPAT_STUB (AesCbc_Decode) +AES_COMPAT_STUB (AesCtr_Code) + +#endif // USE_INTEL_AES + + +#ifndef USE_INTEL_VAES + +#pragma message("VAES HW_SW stub was used") + +#define VAES_COMPAT_STUB(name) \ + void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ + void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \ + { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); } + +VAES_COMPAT_STUB (AesCbc_Decode_HW) +VAES_COMPAT_STUB (AesCtr_Code_HW) + +#endif // ! USE_INTEL_VAES + + +#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) + + #if defined(__clang__) + #if (__clang_major__ >= 8) // fix that check + #define USE_HW_AES + #endif + #elif defined(__GNUC__) + #if (__GNUC__ >= 6) // fix that check + #define USE_HW_AES + #endif + #elif defined(_MSC_VER) + #if _MSC_VER >= 1910 + #define USE_HW_AES + #endif + #endif + +#ifdef USE_HW_AES + +// #pragma message("=== AES HW === ") + +#if defined(__clang__) || defined(__GNUC__) + #ifdef MY_CPU_ARM64 + #define ATTRIB_AES __attribute__((__target__("+crypto"))) + #else + #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) + #endif +#else + // _MSC_VER + // for arm32 + #define _ARM_USE_NEW_NEON_INTRINSICS +#endif + +#ifndef ATTRIB_AES + #define ATTRIB_AES +#endif + +#if defined(_MSC_VER) && defined(MY_CPU_ARM64) +#include +#else +#include +#endif + +typedef uint8x16_t v128; + +#define AES_FUNC_START(name) \ + void MY_FAST_CALL name(v128 *p, v128 *data, size_t numBlocks) + +#define AES_FUNC_START2(name) \ +AES_FUNC_START (name); \ +ATTRIB_AES \ +AES_FUNC_START (name) + +#define MM_OP(op, dest, src) dest = op(dest, src); +#define MM_OP_m(op, src) MM_OP(op, m, src); +#define MM_OP1_m(op) m = op(m); + +#define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src); +#define MM_XOR_m( src) MM_XOR(m, src); + +#define AES_E_m(k) MM_OP_m (vaeseq_u8, k); +#define AES_E_MC_m(k) AES_E_m (k); MM_OP1_m(vaesmcq_u8); + + +AES_FUNC_START2 (AesCbc_Encode_HW) +{ + v128 m = *p; + const v128 k0 = p[2]; + const v128 k1 = p[3]; + const v128 k2 = p[4]; + const v128 k3 = p[5]; + const v128 k4 = p[6]; + const v128 k5 = p[7]; + const v128 k6 = p[8]; + const v128 k7 = p[9]; + const v128 k8 = p[10]; + const v128 k9 = p[11]; + const UInt32 numRounds2 = *(const UInt32 *)(p + 1); + const v128 *w = p + ((size_t)numRounds2 * 2); + const v128 k_z1 = w[1]; + const v128 k_z0 = w[2]; + for (; numBlocks != 0; numBlocks--, data++) + { + MM_XOR_m (*data); + AES_E_MC_m (k0) + AES_E_MC_m (k1) + AES_E_MC_m (k2) + AES_E_MC_m (k3) + AES_E_MC_m (k4) + AES_E_MC_m (k5) + AES_E_MC_m (k6) + AES_E_MC_m (k7) + AES_E_MC_m (k8) + if (numRounds2 >= 6) + { + AES_E_MC_m (k9) + AES_E_MC_m (p[12]) + if (numRounds2 != 6) + { + AES_E_MC_m (p[13]) + AES_E_MC_m (p[14]) + } + } + AES_E_m (k_z1); + MM_XOR_m (k_z0); + *data = m; + } + *p = m; +} + + +#define WOP_1(op) +#define WOP_2(op) WOP_1 (op) op (m1, 1); +#define WOP_3(op) WOP_2 (op) op (m2, 2); +#define WOP_4(op) WOP_3 (op) op (m3, 3); +#define WOP_5(op) WOP_4 (op) op (m4, 4); +#define WOP_6(op) WOP_5 (op) op (m5, 5); +#define WOP_7(op) WOP_6 (op) op (m6, 6); +#define WOP_8(op) WOP_7 (op) op (m7, 7); + + #define NUM_WAYS 8 + #define WOP_M1 WOP_8 + +#define WOP(op) op (m0, 0); WOP_M1(op) + +#define DECLARE_VAR(reg, ii) v128 reg +#define LOAD_data( reg, ii) reg = data[ii]; +#define STORE_data( reg, ii) data[ii] = reg; +#if (NUM_WAYS > 1) +#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]); +#endif + +#define MM_OP_key(op, reg) MM_OP (op, reg, key); + +#define AES_D_m(k) MM_OP_m (vaesdq_u8, k); +#define AES_D_IMC_m(k) AES_D_m (k); MM_OP1_m (vaesimcq_u8); + +#define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg) +#define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg) +#define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg) + +#define AES_D_IMC( reg, ii) AES_D (reg, ii); reg = vaesimcq_u8(reg) +#define AES_E_MC( reg, ii) AES_E (reg, ii); reg = vaesmcq_u8(reg) + +#define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one); reg = vreinterpretq_u8_u64(ctr); +#define CTR_END( reg, ii) MM_XOR (data[ii], reg); + +#define WOP_KEY(op, n) { \ + const v128 key = w[n]; \ + WOP(op); } + +#define WIDE_LOOP_START \ + dataEnd = data + numBlocks; \ + if (numBlocks >= NUM_WAYS) \ + { dataEnd -= NUM_WAYS; do { \ + +#define WIDE_LOOP_END \ + data += NUM_WAYS; \ + } while (data <= dataEnd); \ + dataEnd += NUM_WAYS; } \ + +#define SINGLE_LOOP \ + for (; data < dataEnd; data++) + + +AES_FUNC_START2 (AesCbc_Decode_HW) +{ + v128 iv = *p; + const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; + const v128 *dataEnd; + p += 2; + + WIDE_LOOP_START + { + const v128 *w = wStart; + WOP (DECLARE_VAR) + WOP (LOAD_data); + WOP_KEY (AES_D_IMC, 2) + do + { + WOP_KEY (AES_D_IMC, 1) + WOP_KEY (AES_D_IMC, 0) + w -= 2; + } + while (w != p); + WOP_KEY (AES_D, 1) + WOP_KEY (AES_XOR, 0) + MM_XOR (m0, iv); + WOP_M1 (XOR_data_M1) + iv = data[NUM_WAYS - 1]; + WOP (STORE_data); + } + WIDE_LOOP_END + + SINGLE_LOOP + { + const v128 *w = wStart; + v128 m = *data; + AES_D_IMC_m (w[2]) + do + { + AES_D_IMC_m (w[1]); + AES_D_IMC_m (w[0]); + w -= 2; + } + while (w != p); + AES_D_m (w[1]); + MM_XOR_m (w[0]); + MM_XOR_m (iv); + iv = *data; + *data = m; + } + + p[-2] = iv; +} + + +AES_FUNC_START2 (AesCtr_Code_HW) +{ + uint64x2_t ctr = vreinterpretq_u64_u8(*p); + const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; + const v128 *dataEnd; + uint64x2_t one = vdupq_n_u64(0); + one = vsetq_lane_u64(1, one, 0); + p += 2; + + WIDE_LOOP_START + { + const v128 *w = p; + WOP (DECLARE_VAR) + WOP (CTR_START); + do + { + WOP_KEY (AES_E_MC, 0) + WOP_KEY (AES_E_MC, 1) + w += 2; + } + while (w != wEnd); + WOP_KEY (AES_E_MC, 0) + WOP_KEY (AES_E, 1) + WOP_KEY (AES_XOR, 2) + WOP (CTR_END); + } + WIDE_LOOP_END + + SINGLE_LOOP + { + const v128 *w = p; + v128 m; + CTR_START (m, 0); + do + { + AES_E_MC_m (w[0]); + AES_E_MC_m (w[1]); + w += 2; + } + while (w != wEnd); + AES_E_MC_m (w[0]); + AES_E_m (w[1]); + MM_XOR_m (w[2]); + CTR_END (m, 0); + } + + p[-2] = vreinterpretq_u8_u64(ctr); +} + +#endif // USE_HW_AES + +#endif // MY_CPU_ARM_OR_ARM64 diff --git a/3rdparty/7z/src/Alloc.c b/3rdparty/7z/src/Alloc.c index 30b499e5ff..142a1ea221 100644 --- a/3rdparty/7z/src/Alloc.c +++ b/3rdparty/7z/src/Alloc.c @@ -1,12 +1,12 @@ /* Alloc.c -- Memory allocation functions -2018-04-27 : Igor Pavlov : Public domain */ +2021-07-13 : Igor Pavlov : Public domain */ #include "Precomp.h" #include #ifdef _WIN32 -#include +#include #endif #include @@ -122,7 +122,6 @@ static void PrintAddr(void *p) #define Print(s) #define PrintLn() #define PrintHex(v, align) -#define PrintDec(v, align) #define PrintAddr(p) #endif @@ -133,10 +132,11 @@ void *MyAlloc(size_t size) { if (size == 0) return NULL; + PRINT_ALLOC("Alloc ", g_allocCount, size, NULL); #ifdef _SZ_ALLOC_DEBUG { void *p = malloc(size); - PRINT_ALLOC("Alloc ", g_allocCount, size, p); + // PRINT_ALLOC("Alloc ", g_allocCount, size, p); return p; } #else @@ -172,14 +172,20 @@ void MidFree(void *address) VirtualFree(address, 0, MEM_RELEASE); } -#ifndef MEM_LARGE_PAGES -#undef _7ZIP_LARGE_PAGES +#ifdef _7ZIP_LARGE_PAGES + +#ifdef MEM_LARGE_PAGES + #define MY__MEM_LARGE_PAGES MEM_LARGE_PAGES +#else + #define MY__MEM_LARGE_PAGES 0x20000000 #endif -#ifdef _7ZIP_LARGE_PAGES +extern +SIZE_T g_LargePageSize; SIZE_T g_LargePageSize = 0; -typedef SIZE_T (WINAPI *GetLargePageMinimumP)(); -#endif +typedef SIZE_T (WINAPI *GetLargePageMinimumP)(VOID); + +#endif // _7ZIP_LARGE_PAGES void SetLargePageSize() { @@ -214,7 +220,7 @@ void *BigAlloc(size_t size) size2 = (size + ps) & ~ps; if (size2 >= size) { - void *res = VirtualAlloc(NULL, size2, MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE); + void *res = VirtualAlloc(NULL, size2, MEM_COMMIT | MY__MEM_LARGE_PAGES, PAGE_READWRITE); if (res) return res; } @@ -241,14 +247,14 @@ static void *SzAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MyAlloc static void SzFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MyFree(address); } const ISzAlloc g_Alloc = { SzAlloc, SzFree }; +#ifdef _WIN32 static void *SzMidAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MidAlloc(size); } static void SzMidFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MidFree(address); } -const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree }; - static void *SzBigAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return BigAlloc(size); } static void SzBigFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); BigFree(address); } +const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree }; const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; - +#endif /* uintptr_t : C99 (optional) @@ -280,13 +286,15 @@ const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; */ #define MY_ALIGN_PTR_DOWN(p, align) ((void *)((((UIntPtr)(p)) & ~((UIntPtr)(align) - 1)))) -#define MY_ALIGN_PTR_UP_PLUS(p, align) MY_ALIGN_PTR_DOWN(((char *)(p) + (align) + ADJUST_ALLOC_SIZE), align) - -#if (_POSIX_C_SOURCE >= 200112L) && !defined(_WIN32) +#if !defined(_WIN32) && defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L) #define USE_posix_memalign #endif +#ifndef USE_posix_memalign +#define MY_ALIGN_PTR_UP_PLUS(p, align) MY_ALIGN_PTR_DOWN(((char *)(p) + (align) + ADJUST_ALLOC_SIZE), align) +#endif + /* This posix_memalign() is for test purposes only. We also need special Free() function instead of free(), diff --git a/3rdparty/7z/src/Alloc.h b/3rdparty/7z/src/Alloc.h index 3d796e5eee..59de107601 100644 --- a/3rdparty/7z/src/Alloc.h +++ b/3rdparty/7z/src/Alloc.h @@ -1,5 +1,5 @@ /* Alloc.h -- Memory allocation functions -2018-02-19 : Igor Pavlov : Public domain */ +2021-07-13 : Igor Pavlov : Public domain */ #ifndef __COMMON_ALLOC_H #define __COMMON_ALLOC_H @@ -13,7 +13,7 @@ void MyFree(void *address); #ifdef _WIN32 -void SetLargePageSize(); +void SetLargePageSize(void); void *MidAlloc(size_t size); void MidFree(void *address); @@ -30,8 +30,15 @@ void BigFree(void *address); #endif extern const ISzAlloc g_Alloc; + +#ifdef _WIN32 extern const ISzAlloc g_BigAlloc; extern const ISzAlloc g_MidAlloc; +#else +#define g_BigAlloc g_AlignedAlloc +#define g_MidAlloc g_AlignedAlloc +#endif + extern const ISzAlloc g_AlignedAlloc; diff --git a/3rdparty/7z/src/Bcj2.c b/3rdparty/7z/src/Bcj2.c index da93985cf6..c1772f2344 100644 --- a/3rdparty/7z/src/Bcj2.c +++ b/3rdparty/7z/src/Bcj2.c @@ -1,5 +1,5 @@ /* Bcj2.c -- BCJ2 Decoder (Converter for x86 code) -2018-04-28 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -123,7 +123,7 @@ SRes Bcj2Dec_Decode(CBcj2Dec *p) const Byte *src = p->bufs[BCJ2_STREAM_MAIN]; const Byte *srcLim; Byte *dest; - SizeT num = p->lims[BCJ2_STREAM_MAIN] - src; + SizeT num = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - src); if (num == 0) { @@ -134,7 +134,7 @@ SRes Bcj2Dec_Decode(CBcj2Dec *p) dest = p->dest; if (num > (SizeT)(p->destLim - dest)) { - num = p->destLim - dest; + num = (SizeT)(p->destLim - dest); if (num == 0) { p->state = BCJ2_DEC_STATE_ORIG; @@ -168,7 +168,7 @@ SRes Bcj2Dec_Decode(CBcj2Dec *p) break; } - num = src - p->bufs[BCJ2_STREAM_MAIN]; + num = (SizeT)(src - p->bufs[BCJ2_STREAM_MAIN]); if (src == srcLim) { @@ -228,7 +228,7 @@ SRes Bcj2Dec_Decode(CBcj2Dec *p) p->ip += 4; val -= p->ip; dest = p->dest; - rem = p->destLim - dest; + rem = (SizeT)(p->destLim - dest); if (rem < 4) { diff --git a/3rdparty/7z/src/Bcj2Enc.c b/3rdparty/7z/src/Bcj2Enc.c index 7a02ecde25..71ac5091d5 100644 --- a/3rdparty/7z/src/Bcj2Enc.c +++ b/3rdparty/7z/src/Bcj2Enc.c @@ -1,5 +1,5 @@ /* Bcj2Enc.c -- BCJ2 Encoder (Converter for x86 code) -2019-02-02 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -104,7 +104,7 @@ static void Bcj2Enc_Encode_2(CBcj2Enc *p) const Byte *src = p->src; const Byte *srcLim; Byte *dest; - SizeT num = p->srcLim - src; + SizeT num = (SizeT)(p->srcLim - src); if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE) { @@ -118,7 +118,7 @@ static void Bcj2Enc_Encode_2(CBcj2Enc *p) dest = p->bufs[BCJ2_STREAM_MAIN]; if (num > (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest)) { - num = p->lims[BCJ2_STREAM_MAIN] - dest; + num = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest); if (num == 0) { p->state = BCJ2_STREAM_MAIN; @@ -152,7 +152,7 @@ static void Bcj2Enc_Encode_2(CBcj2Enc *p) break; } - num = src - p->src; + num = (SizeT)(src - p->src); if (src == srcLim) { diff --git a/3rdparty/7z/src/Bra.c b/3rdparty/7z/src/Bra.c index cbdcb290df..cdefa4d2e9 100644 --- a/3rdparty/7z/src/Bra.c +++ b/3rdparty/7z/src/Bra.c @@ -1,5 +1,5 @@ /* Bra.c -- Converters for RISC code -2017-04-04 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -22,7 +22,7 @@ SizeT ARM_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) for (;;) { if (p >= lim) - return p - data; + return (SizeT)(p - data); p += 4; if (p[-1] == 0xEB) break; @@ -43,7 +43,7 @@ SizeT ARM_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) for (;;) { if (p >= lim) - return p - data; + return (SizeT)(p - data); p += 4; if (p[-1] == 0xEB) break; @@ -78,7 +78,7 @@ SizeT ARMT_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) { UInt32 b3; if (p > lim) - return p - data; + return (SizeT)(p - data); b1 = p[1]; b3 = p[3]; p += 2; @@ -113,7 +113,7 @@ SizeT ARMT_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) { UInt32 b3; if (p > lim) - return p - data; + return (SizeT)(p - data); b1 = p[1]; b3 = p[3]; p += 2; @@ -162,7 +162,7 @@ SizeT PPC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) for (;;) { if (p >= lim) - return p - data; + return (SizeT)(p - data); p += 4; /* if ((v & 0xFC000003) == 0x48000001) */ if ((p[-4] & 0xFC) == 0x48 && (p[-1] & 3) == 1) @@ -196,7 +196,7 @@ SizeT SPARC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) for (;;) { if (p >= lim) - return p - data; + return (SizeT)(p - data); /* v = GetBe32(p); p += 4; diff --git a/3rdparty/7z/src/Bra86.c b/3rdparty/7z/src/Bra86.c index a6463c63ba..d857dac677 100644 --- a/3rdparty/7z/src/Bra86.c +++ b/3rdparty/7z/src/Bra86.c @@ -1,5 +1,5 @@ /* Bra86.c -- Converter for x86 code (BCJ) -2017-04-03 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -25,7 +25,7 @@ SizeT x86_Convert(Byte *data, SizeT size, UInt32 ip, UInt32 *state, int encoding break; { - SizeT d = (SizeT)(p - data - pos); + SizeT d = (SizeT)(p - data) - pos; pos = (SizeT)(p - data); if (p >= limit) { diff --git a/3rdparty/7z/src/Compiler.h b/3rdparty/7z/src/Compiler.h index c788648cd2..eba374298d 100644 --- a/3rdparty/7z/src/Compiler.h +++ b/3rdparty/7z/src/Compiler.h @@ -1,9 +1,13 @@ /* Compiler.h -2017-04-03 : Igor Pavlov : Public domain */ +2021-01-05 : Igor Pavlov : Public domain */ #ifndef __7Z_COMPILER_H #define __7Z_COMPILER_H + #ifdef __clang__ + #pragma clang diagnostic ignored "-Wunused-private-field" + #endif + #ifdef _MSC_VER #ifdef UNDER_CE @@ -25,6 +29,12 @@ #pragma warning(disable : 4786) // identifier was truncated to '255' characters in the debug information #endif + #ifdef __clang__ + #pragma clang diagnostic ignored "-Wdeprecated-declarations" + #pragma clang diagnostic ignored "-Wmicrosoft-exception-spec" + // #pragma clang diagnostic ignored "-Wreserved-id-macro" + #endif + #endif #define UNUSED_VAR(x) (void)x; diff --git a/3rdparty/7z/src/CpuArch.c b/3rdparty/7z/src/CpuArch.c index ff1890e7fe..a0e93e8b08 100644 --- a/3rdparty/7z/src/CpuArch.c +++ b/3rdparty/7z/src/CpuArch.c @@ -1,5 +1,5 @@ /* CpuArch.c -- CPU specific code -2018-02-18: Igor Pavlov : Public domain */ +2021-07-13 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -55,6 +55,47 @@ static UInt32 CheckFlag(UInt32 flag) #define CHECK_CPUID_IS_SUPPORTED #endif +#ifndef USE_ASM + #ifdef _MSC_VER + #if _MSC_VER >= 1600 + #define MY__cpuidex __cpuidex + #else + +/* + __cpuid (function == 4) requires subfunction number in ECX. + MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction. + __cpuid() in new MSVC clears ECX. + __cpuid() in old MSVC (14.00) doesn't clear ECX + We still can use __cpuid for low (function) values that don't require ECX, + but __cpuid() in old MSVC will be incorrect for some function values: (function == 4). + So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, + where ECX value is first parameter for FAST_CALL / NO_INLINE function, + So the caller of MY__cpuidex_HACK() sets ECX as subFunction, and + old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. + + DON'T remove MY_NO_INLINE and MY_FAST_CALL for MY__cpuidex_HACK() !!! +*/ + +static +MY_NO_INLINE +void MY_FAST_CALL MY__cpuidex_HACK(UInt32 subFunction, int *CPUInfo, UInt32 function) +{ + UNUSED_VAR(subFunction); + __cpuid(CPUInfo, function); +} + + #define MY__cpuidex(info, func, func2) MY__cpuidex_HACK(func2, info, func) + #pragma message("======== MY__cpuidex_HACK WAS USED ========") + #endif + #else + #define MY__cpuidex(info, func, func2) __cpuid(info, func) + #pragma message("======== (INCORRECT ?) cpuid WAS USED ========") + #endif +#endif + + + + void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d) { #ifdef USE_ASM @@ -99,18 +140,20 @@ void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d) #endif "=c" (*c) , "=d" (*d) - : "0" (function)) ; + : "0" (function), "c"(0) ) ; #endif #else int CPUInfo[4]; - __cpuid(CPUInfo, function); - *a = CPUInfo[0]; - *b = CPUInfo[1]; - *c = CPUInfo[2]; - *d = CPUInfo[3]; + + MY__cpuidex(CPUInfo, (int)function, 0); + + *a = (UInt32)CPUInfo[0]; + *b = (UInt32)CPUInfo[1]; + *c = (UInt32)CPUInfo[2]; + *d = (UInt32)CPUInfo[3]; #endif } @@ -174,7 +217,7 @@ BoolInt CPU_Is_InOrder() } #if !defined(MY_CPU_AMD64) && defined(_WIN32) -#include +#include static BoolInt CPU_Sys_Is_SSE_Supported() { OSVERSIONINFO vi; @@ -188,13 +231,101 @@ static BoolInt CPU_Sys_Is_SSE_Supported() #define CHECK_SYS_SSE_SUPPORT #endif -BoolInt CPU_Is_Aes_Supported() + +static UInt32 X86_CPUID_ECX_Get_Flags() +{ + Cx86cpuid p; + CHECK_SYS_SSE_SUPPORT + if (!x86cpuid_CheckAndRead(&p)) + return 0; + return p.c; +} + +BoolInt CPU_IsSupported_AES() +{ + return (X86_CPUID_ECX_Get_Flags() >> 25) & 1; +} + +BoolInt CPU_IsSupported_SSSE3() +{ + return (X86_CPUID_ECX_Get_Flags() >> 9) & 1; +} + +BoolInt CPU_IsSupported_SSE41() +{ + return (X86_CPUID_ECX_Get_Flags() >> 19) & 1; +} + +BoolInt CPU_IsSupported_SHA() { Cx86cpuid p; CHECK_SYS_SSE_SUPPORT if (!x86cpuid_CheckAndRead(&p)) return False; - return (p.c >> 25) & 1; + + if (p.maxFunc < 7) + return False; + { + UInt32 d[4] = { 0 }; + MyCPUID(7, &d[0], &d[1], &d[2], &d[3]); + return (d[1] >> 29) & 1; + } +} + +// #include + +#ifdef _WIN32 +#include +#endif + +BoolInt CPU_IsSupported_AVX2() +{ + Cx86cpuid p; + CHECK_SYS_SSE_SUPPORT + + #ifdef _WIN32 + #define MY__PF_XSAVE_ENABLED 17 + if (!IsProcessorFeaturePresent(MY__PF_XSAVE_ENABLED)) + return False; + #endif + + if (!x86cpuid_CheckAndRead(&p)) + return False; + if (p.maxFunc < 7) + return False; + { + UInt32 d[4] = { 0 }; + MyCPUID(7, &d[0], &d[1], &d[2], &d[3]); + // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); + return 1 + & (d[1] >> 5); // avx2 + } +} + +BoolInt CPU_IsSupported_VAES_AVX2() +{ + Cx86cpuid p; + CHECK_SYS_SSE_SUPPORT + + #ifdef _WIN32 + #define MY__PF_XSAVE_ENABLED 17 + if (!IsProcessorFeaturePresent(MY__PF_XSAVE_ENABLED)) + return False; + #endif + + if (!x86cpuid_CheckAndRead(&p)) + return False; + if (p.maxFunc < 7) + return False; + { + UInt32 d[4] = { 0 }; + MyCPUID(7, &d[0], &d[1], &d[2], &d[3]); + // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); + return 1 + & (d[1] >> 5) // avx2 + // & (d[1] >> 31) // avx512vl + & (d[2] >> 9); // vaes // VEX-256/EVEX + } } BoolInt CPU_IsSupported_PageGB() @@ -215,4 +346,133 @@ BoolInt CPU_IsSupported_PageGB() } } + +#elif defined(MY_CPU_ARM_OR_ARM64) + +#ifdef _WIN32 + +#include + +BoolInt CPU_IsSupported_CRC32() { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } +BoolInt CPU_IsSupported_CRYPTO() { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } +BoolInt CPU_IsSupported_NEON() { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } + +#else + +#if defined(__APPLE__) + +/* +#include +#include +static void Print_sysctlbyname(const char *name) +{ + size_t bufSize = 256; + char buf[256]; + int res = sysctlbyname(name, &buf, &bufSize, NULL, 0); + { + int i; + printf("\nres = %d : %s : '%s' : bufSize = %d, numeric", res, name, buf, (unsigned)bufSize); + for (i = 0; i < 20; i++) + printf(" %2x", (unsigned)(Byte)buf[i]); + + } +} +*/ + +static BoolInt My_sysctlbyname_Get_BoolInt(const char *name) +{ + UInt32 val = 0; + if (My_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1) + return 1; + return 0; +} + + /* + Print_sysctlbyname("hw.pagesize"); + Print_sysctlbyname("machdep.cpu.brand_string"); + */ + +BoolInt CPU_IsSupported_CRC32(void) +{ + return My_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32"); +} + +BoolInt CPU_IsSupported_NEON(void) +{ + return My_sysctlbyname_Get_BoolInt("hw.optional.neon"); +} + +#ifdef MY_CPU_ARM64 +#define APPLE_CRYPTO_SUPPORT_VAL 1 +#else +#define APPLE_CRYPTO_SUPPORT_VAL 0 +#endif + +BoolInt CPU_IsSupported_SHA1(void) { return APPLE_CRYPTO_SUPPORT_VAL; } +BoolInt CPU_IsSupported_SHA2(void) { return APPLE_CRYPTO_SUPPORT_VAL; } +BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; } + + +#else // __APPLE__ + +#include + +#define USE_HWCAP + +#ifdef USE_HWCAP + +#include + + #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ + BoolInt CPU_IsSupported_ ## name1() { return (getauxval(AT_HWCAP) & (HWCAP_ ## name2)) ? 1 : 0; } + +#ifdef MY_CPU_ARM64 + #define MY_HWCAP_CHECK_FUNC(name) \ + MY_HWCAP_CHECK_FUNC_2(name, name) + MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD) +// MY_HWCAP_CHECK_FUNC (ASIMD) +#elif defined(MY_CPU_ARM) + #define MY_HWCAP_CHECK_FUNC(name) \ + BoolInt CPU_IsSupported_ ## name() { return (getauxval(AT_HWCAP2) & (HWCAP2_ ## name)) ? 1 : 0; } + MY_HWCAP_CHECK_FUNC_2(NEON, NEON) +#endif + +#else // USE_HWCAP + + #define MY_HWCAP_CHECK_FUNC(name) \ + BoolInt CPU_IsSupported_ ## name() { return 0; } + MY_HWCAP_CHECK_FUNC(NEON) + +#endif // USE_HWCAP + +MY_HWCAP_CHECK_FUNC (CRC32) +MY_HWCAP_CHECK_FUNC (SHA1) +MY_HWCAP_CHECK_FUNC (SHA2) +MY_HWCAP_CHECK_FUNC (AES) + +#endif // __APPLE__ +#endif // _WIN32 + +#endif // MY_CPU_ARM_OR_ARM64 + + + +#ifdef __APPLE__ + +#include + +int My_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize) +{ + return sysctlbyname(name, buf, bufSize, NULL, 0); +} + +int My_sysctlbyname_Get_UInt32(const char *name, UInt32 *val) +{ + size_t bufSize = sizeof(*val); + int res = My_sysctlbyname_Get(name, val, &bufSize); + if (res == 0 && bufSize != sizeof(*val)) + return EFAULT; + return res; +} + #endif diff --git a/3rdparty/7z/src/CpuArch.h b/3rdparty/7z/src/CpuArch.h index 5f74c1c0cb..b300a5af76 100644 --- a/3rdparty/7z/src/CpuArch.h +++ b/3rdparty/7z/src/CpuArch.h @@ -1,5 +1,5 @@ /* CpuArch.h -- CPU specific code -2018-02-18 : Igor Pavlov : Public domain */ +2021-07-13 : Igor Pavlov : Public domain */ #ifndef __CPU_ARCH_H #define __CPU_ARCH_H @@ -14,6 +14,10 @@ MY_CPU_BE means that CPU is BIG ENDIAN. If MY_CPU_LE and MY_CPU_BE are not defined, we don't know about ENDIANNESS of platform. MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned memory accesses. + +MY_CPU_64BIT means that processor can work with 64-bit registers. + MY_CPU_64BIT can be used to select fast code branch + MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8) */ #if defined(_M_X64) \ @@ -24,8 +28,10 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #define MY_CPU_AMD64 #ifdef __ILP32__ #define MY_CPU_NAME "x32" + #define MY_CPU_SIZEOF_POINTER 4 #else #define MY_CPU_NAME "x64" + #define MY_CPU_SIZEOF_POINTER 8 #endif #define MY_CPU_64BIT #endif @@ -35,7 +41,8 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem || defined(__i386__) #define MY_CPU_X86 #define MY_CPU_NAME "x86" - #define MY_CPU_32BIT + /* #define MY_CPU_32BIT */ + #define MY_CPU_SIZEOF_POINTER 4 #endif @@ -59,8 +66,14 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem || defined(__THUMBEL__) \ || defined(__THUMBEB__) #define MY_CPU_ARM - #define MY_CPU_NAME "arm" - #define MY_CPU_32BIT + + #if defined(__thumb__) || defined(__THUMBEL__) || defined(_M_ARMT) + #define MY_CPU_NAME "armt" + #else + #define MY_CPU_NAME "arm" + #endif + /* #define MY_CPU_32BIT */ + #define MY_CPU_SIZEOF_POINTER 4 #endif @@ -84,17 +97,29 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #if defined(__ppc64__) \ - || defined(__powerpc64__) + || defined(__powerpc64__) \ + || defined(__ppc__) \ + || defined(__powerpc__) \ + || defined(__PPC__) \ + || defined(_POWER) + +#if defined(__ppc64__) \ + || defined(__powerpc64__) \ + || defined(_LP64) \ + || defined(__64BIT__) #ifdef __ILP32__ #define MY_CPU_NAME "ppc64-32" + #define MY_CPU_SIZEOF_POINTER 4 #else #define MY_CPU_NAME "ppc64" + #define MY_CPU_SIZEOF_POINTER 8 #endif #define MY_CPU_64BIT -#elif defined(__ppc__) \ - || defined(__powerpc__) +#else #define MY_CPU_NAME "ppc" - #define MY_CPU_32BIT + #define MY_CPU_SIZEOF_POINTER 4 + /* #define MY_CPU_32BIT */ +#endif #endif @@ -111,6 +136,10 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #define MY_CPU_X86_OR_AMD64 #endif +#if defined(MY_CPU_ARM) || defined(MY_CPU_ARM64) +#define MY_CPU_ARM_OR_ARM64 +#endif + #ifdef _WIN32 @@ -170,6 +199,40 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #error Stop_Compiling_Bad_32_64_BIT #endif +#ifdef __SIZEOF_POINTER__ + #ifdef MY_CPU_SIZEOF_POINTER + #if MY_CPU_SIZEOF_POINTER != __SIZEOF_POINTER__ + #error Stop_Compiling_Bad_MY_CPU_PTR_SIZE + #endif + #else + #define MY_CPU_SIZEOF_POINTER __SIZEOF_POINTER__ + #endif +#endif + +#if defined(MY_CPU_SIZEOF_POINTER) && (MY_CPU_SIZEOF_POINTER == 4) +#if defined (_LP64) + #error Stop_Compiling_Bad_MY_CPU_PTR_SIZE +#endif +#endif + +#ifdef _MSC_VER + #if _MSC_VER >= 1300 + #define MY_CPU_pragma_pack_push_1 __pragma(pack(push, 1)) + #define MY_CPU_pragma_pop __pragma(pack(pop)) + #else + #define MY_CPU_pragma_pack_push_1 + #define MY_CPU_pragma_pop + #endif +#else + #ifdef __xlC__ + #define MY_CPU_pragma_pack_push_1 _Pragma("pack(1)") + #define MY_CPU_pragma_pop _Pragma("pack()") + #else + #define MY_CPU_pragma_pack_push_1 _Pragma("pack(push, 1)") + #define MY_CPU_pragma_pop _Pragma("pack(pop)") + #endif +#endif + #ifndef MY_CPU_NAME #ifdef MY_CPU_LE @@ -189,8 +252,12 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #ifdef MY_CPU_LE #if defined(MY_CPU_X86_OR_AMD64) \ - || defined(MY_CPU_ARM64) \ - || defined(__ARM_FEATURE_UNALIGNED) + || defined(MY_CPU_ARM64) + #define MY_CPU_LE_UNALIGN + #define MY_CPU_LE_UNALIGN_64 + #elif defined(__ARM_FEATURE_UNALIGNED) + /* gcc9 for 32-bit arm can use LDRD instruction that requires 32-bit alignment. + So we can't use unaligned 64-bit operations. */ #define MY_CPU_LE_UNALIGN #endif #endif @@ -200,11 +267,15 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #define GetUi16(p) (*(const UInt16 *)(const void *)(p)) #define GetUi32(p) (*(const UInt32 *)(const void *)(p)) +#ifdef MY_CPU_LE_UNALIGN_64 #define GetUi64(p) (*(const UInt64 *)(const void *)(p)) +#endif -#define SetUi16(p, v) { *(UInt16 *)(p) = (v); } -#define SetUi32(p, v) { *(UInt32 *)(p) = (v); } -#define SetUi64(p, v) { *(UInt64 *)(p) = (v); } +#define SetUi16(p, v) { *(UInt16 *)(void *)(p) = (v); } +#define SetUi32(p, v) { *(UInt32 *)(void *)(p) = (v); } +#ifdef MY_CPU_LE_UNALIGN_64 +#define SetUi64(p, v) { *(UInt64 *)(void *)(p) = (v); } +#endif #else @@ -218,8 +289,6 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem ((UInt32)((const Byte *)(p))[2] << 16) | \ ((UInt32)((const Byte *)(p))[3] << 24)) -#define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32)) - #define SetUi16(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \ _ppp_[0] = (Byte)_vvv_; \ _ppp_[1] = (Byte)(_vvv_ >> 8); } @@ -230,19 +299,29 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem _ppp_[2] = (Byte)(_vvv_ >> 16); \ _ppp_[3] = (Byte)(_vvv_ >> 24); } +#endif + + +#ifndef MY_CPU_LE_UNALIGN_64 + +#define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32)) + #define SetUi64(p, v) { Byte *_ppp2_ = (Byte *)(p); UInt64 _vvv2_ = (v); \ SetUi32(_ppp2_ , (UInt32)_vvv2_); \ SetUi32(_ppp2_ + 4, (UInt32)(_vvv2_ >> 32)); } #endif + + + #ifdef __has_builtin #define MY__has_builtin(x) __has_builtin(x) #else #define MY__has_builtin(x) 0 #endif -#if defined(MY_CPU_LE_UNALIGN) && /* defined(_WIN64) && */ (_MSC_VER >= 1300) +#if defined(MY_CPU_LE_UNALIGN) && /* defined(_WIN64) && */ defined(_MSC_VER) && (_MSC_VER >= 1300) /* Note: we use bswap instruction, that is unsupported in 386 cpu */ @@ -253,8 +332,8 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #pragma intrinsic(_byteswap_uint64) /* #define GetBe16(p) _byteswap_ushort(*(const UInt16 *)(const Byte *)(p)) */ -#define GetBe32(p) _byteswap_ulong(*(const UInt32 *)(const Byte *)(p)) -#define GetBe64(p) _byteswap_uint64(*(const UInt64 *)(const Byte *)(p)) +#define GetBe32(p) _byteswap_ulong (*(const UInt32 *)(const void *)(p)) +#define GetBe64(p) _byteswap_uint64(*(const UInt64 *)(const void *)(p)) #define SetBe32(p, v) (*(UInt32 *)(void *)(p)) = _byteswap_ulong(v) @@ -262,9 +341,9 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ || (defined(__clang__) && MY__has_builtin(__builtin_bswap16)) ) -/* #define GetBe16(p) __builtin_bswap16(*(const UInt16 *)(const Byte *)(p)) */ -#define GetBe32(p) __builtin_bswap32(*(const UInt32 *)(const Byte *)(p)) -#define GetBe64(p) __builtin_bswap64(*(const UInt64 *)(const Byte *)(p)) +/* #define GetBe16(p) __builtin_bswap16(*(const UInt16 *)(const void *)(p)) */ +#define GetBe32(p) __builtin_bswap32(*(const UInt32 *)(const void *)(p)) +#define GetBe64(p) __builtin_bswap64(*(const UInt64 *)(const void *)(p)) #define SetBe32(p, v) (*(UInt32 *)(void *)(p)) = __builtin_bswap32(v) @@ -325,10 +404,37 @@ int x86cpuid_GetFirm(const Cx86cpuid *p); #define x86cpuid_GetModel(ver) (((ver >> 12) & 0xF0) | ((ver >> 4) & 0xF)) #define x86cpuid_GetStepping(ver) (ver & 0xF) -BoolInt CPU_Is_InOrder(); -BoolInt CPU_Is_Aes_Supported(); -BoolInt CPU_IsSupported_PageGB(); +BoolInt CPU_Is_InOrder(void); +BoolInt CPU_IsSupported_AES(void); +BoolInt CPU_IsSupported_AVX2(void); +BoolInt CPU_IsSupported_VAES_AVX2(void); +BoolInt CPU_IsSupported_SSSE3(void); +BoolInt CPU_IsSupported_SSE41(void); +BoolInt CPU_IsSupported_SHA(void); +BoolInt CPU_IsSupported_PageGB(void); + +#elif defined(MY_CPU_ARM_OR_ARM64) + +BoolInt CPU_IsSupported_CRC32(void); +BoolInt CPU_IsSupported_NEON(void); + +#if defined(_WIN32) +BoolInt CPU_IsSupported_CRYPTO(void); +#define CPU_IsSupported_SHA1 CPU_IsSupported_CRYPTO +#define CPU_IsSupported_SHA2 CPU_IsSupported_CRYPTO +#define CPU_IsSupported_AES CPU_IsSupported_CRYPTO +#else +BoolInt CPU_IsSupported_SHA1(void); +BoolInt CPU_IsSupported_SHA2(void); +BoolInt CPU_IsSupported_AES(void); +#endif + +#endif + +#if defined(__APPLE__) +int My_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize); +int My_sysctlbyname_Get_UInt32(const char *name, UInt32 *val); #endif EXTERN_C_END diff --git a/3rdparty/7z/src/Delta.c b/3rdparty/7z/src/Delta.c index 6cbbe46012..fc7e9fe96c 100644 --- a/3rdparty/7z/src/Delta.c +++ b/3rdparty/7z/src/Delta.c @@ -1,5 +1,5 @@ /* Delta.c -- Delta converter -2009-05-26 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -12,53 +12,158 @@ void Delta_Init(Byte *state) state[i] = 0; } -static void MyMemCpy(Byte *dest, const Byte *src, unsigned size) -{ - unsigned i; - for (i = 0; i < size; i++) - dest[i] = src[i]; -} void Delta_Encode(Byte *state, unsigned delta, Byte *data, SizeT size) { - Byte buf[DELTA_STATE_SIZE]; - unsigned j = 0; - MyMemCpy(buf, state, delta); + Byte temp[DELTA_STATE_SIZE]; + + if (size == 0) + return; + { - SizeT i; - for (i = 0; i < size;) + unsigned i = 0; + do + temp[i] = state[i]; + while (++i != delta); + } + + if (size <= delta) + { + unsigned i = 0, k; + do { - for (j = 0; j < delta && i < size; i++, j++) + Byte b = *data; + *data++ = (Byte)(b - temp[i]); + temp[i] = b; + } + while (++i != size); + + k = 0; + + do + { + if (i == delta) + i = 0; + state[k] = temp[i++]; + } + while (++k != delta); + + return; + } + + { + Byte *p = data + size - delta; + { + unsigned i = 0; + do + state[i] = *p++; + while (++i != delta); + } + { + const Byte *lim = data + delta; + ptrdiff_t dif = -(ptrdiff_t)delta; + + if (((ptrdiff_t)size + dif) & 1) { - Byte b = data[i]; - data[i] = (Byte)(b - buf[j]); - buf[j] = b; + --p; *p = (Byte)(*p - p[dif]); } + + while (p != lim) + { + --p; *p = (Byte)(*p - p[dif]); + --p; *p = (Byte)(*p - p[dif]); + } + + dif = -dif; + + do + { + --p; *p = (Byte)(*p - temp[--dif]); + } + while (dif != 0); } } - if (j == delta) - j = 0; - MyMemCpy(state, buf + j, delta - j); - MyMemCpy(state + delta - j, buf, j); } + void Delta_Decode(Byte *state, unsigned delta, Byte *data, SizeT size) { - Byte buf[DELTA_STATE_SIZE]; - unsigned j = 0; - MyMemCpy(buf, state, delta); + unsigned i; + const Byte *lim; + + if (size == 0) + return; + + i = 0; + lim = data + size; + + if (size <= delta) { - SizeT i; - for (i = 0; i < size;) + do + *data = (Byte)(*data + state[i++]); + while (++data != lim); + + for (; delta != i; state++, delta--) + *state = state[i]; + data -= i; + } + else + { + /* + #define B(n) b ## n + #define I(n) Byte B(n) = state[n]; + #define U(n) { B(n) = (Byte)((B(n)) + *data++); data[-1] = (B(n)); } + #define F(n) if (data != lim) { U(n) } + + if (delta == 1) { - for (j = 0; j < delta && i < size; i++, j++) + I(0) + if ((lim - data) & 1) { U(0) } + while (data != lim) { U(0) U(0) } + data -= 1; + } + else if (delta == 2) + { + I(0) I(1) + lim -= 1; while (data < lim) { U(0) U(1) } + lim += 1; F(0) + data -= 2; + } + else if (delta == 3) + { + I(0) I(1) I(2) + lim -= 2; while (data < lim) { U(0) U(1) U(2) } + lim += 2; F(0) F(1) + data -= 3; + } + else if (delta == 4) + { + I(0) I(1) I(2) I(3) + lim -= 3; while (data < lim) { U(0) U(1) U(2) U(3) } + lim += 3; F(0) F(1) F(2) + data -= 4; + } + else + */ + { + do { - buf[j] = data[i] = (Byte)(buf[j] + data[i]); + *data = (Byte)(*data + state[i++]); + data++; + } + while (i != delta); + + { + ptrdiff_t dif = -(ptrdiff_t)delta; + do + *data = (Byte)(*data + data[dif]); + while (++data != lim); + data += dif; } } } - if (j == delta) - j = 0; - MyMemCpy(state, buf + j, delta - j); - MyMemCpy(state + delta - j, buf, j); + + do + *state++ = *data; + while (++data != lim); } diff --git a/3rdparty/7z/src/DllSecur.c b/3rdparty/7z/src/DllSecur.c index 19a22a9f05..bfd6f61e30 100644 --- a/3rdparty/7z/src/DllSecur.c +++ b/3rdparty/7z/src/DllSecur.c @@ -1,11 +1,11 @@ /* DllSecur.c -- DLL loading security -2018-02-21 : Igor Pavlov : Public domain */ +2021-12-25 : Igor Pavlov : Public domain */ #include "Precomp.h" #ifdef _WIN32 -#include +#include #include "DllSecur.h" @@ -33,17 +33,19 @@ static const char * const g_Dlls = #endif +// #define MY_CAST_FUNC (void(*)()) +#define MY_CAST_FUNC + void My_SetDefaultDllDirectories() { #ifndef UNDER_CE OSVERSIONINFO vi; vi.dwOSVersionInfoSize = sizeof(vi); - GetVersionEx(&vi); if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0) { Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories) - GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories"); + MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories"); if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; @@ -66,7 +68,7 @@ void LoadSecurityDlls() if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0) { Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories) - GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories"); + MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories"); if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; diff --git a/3rdparty/7z/src/DllSecur.h b/3rdparty/7z/src/DllSecur.h index 4c113568e6..0fd8070e54 100644 --- a/3rdparty/7z/src/DllSecur.h +++ b/3rdparty/7z/src/DllSecur.h @@ -10,8 +10,8 @@ EXTERN_C_BEGIN #ifdef _WIN32 -void My_SetDefaultDllDirectories(); -void LoadSecurityDlls(); +void My_SetDefaultDllDirectories(void); +void LoadSecurityDlls(void); #endif diff --git a/3rdparty/7z/src/LzFind.c b/3rdparty/7z/src/LzFind.c index 4eefc17dd2..a17c06bc31 100644 --- a/3rdparty/7z/src/LzFind.c +++ b/3rdparty/7z/src/LzFind.c @@ -1,20 +1,69 @@ /* LzFind.c -- Match finder for LZ algorithms -2018-07-08 : Igor Pavlov : Public domain */ +2021-11-29 : Igor Pavlov : Public domain */ #include "Precomp.h" #include +// #include +#include "CpuArch.h" #include "LzFind.h" #include "LzHash.h" -#define kEmptyHashValue 0 -#define kMaxValForNormalize ((UInt32)0xFFFFFFFF) -#define kNormalizeStepMin (1 << 10) /* it must be power of 2 */ -#define kNormalizeMask (~(UInt32)(kNormalizeStepMin - 1)) -#define kMaxHistorySize ((UInt32)7 << 29) +#define kBlockMoveAlign (1 << 7) // alignment for memmove() +#define kBlockSizeAlign (1 << 16) // alignment for block allocation +#define kBlockSizeReserveMin (1 << 24) // it's 1/256 from 4 GB dictinary + +#define kEmptyHashValue 0 + +#define kMaxValForNormalize ((UInt32)0) +// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xFFF) // for debug + +// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses + +#define GET_AVAIL_BYTES(p) \ + Inline_MatchFinder_GetNumAvailableBytes(p) + + +// #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size) +#define kFix5HashSize kFix4HashSize + +/* + HASH2_CALC: + if (hv) match, then cur[0] and cur[1] also match +*/ +#define HASH2_CALC hv = GetUi16(cur); + +// (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255] + +/* + HASH3_CALC: + if (cur[0]) and (h2) match, then cur[1] also match + if (cur[0]) and (hv) match, then cur[1] and cur[2] also match +*/ +#define HASH3_CALC { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; } + +#define HASH4_CALC { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + temp ^= ((UInt32)cur[2] << 8); \ + h3 = temp & (kHash3Size - 1); \ + hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; } + +#define HASH5_CALC { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + temp ^= ((UInt32)cur[2] << 8); \ + h3 = temp & (kHash3Size - 1); \ + temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \ + /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \ + hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; } + +#define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF; -#define kStartMaxLen 3 static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc) { @@ -25,46 +74,57 @@ static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc) } } -/* keepSizeBefore + keepSizeAfter + keepSizeReserv must be < 4G) */ -static int LzInWindow_Create(CMatchFinder *p, UInt32 keepSizeReserv, ISzAllocPtr alloc) +static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc) { - UInt32 blockSize = p->keepSizeBefore + p->keepSizeAfter + keepSizeReserv; - if (p->directInput) - { - p->blockSize = blockSize; - return 1; - } + if (blockSize == 0) + return 0; if (!p->bufferBase || p->blockSize != blockSize) { + // size_t blockSizeT; LzInWindow_Free(p, alloc); p->blockSize = blockSize; - p->bufferBase = (Byte *)ISzAlloc_Alloc(alloc, (size_t)blockSize); + // blockSizeT = blockSize; + + // printf("\nblockSize = 0x%x\n", blockSize); + /* + #if defined _WIN64 + // we can allocate 4GiB, but still use UInt32 for (p->blockSize) + // we use UInt32 type for (p->blockSize), because + // we don't want to wrap over 4 GiB, + // when we use (p->streamPos - p->pos) that is UInt32. + if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign) + { + blockSizeT = ((size_t)1 << 32); + printf("\nchanged to blockSizeT = 4GiB\n"); + } + #endif + */ + + p->bufferBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize); + // printf("\nbufferBase = %p\n", p->bufferBase); + // return 0; // for debug } return (p->bufferBase != NULL); } -Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } +static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } -UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return p->streamPos - p->pos; } +static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); } -void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue) -{ - p->posLimit -= subValue; - p->pos -= subValue; - p->streamPos -= subValue; -} +MY_NO_INLINE static void MatchFinder_ReadBlock(CMatchFinder *p) { if (p->streamEndWasReached || p->result != SZ_OK) return; - /* We use (p->streamPos - p->pos) value. (p->streamPos < p->pos) is allowed. */ + /* We use (p->streamPos - p->pos) value. + (p->streamPos < p->pos) is allowed. */ if (p->directInput) { - UInt32 curSize = 0xFFFFFFFF - (p->streamPos - p->pos); + UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p); if (curSize > p->directInputRem) curSize = (UInt32)p->directInputRem; p->directInputRem -= curSize; @@ -76,10 +136,22 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) for (;;) { - Byte *dest = p->buffer + (p->streamPos - p->pos); - size_t size = (p->bufferBase + p->blockSize - dest); + Byte *dest = p->buffer + GET_AVAIL_BYTES(p); + size_t size = (size_t)(p->bufferBase + p->blockSize - dest); if (size == 0) + { + /* we call ReadBlock() after NeedMove() and MoveBlock(). + NeedMove() and MoveBlock() povide more than (keepSizeAfter) + to the end of (blockSize). + So we don't execute this branch in normal code flow. + We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock(). + */ + // p->result = SZ_ERROR_FAIL; // we can show error here return; + } + + // #define kRead 3 + // if (size > kRead) size = kRead; // for debug p->result = ISeqInStream_Read(p->stream, dest, &size); if (p->result != SZ_OK) @@ -90,41 +162,52 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) return; } p->streamPos += (UInt32)size; - if (p->streamPos - p->pos > p->keepSizeAfter) + if (GET_AVAIL_BYTES(p) > p->keepSizeAfter) return; + /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function + (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */ } + + // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter) } + + +MY_NO_INLINE void MatchFinder_MoveBlock(CMatchFinder *p) { + const size_t offset = (size_t)(p->buffer - p->bufferBase) - p->keepSizeBefore; + const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore; + p->buffer = p->bufferBase + keepBefore; memmove(p->bufferBase, - p->buffer - p->keepSizeBefore, - (size_t)(p->streamPos - p->pos) + p->keepSizeBefore); - p->buffer = p->bufferBase + p->keepSizeBefore; + p->bufferBase + (offset & ~((size_t)kBlockMoveAlign - 1)), + keepBefore + (size_t)GET_AVAIL_BYTES(p)); } +/* We call MoveBlock() before ReadBlock(). + So MoveBlock() can be wasteful operation, if the whole input data + can fit in current block even without calling MoveBlock(). + in important case where (dataSize <= historySize) + condition (p->blockSize > dataSize + p->keepSizeAfter) is met + So there is no MoveBlock() in that case case. +*/ + int MatchFinder_NeedMove(CMatchFinder *p) { if (p->directInput) return 0; - /* if (p->streamEndWasReached) return 0; */ + if (p->streamEndWasReached || p->result != SZ_OK) + return 0; return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter); } void MatchFinder_ReadIfRequired(CMatchFinder *p) { - if (p->streamEndWasReached) - return; - if (p->keepSizeAfter >= p->streamPos - p->pos) + if (p->keepSizeAfter >= GET_AVAIL_BYTES(p)) MatchFinder_ReadBlock(p); } -static void MatchFinder_CheckAndMoveAndRead(CMatchFinder *p) -{ - if (MatchFinder_NeedMove(p)) - MatchFinder_MoveBlock(p); - MatchFinder_ReadBlock(p); -} + static void MatchFinder_SetDefaultSettings(CMatchFinder *p) { @@ -175,39 +258,74 @@ static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc) return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes); } +#if (kBlockSizeReserveMin < kBlockSizeAlign * 2) + #error Stop_Compiling_Bad_Reserve +#endif + + + +static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize) +{ + UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter); + /* + if (historySize > kMaxHistorySize) + return 0; + */ + // printf("\nhistorySize == 0x%x\n", historySize); + + if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore) // if 32-bit overflow + return 0; + + { + const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign; + const UInt32 rem = kBlockSizeMax - blockSize; + const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2)) + + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here + if (blockSize >= kBlockSizeMax + || rem < kBlockSizeReserveMin) // we reject settings that will be slow + return 0; + if (reserve >= rem) + blockSize = kBlockSizeMax; + else + { + blockSize += reserve; + blockSize &= ~(UInt32)(kBlockSizeAlign - 1); + } + } + // printf("\n LzFind_blockSize = %x\n", blockSize); + // printf("\n LzFind_blockSize = %d\n", blockSize >> 20); + return blockSize; +} + + int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc) { - UInt32 sizeReserv; - - if (historySize > kMaxHistorySize) - { - MatchFinder_Free(p, alloc); - return 0; - } - - sizeReserv = historySize >> 1; - if (historySize >= ((UInt32)3 << 30)) sizeReserv = historySize >> 3; - else if (historySize >= ((UInt32)2 << 30)) sizeReserv = historySize >> 2; - - sizeReserv += (keepAddBufferBefore + matchMaxLen + keepAddBufferAfter) / 2 + (1 << 19); - + /* we need one additional byte in (p->keepSizeBefore), + since we use MoveBlock() after (p->pos++) and before dictionary using */ + // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug p->keepSizeBefore = historySize + keepAddBufferBefore + 1; - p->keepSizeAfter = matchMaxLen + keepAddBufferAfter; - - /* we need one additional byte, since we use MoveBlock after pos++ and before dictionary using */ - - if (LzInWindow_Create(p, sizeReserv, alloc)) + + keepAddBufferAfter += matchMaxLen; + /* we need (p->keepSizeAfter >= p->numHashBytes) */ + if (keepAddBufferAfter < p->numHashBytes) + keepAddBufferAfter = p->numHashBytes; + // keepAddBufferAfter -= 2; // for debug + p->keepSizeAfter = keepAddBufferAfter; + + if (p->directInput) + p->blockSize = 0; + if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc)) { - UInt32 newCyclicBufferSize = historySize + 1; + const UInt32 newCyclicBufferSize = historySize + 1; // do not change it UInt32 hs; p->matchMaxLen = matchMaxLen; { + // UInt32 hs4; p->fixedHashSize = 0; - if (p->numHashBytes == 2) - hs = (1 << 16) - 1; - else + hs = (1 << 16) - 1; + if (p->numHashBytes != 2) { hs = historySize; if (hs > p->expectedDataSize) @@ -218,9 +336,9 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, hs |= (hs >> 2); hs |= (hs >> 4); hs |= (hs >> 8); + // we propagated 16 bits in (hs). Low 16 bits must be set later hs >>= 1; - hs |= 0xFFFF; /* don't change it! It's required for Deflate */ - if (hs > (1 << 24)) + if (hs >= (1 << 24)) { if (p->numHashBytes == 3) hs = (1 << 24) - 1; @@ -228,12 +346,30 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, hs >>= 1; /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ } + + // hs = ((UInt32)1 << 25) - 1; // for test + + // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) + hs |= (1 << 16) - 1; /* don't change it! */ + + // bt5: we adjust the size with recommended minimum size + if (p->numHashBytes >= 5) + hs |= (256 << kLzHash_CrcShift_2) - 1; } p->hashMask = hs; hs++; + + /* + hs4 = (1 << 20); + if (hs4 > hs) + hs4 = hs; + // hs4 = (1 << 16); // for test + p->hash4Mask = hs4 - 1; + */ + if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size; if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size; - if (p->numHashBytes > 4) p->fixedHashSize += kHash4Size; + // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size; hs += p->fixedHashSize; } @@ -242,13 +378,17 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, size_t numSons; p->historySize = historySize; p->hashSizeSum = hs; - p->cyclicBufferSize = newCyclicBufferSize; + p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1) numSons = newCyclicBufferSize; if (p->btMode) numSons <<= 1; newSize = hs + numSons; + // aligned size is not required here, but it can be better for some loops + #define NUM_REFS_ALIGN_MASK 0xF + newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK; + if (p->hash && p->numRefs == newSize) return 1; @@ -268,33 +408,43 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, return 0; } + static void MatchFinder_SetLimits(CMatchFinder *p) { - UInt32 limit = kMaxValForNormalize - p->pos; - UInt32 limit2 = p->cyclicBufferSize - p->cyclicBufferPos; + UInt32 k; + UInt32 n = kMaxValForNormalize - p->pos; + if (n == 0) + n = (UInt32)(Int32)-1; // we allow (pos == 0) at start even with (kMaxValForNormalize == 0) - if (limit2 < limit) - limit = limit2; - limit2 = p->streamPos - p->pos; - - if (limit2 <= p->keepSizeAfter) + k = p->cyclicBufferSize - p->cyclicBufferPos; + if (k < n) + n = k; + + k = GET_AVAIL_BYTES(p); { - if (limit2 > 0) - limit2 = 1; + const UInt32 ksa = p->keepSizeAfter; + UInt32 mm = p->matchMaxLen; + if (k > ksa) + k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock + else if (k >= mm) + { + // the limitation for (p->lenLimit) update + k -= mm; // optimization : to reduce the number of checks + k++; + // k = 1; // non-optimized version : for debug + } + else + { + mm = k; + if (k != 0) + k = 1; + } + p->lenLimit = mm; } - else - limit2 -= p->keepSizeAfter; + if (k < n) + n = k; - if (limit2 < limit) - limit = limit2; - - { - UInt32 lenLimit = p->streamPos - p->pos; - if (lenLimit > p->matchMaxLen) - lenLimit = p->matchMaxLen; - p->lenLimit = lenLimit; - } - p->posLimit = p->pos + limit; + p->posLimit = p->pos + n; } @@ -302,7 +452,7 @@ void MatchFinder_Init_LowHash(CMatchFinder *p) { size_t i; CLzRef *items = p->hash; - size_t numItems = p->fixedHashSize; + const size_t numItems = p->fixedHashSize; for (i = 0; i < numItems; i++) items[i] = kEmptyHashValue; } @@ -312,72 +462,322 @@ void MatchFinder_Init_HighHash(CMatchFinder *p) { size_t i; CLzRef *items = p->hash + p->fixedHashSize; - size_t numItems = (size_t)p->hashMask + 1; + const size_t numItems = (size_t)p->hashMask + 1; for (i = 0; i < numItems; i++) items[i] = kEmptyHashValue; } -void MatchFinder_Init_3(CMatchFinder *p, int readData) +void MatchFinder_Init_4(CMatchFinder *p) { - p->cyclicBufferPos = 0; p->buffer = p->bufferBase; - p->pos = - p->streamPos = p->cyclicBufferSize; + { + /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker. + the code in CMatchFinderMt expects (pos = 1) */ + p->pos = + p->streamPos = + 1; // it's smallest optimal value. do not change it + // 0; // for debug + } p->result = SZ_OK; p->streamEndWasReached = 0; - - if (readData) - MatchFinder_ReadBlock(p); - - MatchFinder_SetLimits(p); } +// (CYC_TO_POS_OFFSET == 0) is expected by some optimized code +#define CYC_TO_POS_OFFSET 0 +// #define CYC_TO_POS_OFFSET 1 // for debug + void MatchFinder_Init(CMatchFinder *p) { MatchFinder_Init_HighHash(p); MatchFinder_Init_LowHash(p); - MatchFinder_Init_3(p, True); + MatchFinder_Init_4(p); + // if (readData) + MatchFinder_ReadBlock(p); + + /* if we init (cyclicBufferPos = pos), then we can use one variable + instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */ + p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos) + // p->cyclicBufferPos = 0; // smallest value + // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses. + MatchFinder_SetLimits(p); } - -static UInt32 MatchFinder_GetSubValue(CMatchFinder *p) + + +#ifdef MY_CPU_X86_OR_AMD64 + #if defined(__clang__) && (__clang_major__ >= 8) \ + || defined(__GNUC__) && (__GNUC__ >= 8) \ + || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) + #define USE_SATUR_SUB_128 + #define USE_AVX2 + #define ATTRIB_SSE41 __attribute__((__target__("sse4.1"))) + #define ATTRIB_AVX2 __attribute__((__target__("avx2"))) + #elif defined(_MSC_VER) + #if (_MSC_VER >= 1600) + #define USE_SATUR_SUB_128 + #if (_MSC_VER >= 1900) + #define USE_AVX2 + #include // avx + #endif + #endif + #endif + +// #elif defined(MY_CPU_ARM_OR_ARM64) +#elif defined(MY_CPU_ARM64) + + #if defined(__clang__) && (__clang_major__ >= 8) \ + || defined(__GNUC__) && (__GNUC__ >= 8) + #define USE_SATUR_SUB_128 + #ifdef MY_CPU_ARM64 + // #define ATTRIB_SSE41 __attribute__((__target__(""))) + #else + // #define ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) + #endif + + #elif defined(_MSC_VER) + #if (_MSC_VER >= 1910) + #define USE_SATUR_SUB_128 + #endif + #endif + + #if defined(_MSC_VER) && defined(MY_CPU_ARM64) + #include + #else + #include + #endif + +#endif + +/* +#ifndef ATTRIB_SSE41 + #define ATTRIB_SSE41 +#endif +#ifndef ATTRIB_AVX2 + #define ATTRIB_AVX2 +#endif +*/ + +#ifdef USE_SATUR_SUB_128 + +// #define _SHOW_HW_STATUS + +#ifdef _SHOW_HW_STATUS +#include +#define _PRF(x) x +_PRF(;) +#else +#define _PRF(x) +#endif + +#ifdef MY_CPU_ARM_OR_ARM64 + +#ifdef MY_CPU_ARM64 +// #define FORCE_SATUR_SUB_128 +#endif + +typedef uint32x4_t v128; +#define SASUB_128(i) \ + *(v128 *)(void *)(items + (i) * 4) = \ + vsubq_u32(vmaxq_u32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2); + +#else + +#include // sse4.1 + +typedef __m128i v128; +#define SASUB_128(i) \ + *(v128 *)(void *)(items + (i) * 4) = \ + _mm_sub_epi32(_mm_max_epu32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2); // SSE 4.1 + +#endif + + + +MY_NO_INLINE +static +#ifdef ATTRIB_SSE41 +ATTRIB_SSE41 +#endif +void +MY_FAST_CALL +LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim) { - return (p->pos - p->historySize - 1) & kNormalizeMask; + v128 sub2 = + #ifdef MY_CPU_ARM_OR_ARM64 + vdupq_n_u32(subValue); + #else + _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); + #endif + do + { + SASUB_128(0) + SASUB_128(1) + SASUB_128(2) + SASUB_128(3) + items += 4 * 4; + } + while (items != lim); } + + +#ifdef USE_AVX2 + +#include // avx + +#define SASUB_256(i) *(__m256i *)(void *)(items + (i) * 8) = _mm256_sub_epi32(_mm256_max_epu32(*(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2); // AVX2 + +MY_NO_INLINE +static +#ifdef ATTRIB_AVX2 +ATTRIB_AVX2 +#endif +void +MY_FAST_CALL +LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim) +{ + __m256i sub2 = _mm256_set_epi32( + (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue, + (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); + do + { + SASUB_256(0) + SASUB_256(1) + items += 2 * 8; + } + while (items != lim); +} +#endif // USE_AVX2 + +#ifndef FORCE_SATUR_SUB_128 +typedef void (MY_FAST_CALL *LZFIND_SATUR_SUB_CODE_FUNC)( + UInt32 subValue, CLzRef *items, const CLzRef *lim); +static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub; +#endif // FORCE_SATUR_SUB_128 + +#endif // USE_SATUR_SUB_128 + + +// kEmptyHashValue must be zero +// #define SASUB_32(i) v = items[i]; m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; +#define SASUB_32(i) v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; + +#ifdef FORCE_SATUR_SUB_128 + +#define DEFAULT_SaturSub LzFind_SaturSub_128 + +#else + +#define DEFAULT_SaturSub LzFind_SaturSub_32 + +MY_NO_INLINE +static +void +MY_FAST_CALL +LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim) +{ + do + { + UInt32 v; + SASUB_32(0) + SASUB_32(1) + SASUB_32(2) + SASUB_32(3) + SASUB_32(4) + SASUB_32(5) + SASUB_32(6) + SASUB_32(7) + items += 8; + } + while (items != lim); +} + +#endif + + +MY_NO_INLINE void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) { - size_t i; - for (i = 0; i < numItems; i++) + #define K_NORM_ALIGN_BLOCK_SIZE (1 << 6) + + CLzRef *lim; + + for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (K_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--) { - UInt32 value = items[i]; - if (value <= subValue) - value = kEmptyHashValue; - else - value -= subValue; - items[i] = value; + UInt32 v; + SASUB_32(0); + items++; + } + + { + #define K_NORM_ALIGN_MASK (K_NORM_ALIGN_BLOCK_SIZE / 4 - 1) + lim = items + (numItems & ~(size_t)K_NORM_ALIGN_MASK); + numItems &= K_NORM_ALIGN_MASK; + if (items != lim) + { + #if defined(USE_SATUR_SUB_128) && !defined(FORCE_SATUR_SUB_128) + if (g_LzFind_SaturSub) + g_LzFind_SaturSub(subValue, items, lim); + else + #endif + DEFAULT_SaturSub(subValue, items, lim); + } + items = lim; + } + + + for (; numItems != 0; numItems--) + { + UInt32 v; + SASUB_32(0); + items++; } } -static void MatchFinder_Normalize(CMatchFinder *p) -{ - UInt32 subValue = MatchFinder_GetSubValue(p); - MatchFinder_Normalize3(subValue, p->hash, p->numRefs); - MatchFinder_ReduceOffsets(p, subValue); -} +// call MatchFinder_CheckLimits() only after (p->pos++) update + MY_NO_INLINE static void MatchFinder_CheckLimits(CMatchFinder *p) { + if (// !p->streamEndWasReached && p->result == SZ_OK && + p->keepSizeAfter == GET_AVAIL_BYTES(p)) + { + // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p)) + if (MatchFinder_NeedMove(p)) + MatchFinder_MoveBlock(p); + MatchFinder_ReadBlock(p); + } + if (p->pos == kMaxValForNormalize) - MatchFinder_Normalize(p); - if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos) - MatchFinder_CheckAndMoveAndRead(p); + if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data. + /* + if we disable normalization for last bytes of data, and + if (data_size == 4 GiB), we don't call wastfull normalization, + but (pos) will be wrapped over Zero (0) in that case. + And we cannot resume later to normal operation + */ + { + // MatchFinder_Normalize(p); + /* after normalization we need (p->pos >= p->historySize + 1); */ + /* we can reduce subValue to aligned value, if want to keep alignment + of (p->pos) and (p->buffer) for speculated accesses. */ + const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */; + // const UInt32 subValue = (1 << 15); // for debug + // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue); + size_t numSonRefs = p->cyclicBufferSize; + if (p->btMode) + numSonRefs <<= 1; + Inline_MatchFinder_ReduceOffsets(p, subValue); + MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashSizeSum + numSonRefs); + } + if (p->cyclicBufferPos == p->cyclicBufferSize) p->cyclicBufferPos = 0; + MatchFinder_SetLimits(p); } @@ -386,9 +786,9 @@ static void MatchFinder_CheckLimits(CMatchFinder *p) (lenLimit > maxLen) */ MY_FORCE_INLINE -static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, - UInt32 *distances, unsigned maxLen) +static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, + UInt32 *d, unsigned maxLen) { /* son[_cyclicBufferPos] = curMatch; @@ -396,7 +796,7 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos { UInt32 delta = pos - curMatch; if (cutValue-- == 0 || delta >= _cyclicBufferSize) - return distances; + return d; { const Byte *pb = cur - delta; curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; @@ -409,10 +809,10 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos if (maxLen < len) { maxLen = len; - *distances++ = len; - *distances++ = delta - 1; + *d++ = len; + *d++ = delta - 1; if (len == lenLimit) - return distances; + return d; } } } @@ -421,35 +821,41 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos const Byte *lim = cur + lenLimit; son[_cyclicBufferPos] = curMatch; + do { - UInt32 delta = pos - curMatch; + UInt32 delta; + + if (curMatch == 0) + break; + // if (curMatch2 >= curMatch) return NULL; + delta = pos - curMatch; if (delta >= _cyclicBufferSize) break; { ptrdiff_t diff; curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; - diff = (ptrdiff_t)0 - delta; - if (cur[maxLen] == cur[maxLen + diff]) + diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff]) { const Byte *c = cur; while (*c == c[diff]) { if (++c == lim) { - distances[0] = (UInt32)(lim - cur); - distances[1] = delta - 1; - return distances + 2; + d[0] = (UInt32)(lim - cur); + d[1] = delta - 1; + return d + 2; } } { - unsigned len = (unsigned)(c - cur); + const unsigned len = (unsigned)(c - cur); if (maxLen < len) { maxLen = len; - distances[0] = (UInt32)len; - distances[1] = delta - 1; - distances += 2; + d[0] = (UInt32)len; + d[1] = delta - 1; + d += 2; } } } @@ -457,31 +863,36 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos } while (--cutValue); - return distances; + return d; } MY_FORCE_INLINE UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, - UInt32 *distances, UInt32 maxLen) + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, + UInt32 *d, UInt32 maxLen) { CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); unsigned len0 = 0, len1 = 0; - for (;;) + + UInt32 cmCheck; + + // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } + + cmCheck = (UInt32)(pos - _cyclicBufferSize); + if ((UInt32)pos <= _cyclicBufferSize) + cmCheck = 0; + + if (cmCheck < curMatch) + do { - UInt32 delta = pos - curMatch; - if (cutValue-- == 0 || delta >= _cyclicBufferSize) - { - *ptr0 = *ptr1 = kEmptyHashValue; - return distances; - } + const UInt32 delta = pos - curMatch; { CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); const Byte *pb = cur - delta; unsigned len = (len0 < len1 ? len0 : len1); - UInt32 pair0 = pair[0]; + const UInt32 pair0 = pair[0]; if (pb[len] == cur[len]) { if (++len != lenLimit && pb[len] == cur[len]) @@ -491,48 +902,60 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt if (maxLen < len) { maxLen = (UInt32)len; - *distances++ = (UInt32)len; - *distances++ = delta - 1; + *d++ = (UInt32)len; + *d++ = delta - 1; if (len == lenLimit) { *ptr1 = pair0; *ptr0 = pair[1]; - return distances; + return d; } } } if (pb[len] < cur[len]) { *ptr1 = curMatch; + // const UInt32 curMatch2 = pair[1]; + // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } + // curMatch = curMatch2; + curMatch = pair[1]; ptr1 = pair + 1; - curMatch = *ptr1; len1 = len; } else { *ptr0 = curMatch; + curMatch = pair[0]; ptr0 = pair; - curMatch = *ptr0; len0 = len; } } } + while(--cutValue && cmCheck < curMatch); + + *ptr0 = *ptr1 = kEmptyHashValue; + return d; } + static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue) + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue) { CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); unsigned len0 = 0, len1 = 0; - for (;;) + + UInt32 cmCheck; + + cmCheck = (UInt32)(pos - _cyclicBufferSize); + if ((UInt32)pos <= _cyclicBufferSize) + cmCheck = 0; + + if (// curMatch >= pos || // failure + cmCheck < curMatch) + do { - UInt32 delta = pos - curMatch; - if (cutValue-- == 0 || delta >= _cyclicBufferSize) - { - *ptr0 = *ptr1 = kEmptyHashValue; - return; - } + const UInt32 delta = pos - curMatch; { CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); const Byte *pb = cur - delta; @@ -554,80 +977,108 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const if (pb[len] < cur[len]) { *ptr1 = curMatch; + curMatch = pair[1]; ptr1 = pair + 1; - curMatch = *ptr1; len1 = len; } else { *ptr0 = curMatch; + curMatch = pair[0]; ptr0 = pair; - curMatch = *ptr0; len0 = len; } } } + while(--cutValue && cmCheck < curMatch); + + *ptr0 = *ptr1 = kEmptyHashValue; + return; } + #define MOVE_POS \ ++p->cyclicBufferPos; \ p->buffer++; \ - if (++p->pos == p->posLimit) MatchFinder_CheckLimits(p); + { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } -#define MOVE_POS_RET MOVE_POS return (UInt32)offset; +#define MOVE_POS_RET MOVE_POS return distances; -static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; } +MY_NO_INLINE +static void MatchFinder_MovePos(CMatchFinder *p) +{ + /* we go here at the end of stream data, when (avail < num_hash_bytes) + We don't update sons[cyclicBufferPos << btMode]. + So (sons) record will contain junk. And we cannot resume match searching + to normal operation, even if we will provide more input data in buffer. + p->sons[p->cyclicBufferPos << p->btMode] = 0; // kEmptyHashValue + if (p->btMode) + p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue + */ + MOVE_POS; +} #define GET_MATCHES_HEADER2(minLen, ret_op) \ - unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \ + unsigned lenLimit; UInt32 hv; Byte *cur; UInt32 curMatch; \ lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ cur = p->buffer; -#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return 0) -#define SKIP_HEADER(minLen) GET_MATCHES_HEADER2(minLen, continue) +#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances) +#define SKIP_HEADER(minLen) do { GET_MATCHES_HEADER2(minLen, continue) -#define MF_PARAMS(p) p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue +#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue + +#define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS; } while (--num); + +#define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ + distances = func(MF_PARAMS(p), \ + distances, (UInt32)_maxLen_); MOVE_POS_RET; + +#define GET_MATCHES_FOOTER_BT(_maxLen_) \ + GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) + +#define GET_MATCHES_FOOTER_HC(_maxLen_) \ + GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec) -#define GET_MATCHES_FOOTER(offset, maxLen) \ - offset = (unsigned)(GetMatchesSpec1((UInt32)lenLimit, curMatch, MF_PARAMS(p), \ - distances + offset, (UInt32)maxLen) - distances); MOVE_POS_RET; -#define SKIP_FOOTER \ - SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p)); MOVE_POS; #define UPDATE_maxLen { \ - ptrdiff_t diff = (ptrdiff_t)0 - d2; \ + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \ const Byte *c = cur + maxLen; \ const Byte *lim = cur + lenLimit; \ for (; c != lim; c++) if (*(c + diff) != *c) break; \ maxLen = (unsigned)(c - cur); } -static UInt32 Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - unsigned offset; GET_MATCHES_HEADER(2) HASH2_CALC; curMatch = p->hash[hv]; p->hash[hv] = p->pos; - offset = 0; - GET_MATCHES_FOOTER(offset, 1) + GET_MATCHES_FOOTER_BT(1) } -UInt32 Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - unsigned offset; GET_MATCHES_HEADER(3) HASH_ZIP_CALC; curMatch = p->hash[hv]; p->hash[hv] = p->pos; - offset = 0; - GET_MATCHES_FOOTER(offset, 2) + GET_MATCHES_FOOTER_BT(2) } -static UInt32 Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +#define SET_mmm \ + mmm = p->cyclicBufferSize; \ + if (pos < mmm) \ + mmm = pos; + + +static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { + UInt32 mmm; UInt32 h2, d2, pos; - unsigned maxLen, offset; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(3) @@ -643,29 +1094,32 @@ static UInt32 Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) hash[h2] = pos; (hash + kFix3HashSize)[hv] = pos; - maxLen = 2; - offset = 0; + SET_mmm - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + maxLen = 2; + + if (d2 < mmm && *(cur - d2) == *cur) { UPDATE_maxLen distances[0] = (UInt32)maxLen; distances[1] = d2 - 1; - offset = 2; + distances += 2; if (maxLen == lenLimit) { - SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p)); + SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS_RET; } } - GET_MATCHES_FOOTER(offset, maxLen) + GET_MATCHES_FOOTER_BT(maxLen) } -static UInt32 Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { + UInt32 mmm; UInt32 h2, h3, d2, d3, pos; - unsigned maxLen, offset; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(4) @@ -676,53 +1130,63 @@ static UInt32 Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) d2 = pos - hash [h2]; d3 = pos - (hash + kFix3HashSize)[h3]; - curMatch = (hash + kFix4HashSize)[hv]; hash [h2] = pos; (hash + kFix3HashSize)[h3] = pos; (hash + kFix4HashSize)[hv] = pos; - maxLen = 0; - offset = 0; + SET_mmm + + maxLen = 3; - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + for (;;) { - maxLen = 2; - distances[0] = 2; - distances[1] = d2 - 1; - offset = 2; - } + if (d2 < mmm && *(cur - d2) == *cur) + { + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + // distances[-2] = 3; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; - if (d2 != d3 && d3 < p->cyclicBufferSize && *(cur - d3) == *cur) - { - maxLen = 3; - distances[(size_t)offset + 1] = d3 - 1; - offset += 2; - d2 = d3; - } - - if (offset != 0) - { UPDATE_maxLen - distances[(size_t)offset - 2] = (UInt32)maxLen; + distances[-2] = (UInt32)maxLen; if (maxLen == lenLimit) { - SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p)); - MOVE_POS_RET; + SkipMatchesSpec(MF_PARAMS(p)); + MOVE_POS_RET } + break; } - if (maxLen < 3) - maxLen = 3; - - GET_MATCHES_FOOTER(offset, maxLen) + GET_MATCHES_FOOTER_BT(maxLen) } -/* -static UInt32 Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - UInt32 h2, h3, h4, d2, d3, d4, maxLen, offset, pos; + UInt32 mmm; + UInt32 h2, h3, d2, d3, maxLen, pos; UInt32 *hash; GET_MATCHES_HEADER(5) @@ -733,73 +1197,69 @@ static UInt32 Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) d2 = pos - hash [h2]; d3 = pos - (hash + kFix3HashSize)[h3]; - d4 = pos - (hash + kFix4HashSize)[h4]; + // d4 = pos - (hash + kFix4HashSize)[h4]; curMatch = (hash + kFix5HashSize)[hv]; hash [h2] = pos; (hash + kFix3HashSize)[h3] = pos; - (hash + kFix4HashSize)[h4] = pos; + // (hash + kFix4HashSize)[h4] = pos; (hash + kFix5HashSize)[hv] = pos; - maxLen = 0; - offset = 0; + SET_mmm - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + maxLen = 4; + + for (;;) { - distances[0] = maxLen = 2; - distances[1] = d2 - 1; - offset = 2; - if (*(cur - d2 + 2) == cur[2]) - distances[0] = maxLen = 3; - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + if (d2 < mmm && *(cur - d2) == *cur) { - distances[2] = maxLen = 3; - distances[3] = d3 - 1; - offset = 4; + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; + d2 = d3; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; d2 = d3; } - } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) - { - distances[0] = maxLen = 3; - distances[1] = d3 - 1; - offset = 2; - d2 = d3; - } - - if (d2 != d4 && d4 < p->cyclicBufferSize - && *(cur - d4) == *cur - && *(cur - d4 + 3) == *(cur + 3)) - { - maxLen = 4; - distances[(size_t)offset + 1] = d4 - 1; - offset += 2; - d2 = d4; - } - - if (offset != 0) - { + else + break; + + distances[-2] = 3; + if (*(cur - d2 + 3) != cur[3]) + break; UPDATE_maxLen - distances[(size_t)offset - 2] = maxLen; + distances[-2] = (UInt32)maxLen; if (maxLen == lenLimit) { - SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); + SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS_RET; } + break; } - - if (maxLen < 4) - maxLen = 4; - GET_MATCHES_FOOTER(offset, maxLen) + GET_MATCHES_FOOTER_BT(maxLen) } -*/ -static UInt32 Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { + UInt32 mmm; UInt32 h2, h3, d2, d3, pos; - unsigned maxLen, offset; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(4) @@ -816,48 +1276,57 @@ static UInt32 Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) (hash + kFix3HashSize)[h3] = pos; (hash + kFix4HashSize)[hv] = pos; - maxLen = 0; - offset = 0; + SET_mmm - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) - { - maxLen = 2; - distances[0] = 2; - distances[1] = d2 - 1; - offset = 2; - } - - if (d2 != d3 && d3 < p->cyclicBufferSize && *(cur - d3) == *cur) - { - maxLen = 3; - distances[(size_t)offset + 1] = d3 - 1; - offset += 2; - d2 = d3; - } - - if (offset != 0) + maxLen = 3; + + for (;;) { + if (d2 < mmm && *(cur - d2) == *cur) + { + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + // distances[-2] = 3; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; + UPDATE_maxLen - distances[(size_t)offset - 2] = (UInt32)maxLen; + distances[-2] = (UInt32)maxLen; if (maxLen == lenLimit) { p->son[p->cyclicBufferPos] = curMatch; MOVE_POS_RET; } + break; } - if (maxLen < 3) - maxLen = 3; - - offset = (unsigned)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), - distances + offset, maxLen) - (distances)); - MOVE_POS_RET + GET_MATCHES_FOOTER_HC(maxLen); } -/* -static UInt32 Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - UInt32 h2, h3, h4, d2, d3, d4, maxLen, offset, pos + UInt32 mmm; + UInt32 h2, h3, d2, d3, maxLen, pos; UInt32 *hash; GET_MATCHES_HEADER(5) @@ -865,242 +1334,237 @@ static UInt32 Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) hash = p->hash; pos = p->pos; - + d2 = pos - hash [h2]; d3 = pos - (hash + kFix3HashSize)[h3]; - d4 = pos - (hash + kFix4HashSize)[h4]; + // d4 = pos - (hash + kFix4HashSize)[h4]; curMatch = (hash + kFix5HashSize)[hv]; hash [h2] = pos; (hash + kFix3HashSize)[h3] = pos; - (hash + kFix4HashSize)[h4] = pos; + // (hash + kFix4HashSize)[h4] = pos; (hash + kFix5HashSize)[hv] = pos; - maxLen = 0; - offset = 0; + SET_mmm + + maxLen = 4; - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + for (;;) { - distances[0] = maxLen = 2; - distances[1] = d2 - 1; - offset = 2; - if (*(cur - d2 + 2) == cur[2]) - distances[0] = maxLen = 3; - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + if (d2 < mmm && *(cur - d2) == *cur) { - distances[2] = maxLen = 3; - distances[3] = d3 - 1; - offset = 4; + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; + d2 = d3; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; d2 = d3; } - } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) - { - distances[0] = maxLen = 3; - distances[1] = d3 - 1; - offset = 2; - d2 = d3; - } - - if (d2 != d4 && d4 < p->cyclicBufferSize - && *(cur - d4) == *cur - && *(cur - d4 + 3) == *(cur + 3)) - { - maxLen = 4; - distances[(size_t)offset + 1] = d4 - 1; - offset += 2; - d2 = d4; - } - - if (offset != 0) - { + else + break; + + distances[-2] = 3; + if (*(cur - d2 + 3) != cur[3]) + break; UPDATE_maxLen - distances[(size_t)offset - 2] = maxLen; + distances[-2] = maxLen; if (maxLen == lenLimit) { p->son[p->cyclicBufferPos] = curMatch; MOVE_POS_RET; } + break; } - if (maxLen < 4) - maxLen = 4; - - offset = (UInt32)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), - distances + offset, maxLen) - (distances)); - MOVE_POS_RET + GET_MATCHES_FOOTER_HC(maxLen); } -*/ -UInt32 Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - unsigned offset; GET_MATCHES_HEADER(3) HASH_ZIP_CALC; curMatch = p->hash[hv]; p->hash[hv] = p->pos; - offset = (unsigned)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), - distances, 2) - (distances)); - MOVE_POS_RET + GET_MATCHES_FOOTER_HC(2) } + static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do + SKIP_HEADER(2) { - SKIP_HEADER(2) HASH2_CALC; curMatch = p->hash[hv]; p->hash[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do + SKIP_HEADER(3) { - SKIP_HEADER(3) HASH_ZIP_CALC; curMatch = p->hash[hv]; p->hash[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do + SKIP_HEADER(3) { UInt32 h2; UInt32 *hash; - SKIP_HEADER(3) HASH3_CALC; hash = p->hash; curMatch = (hash + kFix3HashSize)[hv]; hash[h2] = (hash + kFix3HashSize)[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do + SKIP_HEADER(4) { UInt32 h2, h3; UInt32 *hash; - SKIP_HEADER(4) HASH4_CALC; hash = p->hash; curMatch = (hash + kFix4HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = (hash + kFix4HashSize)[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } -/* static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do + SKIP_HEADER(5) { - UInt32 h2, h3, h4; + UInt32 h2, h3; UInt32 *hash; - SKIP_HEADER(5) HASH5_CALC; hash = p->hash; curMatch = (hash + kFix5HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = - (hash + kFix4HashSize)[h4] = + // (hash + kFix4HashSize)[h4] = (hash + kFix5HashSize)[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } -*/ + + +#define HC_SKIP_HEADER(minLen) \ + do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \ + Byte *cur; \ + UInt32 *hash; \ + UInt32 *son; \ + UInt32 pos = p->pos; \ + UInt32 num2 = num; \ + /* (p->pos == p->posLimit) is not allowed here !!! */ \ + { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \ + num -= num2; \ + { const UInt32 cycPos = p->cyclicBufferPos; \ + son = p->son + cycPos; \ + p->cyclicBufferPos = cycPos + num2; } \ + cur = p->buffer; \ + hash = p->hash; \ + do { \ + UInt32 curMatch; \ + UInt32 hv; + + +#define HC_SKIP_FOOTER \ + cur++; pos++; *son++ = curMatch; \ + } while (--num2); \ + p->buffer = cur; \ + p->pos = pos; \ + if (pos == p->posLimit) MatchFinder_CheckLimits(p); \ + }} while(num); \ + static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do - { + HC_SKIP_HEADER(4) + UInt32 h2, h3; - UInt32 *hash; - SKIP_HEADER(4) HASH4_CALC; - hash = p->hash; curMatch = (hash + kFix4HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = - (hash + kFix4HashSize)[hv] = p->pos; - p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS - } - while (--num != 0); + (hash + kFix4HashSize)[hv] = pos; + + HC_SKIP_FOOTER } -/* + static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do - { - UInt32 h2, h3, h4; - UInt32 *hash; - SKIP_HEADER(5) - HASH5_CALC; - hash = p->hash; - curMatch = hash + kFix5HashSize)[hv]; + HC_SKIP_HEADER(5) + + UInt32 h2, h3; + HASH5_CALC + curMatch = (hash + kFix5HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = - (hash + kFix4HashSize)[h4] = - (hash + kFix5HashSize)[hv] = p->pos; - p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS - } - while (--num != 0); + // (hash + kFix4HashSize)[h4] = + (hash + kFix5HashSize)[hv] = pos; + + HC_SKIP_FOOTER } -*/ + void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do - { - SKIP_HEADER(3) + HC_SKIP_HEADER(3) + HASH_ZIP_CALC; - curMatch = p->hash[hv]; - p->hash[hv] = p->pos; - p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS - } - while (--num != 0); + curMatch = hash[hv]; + hash[hv] = pos; + + HC_SKIP_FOOTER } -void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable) + +void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) { vTable->Init = (Mf_Init_Func)MatchFinder_Init; vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos; if (!p->btMode) { - /* if (p->numHashBytes <= 4) */ + if (p->numHashBytes <= 4) { vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches; vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip; } - /* else { vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches; vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip; } - */ } else if (p->numHashBytes == 2) { @@ -1112,16 +1576,53 @@ void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable) vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches; vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip; } - else /* if (p->numHashBytes == 4) */ + else if (p->numHashBytes == 4) { vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; } - /* else { vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches; vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip; } - */ +} + + + +void LzFindPrepare() +{ + #ifndef FORCE_SATUR_SUB_128 + #ifdef USE_SATUR_SUB_128 + LZFIND_SATUR_SUB_CODE_FUNC f = NULL; + #ifdef MY_CPU_ARM_OR_ARM64 + { + if (CPU_IsSupported_NEON()) + { + // #pragma message ("=== LzFind NEON") + _PRF(printf("\n=== LzFind NEON\n")); + f = LzFind_SaturSub_128; + } + // f = 0; // for debug + } + #else // MY_CPU_ARM_OR_ARM64 + if (CPU_IsSupported_SSE41()) + { + // #pragma message ("=== LzFind SSE41") + _PRF(printf("\n=== LzFind SSE41\n")); + f = LzFind_SaturSub_128; + + #ifdef USE_AVX2 + if (CPU_IsSupported_AVX2()) + { + // #pragma message ("=== LzFind AVX2") + _PRF(printf("\n=== LzFind AVX2\n")); + f = LzFind_SaturSub_256; + } + #endif + } + #endif // MY_CPU_ARM_OR_ARM64 + g_LzFind_SaturSub = f; + #endif // USE_SATUR_SUB_128 + #endif // FORCE_SATUR_SUB_128 } diff --git a/3rdparty/7z/src/LzFind.h b/3rdparty/7z/src/LzFind.h index c77added7b..8f9fade230 100644 --- a/3rdparty/7z/src/LzFind.h +++ b/3rdparty/7z/src/LzFind.h @@ -1,5 +1,5 @@ /* LzFind.h -- Match finder for LZ algorithms -2017-06-10 : Igor Pavlov : Public domain */ +2021-07-13 : Igor Pavlov : Public domain */ #ifndef __LZ_FIND_H #define __LZ_FIND_H @@ -15,7 +15,7 @@ typedef struct _CMatchFinder Byte *buffer; UInt32 pos; UInt32 posLimit; - UInt32 streamPos; + UInt32 streamPos; /* wrap over Zero is allowed (streamPos < pos). Use (UInt32)(streamPos - pos) */ UInt32 lenLimit; UInt32 cyclicBufferPos; @@ -51,17 +51,19 @@ typedef struct _CMatchFinder UInt64 expectedDataSize; } CMatchFinder; -#define Inline_MatchFinder_GetPointerToCurrentPos(p) ((p)->buffer) +#define Inline_MatchFinder_GetPointerToCurrentPos(p) ((const Byte *)(p)->buffer) -#define Inline_MatchFinder_GetNumAvailableBytes(p) ((p)->streamPos - (p)->pos) +#define Inline_MatchFinder_GetNumAvailableBytes(p) ((UInt32)((p)->streamPos - (p)->pos)) +/* #define Inline_MatchFinder_IsFinishedOK(p) \ ((p)->streamEndWasReached \ && (p)->streamPos == (p)->pos \ && (!(p)->directInput || (p)->directInputRem == 0)) +*/ int MatchFinder_NeedMove(CMatchFinder *p); -Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p); +/* Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p); */ void MatchFinder_MoveBlock(CMatchFinder *p); void MatchFinder_ReadIfRequired(CMatchFinder *p); @@ -76,10 +78,21 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, ISzAllocPtr alloc); void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc); void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems); -void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue); +// void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue); + +/* +#define Inline_MatchFinder_InitPos(p, val) \ + (p)->pos = (val); \ + (p)->streamPos = (val); +*/ + +#define Inline_MatchFinder_ReduceOffsets(p, subValue) \ + (p)->pos -= (subValue); \ + (p)->streamPos -= (subValue); + UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *buffer, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue, UInt32 *distances, UInt32 maxLen); /* @@ -91,7 +104,7 @@ Conditions: typedef void (*Mf_Init_Func)(void *object); typedef UInt32 (*Mf_GetNumAvailableBytes_Func)(void *object); typedef const Byte * (*Mf_GetPointerToCurrentPos_Func)(void *object); -typedef UInt32 (*Mf_GetMatches_Func)(void *object, UInt32 *distances); +typedef UInt32 * (*Mf_GetMatches_Func)(void *object, UInt32 *distances); typedef void (*Mf_Skip_Func)(void *object, UInt32); typedef struct _IMatchFinder @@ -101,21 +114,23 @@ typedef struct _IMatchFinder Mf_GetPointerToCurrentPos_Func GetPointerToCurrentPos; Mf_GetMatches_Func GetMatches; Mf_Skip_Func Skip; -} IMatchFinder; +} IMatchFinder2; -void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable); +void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable); void MatchFinder_Init_LowHash(CMatchFinder *p); void MatchFinder_Init_HighHash(CMatchFinder *p); -void MatchFinder_Init_3(CMatchFinder *p, int readData); +void MatchFinder_Init_4(CMatchFinder *p); void MatchFinder_Init(CMatchFinder *p); -UInt32 Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); -UInt32 Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); +UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); +UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num); void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num); +void LzFindPrepare(void); + EXTERN_C_END #endif diff --git a/3rdparty/7z/src/LzFindMt.c b/3rdparty/7z/src/LzFindMt.c index df32146f92..3865263b7b 100644 --- a/3rdparty/7z/src/LzFindMt.c +++ b/3rdparty/7z/src/LzFindMt.c @@ -1,97 +1,215 @@ /* LzFindMt.c -- multithreaded Match finder for LZ algorithms -2018-12-29 : Igor Pavlov : Public domain */ +2021-12-21 : Igor Pavlov : Public domain */ #include "Precomp.h" -#include "LzHash.h" +// #include +#include "CpuArch.h" + +#include "LzHash.h" #include "LzFindMt.h" +// #define LOG_ITERS + +// #define LOG_THREAD + +#ifdef LOG_THREAD +#include +#define PRF(x) x +#else +#define PRF(x) +#endif + +#ifdef LOG_ITERS +#include +extern UInt64 g_NumIters_Tree; +extern UInt64 g_NumIters_Loop; +extern UInt64 g_NumIters_Bytes; +#define LOG_ITER(x) x +#else +#define LOG_ITER(x) +#endif + +#define kMtHashBlockSize ((UInt32)1 << 17) +#define kMtHashNumBlocks (1 << 1) + +#define GET_HASH_BLOCK_OFFSET(i) (((i) & (kMtHashNumBlocks - 1)) * kMtHashBlockSize) + +#define kMtBtBlockSize ((UInt32)1 << 16) +#define kMtBtNumBlocks (1 << 4) + +#define GET_BT_BLOCK_OFFSET(i) (((i) & (kMtBtNumBlocks - 1)) * (size_t)kMtBtBlockSize) + +/* + HASH functions: + We use raw 8/16 bits from a[1] and a[2], + xored with crc(a[0]) and crc(a[3]). + We check a[0], a[3] only. We don't need to compare a[1] and a[2] in matches. + our crc() function provides one-to-one correspondence for low 8-bit values: + (crc[0...0xFF] & 0xFF) <-> [0...0xFF] +*/ + +#define MF(mt) ((mt)->MatchFinder) +#define MF_CRC (p->crc) + +// #define MF(mt) (&(mt)->MatchFinder) +// #define MF_CRC (p->MatchFinder.crc) + +#define MT_HASH2_CALC \ + h2 = (MF_CRC[cur[0]] ^ cur[1]) & (kHash2Size - 1); + +#define MT_HASH3_CALC { \ + UInt32 temp = MF_CRC[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); } + +/* +#define MT_HASH3_CALC__NO_2 { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); } + +#define __MT_HASH4_CALC { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + temp ^= ((UInt32)cur[2] << 8); \ + h3 = temp & (kHash3Size - 1); \ + h4 = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hash4Mask; } + // (kHash4Size - 1); +*/ + + +MY_NO_INLINE static void MtSync_Construct(CMtSync *p) { + p->affinity = 0; p->wasCreated = False; p->csWasInitialized = False; p->csWasEntered = False; Thread_Construct(&p->thread); Event_Construct(&p->canStart); - Event_Construct(&p->wasStarted); Event_Construct(&p->wasStopped); Semaphore_Construct(&p->freeSemaphore); Semaphore_Construct(&p->filledSemaphore); } -static void MtSync_GetNextBlock(CMtSync *p) + +#define DEBUG_BUFFER_LOCK // define it to debug lock state + +#ifdef DEBUG_BUFFER_LOCK +#include +#define BUFFER_MUST_BE_LOCKED(p) if (!(p)->csWasEntered) exit(1); +#define BUFFER_MUST_BE_UNLOCKED(p) if ( (p)->csWasEntered) exit(1); +#else +#define BUFFER_MUST_BE_LOCKED(p) +#define BUFFER_MUST_BE_UNLOCKED(p) +#endif + +#define LOCK_BUFFER(p) { \ + BUFFER_MUST_BE_UNLOCKED(p); \ + CriticalSection_Enter(&(p)->cs); \ + (p)->csWasEntered = True; } + +#define UNLOCK_BUFFER(p) { \ + BUFFER_MUST_BE_LOCKED(p); \ + CriticalSection_Leave(&(p)->cs); \ + (p)->csWasEntered = False; } + + +MY_NO_INLINE +static UInt32 MtSync_GetNextBlock(CMtSync *p) { + UInt32 numBlocks = 0; if (p->needStart) { + BUFFER_MUST_BE_UNLOCKED(p) p->numProcessedBlocks = 1; p->needStart = False; p->stopWriting = False; p->exit = False; - Event_Reset(&p->wasStarted); Event_Reset(&p->wasStopped); - Event_Set(&p->canStart); - Event_Wait(&p->wasStarted); - - // if (mt) MatchFinder_Init_LowHash(mt->MatchFinder); } else { - CriticalSection_Leave(&p->cs); - p->csWasEntered = False; - p->numProcessedBlocks++; + UNLOCK_BUFFER(p) + // we free current block + numBlocks = p->numProcessedBlocks++; Semaphore_Release1(&p->freeSemaphore); } + + // buffer is UNLOCKED here Semaphore_Wait(&p->filledSemaphore); - CriticalSection_Enter(&p->cs); - p->csWasEntered = True; + LOCK_BUFFER(p); + return numBlocks; } -/* MtSync_StopWriting must be called if Writing was started */ +/* if Writing (Processing) thread was started, we must call MtSync_StopWriting() */ + +MY_NO_INLINE static void MtSync_StopWriting(CMtSync *p) { - UInt32 myNumBlocks = p->numProcessedBlocks; if (!Thread_WasCreated(&p->thread) || p->needStart) return; - p->stopWriting = True; + + PRF(printf("\nMtSync_StopWriting %p\n", p)); + if (p->csWasEntered) { - CriticalSection_Leave(&p->cs); - p->csWasEntered = False; + /* we don't use buffer in this thread after StopWriting(). + So we UNLOCK buffer. + And we restore default UNLOCKED state for stopped thread */ + UNLOCK_BUFFER(p) } - Semaphore_Release1(&p->freeSemaphore); - - Event_Wait(&p->wasStopped); - while (myNumBlocks++ != p->numProcessedBlocks) - { - Semaphore_Wait(&p->filledSemaphore); - Semaphore_Release1(&p->freeSemaphore); - } + /* We send (p->stopWriting) message and release freeSemaphore + to free current block. + So the thread will see (p->stopWriting) at some + iteration after Wait(freeSemaphore). + The thread doesn't need to fill all avail free blocks, + so we can get fast thread stop. + */ + + p->stopWriting = True; + Semaphore_Release1(&p->freeSemaphore); // check semaphore count !!! + + PRF(printf("\nMtSync_StopWriting %p : Event_Wait(&p->wasStopped)\n", p)); + Event_Wait(&p->wasStopped); + PRF(printf("\nMtSync_StopWriting %p : Event_Wait() finsihed\n", p)); + + /* 21.03 : we don't restore samaphore counters here. + We will recreate and reinit samaphores in next start */ + p->needStart = True; } + +MY_NO_INLINE static void MtSync_Destruct(CMtSync *p) { + PRF(printf("\nMtSync_Destruct %p\n", p)); + if (Thread_WasCreated(&p->thread)) { + /* we want thread to be in Stopped state before sending EXIT command. + note: stop(btSync) will stop (htSync) also */ MtSync_StopWriting(p); + /* thread in Stopped state here : (p->needStart == true) */ p->exit = True; - if (p->needStart) - Event_Set(&p->canStart); - Thread_Wait(&p->thread); - Thread_Close(&p->thread); + // if (p->needStart) // it's (true) + Event_Set(&p->canStart); // we send EXIT command to thread + Thread_Wait_Close(&p->thread); // we wait thread finishing } + if (p->csWasInitialized) { CriticalSection_Delete(&p->cs); p->csWasInitialized = False; } + p->csWasEntered = False; Event_Close(&p->canStart); - Event_Close(&p->wasStarted); Event_Close(&p->wasStopped); Semaphore_Close(&p->freeSemaphore); Semaphore_Close(&p->filledSemaphore); @@ -99,80 +217,251 @@ static void MtSync_Destruct(CMtSync *p) p->wasCreated = False; } -#define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; } -static SRes MtSync_Create2(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks) +// #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; } +// we want to get real system error codes here instead of SZ_ERROR_THREAD +#define RINOK_THREAD(x) RINOK(x) + + +// call it before each new file (when new starting is required): +MY_NO_INLINE +static SRes MtSync_Init(CMtSync *p, UInt32 numBlocks) { + WRes wres; + // BUFFER_MUST_BE_UNLOCKED(p) + if (!p->needStart || p->csWasEntered) + return SZ_ERROR_FAIL; + wres = Semaphore_OptCreateInit(&p->freeSemaphore, numBlocks, numBlocks); + if (wres == 0) + wres = Semaphore_OptCreateInit(&p->filledSemaphore, 0, numBlocks); + return MY_SRes_HRESULT_FROM_WRes(wres); +} + + +static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj) +{ + WRes wres; + if (p->wasCreated) return SZ_OK; RINOK_THREAD(CriticalSection_Init(&p->cs)); p->csWasInitialized = True; + p->csWasEntered = False; RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart)); - RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStarted)); RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped)); - - RINOK_THREAD(Semaphore_Create(&p->freeSemaphore, numBlocks, numBlocks)); - RINOK_THREAD(Semaphore_Create(&p->filledSemaphore, 0, numBlocks)); p->needStart = True; - - RINOK_THREAD(Thread_Create(&p->thread, startAddress, obj)); + p->exit = True; /* p->exit is unused before (canStart) Event. + But in case of some unexpected code failure we will get fast exit from thread */ + + // return ERROR_TOO_MANY_POSTS; // for debug + // return EINVAL; // for debug + + if (p->affinity != 0) + wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity); + else + wres = Thread_Create(&p->thread, startAddress, obj); + + RINOK_THREAD(wres); p->wasCreated = True; return SZ_OK; } -static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks) + +MY_NO_INLINE +static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj) { - SRes res = MtSync_Create2(p, startAddress, obj, numBlocks); - if (res != SZ_OK) - MtSync_Destruct(p); - return res; + const WRes wres = MtSync_Create_WRes(p, startAddress, obj); + if (wres == 0) + return 0; + MtSync_Destruct(p); + return MY_SRes_HRESULT_FROM_WRes(wres); } -void MtSync_Init(CMtSync *p) { p->needStart = True; } + +// ---------- HASH THREAD ---------- #define kMtMaxValForNormalize 0xFFFFFFFF +// #define kMtMaxValForNormalize ((1 << 21)) // for debug +// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses + +#ifdef MY_CPU_LE_UNALIGN + #define GetUi24hi_from32(p) ((UInt32)GetUi32(p) >> 8) +#else + #define GetUi24hi_from32(p) ((p)[1] ^ ((UInt32)(p)[2] << 8) ^ ((UInt32)(p)[3] << 16)) +#endif + +#define GetHeads_DECL(name) \ + static void GetHeads ## name(const Byte *p, UInt32 pos, \ + UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc) + +#define GetHeads_LOOP(v) \ + for (; numHeads != 0; numHeads--) { \ + const UInt32 value = (v); \ + p++; \ + *heads++ = pos - hash[value]; \ + hash[value] = pos++; } #define DEF_GetHeads2(name, v, action) \ - static void GetHeads ## name(const Byte *p, UInt32 pos, \ - UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc) \ - { action; for (; numHeads != 0; numHeads--) { \ - const UInt32 value = (v); p++; *heads++ = pos - hash[value]; hash[value] = pos++; } } - + GetHeads_DECL(name) { action \ + GetHeads_LOOP(v) } + #define DEF_GetHeads(name, v) DEF_GetHeads2(name, v, ;) -DEF_GetHeads2(2, (p[0] | ((UInt32)p[1] << 8)), UNUSED_VAR(hashMask); UNUSED_VAR(crc); ) -DEF_GetHeads(3, (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8)) & hashMask) -DEF_GetHeads(4, (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8) ^ (crc[p[3]] << 5)) & hashMask) -DEF_GetHeads(4b, (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8) ^ ((UInt32)p[3] << 16)) & hashMask) -/* DEF_GetHeads(5, (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8) ^ (crc[p[3]] << 5) ^ (crc[p[4]] << 3)) & hashMask) */ +DEF_GetHeads2(2, GetUi16(p), UNUSED_VAR(hashMask); UNUSED_VAR(crc); ) +DEF_GetHeads(3, (crc[p[0]] ^ GetUi16(p + 1)) & hashMask) +DEF_GetHeads2(3b, GetUi16(p) ^ ((UInt32)(p)[2] << 16), UNUSED_VAR(hashMask); UNUSED_VAR(crc); ) +// BT3 is not good for crc collisions for big hashMask values. + +/* +GetHeads_DECL(3b) +{ + UNUSED_VAR(hashMask); + UNUSED_VAR(crc); + { + const Byte *pLim = p + numHeads; + if (numHeads == 0) + return; + pLim--; + while (p < pLim) + { + UInt32 v1 = GetUi32(p); + UInt32 v0 = v1 & 0xFFFFFF; + UInt32 h0, h1; + p += 2; + v1 >>= 8; + h0 = hash[v0]; hash[v0] = pos; heads[0] = pos - h0; pos++; + h1 = hash[v1]; hash[v1] = pos; heads[1] = pos - h1; pos++; + heads += 2; + } + if (p == pLim) + { + UInt32 v0 = GetUi16(p) ^ ((UInt32)(p)[2] << 16); + *heads = pos - hash[v0]; + hash[v0] = pos; + } + } +} +*/ + +/* +GetHeads_DECL(4) +{ + unsigned sh = 0; + UNUSED_VAR(crc) + while ((hashMask & 0x80000000) == 0) + { + hashMask <<= 1; + sh++; + } + GetHeads_LOOP((GetUi32(p) * 0xa54a1) >> sh) +} +#define GetHeads4b GetHeads4 +*/ + +#define USE_GetHeads_LOCAL_CRC + +#ifdef USE_GetHeads_LOCAL_CRC + +GetHeads_DECL(4) +{ + UInt32 crc0[256]; + UInt32 crc1[256]; + { + unsigned i; + for (i = 0; i < 256; i++) + { + UInt32 v = crc[i]; + crc0[i] = v & hashMask; + crc1[i] = (v << kLzHash_CrcShift_1) & hashMask; + // crc1[i] = rotlFixed(v, 8) & hashMask; + } + } + GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ (UInt32)GetUi16(p+1)) +} + +GetHeads_DECL(4b) +{ + UInt32 crc0[256]; + { + unsigned i; + for (i = 0; i < 256; i++) + crc0[i] = crc[i] & hashMask; + } + GetHeads_LOOP(crc0[p[0]] ^ GetUi24hi_from32(p)) +} + +GetHeads_DECL(5) +{ + UInt32 crc0[256]; + UInt32 crc1[256]; + UInt32 crc2[256]; + { + unsigned i; + for (i = 0; i < 256; i++) + { + UInt32 v = crc[i]; + crc0[i] = v & hashMask; + crc1[i] = (v << kLzHash_CrcShift_1) & hashMask; + crc2[i] = (v << kLzHash_CrcShift_2) & hashMask; + } + } + GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ crc2[p[4]] ^ (UInt32)GetUi16(p+1)) +} + +GetHeads_DECL(5b) +{ + UInt32 crc0[256]; + UInt32 crc1[256]; + { + unsigned i; + for (i = 0; i < 256; i++) + { + UInt32 v = crc[i]; + crc0[i] = v & hashMask; + crc1[i] = (v << kLzHash_CrcShift_1) & hashMask; + } + } + GetHeads_LOOP(crc0[p[0]] ^ crc1[p[4]] ^ GetUi24hi_from32(p)) +} + +#else + +DEF_GetHeads(4, (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (UInt32)GetUi16(p+1)) & hashMask) +DEF_GetHeads(4b, (crc[p[0]] ^ GetUi24hi_from32(p)) & hashMask) +DEF_GetHeads(5, (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (crc[p[4]] << kLzHash_CrcShift_2) ^ (UInt32)GetUi16(p + 1)) & hashMask) +DEF_GetHeads(5b, (crc[p[0]] ^ (crc[p[4]] << kLzHash_CrcShift_1) ^ GetUi24hi_from32(p)) & hashMask) + +#endif + static void HashThreadFunc(CMatchFinderMt *mt) { CMtSync *p = &mt->hashSync; + PRF(printf("\nHashThreadFunc\n")); + for (;;) { - UInt32 numProcessedBlocks = 0; + UInt32 blockIndex = 0; + PRF(printf("\nHashThreadFunc : Event_Wait(&p->canStart)\n")); Event_Wait(&p->canStart); - Event_Set(&p->wasStarted); + PRF(printf("\nHashThreadFunc : Event_Wait(&p->canStart) : after \n")); + if (p->exit) + { + PRF(printf("\nHashThreadFunc : exit \n")); + return; + } - MatchFinder_Init_HighHash(mt->MatchFinder); + MatchFinder_Init_HighHash(MF(mt)); for (;;) { - if (p->exit) - return; - if (p->stopWriting) - { - p->numProcessedBlocks = numProcessedBlocks; - Event_Set(&p->wasStopped); - break; - } + PRF(printf("Hash thread block = %d pos = %d\n", (unsigned)blockIndex, mt->MatchFinder->pos)); { - CMatchFinder *mf = mt->MatchFinder; + CMatchFinder *mf = MF(mt); if (MatchFinder_NeedMove(mf)) { CriticalSection_Enter(&mt->btSync.cs); @@ -185,194 +474,178 @@ static void HashThreadFunc(CMatchFinderMt *mt) mt->pointerToCurPos -= offset; mt->buffer -= offset; } - CriticalSection_Leave(&mt->btSync.cs); CriticalSection_Leave(&mt->hashSync.cs); + CriticalSection_Leave(&mt->btSync.cs); continue; } Semaphore_Wait(&p->freeSemaphore); + if (p->exit) // exit is unexpected here. But we check it here for some failure case + return; + + // for faster stop : we check (p->stopWriting) after Wait(freeSemaphore) + if (p->stopWriting) + break; + MatchFinder_ReadIfRequired(mf); - if (mf->pos > (kMtMaxValForNormalize - kMtHashBlockSize)) { - UInt32 subValue = (mf->pos - mf->historySize - 1); - MatchFinder_ReduceOffsets(mf, subValue); - MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1); - } - { - UInt32 *heads = mt->hashBuf + ((numProcessedBlocks++) & kMtHashNumBlocksMask) * kMtHashBlockSize; - UInt32 num = mf->streamPos - mf->pos; + UInt32 *heads = mt->hashBuf + GET_HASH_BLOCK_OFFSET(blockIndex++); + UInt32 num = Inline_MatchFinder_GetNumAvailableBytes(mf); heads[0] = 2; heads[1] = num; + + /* heads[1] contains the number of avail bytes: + if (avail < mf->numHashBytes) : + { + it means that stream was finished + HASH_THREAD and BT_TREAD must move position for heads[1] (avail) bytes. + HASH_THREAD doesn't stop, + HASH_THREAD fills only the header (2 numbers) for all next blocks: + {2, NumHashBytes - 1}, {2,0}, {2,0}, ... , {2,0} + } + else + { + HASH_THREAD and BT_TREAD must move position for (heads[0] - 2) bytes; + } + */ + if (num >= mf->numHashBytes) { num = num - mf->numHashBytes + 1; if (num > kMtHashBlockSize - 2) num = kMtHashBlockSize - 2; - mt->GetHeadsFunc(mf->buffer, mf->pos, mf->hash + mf->fixedHashSize, mf->hashMask, heads + 2, num, mf->crc); + + if (mf->pos > (UInt32)kMtMaxValForNormalize - num) + { + const UInt32 subValue = (mf->pos - mf->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1); + Inline_MatchFinder_ReduceOffsets(mf, subValue); + MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1); + } + heads[0] = 2 + num; + mt->GetHeadsFunc(mf->buffer, mf->pos, mf->hash + mf->fixedHashSize, mf->hashMask, heads + 2, num, mf->crc); } - mf->pos += num; + + mf->pos += num; // wrap over zero is allowed at the end of stream mf->buffer += num; } } Semaphore_Release1(&p->filledSemaphore); - } - } + } // for() processing end + + // p->numBlocks_Sent = blockIndex; + Event_Set(&p->wasStopped); + } // for() thread end } -static void MatchFinderMt_GetNextBlock_Hash(CMatchFinderMt *p) -{ - MtSync_GetNextBlock(&p->hashSync); - p->hashBufPosLimit = p->hashBufPos = ((p->hashSync.numProcessedBlocks - 1) & kMtHashNumBlocksMask) * kMtHashBlockSize; - p->hashBufPosLimit += p->hashBuf[p->hashBufPos++]; - p->hashNumAvail = p->hashBuf[p->hashBufPos++]; -} -#define kEmptyHashValue 0 + + +// ---------- BT THREAD ---------- + +/* we use one variable instead of two (cyclicBufferPos == pos) before CyclicBuf wrap. + here we define fixed offset of (p->pos) from (p->cyclicBufferPos) */ +#define CYC_TO_POS_OFFSET 0 +// #define CYC_TO_POS_OFFSET 1 // for debug #define MFMT_GM_INLINE #ifdef MFMT_GM_INLINE /* - we use size_t for _cyclicBufferPos instead of UInt32 + we use size_t for (pos) instead of UInt32 to eliminate "movsx" BUG in old MSVC x64 compiler. */ -MY_NO_INLINE -static UInt32 *GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *cur, CLzRef *son, - size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue, - UInt32 *distances, UInt32 _maxLen, const UInt32 *hash, const UInt32 *limit, UInt32 size, UInt32 *posRes) -{ - do - { - UInt32 *_distances = ++distances; - UInt32 delta = *hash++; - CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; - CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); - unsigned len0 = 0, len1 = 0; - UInt32 cutValue = _cutValue; - unsigned maxLen = (unsigned)_maxLen; - - /* - if (size > 1) - { - UInt32 delta = *hash; - if (delta < _cyclicBufferSize) - { - UInt32 cyc1 = _cyclicBufferPos + 1; - CLzRef *pair = son + ((size_t)(cyc1 - delta + ((delta > cyc1) ? _cyclicBufferSize : 0)) << 1); - Byte b = *(cur + 1 - delta); - _distances[0] = pair[0]; - _distances[1] = b; - } - } - */ - if (cutValue == 0 || delta >= _cyclicBufferSize) - { - *ptr0 = *ptr1 = kEmptyHashValue; - } - else - for(;;) - { - { - CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((_cyclicBufferPos < delta) ? _cyclicBufferSize : 0)) << 1); - const Byte *pb = cur - delta; - unsigned len = (len0 < len1 ? len0 : len1); - UInt32 pair0 = *pair; - if (pb[len] == cur[len]) - { - if (++len != lenLimit && pb[len] == cur[len]) - while (++len != lenLimit) - if (pb[len] != cur[len]) - break; - if (maxLen < len) - { - maxLen = len; - *distances++ = (UInt32)len; - *distances++ = delta - 1; - if (len == lenLimit) - { - UInt32 pair1 = pair[1]; - *ptr1 = pair0; - *ptr0 = pair1; - break; - } - } - } - { - UInt32 curMatch = pos - delta; - // delta = pos - *pair; - // delta = pos - pair[((UInt32)pb[len] - (UInt32)cur[len]) >> 31]; - if (pb[len] < cur[len]) - { - delta = pos - pair[1]; - *ptr1 = curMatch; - ptr1 = pair + 1; - len1 = len; - } - else - { - delta = pos - *pair; - *ptr0 = curMatch; - ptr0 = pair; - len0 = len; - } - } - } - if (--cutValue == 0 || delta >= _cyclicBufferSize) - { - *ptr0 = *ptr1 = kEmptyHashValue; - break; - } - } - pos++; - _cyclicBufferPos++; - cur++; - { - UInt32 num = (UInt32)(distances - _distances); - _distances[-1] = num; - } - } - while (distances < limit && --size != 0); - *posRes = pos; - return distances; -} +UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes); #endif - -static void BtGetMatches(CMatchFinderMt *p, UInt32 *distances) +static void BtGetMatches(CMatchFinderMt *p, UInt32 *d) { UInt32 numProcessed = 0; UInt32 curPos = 2; - UInt32 limit = kMtBtBlockSize - (p->matchMaxLen * 2); // * 2 - distances[1] = p->hashNumAvail; + /* GetMatchesSpec() functions don't create (len = 1) + in [len, dist] match pairs, if (p->numHashBytes >= 2) + Also we suppose here that (matchMaxLen >= 2). + So the following code for (reserve) is not required + UInt32 reserve = (p->matchMaxLen * 2); + const UInt32 kNumHashBytes_Max = 5; // BT_HASH_BYTES_MAX + if (reserve < kNumHashBytes_Max - 1) + reserve = kNumHashBytes_Max - 1; + const UInt32 limit = kMtBtBlockSize - (reserve); + */ + + const UInt32 limit = kMtBtBlockSize - (p->matchMaxLen * 2); + + d[1] = p->hashNumAvail; + + if (p->failure_BT) + { + // printf("\n == 1 BtGetMatches() p->failure_BT\n"); + d[0] = 0; + // d[1] = 0; + return; + } while (curPos < limit) { if (p->hashBufPos == p->hashBufPosLimit) { - MatchFinderMt_GetNextBlock_Hash(p); - distances[1] = numProcessed + p->hashNumAvail; - if (p->hashNumAvail >= p->numHashBytes) + // MatchFinderMt_GetNextBlock_Hash(p); + UInt32 avail; + { + const UInt32 bi = MtSync_GetNextBlock(&p->hashSync); + const UInt32 k = GET_HASH_BLOCK_OFFSET(bi); + const UInt32 *h = p->hashBuf + k; + avail = h[1]; + p->hashBufPosLimit = k + h[0]; + p->hashNumAvail = avail; + p->hashBufPos = k + 2; + } + + { + /* we must prevent UInt32 overflow for avail total value, + if avail was increased with new hash block */ + UInt32 availSum = numProcessed + avail; + if (availSum < numProcessed) + availSum = (UInt32)(Int32)-1; + d[1] = availSum; + } + + if (avail >= p->numHashBytes) continue; - distances[0] = curPos + p->hashNumAvail; - distances += curPos; - for (; p->hashNumAvail != 0; p->hashNumAvail--) - *distances++ = 0; + + // if (p->hashBufPos != p->hashBufPosLimit) exit(1); + + /* (avail < p->numHashBytes) + It means that stream was finished. + And (avail) - is a number of remaining bytes, + we fill (d) for (avail) bytes for LZ_THREAD (receiver). + but we don't update (p->pos) and (p->cyclicBufferPos) here in BT_THREAD */ + + /* here we suppose that we have space enough: + (kMtBtBlockSize - curPos >= p->hashNumAvail) */ + p->hashNumAvail = 0; + d[0] = curPos + avail; + d += curPos; + for (; avail != 0; avail--) + *d++ = 0; return; } { UInt32 size = p->hashBufPosLimit - p->hashBufPos; - UInt32 lenLimit = p->matchMaxLen; UInt32 pos = p->pos; UInt32 cyclicBufferPos = p->cyclicBufferPos; + UInt32 lenLimit = p->matchMaxLen; if (lenLimit >= p->hashNumAvail) lenLimit = p->hashNumAvail; { @@ -384,10 +657,18 @@ static void BtGetMatches(CMatchFinderMt *p, UInt32 *distances) size = size2; } + if (pos > (UInt32)kMtMaxValForNormalize - size) + { + const UInt32 subValue = (pos - p->cyclicBufferSize); // & ~(UInt32)(kNormalizeAlign - 1); + pos -= subValue; + p->pos = pos; + MatchFinder_Normalize3(subValue, p->son, (size_t)p->cyclicBufferSize * 2); + } + #ifndef MFMT_GM_INLINE while (curPos < limit && size-- != 0) { - UInt32 *startDistances = distances + curPos; + UInt32 *startDistances = d + curPos; UInt32 num = (UInt32)(GetMatchesSpec1(lenLimit, pos - p->hashBuf[p->hashBufPos++], pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue, startDistances + 1, p->numHashBytes - 1) - startDistances); @@ -399,81 +680,112 @@ static void BtGetMatches(CMatchFinderMt *p, UInt32 *distances) } #else { - UInt32 posRes; - curPos = (UInt32)(GetMatchesSpecN(lenLimit, pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue, - distances + curPos, p->numHashBytes - 1, p->hashBuf + p->hashBufPos, - distances + limit, - size, &posRes) - distances); - p->hashBufPos += posRes - pos; - cyclicBufferPos += posRes - pos; - p->buffer += posRes - pos; - pos = posRes; + UInt32 posRes = pos; + const UInt32 *d_end; + { + d_end = GetMatchesSpecN_2( + p->buffer + lenLimit - 1, + pos, p->buffer, p->son, p->cutValue, d + curPos, + p->numHashBytes - 1, p->hashBuf + p->hashBufPos, + d + limit, p->hashBuf + p->hashBufPos + size, + cyclicBufferPos, p->cyclicBufferSize, + &posRes); + } + { + if (!d_end) + { + // printf("\n == 2 BtGetMatches() p->failure_BT\n"); + // internal data failure + p->failure_BT = True; + d[0] = 0; + // d[1] = 0; + return; + } + } + curPos = (UInt32)(d_end - d); + { + const UInt32 processed = posRes - pos; + pos = posRes; + p->hashBufPos += processed; + cyclicBufferPos += processed; + p->buffer += processed; + } } #endif - numProcessed += pos - p->pos; - p->hashNumAvail -= pos - p->pos; - p->pos = pos; + { + const UInt32 processed = pos - p->pos; + numProcessed += processed; + p->hashNumAvail -= processed; + p->pos = pos; + } if (cyclicBufferPos == p->cyclicBufferSize) cyclicBufferPos = 0; p->cyclicBufferPos = cyclicBufferPos; } } - distances[0] = curPos; + d[0] = curPos; } + static void BtFillBlock(CMatchFinderMt *p, UInt32 globalBlockIndex) { CMtSync *sync = &p->hashSync; + + BUFFER_MUST_BE_UNLOCKED(sync) + if (!sync->needStart) { - CriticalSection_Enter(&sync->cs); - sync->csWasEntered = True; + LOCK_BUFFER(sync) } - BtGetMatches(p, p->btBuf + (globalBlockIndex & kMtBtNumBlocksMask) * kMtBtBlockSize); + BtGetMatches(p, p->btBuf + GET_BT_BLOCK_OFFSET(globalBlockIndex)); + + /* We suppose that we have called GetNextBlock() from start. + So buffer is LOCKED */ - if (p->pos > kMtMaxValForNormalize - kMtBtBlockSize) - { - UInt32 subValue = p->pos - p->cyclicBufferSize; - MatchFinder_Normalize3(subValue, p->son, (size_t)p->cyclicBufferSize * 2); - p->pos -= subValue; - } - - if (!sync->needStart) - { - CriticalSection_Leave(&sync->cs); - sync->csWasEntered = False; - } + UNLOCK_BUFFER(sync) } -void BtThreadFunc(CMatchFinderMt *mt) + +MY_NO_INLINE +static void BtThreadFunc(CMatchFinderMt *mt) { CMtSync *p = &mt->btSync; for (;;) { UInt32 blockIndex = 0; Event_Wait(&p->canStart); - Event_Set(&p->wasStarted); + for (;;) { + PRF(printf(" BT thread block = %d pos = %d\n", (unsigned)blockIndex, mt->pos)); + /* (p->exit == true) is possible after (p->canStart) at first loop iteration + and is unexpected after more Wait(freeSemaphore) iterations */ if (p->exit) return; - if (p->stopWriting) - { - p->numProcessedBlocks = blockIndex; - MtSync_StopWriting(&mt->hashSync); - Event_Set(&p->wasStopped); - break; - } + Semaphore_Wait(&p->freeSemaphore); + + // for faster stop : we check (p->stopWriting) after Wait(freeSemaphore) + if (p->stopWriting) + break; + BtFillBlock(mt, blockIndex++); + Semaphore_Release1(&p->filledSemaphore); } + + // we stop HASH_THREAD here + MtSync_StopWriting(&mt->hashSync); + + // p->numBlocks_Sent = blockIndex; + Event_Set(&p->wasStopped); } } + void MatchFinderMt_Construct(CMatchFinderMt *p) { p->hashBuf = NULL; @@ -489,16 +801,39 @@ static void MatchFinderMt_FreeMem(CMatchFinderMt *p, ISzAllocPtr alloc) void MatchFinderMt_Destruct(CMatchFinderMt *p, ISzAllocPtr alloc) { - MtSync_Destruct(&p->hashSync); + /* + HASH_THREAD can use CriticalSection(s) btSync.cs and hashSync.cs. + So we must be sure that HASH_THREAD will not use CriticalSection(s) + after deleting CriticalSection here. + + we call ReleaseStream(p) + that calls StopWriting(btSync) + that calls StopWriting(hashSync), if it's required to stop HASH_THREAD. + after StopWriting() it's safe to destruct MtSync(s) in any order */ + + MatchFinderMt_ReleaseStream(p); + MtSync_Destruct(&p->btSync); + MtSync_Destruct(&p->hashSync); + + LOG_ITER( + printf("\nTree %9d * %7d iter = %9d = sum : bytes = %9d\n", + (UInt32)(g_NumIters_Tree / 1000), + (UInt32)(((UInt64)g_NumIters_Loop * 1000) / (g_NumIters_Tree + 1)), + (UInt32)(g_NumIters_Loop / 1000), + (UInt32)(g_NumIters_Bytes / 1000) + )); + MatchFinderMt_FreeMem(p, alloc); } + #define kHashBufferSize (kMtHashBlockSize * kMtHashNumBlocks) #define kBtBufferSize (kMtBtBlockSize * kMtBtNumBlocks) -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE HashThreadFunc2(void *p) { HashThreadFunc((CMatchFinderMt *)p); return 0; } -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE BtThreadFunc2(void *p) + +static THREAD_FUNC_DECL HashThreadFunc2(void *p) { HashThreadFunc((CMatchFinderMt *)p); return 0; } +static THREAD_FUNC_DECL BtThreadFunc2(void *p) { Byte allocaDummy[0x180]; unsigned i = 0; @@ -509,16 +844,17 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE BtThreadFunc2(void *p) return 0; } + SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc) { - CMatchFinder *mf = p->MatchFinder; + CMatchFinder *mf = MF(p); p->historySize = historySize; if (kMtBtBlockSize <= matchMaxLen * 4) return SZ_ERROR_PARAM; if (!p->hashBuf) { - p->hashBuf = (UInt32 *)ISzAlloc_Alloc(alloc, (kHashBufferSize + kBtBufferSize) * sizeof(UInt32)); + p->hashBuf = (UInt32 *)ISzAlloc_Alloc(alloc, ((size_t)kHashBufferSize + (size_t)kBtBufferSize) * sizeof(UInt32)); if (!p->hashBuf) return SZ_ERROR_MEM; p->btBuf = p->hashBuf + kHashBufferSize; @@ -528,253 +864,457 @@ SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddB if (!MatchFinder_Create(mf, historySize, keepAddBufferBefore, matchMaxLen, keepAddBufferAfter, alloc)) return SZ_ERROR_MEM; - RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p, kMtHashNumBlocks)); - RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p, kMtBtNumBlocks)); + RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p)); + RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p)); return SZ_OK; } -/* Call it after ReleaseStream / SetStream */ + +SRes MatchFinderMt_InitMt(CMatchFinderMt *p) +{ + RINOK(MtSync_Init(&p->hashSync, kMtHashNumBlocks)); + return MtSync_Init(&p->btSync, kMtBtNumBlocks); +} + + static void MatchFinderMt_Init(CMatchFinderMt *p) { - CMatchFinder *mf = p->MatchFinder; + CMatchFinder *mf = MF(p); p->btBufPos = - p->btBufPosLimit = 0; + p->btBufPosLimit = NULL; p->hashBufPos = p->hashBufPosLimit = 0; + p->hashNumAvail = 0; // 21.03 + + p->failure_BT = False; /* Init without data reading. We don't want to read data in this thread */ - MatchFinder_Init_3(mf, False); + MatchFinder_Init_4(mf); + MatchFinder_Init_LowHash(mf); p->pointerToCurPos = Inline_MatchFinder_GetPointerToCurrentPos(mf); p->btNumAvailBytes = 0; - p->lzPos = p->historySize + 1; + p->failure_LZ_BT = False; + // p->failure_LZ_LZ = False; + + p->lzPos = + 1; // optimal smallest value + // 0; // for debug: ignores match to start + // kNormalizeAlign; // for debug p->hash = mf->hash; p->fixedHashSize = mf->fixedHashSize; + // p->hash4Mask = mf->hash4Mask; p->crc = mf->crc; + // memcpy(p->crc, mf->crc, sizeof(mf->crc)); p->son = mf->son; p->matchMaxLen = mf->matchMaxLen; p->numHashBytes = mf->numHashBytes; - p->pos = mf->pos; - p->buffer = mf->buffer; - p->cyclicBufferPos = mf->cyclicBufferPos; + + /* (mf->pos) and (mf->streamPos) were already initialized to 1 in MatchFinder_Init_4() */ + // mf->streamPos = mf->pos = 1; // optimal smallest value + // 0; // for debug: ignores match to start + // kNormalizeAlign; // for debug + + /* we must init (p->pos = mf->pos) for BT, because + BT code needs (p->pos == delta_value_for_empty_hash_record == mf->pos) */ + p->pos = mf->pos; // do not change it + + p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); p->cyclicBufferSize = mf->cyclicBufferSize; + p->buffer = mf->buffer; p->cutValue = mf->cutValue; + // p->son[0] = p->son[1] = 0; // unused: to init skipped record for speculated accesses. } + /* ReleaseStream is required to finish multithreading */ void MatchFinderMt_ReleaseStream(CMatchFinderMt *p) { + // Sleep(1); // for debug MtSync_StopWriting(&p->btSync); + // Sleep(200); // for debug /* p->MatchFinder->ReleaseStream(); */ } -static void MatchFinderMt_Normalize(CMatchFinderMt *p) + +MY_NO_INLINE +static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p) { - MatchFinder_Normalize3(p->lzPos - p->historySize - 1, p->hash, p->fixedHashSize); - p->lzPos = p->historySize + 1; + if (p->failure_LZ_BT) + p->btBufPos = p->failureBuf; + else + { + const UInt32 bi = MtSync_GetNextBlock(&p->btSync); + const UInt32 *bt = p->btBuf + GET_BT_BLOCK_OFFSET(bi); + { + const UInt32 numItems = bt[0]; + p->btBufPosLimit = bt + numItems; + p->btNumAvailBytes = bt[1]; + p->btBufPos = bt + 2; + if (numItems < 2 || numItems > kMtBtBlockSize) + { + p->failureBuf[0] = 0; + p->btBufPos = p->failureBuf; + p->btBufPosLimit = p->failureBuf + 1; + p->failure_LZ_BT = True; + // p->btNumAvailBytes = 0; + /* we don't want to decrease AvailBytes, that was load before. + that can be unxepected for the code that have loaded anopther value before */ + } + } + + if (p->lzPos >= (UInt32)kMtMaxValForNormalize - (UInt32)kMtBtBlockSize) + { + /* we don't check (lzPos) over exact avail bytes in (btBuf). + (fixedHashSize) is small, so normalization is fast */ + const UInt32 subValue = (p->lzPos - p->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1); + p->lzPos -= subValue; + MatchFinder_Normalize3(subValue, p->hash, p->fixedHashSize); + } + } + return p->btNumAvailBytes; } -static void MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p) -{ - UInt32 blockIndex; - MtSync_GetNextBlock(&p->btSync); - blockIndex = ((p->btSync.numProcessedBlocks - 1) & kMtBtNumBlocksMask); - p->btBufPosLimit = p->btBufPos = blockIndex * kMtBtBlockSize; - p->btBufPosLimit += p->btBuf[p->btBufPos++]; - p->btNumAvailBytes = p->btBuf[p->btBufPos++]; - if (p->lzPos >= kMtMaxValForNormalize - kMtBtBlockSize) - MatchFinderMt_Normalize(p); -} + static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p) { return p->pointerToCurPos; } + #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p); + static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p) { - GET_NEXT_BLOCK_IF_REQUIRED; - return p->btNumAvailBytes; + if (p->btBufPos != p->btBufPosLimit) + return p->btNumAvailBytes; + return MatchFinderMt_GetNextBlock_Bt(p); } -static UInt32 * MixMatches2(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *distances) + +// #define CHECK_FAILURE_LZ(_match_, _pos_) if (_match_ >= _pos_) { p->failure_LZ_LZ = True; return d; } +#define CHECK_FAILURE_LZ(_match_, _pos_) + +static UInt32 * MixMatches2(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) { - UInt32 h2, curMatch2; + UInt32 h2, c2; UInt32 *hash = p->hash; const Byte *cur = p->pointerToCurPos; - UInt32 lzPos = p->lzPos; + const UInt32 m = p->lzPos; MT_HASH2_CALC - curMatch2 = hash[h2]; - hash[h2] = lzPos; + c2 = hash[h2]; + hash[h2] = m; - if (curMatch2 >= matchMinPos) - if (cur[(ptrdiff_t)curMatch2 - lzPos] == cur[0]) + if (c2 >= matchMinPos) + { + CHECK_FAILURE_LZ(c2, m) + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) { - *distances++ = 2; - *distances++ = lzPos - curMatch2 - 1; + *d++ = 2; + *d++ = m - c2 - 1; } + } - return distances; + return d; } -static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *distances) +static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) { - UInt32 h2, h3, curMatch2, curMatch3; + UInt32 h2, h3, c2, c3; UInt32 *hash = p->hash; const Byte *cur = p->pointerToCurPos; - UInt32 lzPos = p->lzPos; + const UInt32 m = p->lzPos; MT_HASH3_CALC - curMatch2 = hash[ h2]; - curMatch3 = (hash + kFix3HashSize)[h3]; + c2 = hash[h2]; + c3 = (hash + kFix3HashSize)[h3]; - hash[ h2] = lzPos; - (hash + kFix3HashSize)[h3] = lzPos; + hash[h2] = m; + (hash + kFix3HashSize)[h3] = m; - if (curMatch2 >= matchMinPos && cur[(ptrdiff_t)curMatch2 - lzPos] == cur[0]) + if (c2 >= matchMinPos) { - distances[1] = lzPos - curMatch2 - 1; - if (cur[(ptrdiff_t)curMatch2 - lzPos + 2] == cur[2]) + CHECK_FAILURE_LZ(c2, m) + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) { - distances[0] = 3; - return distances + 2; + d[1] = m - c2 - 1; + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2]) + { + d[0] = 3; + return d + 2; + } + d[0] = 2; + d += 2; } - distances[0] = 2; - distances += 2; } - if (curMatch3 >= matchMinPos && cur[(ptrdiff_t)curMatch3 - lzPos] == cur[0]) + if (c3 >= matchMinPos) { - *distances++ = 3; - *distances++ = lzPos - curMatch3 - 1; + CHECK_FAILURE_LZ(c3, m) + if (cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0]) + { + *d++ = 3; + *d++ = m - c3 - 1; + } } - return distances; + return d; } -/* -static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *distances) -{ - UInt32 h2, h3, h4, curMatch2, curMatch3, curMatch4; - UInt32 *hash = p->hash; - const Byte *cur = p->pointerToCurPos; - UInt32 lzPos = p->lzPos; - MT_HASH4_CALC - - curMatch2 = hash[ h2]; - curMatch3 = (hash + kFix3HashSize)[h3]; - curMatch4 = (hash + kFix4HashSize)[h4]; - - hash[ h2] = lzPos; - (hash + kFix3HashSize)[h3] = lzPos; - (hash + kFix4HashSize)[h4] = lzPos; - - if (curMatch2 >= matchMinPos && cur[(ptrdiff_t)curMatch2 - lzPos] == cur[0]) - { - distances[1] = lzPos - curMatch2 - 1; - if (cur[(ptrdiff_t)curMatch2 - lzPos + 2] == cur[2]) - { - distances[0] = (cur[(ptrdiff_t)curMatch2 - lzPos + 3] == cur[3]) ? 4 : 3; - return distances + 2; - } - distances[0] = 2; - distances += 2; - } - - if (curMatch3 >= matchMinPos && cur[(ptrdiff_t)curMatch3 - lzPos] == cur[0]) - { - distances[1] = lzPos - curMatch3 - 1; - if (cur[(ptrdiff_t)curMatch3 - lzPos + 3] == cur[3]) - { - distances[0] = 4; - return distances + 2; - } - distances[0] = 3; - distances += 2; - } - - if (curMatch4 >= matchMinPos) - if ( - cur[(ptrdiff_t)curMatch4 - lzPos] == cur[0] && - cur[(ptrdiff_t)curMatch4 - lzPos + 3] == cur[3] - ) - { - *distances++ = 4; - *distances++ = lzPos - curMatch4 - 1; - } - - return distances; -} -*/ #define INCREASE_LZ_POS p->lzPos++; p->pointerToCurPos++; -static UInt32 MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *distances) +/* +static +UInt32* MatchFinderMt_GetMatches_Bt4(CMatchFinderMt *p, UInt32 *d) { - const UInt32 *btBuf = p->btBuf + p->btBufPos; - UInt32 len = *btBuf++; - p->btBufPos += 1 + len; - p->btNumAvailBytes--; + const UInt32 *bt = p->btBufPos; + const UInt32 len = *bt++; + const UInt32 *btLim = bt + len; + UInt32 matchMinPos; + UInt32 avail = p->btNumAvailBytes - 1; + p->btBufPos = btLim; + { - UInt32 i; - for (i = 0; i < len; i += 2) + p->btNumAvailBytes = avail; + + #define BT_HASH_BYTES_MAX 5 + + matchMinPos = p->lzPos; + + if (len != 0) + matchMinPos -= bt[1]; + else if (avail < (BT_HASH_BYTES_MAX - 1) - 1) { - UInt32 v0 = btBuf[0]; - UInt32 v1 = btBuf[1]; - btBuf += 2; - distances[0] = v0; - distances[1] = v1; - distances += 2; + INCREASE_LZ_POS + return d; + } + else + { + const UInt32 hs = p->historySize; + if (matchMinPos > hs) + matchMinPos -= hs; + else + matchMinPos = 1; } } + + for (;;) + { + + UInt32 h2, h3, c2, c3; + UInt32 *hash = p->hash; + const Byte *cur = p->pointerToCurPos; + UInt32 m = p->lzPos; + MT_HASH3_CALC + + c2 = hash[h2]; + c3 = (hash + kFix3HashSize)[h3]; + + hash[h2] = m; + (hash + kFix3HashSize)[h3] = m; + + if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) + { + d[1] = m - c2 - 1; + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2]) + { + d[0] = 3; + d += 2; + break; + } + // else + { + d[0] = 2; + d += 2; + } + } + if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0]) + { + *d++ = 3; + *d++ = m - c3 - 1; + } + break; + } + + if (len != 0) + { + do + { + const UInt32 v0 = bt[0]; + const UInt32 v1 = bt[1]; + bt += 2; + d[0] = v0; + d[1] = v1; + d += 2; + } + while (bt != btLim); + } INCREASE_LZ_POS - return len; + return d; +} +*/ + + +static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) +{ + UInt32 h2, h3, /* h4, */ c2, c3 /* , c4 */; + UInt32 *hash = p->hash; + const Byte *cur = p->pointerToCurPos; + const UInt32 m = p->lzPos; + MT_HASH3_CALC + // MT_HASH4_CALC + c2 = hash[h2]; + c3 = (hash + kFix3HashSize)[h3]; + // c4 = (hash + kFix4HashSize)[h4]; + + hash[h2] = m; + (hash + kFix3HashSize)[h3] = m; + // (hash + kFix4HashSize)[h4] = m; + + #define _USE_H2 + + #ifdef _USE_H2 + if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) + { + d[1] = m - c2 - 1; + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2]) + { + // d[0] = (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3]) ? 4 : 3; + // return d + 2; + + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3]) + { + d[0] = 4; + return d + 2; + } + d[0] = 3; + d += 2; + + #ifdef _USE_H4 + if (c4 >= matchMinPos) + if ( + cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] && + cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3] + ) + { + *d++ = 4; + *d++ = m - c4 - 1; + } + #endif + return d; + } + d[0] = 2; + d += 2; + } + #endif + + if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0]) + { + d[1] = m - c3 - 1; + if (cur[(ptrdiff_t)c3 - (ptrdiff_t)m + 3] == cur[3]) + { + d[0] = 4; + return d + 2; + } + d[0] = 3; + d += 2; + } + + #ifdef _USE_H4 + if (c4 >= matchMinPos) + if ( + cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] && + cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3] + ) + { + *d++ = 4; + *d++ = m - c4 - 1; + } + #endif + + return d; } -static UInt32 MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *distances) -{ - const UInt32 *btBuf = p->btBuf + p->btBufPos; - UInt32 len = *btBuf++; - p->btBufPos += 1 + len; +static UInt32* MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d) +{ + const UInt32 *bt = p->btBufPos; + const UInt32 len = *bt++; + const UInt32 *btLim = bt + len; + p->btBufPos = btLim; + p->btNumAvailBytes--; + INCREASE_LZ_POS + { + while (bt != btLim) + { + const UInt32 v0 = bt[0]; + const UInt32 v1 = bt[1]; + bt += 2; + d[0] = v0; + d[1] = v1; + d += 2; + } + } + return d; +} + + + +static UInt32* MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) +{ + const UInt32 *bt = p->btBufPos; + UInt32 len = *bt++; + const UInt32 avail = p->btNumAvailBytes - 1; + p->btNumAvailBytes = avail; + p->btBufPos = bt + len; if (len == 0) { - /* change for bt5 ! */ - if (p->btNumAvailBytes-- >= 4) - len = (UInt32)(p->MixMatchesFunc(p, p->lzPos - p->historySize, distances) - (distances)); + #define BT_HASH_BYTES_MAX 5 + if (avail >= (BT_HASH_BYTES_MAX - 1) - 1) + { + UInt32 m = p->lzPos; + if (m > p->historySize) + m -= p->historySize; + else + m = 1; + d = p->MixMatchesFunc(p, m, d); + } } else { - /* Condition: there are matches in btBuf with length < p->numHashBytes */ - UInt32 *distances2; - p->btNumAvailBytes--; - distances2 = p->MixMatchesFunc(p, p->lzPos - btBuf[1], distances); + /* + first match pair from BinTree: (match_len, match_dist), + (match_len >= numHashBytes). + MixMatchesFunc() inserts only hash matches that are nearer than (match_dist) + */ + d = p->MixMatchesFunc(p, p->lzPos - bt[1], d); + // if (d) // check for failure do { - UInt32 v0 = btBuf[0]; - UInt32 v1 = btBuf[1]; - btBuf += 2; - distances2[0] = v0; - distances2[1] = v1; - distances2 += 2; + const UInt32 v0 = bt[0]; + const UInt32 v1 = bt[1]; + bt += 2; + d[0] = v0; + d[1] = v1; + d += 2; } - while ((len -= 2) != 0); - len = (UInt32)(distances2 - (distances)); + while (len -= 2); } INCREASE_LZ_POS - return len; + return d; } #define SKIP_HEADER2_MT do { GET_NEXT_BLOCK_IF_REQUIRED #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash; -#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += p->btBuf[p->btBufPos] + 1; } while (--num != 0); +#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0); static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num) { @@ -803,12 +1343,16 @@ static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num) } /* +// MatchFinderMt4_Skip() is similar to MatchFinderMt3_Skip(). +// The difference is that MatchFinderMt3_Skip() updates hash for last 3 bytes of stream. + static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num) { SKIP_HEADER_MT(4) - UInt32 h2, h3, h4; - MT_HASH4_CALC - (hash + kFix4HashSize)[h4] = + UInt32 h2, h3; // h4 + MT_HASH3_CALC + // MT_HASH4_CALC + // (hash + kFix4HashSize)[h4] = (hash + kFix3HashSize)[h3] = hash[ h2] = p->lzPos; @@ -816,14 +1360,14 @@ static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num) } */ -void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable) +void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable) { vTable->Init = (Mf_Init_Func)MatchFinderMt_Init; vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes; vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos; vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches; - switch (p->MatchFinder->numHashBytes) + switch (MF(p)->numHashBytes) { case 2: p->GetHeadsFunc = GetHeads2; @@ -832,22 +1376,25 @@ void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable) vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches; break; case 3: - p->GetHeadsFunc = GetHeads3; + p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3; p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2; vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip; break; - default: - /* case 4: */ - p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads4b : GetHeads4; + case 4: + p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4; + + // it's fast inline version of GetMatches() + // vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches_Bt4; + p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3; vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip; break; - /* default: - p->GetHeadsFunc = GetHeads5; + p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5; p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4; - vTable->Skip = (Mf_Skip_Func)MatchFinderMt4_Skip; + vTable->Skip = + (Mf_Skip_Func)MatchFinderMt3_Skip; + // (Mf_Skip_Func)MatchFinderMt4_Skip; break; - */ } } diff --git a/3rdparty/7z/src/LzFindMt.h b/3rdparty/7z/src/LzFindMt.h index fdd17008c2..ee9a1b6fe9 100644 --- a/3rdparty/7z/src/LzFindMt.h +++ b/3rdparty/7z/src/LzFindMt.h @@ -1,5 +1,5 @@ /* LzFindMt.h -- multithreaded Match finder for LZ algorithms -2018-07-04 : Igor Pavlov : Public domain */ +2021-07-12 : Igor Pavlov : Public domain */ #ifndef __LZ_FIND_MT_H #define __LZ_FIND_MT_H @@ -9,31 +9,26 @@ EXTERN_C_BEGIN -#define kMtHashBlockSize (1 << 13) -#define kMtHashNumBlocks (1 << 3) -#define kMtHashNumBlocksMask (kMtHashNumBlocks - 1) - -#define kMtBtBlockSize (1 << 14) -#define kMtBtNumBlocks (1 << 6) -#define kMtBtNumBlocksMask (kMtBtNumBlocks - 1) - typedef struct _CMtSync { + UInt32 numProcessedBlocks; + CThread thread; + UInt64 affinity; + BoolInt wasCreated; BoolInt needStart; + BoolInt csWasInitialized; + BoolInt csWasEntered; + BoolInt exit; BoolInt stopWriting; - CThread thread; CAutoResetEvent canStart; - CAutoResetEvent wasStarted; CAutoResetEvent wasStopped; CSemaphore freeSemaphore; CSemaphore filledSemaphore; - BoolInt csWasInitialized; - BoolInt csWasEntered; CCriticalSection cs; - UInt32 numProcessedBlocks; + // UInt32 numBlocks_Sent; } CMtSync; typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distances); @@ -49,18 +44,23 @@ typedef struct _CMatchFinderMt /* LZ */ const Byte *pointerToCurPos; UInt32 *btBuf; - UInt32 btBufPos; - UInt32 btBufPosLimit; + const UInt32 *btBufPos; + const UInt32 *btBufPosLimit; UInt32 lzPos; UInt32 btNumAvailBytes; UInt32 *hash; UInt32 fixedHashSize; + // UInt32 hash4Mask; UInt32 historySize; const UInt32 *crc; Mf_Mix_Matches MixMatchesFunc; - + UInt32 failure_LZ_BT; // failure in BT transfered to LZ + // UInt32 failure_LZ_LZ; // failure in LZ tables + UInt32 failureBuf[1]; + // UInt32 crc[256]; + /* LZ + BT */ CMtSync btSync; Byte btDummy[kMtCacheLineDummy]; @@ -70,6 +70,8 @@ typedef struct _CMatchFinderMt UInt32 hashBufPos; UInt32 hashBufPosLimit; UInt32 hashNumAvail; + UInt32 failure_BT; + CLzRef *son; UInt32 matchMaxLen; @@ -77,7 +79,7 @@ typedef struct _CMatchFinderMt UInt32 pos; const Byte *buffer; UInt32 cyclicBufferPos; - UInt32 cyclicBufferSize; /* it must be historySize + 1 */ + UInt32 cyclicBufferSize; /* it must be = (historySize + 1) */ UInt32 cutValue; /* BT + Hash */ @@ -87,13 +89,19 @@ typedef struct _CMatchFinderMt /* Hash */ Mf_GetHeads GetHeadsFunc; CMatchFinder *MatchFinder; + // CMatchFinder MatchFinder; } CMatchFinderMt; +// only for Mt part void MatchFinderMt_Construct(CMatchFinderMt *p); void MatchFinderMt_Destruct(CMatchFinderMt *p, ISzAllocPtr alloc); + SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc); -void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable); +void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable); + +/* call MatchFinderMt_InitMt() before IMatchFinder::Init() */ +SRes MatchFinderMt_InitMt(CMatchFinderMt *p); void MatchFinderMt_ReleaseStream(CMatchFinderMt *p); EXTERN_C_END diff --git a/3rdparty/7z/src/LzFindOpt.c b/3rdparty/7z/src/LzFindOpt.c new file mode 100644 index 0000000000..dbb6ad9fc9 --- /dev/null +++ b/3rdparty/7z/src/LzFindOpt.c @@ -0,0 +1,578 @@ +/* LzFindOpt.c -- multithreaded Match finder for LZ algorithms +2021-07-13 : Igor Pavlov : Public domain */ + +#include "Precomp.h" + +#include "CpuArch.h" +#include "LzFind.h" + +// #include "LzFindMt.h" + +// #define LOG_ITERS + +// #define LOG_THREAD + +#ifdef LOG_THREAD +#include +#define PRF(x) x +#else +// #define PRF(x) +#endif + +#ifdef LOG_ITERS +#include +UInt64 g_NumIters_Tree; +UInt64 g_NumIters_Loop; +UInt64 g_NumIters_Bytes; +#define LOG_ITER(x) x +#else +#define LOG_ITER(x) +#endif + +// ---------- BT THREAD ---------- + +#define USE_SON_PREFETCH +#define USE_LONG_MATCH_OPT + +#define kEmptyHashValue 0 + +// #define CYC_TO_POS_OFFSET 0 + +// #define CYC_TO_POS_OFFSET 1 // for debug + +/* +MY_NO_INLINE +UInt32 * MY_FAST_CALL GetMatchesSpecN_1(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 *posRes) +{ + do + { + UInt32 delta; + if (hash == size) + break; + delta = *hash++; + + if (delta == 0 || delta > (UInt32)pos) + return NULL; + + lenLimit++; + + if (delta == (UInt32)pos) + { + CLzRef *ptr1 = son + ((size_t)pos << 1) - CYC_TO_POS_OFFSET * 2; + *d++ = 0; + ptr1[0] = kEmptyHashValue; + ptr1[1] = kEmptyHashValue; + } +else +{ + UInt32 *_distances = ++d; + + CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1; + CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; + + const Byte *len0 = cur, *len1 = cur; + UInt32 cutValue = _cutValue; + const Byte *maxLen = cur + _maxLen; + + for (LOG_ITER(g_NumIters_Tree++);;) + { + LOG_ITER(g_NumIters_Loop++); + { + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + CLzRef *pair = son + ((size_t)(((ptrdiff_t)pos - CYC_TO_POS_OFFSET) + diff) << 1); + const Byte *len = (len0 < len1 ? len0 : len1); + + #ifdef USE_SON_PREFETCH + const UInt32 pair0 = *pair; + #endif + + if (len[diff] == len[0]) + { + if (++len != lenLimit && len[diff] == len[0]) + while (++len != lenLimit) + { + LOG_ITER(g_NumIters_Bytes++); + if (len[diff] != len[0]) + break; + } + if (maxLen < len) + { + maxLen = len; + *d++ = (UInt32)(len - cur); + *d++ = delta - 1; + + if (len == lenLimit) + { + const UInt32 pair1 = pair[1]; + *ptr1 = + #ifdef USE_SON_PREFETCH + pair0; + #else + pair[0]; + #endif + *ptr0 = pair1; + + _distances[-1] = (UInt32)(d - _distances); + + #ifdef USE_LONG_MATCH_OPT + + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + + { + for (;;) + { + hash++; + pos++; + cur++; + lenLimit++; + { + CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; + #if 0 + *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff]; + #else + const UInt32 p0 = ptr[0 + (diff * 2)]; + const UInt32 p1 = ptr[1 + (diff * 2)]; + ptr[0] = p0; + ptr[1] = p1; + // ptr[0] = ptr[0 + (diff * 2)]; + // ptr[1] = ptr[1 + (diff * 2)]; + #endif + } + // PrintSon(son + 2, pos - 1); + // printf("\npos = %x delta = %x\n", pos, delta); + len++; + *d++ = 2; + *d++ = (UInt32)(len - cur); + *d++ = delta - 1; + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + } + } + #endif + + break; + } + } + } + + { + const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff); + if (len[diff] < len[0]) + { + delta = pair[1]; + if (delta >= curMatch) + return NULL; + *ptr1 = curMatch; + ptr1 = pair + 1; + len1 = len; + } + else + { + delta = *pair; + if (delta >= curMatch) + return NULL; + *ptr0 = curMatch; + ptr0 = pair; + len0 = len; + } + + delta = (UInt32)pos - delta; + + if (--cutValue == 0 || delta >= pos) + { + *ptr0 = *ptr1 = kEmptyHashValue; + _distances[-1] = (UInt32)(d - _distances); + break; + } + } + } + } // for (tree iterations) +} + pos++; + cur++; + } + while (d < limit); + *posRes = (UInt32)pos; + return d; +} +*/ + +/* define cbs if you use 2 functions. + GetMatchesSpecN_1() : (pos < _cyclicBufferSize) + GetMatchesSpecN_2() : (pos >= _cyclicBufferSize) + + do not define cbs if you use 1 function: + GetMatchesSpecN_2() +*/ + +// #define cbs _cyclicBufferSize + +/* + we use size_t for (pos) and (_cyclicBufferPos_ instead of UInt32 + to eliminate "movsx" BUG in old MSVC x64 compiler. +*/ + +UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes); + +MY_NO_INLINE +UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes) +{ + do // while (hash != size) + { + UInt32 delta; + + #ifndef cbs + UInt32 cbs; + #endif + + if (hash == size) + break; + + delta = *hash++; + + if (delta == 0) + return NULL; + + lenLimit++; + + #ifndef cbs + cbs = _cyclicBufferSize; + if ((UInt32)pos < cbs) + { + if (delta > (UInt32)pos) + return NULL; + cbs = (UInt32)pos; + } + #endif + + if (delta >= cbs) + { + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + *d++ = 0; + ptr1[0] = kEmptyHashValue; + ptr1[1] = kEmptyHashValue; + } +else +{ + UInt32 *_distances = ++d; + + CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + + UInt32 cutValue = _cutValue; + const Byte *len0 = cur, *len1 = cur; + const Byte *maxLen = cur + _maxLen; + + // if (cutValue == 0) { *ptr0 = *ptr1 = kEmptyHashValue; } else + for (LOG_ITER(g_NumIters_Tree++);;) + { + LOG_ITER(g_NumIters_Loop++); + { + // SPEC code + CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - (ptrdiff_t)delta + + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0) + ) << 1); + + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + const Byte *len = (len0 < len1 ? len0 : len1); + + #ifdef USE_SON_PREFETCH + const UInt32 pair0 = *pair; + #endif + + if (len[diff] == len[0]) + { + if (++len != lenLimit && len[diff] == len[0]) + while (++len != lenLimit) + { + LOG_ITER(g_NumIters_Bytes++); + if (len[diff] != len[0]) + break; + } + if (maxLen < len) + { + maxLen = len; + *d++ = (UInt32)(len - cur); + *d++ = delta - 1; + + if (len == lenLimit) + { + const UInt32 pair1 = pair[1]; + *ptr1 = + #ifdef USE_SON_PREFETCH + pair0; + #else + pair[0]; + #endif + *ptr0 = pair1; + + _distances[-1] = (UInt32)(d - _distances); + + #ifdef USE_LONG_MATCH_OPT + + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + + { + for (;;) + { + *d++ = 2; + *d++ = (UInt32)(lenLimit - cur); + *d++ = delta - 1; + cur++; + lenLimit++; + // SPEC + _cyclicBufferPos++; + { + // SPEC code + CLzRef *dest = son + ((size_t)(_cyclicBufferPos) << 1); + const CLzRef *src = dest + ((diff + + (ptrdiff_t)(UInt32)((_cyclicBufferPos < delta) ? cbs : 0)) << 1); + // CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; + #if 0 + *(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src); + #else + const UInt32 p0 = src[0]; + const UInt32 p1 = src[1]; + dest[0] = p0; + dest[1] = p1; + #endif + } + pos++; + hash++; + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + } // for() end for long matches + } + #endif + + break; // break from TREE iterations + } + } + } + { + const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff); + if (len[diff] < len[0]) + { + delta = pair[1]; + *ptr1 = curMatch; + ptr1 = pair + 1; + len1 = len; + if (delta >= curMatch) + return NULL; + } + else + { + delta = *pair; + *ptr0 = curMatch; + ptr0 = pair; + len0 = len; + if (delta >= curMatch) + return NULL; + } + delta = (UInt32)pos - delta; + + if (--cutValue == 0 || delta >= cbs) + { + *ptr0 = *ptr1 = kEmptyHashValue; + _distances[-1] = (UInt32)(d - _distances); + break; + } + } + } + } // for (tree iterations) +} + pos++; + _cyclicBufferPos++; + cur++; + } + while (d < limit); + *posRes = (UInt32)pos; + return d; +} + + + +/* +typedef UInt32 uint32plus; // size_t + +UInt32 * MY_FAST_CALL GetMatchesSpecN_3(uint32plus lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, uint32plus _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes) +{ + do // while (hash != size) + { + UInt32 delta; + + #ifndef cbs + UInt32 cbs; + #endif + + if (hash == size) + break; + + delta = *hash++; + + if (delta == 0) + return NULL; + + #ifndef cbs + cbs = _cyclicBufferSize; + if ((UInt32)pos < cbs) + { + if (delta > (UInt32)pos) + return NULL; + cbs = (UInt32)pos; + } + #endif + + if (delta >= cbs) + { + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + *d++ = 0; + ptr1[0] = kEmptyHashValue; + ptr1[1] = kEmptyHashValue; + } +else +{ + CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + UInt32 *_distances = ++d; + uint32plus len0 = 0, len1 = 0; + UInt32 cutValue = _cutValue; + uint32plus maxLen = _maxLen; + // lenLimit++; // const Byte *lenLimit = cur + _lenLimit; + + for (LOG_ITER(g_NumIters_Tree++);;) + { + LOG_ITER(g_NumIters_Loop++); + { + // const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - delta + + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0) + ) << 1); + const Byte *pb = cur - delta; + uint32plus len = (len0 < len1 ? len0 : len1); + + #ifdef USE_SON_PREFETCH + const UInt32 pair0 = *pair; + #endif + + if (pb[len] == cur[len]) + { + if (++len != lenLimit && pb[len] == cur[len]) + while (++len != lenLimit) + if (pb[len] != cur[len]) + break; + if (maxLen < len) + { + maxLen = len; + *d++ = (UInt32)len; + *d++ = delta - 1; + if (len == lenLimit) + { + { + const UInt32 pair1 = pair[1]; + *ptr0 = pair1; + *ptr1 = + #ifdef USE_SON_PREFETCH + pair0; + #else + pair[0]; + #endif + } + + _distances[-1] = (UInt32)(d - _distances); + + #ifdef USE_LONG_MATCH_OPT + + if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit) + break; + + { + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + for (;;) + { + *d++ = 2; + *d++ = (UInt32)lenLimit; + *d++ = delta - 1; + _cyclicBufferPos++; + { + CLzRef *dest = son + ((size_t)_cyclicBufferPos << 1); + const CLzRef *src = dest + ((diff + + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)) << 1); + #if 0 + *(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src); + #else + const UInt32 p0 = src[0]; + const UInt32 p1 = src[1]; + dest[0] = p0; + dest[1] = p1; + #endif + } + hash++; + pos++; + cur++; + pb++; + if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit) + break; + } + } + #endif + + break; + } + } + } + { + const UInt32 curMatch = (UInt32)pos - delta; + if (pb[len] < cur[len]) + { + delta = pair[1]; + *ptr1 = curMatch; + ptr1 = pair + 1; + len1 = len; + } + else + { + delta = *pair; + *ptr0 = curMatch; + ptr0 = pair; + len0 = len; + } + + { + if (delta >= curMatch) + return NULL; + delta = (UInt32)pos - delta; + if (delta >= cbs + // delta >= _cyclicBufferSize || delta >= pos + || --cutValue == 0) + { + *ptr0 = *ptr1 = kEmptyHashValue; + _distances[-1] = (UInt32)(d - _distances); + break; + } + } + } + } + } // for (tree iterations) +} + pos++; + _cyclicBufferPos++; + cur++; + } + while (d < limit); + *posRes = (UInt32)pos; + return d; +} +*/ diff --git a/3rdparty/7z/src/LzHash.h b/3rdparty/7z/src/LzHash.h index 2191444072..a682f83bec 100644 --- a/3rdparty/7z/src/LzHash.h +++ b/3rdparty/7z/src/LzHash.h @@ -1,57 +1,34 @@ /* LzHash.h -- HASH functions for LZ algorithms -2015-04-12 : Igor Pavlov : Public domain */ +2019-10-30 : Igor Pavlov : Public domain */ #ifndef __LZ_HASH_H #define __LZ_HASH_H +/* + (kHash2Size >= (1 << 8)) : Required + (kHash3Size >= (1 << 16)) : Required +*/ + #define kHash2Size (1 << 10) #define kHash3Size (1 << 16) -#define kHash4Size (1 << 20) +// #define kHash4Size (1 << 20) #define kFix3HashSize (kHash2Size) #define kFix4HashSize (kHash2Size + kHash3Size) -#define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size) +// #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size) -#define HASH2_CALC hv = cur[0] | ((UInt32)cur[1] << 8); +/* + We use up to 3 crc values for hash: + crc0 + crc1 << Shift_1 + crc2 << Shift_2 + (Shift_1 = 5) and (Shift_2 = 10) is good tradeoff. + Small values for Shift are not good for collision rate. + Big value for Shift_2 increases the minimum size + of hash table, that will be slow for small files. +*/ -#define HASH3_CALC { \ - UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ - h2 = temp & (kHash2Size - 1); \ - hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; } - -#define HASH4_CALC { \ - UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ - h2 = temp & (kHash2Size - 1); \ - temp ^= ((UInt32)cur[2] << 8); \ - h3 = temp & (kHash3Size - 1); \ - hv = (temp ^ (p->crc[cur[3]] << 5)) & p->hashMask; } - -#define HASH5_CALC { \ - UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ - h2 = temp & (kHash2Size - 1); \ - temp ^= ((UInt32)cur[2] << 8); \ - h3 = temp & (kHash3Size - 1); \ - temp ^= (p->crc[cur[3]] << 5); \ - h4 = temp & (kHash4Size - 1); \ - hv = (temp ^ (p->crc[cur[4]] << 3)) & p->hashMask; } - -/* #define HASH_ZIP_CALC hv = ((cur[0] | ((UInt32)cur[1] << 8)) ^ p->crc[cur[2]]) & 0xFFFF; */ -#define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF; - - -#define MT_HASH2_CALC \ - h2 = (p->crc[cur[0]] ^ cur[1]) & (kHash2Size - 1); - -#define MT_HASH3_CALC { \ - UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ - h2 = temp & (kHash2Size - 1); \ - h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); } - -#define MT_HASH4_CALC { \ - UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ - h2 = temp & (kHash2Size - 1); \ - temp ^= ((UInt32)cur[2] << 8); \ - h3 = temp & (kHash3Size - 1); \ - h4 = (temp ^ (p->crc[cur[3]] << 5)) & (kHash4Size - 1); } +#define kLzHash_CrcShift_1 5 +#define kLzHash_CrcShift_2 10 #endif diff --git a/3rdparty/7z/src/Lzma2Dec.c b/3rdparty/7z/src/Lzma2Dec.c index 2e631051ba..f9f98095df 100644 --- a/3rdparty/7z/src/Lzma2Dec.c +++ b/3rdparty/7z/src/Lzma2Dec.c @@ -1,5 +1,5 @@ /* Lzma2Dec.c -- LZMA2 Decoder -2019-02-02 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ /* #define SHOW_DEBUG_INFO */ @@ -93,7 +93,8 @@ void Lzma2Dec_Init(CLzma2Dec *p) LzmaDec_Init(&p->decoder); } -static ELzma2State Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b) +// ELzma2State +static unsigned Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b) { switch (p->state) { diff --git a/3rdparty/7z/src/Lzma2DecMt.c b/3rdparty/7z/src/Lzma2DecMt.c index 87d5567ad4..252b5be49f 100644 --- a/3rdparty/7z/src/Lzma2DecMt.c +++ b/3rdparty/7z/src/Lzma2DecMt.c @@ -1,25 +1,25 @@ /* Lzma2DecMt.c -- LZMA2 Decoder Multi-thread -2019-02-02 : Igor Pavlov : Public domain */ +2021-04-01 : Igor Pavlov : Public domain */ #include "Precomp.h" // #define SHOW_DEBUG_INFO +// #define _7ZIP_ST + #ifdef SHOW_DEBUG_INFO #include #endif +#ifndef _7ZIP_ST #ifdef SHOW_DEBUG_INFO #define PRF(x) x #else #define PRF(x) #endif - #define PRF_STR(s) PRF(printf("\n" s "\n")) -#define PRF_STR_INT(s, d) PRF(printf("\n" s " %d\n", (unsigned)d)) #define PRF_STR_INT_2(s, d1, d2) PRF(printf("\n" s " %d %d\n", (unsigned)d1, (unsigned)d2)) - -// #define _7ZIP_ST +#endif #include "Alloc.h" @@ -28,10 +28,10 @@ #ifndef _7ZIP_ST #include "MtDec.h" -#endif - #define LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT (1 << 28) +#endif + void Lzma2DecMtProps_Init(CLzma2DecMtProps *p) { @@ -255,7 +255,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa const unsigned kNumAlignBits = 12; const unsigned kNumCacheLineBits = 7; /* <= kNumAlignBits */ t->alloc.numAlignBits = kNumAlignBits; - t->alloc.offset = ((UInt32)coderIndex * ((1 << 11) + (1 << 8) + (1 << 6))) & ((1 << kNumAlignBits) - (1 << kNumCacheLineBits)); + t->alloc.offset = ((UInt32)coderIndex * (((unsigned)1 << 11) + (1 << 8) + (1 << 6))) & (((unsigned)1 << kNumAlignBits) - ((unsigned)1 << kNumCacheLineBits)); t->alloc.baseAlloc = me->alignOffsetAlloc.baseAlloc; } } @@ -527,7 +527,7 @@ static SRes Lzma2DecMt_MtCallback_Code(void *pp, unsigned coderIndex, static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex, BoolInt needWriteToStream, - const Byte *src, size_t srcSize, + const Byte *src, size_t srcSize, BoolInt isCross, BoolInt *needContinue, BoolInt *canRecode) { CLzma2DecMt *me = (CLzma2DecMt *)pp; @@ -536,12 +536,14 @@ static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex, const Byte *data = t->outBuf; BoolInt needContinue2 = True; + UNUSED_VAR(src) + UNUSED_VAR(srcSize) + UNUSED_VAR(isCross) + PRF_STR_INT_2("Write", coderIndex, srcSize); *needContinue = False; *canRecode = True; - UNUSED_VAR(src) - UNUSED_VAR(srcSize) if ( // t->parseStatus == LZMA_STATUS_FINISHED_WITH_MARK @@ -696,7 +698,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p inPos = 0; inLim = p->inBufSize; inData = p->inBuf; - p->readRes = ISeqInStream_Read(p->inStream, (void *)inData, &inLim); + p->readRes = ISeqInStream_Read(p->inStream, (void *)(p->inBuf), &inLim); // p->readProcessed += inLim; // inLim -= 5; p->readWasFinished = True; // for test if (inLim == 0 || p->readRes != SZ_OK) @@ -838,6 +840,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp, p->inProcessed = 0; p->readWasFinished = False; + p->readRes = SZ_OK; *isMT = False; @@ -856,7 +859,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp, if (p->props.numThreads > 1) { - IMtDecCallback vt; + IMtDecCallback2 vt; Lzma2DecMt_FreeSt(p); @@ -955,7 +958,12 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp, *inProcessed = p->inProcessed; // res = SZ_OK; // for test - if (res == SZ_OK && p->readRes != SZ_OK) + if (res == SZ_ERROR_INPUT_EOF) + { + if (p->readRes != SZ_OK) + res = p->readRes; + } + else if (res == SZ_OK && p->readRes != SZ_OK) res = p->readRes; /* diff --git a/3rdparty/7z/src/Lzma2Enc.c b/3rdparty/7z/src/Lzma2Enc.c index d541477527..c8b114cb48 100644 --- a/3rdparty/7z/src/Lzma2Enc.c +++ b/3rdparty/7z/src/Lzma2Enc.c @@ -1,5 +1,5 @@ /* Lzma2Enc.c -- LZMA2 Encoder -2018-07-04 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -330,7 +330,7 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p) numBlocks++; if (numBlocks < (unsigned)t2) { - t2r = (unsigned)numBlocks; + t2r = (int)numBlocks; if (t2r == 0) t2r = 1; t3 = t1 * t2r; @@ -632,15 +632,15 @@ static SRes Lzma2Enc_EncodeMt1( { if (outBuf) { - size_t destPos = *outBufSize; + const size_t destPos = *outBufSize; if (destPos >= outLim) return SZ_ERROR_OUTPUT_EOF; - outBuf[destPos] = 0; + outBuf[destPos] = LZMA2_CONTROL_EOF; // 0 *outBufSize = destPos + 1; } else { - Byte b = 0; + const Byte b = LZMA2_CONTROL_EOF; // 0; if (ISeqOutStream_Write(outStream, &b, 1) != 1) return SZ_ERROR_WRITE; } @@ -780,13 +780,13 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp, p->outBufSize = destBlockSize; } - p->mtCoder.numThreadsMax = p->props.numBlockThreads_Max; + p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max; p->mtCoder.expectedDataSize = p->expectedDataSize; { SRes res = MtCoder_Code(&p->mtCoder); if (!outStream) - *outBufSize = p->outBuf - outBuf; + *outBufSize = (size_t)(p->outBuf - outBuf); return res; } } diff --git a/3rdparty/7z/src/Lzma86Enc.c b/3rdparty/7z/src/Lzma86Enc.c index 8d35e6dc5e..99397bc5e3 100644 --- a/3rdparty/7z/src/Lzma86Enc.c +++ b/3rdparty/7z/src/Lzma86Enc.c @@ -11,8 +11,6 @@ #include "Bra.h" #include "LzmaEnc.h" -#define SZE_OUT_OVERFLOW SZE_DATA_ERROR - int Lzma86_Encode(Byte *dest, size_t *destLen, const Byte *src, size_t srcLen, int level, UInt32 dictSize, int filterMode) { diff --git a/3rdparty/7z/src/LzmaDec.c b/3rdparty/7z/src/LzmaDec.c index 4d1576419f..80b70a9eee 100644 --- a/3rdparty/7z/src/LzmaDec.c +++ b/3rdparty/7z/src/LzmaDec.c @@ -1,5 +1,5 @@ /* LzmaDec.c -- LZMA Decoder -2018-07-04 : Igor Pavlov : Public domain */ +2021-04-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -13,10 +13,12 @@ #define kNumBitModelTotalBits 11 #define kBitModelTotal (1 << kNumBitModelTotalBits) -#define kNumMoveBits 5 #define RC_INIT_SIZE 5 +#ifndef _LZMA_DEC_OPT + +#define kNumMoveBits 5 #define NORMALIZE if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); } #define IF_BIT_0(p) ttt = *(p); NORMALIZE; bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound) @@ -62,9 +64,10 @@ probLit = prob + (offs + bit + symbol); \ GET_BIT2(probLit, symbol, offs ^= bit; , ;) +#endif // _LZMA_DEC_OPT -#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); } +#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_INPUT_EOF; range <<= 8; code = (code << 8) | (*buf++); } #define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound) #define UPDATE_0_CHECK range = bound; @@ -114,6 +117,9 @@ #define kMatchMinLen 2 #define kMatchSpecLenStart (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols) +#define kMatchSpecLen_Error_Data (1 << 9) +#define kMatchSpecLen_Error_Fail (kMatchSpecLen_Error_Data - 1) + /* External ASM code needs same CLzmaProb array layout. So don't change it. */ /* (probs_1664) is faster and better for code size at some platforms */ @@ -166,10 +172,12 @@ /* p->remainLen : shows status of LZMA decoder: - < kMatchSpecLenStart : normal remain - = kMatchSpecLenStart : finished - = kMatchSpecLenStart + 1 : need init range coder - = kMatchSpecLenStart + 2 : need init range coder and state + < kMatchSpecLenStart : the number of bytes to be copied with (p->rep0) offset + = kMatchSpecLenStart : the LZMA stream was finished with end mark + = kMatchSpecLenStart + 1 : need init range coder + = kMatchSpecLenStart + 2 : need init range coder and state + = kMatchSpecLen_Error_Fail : Internal Code Failure + = kMatchSpecLen_Error_Data + [0 ... 273] : LZMA Data Error */ /* ---------- LZMA_DECODE_REAL ---------- */ @@ -188,23 +196,31 @@ In: { LzmaDec_TryDummy() was called before to exclude LITERAL and MATCH-REP cases. So first symbol can be only MATCH-NON-REP. And if that MATCH-NON-REP symbol - is not END_OF_PAYALOAD_MARKER, then function returns error code. + is not END_OF_PAYALOAD_MARKER, then the function doesn't write any byte to dictionary, + the function returns SZ_OK, and the caller can use (p->remainLen) and (p->reps[0]) later. } Processing: - first LZMA symbol will be decoded in any case - All checks for limits are at the end of main loop, - It will decode new LZMA-symbols while (p->buf < bufLimit && dicPos < limit), + The first LZMA symbol will be decoded in any case. + All main checks for limits are at the end of main loop, + It decodes additional LZMA-symbols while (p->buf < bufLimit && dicPos < limit), RangeCoder is still without last normalization when (p->buf < bufLimit) is being checked. + But if (p->buf < bufLimit), the caller provided at least (LZMA_REQUIRED_INPUT_MAX + 1) bytes for + next iteration before limit (bufLimit + LZMA_REQUIRED_INPUT_MAX), + that is enough for worst case LZMA symbol with one additional RangeCoder normalization for one bit. + So that function never reads bufLimit [LZMA_REQUIRED_INPUT_MAX] byte. Out: RangeCoder is normalized Result: SZ_OK - OK - SZ_ERROR_DATA - Error - p->remainLen: - < kMatchSpecLenStart : normal remain - = kMatchSpecLenStart : finished + p->remainLen: + < kMatchSpecLenStart : the number of bytes to be copied with (p->reps[0]) offset + = kMatchSpecLenStart : the LZMA stream was finished with end mark + + SZ_ERROR_DATA - error, when the MATCH-Symbol refers out of dictionary + p->remainLen : undefined + p->reps[*] : undefined */ @@ -316,11 +332,6 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit else { UPDATE_1(prob); - /* - // that case was checked before with kBadRepCode - if (checkDicSize == 0 && processedPos == 0) - return SZ_ERROR_DATA; - */ prob = probs + IsRepG0 + state; IF_BIT_0(prob) { @@ -329,6 +340,13 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit IF_BIT_0(prob) { UPDATE_0(prob); + + // that case was checked before with kBadRepCode + // if (checkDicSize == 0 && processedPos == 0) { len = kMatchSpecLen_Error_Data + 1; break; } + // The caller doesn't allow (dicPos == limit) case here + // so we don't need the following check: + // if (dicPos == limit) { state = state < kNumLitStates ? 9 : 11; len = 1; break; } + dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; dicPos++; processedPos++; @@ -518,8 +536,10 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize)) { - p->dicPos = dicPos; - return SZ_ERROR_DATA; + len += kMatchSpecLen_Error_Data + kMatchMinLen; + // len = kMatchSpecLen_Error_Data; + // len += kMatchMinLen; + break; } } @@ -532,8 +552,13 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit if ((rem = limit - dicPos) == 0) { - p->dicPos = dicPos; - return SZ_ERROR_DATA; + /* + We stop decoding and return SZ_OK, and we can resume decoding later. + Any error conditions can be tested later in caller code. + For more strict mode we can stop decoding with error + // len += kMatchSpecLen_Error_Data; + */ + break; } curLen = ((rem < len) ? (unsigned)rem : len); @@ -572,7 +597,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit p->buf = buf; p->range = range; p->code = code; - p->remainLen = (UInt32)len; + p->remainLen = (UInt32)len; // & (kMatchSpecLen_Error_Data - 1); // we can write real length for error matches too. p->dicPos = dicPos; p->processedPos = processedPos; p->reps[0] = rep0; @@ -580,40 +605,61 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit p->reps[2] = rep2; p->reps[3] = rep3; p->state = (UInt32)state; - + if (len >= kMatchSpecLen_Error_Data) + return SZ_ERROR_DATA; return SZ_OK; } #endif + + static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit) { - if (p->remainLen != 0 && p->remainLen < kMatchSpecLenStart) + unsigned len = (unsigned)p->remainLen; + if (len == 0 /* || len >= kMatchSpecLenStart */) + return; { - Byte *dic = p->dic; SizeT dicPos = p->dicPos; - SizeT dicBufSize = p->dicBufSize; - unsigned len = (unsigned)p->remainLen; - SizeT rep0 = p->reps[0]; /* we use SizeT to avoid the BUG of VC14 for AMD64 */ - SizeT rem = limit - dicPos; - if (rem < len) - len = (unsigned)(rem); + Byte *dic; + SizeT dicBufSize; + SizeT rep0; /* we use SizeT to avoid the BUG of VC14 for AMD64 */ + { + SizeT rem = limit - dicPos; + if (rem < len) + { + len = (unsigned)(rem); + if (len == 0) + return; + } + } if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len) p->checkDicSize = p->prop.dicSize; p->processedPos += (UInt32)len; p->remainLen -= (UInt32)len; - while (len != 0) + dic = p->dic; + rep0 = p->reps[0]; + dicBufSize = p->dicBufSize; + do { - len--; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; dicPos++; } + while (--len); p->dicPos = dicPos; } } +/* +At staring of new stream we have one of the following symbols: + - Literal - is allowed + - Non-Rep-Match - is allowed only if it's end marker symbol + - Rep-Match - is not allowed +We use early check of (RangeCoder:Code) over kBadRepCode to simplify main decoding code +*/ + #define kRange0 0xFFFFFFFF #define kBound0 ((kRange0 >> kNumBitModelTotalBits) << (kNumBitModelTotalBits - 1)) #define kBadRepCode (kBound0 + (((kRange0 - kBound0) >> kNumBitModelTotalBits) << (kNumBitModelTotalBits - 1))) @@ -621,69 +667,77 @@ static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit) #error Stop_Compiling_Bad_LZMA_Check #endif + +/* +LzmaDec_DecodeReal2(): + It calls LZMA_DECODE_REAL() and it adjusts limit according (p->checkDicSize). + +We correct (p->checkDicSize) after LZMA_DECODE_REAL() and in LzmaDec_WriteRem(), +and we support the following state of (p->checkDicSize): + if (total_processed < p->prop.dicSize) then + { + (total_processed == p->processedPos) + (p->checkDicSize == 0) + } + else + (p->checkDicSize == p->prop.dicSize) +*/ + static int MY_FAST_CALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit) { - do + if (p->checkDicSize == 0) { - SizeT limit2 = limit; - if (p->checkDicSize == 0) - { - UInt32 rem = p->prop.dicSize - p->processedPos; - if (limit - p->dicPos > rem) - limit2 = p->dicPos + rem; - - if (p->processedPos == 0) - if (p->code >= kBadRepCode) - return SZ_ERROR_DATA; - } - - RINOK(LZMA_DECODE_REAL(p, limit2, bufLimit)); - + UInt32 rem = p->prop.dicSize - p->processedPos; + if (limit - p->dicPos > rem) + limit = p->dicPos + rem; + } + { + int res = LZMA_DECODE_REAL(p, limit, bufLimit); if (p->checkDicSize == 0 && p->processedPos >= p->prop.dicSize) p->checkDicSize = p->prop.dicSize; - - LzmaDec_WriteRem(p, limit); + return res; } - while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); - - return 0; } + + typedef enum { - DUMMY_ERROR, /* unexpected end of input stream */ + DUMMY_INPUT_EOF, /* need more input data */ DUMMY_LIT, DUMMY_MATCH, DUMMY_REP } ELzmaDummy; -static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inSize) + +#define IS_DUMMY_END_MARKER_POSSIBLE(dummyRes) ((dummyRes) == DUMMY_MATCH) + +static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byte **bufOut) { UInt32 range = p->range; UInt32 code = p->code; - const Byte *bufLimit = buf + inSize; + const Byte *bufLimit = *bufOut; const CLzmaProb *probs = GET_PROBS; unsigned state = (unsigned)p->state; ELzmaDummy res; + for (;;) { const CLzmaProb *prob; UInt32 bound; unsigned ttt; - unsigned posState = CALC_POS_STATE(p->processedPos, (1 << p->prop.pb) - 1); + unsigned posState = CALC_POS_STATE(p->processedPos, ((unsigned)1 << p->prop.pb) - 1); prob = probs + IsMatch + COMBINED_PS_STATE; IF_BIT_0_CHECK(prob) { UPDATE_0_CHECK - /* if (bufLimit - buf >= 7) return DUMMY_LIT; */ - prob = probs + Literal; if (p->checkDicSize != 0 || p->processedPos != 0) prob += ((UInt32)LZMA_LIT_SIZE * - ((((p->processedPos) & ((1 << (p->prop.lp)) - 1)) << p->prop.lc) + - (p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc)))); + ((((p->processedPos) & (((unsigned)1 << (p->prop.lp)) - 1)) << p->prop.lc) + + ((unsigned)p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc)))); if (state < kNumLitStates) { @@ -735,8 +789,7 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS IF_BIT_0_CHECK(prob) { UPDATE_0_CHECK; - NORMALIZE_CHECK; - return DUMMY_REP; + break; } else { @@ -812,8 +865,6 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS { unsigned numDirectBits = ((posSlot >> 1) - 1); - /* if (bufLimit - buf >= 8) return DUMMY_MATCH; */ - if (posSlot < kEndPosModelIndex) { prob = probs + SpecPos + ((2 | (posSlot & 1)) << numDirectBits); @@ -844,12 +895,15 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS } } } + break; } NORMALIZE_CHECK; + + *bufOut = buf; return res; } - +void LzmaDec_InitDicAndState(CLzmaDec *p, BoolInt initDic, BoolInt initState); void LzmaDec_InitDicAndState(CLzmaDec *p, BoolInt initDic, BoolInt initState) { p->remainLen = kMatchSpecLenStart + 1; @@ -872,16 +926,41 @@ void LzmaDec_Init(CLzmaDec *p) } +/* +LZMA supports optional end_marker. +So the decoder can lookahead for one additional LZMA-Symbol to check end_marker. +That additional LZMA-Symbol can require up to LZMA_REQUIRED_INPUT_MAX bytes in input stream. +When the decoder reaches dicLimit, it looks (finishMode) parameter: + if (finishMode == LZMA_FINISH_ANY), the decoder doesn't lookahead + if (finishMode != LZMA_FINISH_ANY), the decoder lookahead, if end_marker is possible for current position + +When the decoder lookahead, and the lookahead symbol is not end_marker, we have two ways: + 1) Strict mode (default) : the decoder returns SZ_ERROR_DATA. + 2) The relaxed mode (alternative mode) : we could return SZ_OK, and the caller + must check (status) value. The caller can show the error, + if the end of stream is expected, and the (status) is noit + LZMA_STATUS_FINISHED_WITH_MARK or LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK. +*/ + + +#define RETURN__NOT_FINISHED__FOR_FINISH \ + *status = LZMA_STATUS_NOT_FINISHED; \ + return SZ_ERROR_DATA; // for strict mode + // return SZ_OK; // for relaxed mode + + SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status) { SizeT inSize = *srcLen; (*srcLen) = 0; - *status = LZMA_STATUS_NOT_SPECIFIED; if (p->remainLen > kMatchSpecLenStart) { + if (p->remainLen > kMatchSpecLenStart + 2) + return p->remainLen == kMatchSpecLen_Error_Fail ? SZ_ERROR_FAIL : SZ_ERROR_DATA; + for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--) p->tempBuf[p->tempBufSize++] = *src++; if (p->tempBufSize != 0 && p->tempBuf[0] != 0) @@ -896,6 +975,12 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr | ((UInt32)p->tempBuf[2] << 16) | ((UInt32)p->tempBuf[3] << 8) | ((UInt32)p->tempBuf[4]); + + if (p->checkDicSize == 0 + && p->processedPos == 0 + && p->code >= kBadRepCode) + return SZ_ERROR_DATA; + p->range = 0xFFFFFFFF; p->tempBufSize = 0; @@ -913,10 +998,21 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr p->remainLen = 0; } - LzmaDec_WriteRem(p, dicLimit); - - while (p->remainLen != kMatchSpecLenStart) + for (;;) { + if (p->remainLen == kMatchSpecLenStart) + { + if (p->code != 0) + return SZ_ERROR_DATA; + *status = LZMA_STATUS_FINISHED_WITH_MARK; + return SZ_OK; + } + + LzmaDec_WriteRem(p, dicLimit); + + { + // (p->remainLen == 0 || p->dicPos == dicLimit) + int checkEndMarkNow = 0; if (p->dicPos >= dicLimit) @@ -933,92 +1029,174 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr } if (p->remainLen != 0) { - *status = LZMA_STATUS_NOT_FINISHED; - return SZ_ERROR_DATA; + RETURN__NOT_FINISHED__FOR_FINISH; } checkEndMarkNow = 1; } + // (p->remainLen == 0) + if (p->tempBufSize == 0) { - SizeT processed; const Byte *bufLimit; + int dummyProcessed = -1; + if (inSize < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) { - int dummyRes = LzmaDec_TryDummy(p, src, inSize); - if (dummyRes == DUMMY_ERROR) + const Byte *bufOut = src + inSize; + + ELzmaDummy dummyRes = LzmaDec_TryDummy(p, src, &bufOut); + + if (dummyRes == DUMMY_INPUT_EOF) { - memcpy(p->tempBuf, src, inSize); - p->tempBufSize = (unsigned)inSize; + size_t i; + if (inSize >= LZMA_REQUIRED_INPUT_MAX) + break; (*srcLen) += inSize; + p->tempBufSize = (unsigned)inSize; + for (i = 0; i < inSize; i++) + p->tempBuf[i] = src[i]; *status = LZMA_STATUS_NEEDS_MORE_INPUT; return SZ_OK; } - if (checkEndMarkNow && dummyRes != DUMMY_MATCH) + + dummyProcessed = (int)(bufOut - src); + if ((unsigned)dummyProcessed > LZMA_REQUIRED_INPUT_MAX) + break; + + if (checkEndMarkNow && !IS_DUMMY_END_MARKER_POSSIBLE(dummyRes)) { - *status = LZMA_STATUS_NOT_FINISHED; - return SZ_ERROR_DATA; + unsigned i; + (*srcLen) += (unsigned)dummyProcessed; + p->tempBufSize = (unsigned)dummyProcessed; + for (i = 0; i < (unsigned)dummyProcessed; i++) + p->tempBuf[i] = src[i]; + // p->remainLen = kMatchSpecLen_Error_Data; + RETURN__NOT_FINISHED__FOR_FINISH; } + bufLimit = src; + // we will decode only one iteration } else bufLimit = src + inSize - LZMA_REQUIRED_INPUT_MAX; + p->buf = src; - if (LzmaDec_DecodeReal2(p, dicLimit, bufLimit) != 0) - return SZ_ERROR_DATA; - processed = (SizeT)(p->buf - src); - (*srcLen) += processed; - src += processed; - inSize -= processed; - } - else - { - unsigned rem = p->tempBufSize, lookAhead = 0; - while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize) - p->tempBuf[rem++] = src[lookAhead++]; - p->tempBufSize = rem; - if (rem < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) + { - int dummyRes = LzmaDec_TryDummy(p, p->tempBuf, (SizeT)rem); - if (dummyRes == DUMMY_ERROR) + int res = LzmaDec_DecodeReal2(p, dicLimit, bufLimit); + + SizeT processed = (SizeT)(p->buf - src); + + if (dummyProcessed < 0) { - (*srcLen) += (SizeT)lookAhead; - *status = LZMA_STATUS_NEEDS_MORE_INPUT; - return SZ_OK; + if (processed > inSize) + break; } - if (checkEndMarkNow && dummyRes != DUMMY_MATCH) + else if ((unsigned)dummyProcessed != processed) + break; + + src += processed; + inSize -= processed; + (*srcLen) += processed; + + if (res != SZ_OK) { - *status = LZMA_STATUS_NOT_FINISHED; + p->remainLen = kMatchSpecLen_Error_Data; return SZ_ERROR_DATA; } } + continue; + } + + { + // we have some data in (p->tempBuf) + // in strict mode: tempBufSize is not enough for one Symbol decoding. + // in relaxed mode: tempBufSize not larger than required for one Symbol decoding. + + unsigned rem = p->tempBufSize; + unsigned ahead = 0; + int dummyProcessed = -1; + + while (rem < LZMA_REQUIRED_INPUT_MAX && ahead < inSize) + p->tempBuf[rem++] = src[ahead++]; + + // ahead - the size of new data copied from (src) to (p->tempBuf) + // rem - the size of temp buffer including new data from (src) + + if (rem < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) + { + const Byte *bufOut = p->tempBuf + rem; + + ELzmaDummy dummyRes = LzmaDec_TryDummy(p, p->tempBuf, &bufOut); + + if (dummyRes == DUMMY_INPUT_EOF) + { + if (rem >= LZMA_REQUIRED_INPUT_MAX) + break; + p->tempBufSize = rem; + (*srcLen) += (SizeT)ahead; + *status = LZMA_STATUS_NEEDS_MORE_INPUT; + return SZ_OK; + } + + dummyProcessed = (int)(bufOut - p->tempBuf); + + if ((unsigned)dummyProcessed < p->tempBufSize) + break; + + if (checkEndMarkNow && !IS_DUMMY_END_MARKER_POSSIBLE(dummyRes)) + { + (*srcLen) += (unsigned)dummyProcessed - p->tempBufSize; + p->tempBufSize = (unsigned)dummyProcessed; + // p->remainLen = kMatchSpecLen_Error_Data; + RETURN__NOT_FINISHED__FOR_FINISH; + } + } + p->buf = p->tempBuf; - if (LzmaDec_DecodeReal2(p, dicLimit, p->buf) != 0) - return SZ_ERROR_DATA; { - unsigned kkk = (unsigned)(p->buf - p->tempBuf); - if (rem < kkk) - return SZ_ERROR_FAIL; /* some internal error */ - rem -= kkk; - if (lookAhead < rem) - return SZ_ERROR_FAIL; /* some internal error */ - lookAhead -= rem; + // we decode one symbol from (p->tempBuf) here, so the (bufLimit) is equal to (p->buf) + int res = LzmaDec_DecodeReal2(p, dicLimit, p->buf); + + SizeT processed = (SizeT)(p->buf - p->tempBuf); + rem = p->tempBufSize; + + if (dummyProcessed < 0) + { + if (processed > LZMA_REQUIRED_INPUT_MAX) + break; + if (processed < rem) + break; + } + else if ((unsigned)dummyProcessed != processed) + break; + + processed -= rem; + + src += processed; + inSize -= processed; + (*srcLen) += processed; + p->tempBufSize = 0; + + if (res != SZ_OK) + { + p->remainLen = kMatchSpecLen_Error_Data; + return SZ_ERROR_DATA; + } } - (*srcLen) += (SizeT)lookAhead; - src += lookAhead; - inSize -= (SizeT)lookAhead; - p->tempBufSize = 0; } + } } - - if (p->code != 0) - return SZ_ERROR_DATA; - *status = LZMA_STATUS_FINISHED_WITH_MARK; - return SZ_OK; + + /* Some unexpected error: internal error of code, memory corruption or hardware failure */ + p->remainLen = kMatchSpecLen_Error_Fail; + return SZ_ERROR_FAIL; } + SRes LzmaDec_DecodeToBuf(CLzmaDec *p, Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status) { SizeT outSize = *destLen; diff --git a/3rdparty/7z/src/LzmaDec.h b/3rdparty/7z/src/LzmaDec.h index 28ce60c3ea..6194b7d127 100644 --- a/3rdparty/7z/src/LzmaDec.h +++ b/3rdparty/7z/src/LzmaDec.h @@ -1,5 +1,5 @@ /* LzmaDec.h -- LZMA Decoder -2018-04-21 : Igor Pavlov : Public domain */ +2020-03-19 : Igor Pavlov : Public domain */ #ifndef __LZMA_DEC_H #define __LZMA_DEC_H @@ -181,6 +181,7 @@ Returns: LZMA_STATUS_NEEDS_MORE_INPUT LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK SZ_ERROR_DATA - Data error + SZ_ERROR_FAIL - Some unexpected error: internal error of code, memory corruption or hardware failure */ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, @@ -223,6 +224,7 @@ Returns: SZ_ERROR_MEM - Memory allocation error SZ_ERROR_UNSUPPORTED - Unsupported properties SZ_ERROR_INPUT_EOF - It needs more bytes in input buffer (src). + SZ_ERROR_FAIL - Some unexpected error: internal error of code, memory corruption or hardware failure */ SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, diff --git a/3rdparty/7z/src/LzmaEnc.c b/3rdparty/7z/src/LzmaEnc.c index 14086fc4f9..5705506bc9 100644 --- a/3rdparty/7z/src/LzmaEnc.c +++ b/3rdparty/7z/src/LzmaEnc.c @@ -1,5 +1,5 @@ /* LzmaEnc.c -- LZMA Encoder -2019-01-10: Igor Pavlov : Public domain */ +2021-11-18: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -12,6 +12,7 @@ #include #endif +#include "CpuArch.h" #include "LzmaEnc.h" #include "LzFind.h" @@ -19,12 +20,25 @@ #include "LzFindMt.h" #endif +/* the following LzmaEnc_* declarations is internal LZMA interface for LZMA2 encoder */ + +SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp, ISeqInStream *inStream, UInt32 keepWindowSize, + ISzAllocPtr alloc, ISzAllocPtr allocBig); +SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const Byte *src, SizeT srcLen, + UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig); +SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit, + Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize); +const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle pp); +void LzmaEnc_Finish(CLzmaEncHandle pp); +void LzmaEnc_SaveState(CLzmaEncHandle pp); +void LzmaEnc_RestoreState(CLzmaEncHandle pp); + #ifdef SHOW_STAT static unsigned g_STAT_OFFSET = 0; #endif -#define kLzmaMaxHistorySize ((UInt32)3 << 29) -/* #define kLzmaMaxHistorySize ((UInt32)7 << 29) */ +/* for good normalization speed we still reserve 256 MB before 4 GB range */ +#define kLzmaMaxHistorySize ((UInt32)15 << 28) #define kNumTopBits 24 #define kTopValue ((UInt32)1 << kNumTopBits) @@ -36,7 +50,7 @@ static unsigned g_STAT_OFFSET = 0; #define kNumMoveReducingBits 4 #define kNumBitPriceShiftBits 4 -#define kBitPrice (1 << kNumBitPriceShiftBits) +// #define kBitPrice (1 << kNumBitPriceShiftBits) #define REP_LEN_COUNT 64 @@ -47,6 +61,7 @@ void LzmaEncProps_Init(CLzmaEncProps *p) p->reduceSize = (UInt64)(Int64)-1; p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1; p->writeEndMark = 0; + p->affinity = 0; } void LzmaEncProps_Normalize(CLzmaEncProps *p) @@ -55,16 +70,21 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) if (level < 0) level = 5; p->level = level; - if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level <= 7 ? (1 << 25) : (1 << 26))); + if (p->dictSize == 0) + p->dictSize = + ( level <= 3 ? ((UInt32)1 << (level * 2 + 16)) : + ( level <= 6 ? ((UInt32)1 << (level + 19)) : + ( level <= 7 ? ((UInt32)1 << 25) : ((UInt32)1 << 26) + ))); + if (p->dictSize > p->reduceSize) { - unsigned i; - UInt32 reduceSize = (UInt32)p->reduceSize; - for (i = 11; i <= 30; i++) - { - if (reduceSize <= ((UInt32)2 << i)) { p->dictSize = ((UInt32)2 << i); break; } - if (reduceSize <= ((UInt32)3 << i)) { p->dictSize = ((UInt32)3 << i); break; } - } + UInt32 v = (UInt32)p->reduceSize; + const UInt32 kReduceMin = ((UInt32)1 << 12); + if (v < kReduceMin) + v = kReduceMin; + if (p->dictSize > v) + p->dictSize = v; } if (p->lc < 0) p->lc = 3; @@ -74,8 +94,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); - if (p->numHashBytes < 0) p->numHashBytes = 4; - if (p->mc == 0) p->mc = (16 + (p->fb >> 1)) >> (p->btMode ? 0 : 1); + if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5); + if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1); if (p->numThreads < 0) p->numThreads = @@ -93,18 +113,85 @@ UInt32 LzmaEncProps_GetDictSize(const CLzmaEncProps *props2) return props.dictSize; } -#if (_MSC_VER >= 1400) -/* BSR code is fast for some new CPUs */ -/* #define LZMA_LOG_BSR */ + +/* +x86/x64: + +BSR: + IF (SRC == 0) ZF = 1, DEST is undefined; + AMD : DEST is unchanged; + IF (SRC != 0) ZF = 0; DEST is index of top non-zero bit + BSR is slow in some processors + +LZCNT: + IF (SRC == 0) CF = 1, DEST is size_in_bits_of_register(src) (32 or 64) + IF (SRC != 0) CF = 0, DEST = num_lead_zero_bits + IF (DEST == 0) ZF = 1; + +LZCNT works only in new processors starting from Haswell. +if LZCNT is not supported by processor, then it's executed as BSR. +LZCNT can be faster than BSR, if supported. +*/ + +// #define LZMA_LOG_BSR + +#if defined(MY_CPU_ARM_OR_ARM64) /* || defined(MY_CPU_X86_OR_AMD64) */ + + #if (defined(__clang__) && (__clang_major__ >= 6)) \ + || (defined(__GNUC__) && (__GNUC__ >= 6)) + #define LZMA_LOG_BSR + #elif defined(_MSC_VER) && (_MSC_VER >= 1300) + // #if defined(MY_CPU_ARM_OR_ARM64) + #define LZMA_LOG_BSR + // #endif + #endif #endif +// #include + #ifdef LZMA_LOG_BSR -#define kDicLogSizeMaxCompress 32 +#if defined(__clang__) \ + || defined(__GNUC__) -#define BSR2_RET(pos, res) { unsigned long zz; _BitScanReverse(&zz, (pos)); res = (zz + zz) + ((pos >> (zz - 1)) & 1); } +/* + C code: : (30 - __builtin_clz(x)) + gcc9/gcc10 for x64 /x86 : 30 - (bsr(x) xor 31) + clang10 for x64 : 31 + (bsr(x) xor -32) +*/ -static unsigned GetPosSlot1(UInt32 pos) + #define MY_clz(x) ((unsigned)__builtin_clz(x)) + // __lzcnt32 + // __builtin_ia32_lzcnt_u32 + +#else // #if defined(_MSC_VER) + + #ifdef MY_CPU_ARM_OR_ARM64 + + #define MY_clz _CountLeadingZeros + + #else // if defined(MY_CPU_X86_OR_AMD64) + + // #define MY_clz __lzcnt // we can use lzcnt (unsupported by old CPU) + // _BitScanReverse code is not optimal for some MSVC compilers + #define BSR2_RET(pos, res) { unsigned long zz; _BitScanReverse(&zz, (pos)); zz--; \ + res = (zz + zz) + (pos >> zz); } + + #endif // MY_CPU_X86_OR_AMD64 + +#endif // _MSC_VER + + +#ifndef BSR2_RET + + #define BSR2_RET(pos, res) { unsigned zz = 30 - MY_clz(pos); \ + res = (zz + zz) + (pos >> zz); } + +#endif + + +unsigned GetPosSlot1(UInt32 pos); +unsigned GetPosSlot1(UInt32 pos) { unsigned res; BSR2_RET(pos, res); @@ -113,10 +200,10 @@ static unsigned GetPosSlot1(UInt32 pos) #define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } #define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res); } -#else -#define kNumLogBits (9 + sizeof(size_t) / 2) -/* #define kNumLogBits (11 + sizeof(size_t) / 8 * 3) */ +#else // ! LZMA_LOG_BSR + +#define kNumLogBits (11 + sizeof(size_t) / 8 * 3) #define kDicLogSizeMaxCompress ((kNumLogBits - 1) * 2 + 7) @@ -163,7 +250,7 @@ static void LzmaEnc_FastPosInit(Byte *g_FastPos) #define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } #define GetPosSlot(pos, res) { if (pos < kNumFullDistances) res = p->g_FastPos[pos & (kNumFullDistances - 1)]; else BSR2_RET(pos, res); } -#endif +#endif // LZMA_LOG_BSR #define LZMA_NUM_REPS 4 @@ -193,7 +280,7 @@ typedef struct #define kNumLenToPosStates 4 #define kNumPosSlotBits 6 -#define kDicLogSizeMin 0 +// #define kDicLogSizeMin 0 #define kDicLogSizeMax 32 #define kDistTableSizeMax (kDicLogSizeMax * 2) @@ -299,7 +386,7 @@ typedef UInt32 CProbPrice; typedef struct { void *matchFinderObj; - IMatchFinder matchFinder; + IMatchFinder2 matchFinder; unsigned optCur; unsigned optEnd; @@ -344,10 +431,14 @@ typedef struct // begin of CMatchFinderMt is used in LZ thread CMatchFinderMt matchFinderMt; // end of CMatchFinderMt is used in BT and HASH threads + // #else + // CMatchFinder matchFinderBase; #endif - CMatchFinder matchFinderBase; + + // we suppose that we have 8-bytes alignment after CMatchFinder + #ifndef _7ZIP_ST Byte pad[128]; #endif @@ -355,8 +446,10 @@ typedef struct // LZ thread CProbPrice ProbPrices[kBitModelTotal >> kNumMoveReducingBits]; - UInt32 matches[LZMA_MATCH_LEN_MAX * 2 + 2 + 1]; + // we want {len , dist} pairs to be 8-bytes aligned in matches array + UInt32 matches[LZMA_MATCH_LEN_MAX * 2 + 2]; + // we want 8-bytes alignment here UInt32 alignPrices[kAlignTableSize]; UInt32 posSlotPrices[kNumLenToPosStates][kDistTableSizeMax]; UInt32 distancesPrices[kNumLenToPosStates][kNumFullDistances]; @@ -385,12 +478,19 @@ typedef struct CSaveState saveState; + // BoolInt mf_Failure; #ifndef _7ZIP_ST Byte pad2[128]; #endif } CLzmaEnc; +#define MFB (p->matchFinderBase) +/* +#ifndef _7ZIP_ST +#define MFB (p->matchFinderMt.MatchFinder) +#endif +*/ #define COPY_ARR(dest, src, arr) memcpy(dest->arr, src->arr, sizeof(src->arr)); @@ -455,41 +555,51 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX - || props.pb > LZMA_PB_MAX - || props.dictSize > ((UInt64)1 << kDicLogSizeMaxCompress) - || props.dictSize > kLzmaMaxHistorySize) + || props.pb > LZMA_PB_MAX) return SZ_ERROR_PARAM; + + if (props.dictSize > kLzmaMaxHistorySize) + props.dictSize = kLzmaMaxHistorySize; + + #ifndef LZMA_LOG_BSR + { + const UInt64 dict64 = props.dictSize; + if (dict64 > ((UInt64)1 << kDicLogSizeMaxCompress)) + return SZ_ERROR_PARAM; + } + #endif + p->dictSize = props.dictSize; { - unsigned fb = props.fb; + unsigned fb = (unsigned)props.fb; if (fb < 5) fb = 5; if (fb > LZMA_MATCH_LEN_MAX) fb = LZMA_MATCH_LEN_MAX; p->numFastBytes = fb; } - p->lc = props.lc; - p->lp = props.lp; - p->pb = props.pb; + p->lc = (unsigned)props.lc; + p->lp = (unsigned)props.lp; + p->pb = (unsigned)props.pb; p->fastMode = (props.algo == 0); // p->_maxMode = True; - p->matchFinderBase.btMode = (Byte)(props.btMode ? 1 : 0); + MFB.btMode = (Byte)(props.btMode ? 1 : 0); { unsigned numHashBytes = 4; if (props.btMode) { - if (props.numHashBytes < 2) - numHashBytes = 2; - else if (props.numHashBytes < 4) - numHashBytes = props.numHashBytes; + if (props.numHashBytes < 2) numHashBytes = 2; + else if (props.numHashBytes < 4) numHashBytes = (unsigned)props.numHashBytes; } - p->matchFinderBase.numHashBytes = numHashBytes; + if (props.numHashBytes >= 5) numHashBytes = 5; + + MFB.numHashBytes = numHashBytes; } - p->matchFinderBase.cutValue = props.mc; + MFB.cutValue = props.mc; - p->writeEndMark = props.writeEndMark; + p->writeEndMark = (BoolInt)props.writeEndMark; #ifndef _7ZIP_ST /* @@ -500,6 +610,8 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) } */ p->multiThread = (props.numThreads > 1); + p->matchFinderMt.btSync.affinity = + p->matchFinderMt.hashSync.affinity = props.affinity; #endif return SZ_OK; @@ -509,7 +621,7 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) void LzmaEnc_SetDataSize(CLzmaEncHandle pp, UInt64 expectedDataSiize) { CLzmaEnc *p = (CLzmaEnc *)pp; - p->matchFinderBase.expectedDataSize = expectedDataSiize; + MFB.expectedDataSize = expectedDataSiize; } @@ -536,8 +648,8 @@ static void RangeEnc_Construct(CRangeEnc *p) p->bufBase = NULL; } -#define RangeEnc_GetProcessed(p) ((p)->processed + ((p)->buf - (p)->bufBase) + (p)->cacheSize) -#define RangeEnc_GetProcessed_sizet(p) ((size_t)(p)->processed + ((p)->buf - (p)->bufBase) + (size_t)(p)->cacheSize) +#define RangeEnc_GetProcessed(p) ( (p)->processed + (size_t)((p)->buf - (p)->bufBase) + (p)->cacheSize) +#define RangeEnc_GetProcessed_sizet(p) ((size_t)(p)->processed + (size_t)((p)->buf - (p)->bufBase) + (size_t)(p)->cacheSize) #define RC_BUF_SIZE (1 << 16) @@ -556,12 +668,11 @@ static int RangeEnc_Alloc(CRangeEnc *p, ISzAllocPtr alloc) static void RangeEnc_Free(CRangeEnc *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->bufBase); - p->bufBase = 0; + p->bufBase = NULL; } static void RangeEnc_Init(CRangeEnc *p) { - /* Stream.Init(); */ p->range = 0xFFFFFFFF; p->cache = 0; p->low = 0; @@ -575,12 +686,12 @@ static void RangeEnc_Init(CRangeEnc *p) MY_NO_INLINE static void RangeEnc_FlushStream(CRangeEnc *p) { - size_t num; - if (p->res != SZ_OK) - return; - num = p->buf - p->bufBase; - if (num != ISeqOutStream_Write(p->outStream, p->bufBase, num)) - p->res = SZ_ERROR_WRITE; + const size_t num = (size_t)(p->buf - p->bufBase); + if (p->res == SZ_OK) + { + if (num != ISeqOutStream_Write(p->outStream, p->bufBase, num)) + p->res = SZ_ERROR_WRITE; + } p->processed += num; p->buf = p->bufBase; } @@ -656,7 +767,7 @@ static void RangeEnc_FlushData(CRangeEnc *p) range += newBound & mask; \ mask &= (kBitModelTotal - ((1 << kNumMoveBits) - 1)); \ mask += ((1 << kNumMoveBits) - 1); \ - ttt += (Int32)(mask - ttt) >> kNumMoveBits; \ + ttt += (UInt32)((Int32)(mask - ttt) >> kNumMoveBits); \ *(prob) = (CLzmaProb)ttt; \ RC_NORM(p) \ } @@ -749,7 +860,7 @@ static void LzmaEnc_InitPriceTables(CProbPrice *ProbPrices) bitCount++; } } - ProbPrices[i] = (CProbPrice)((kNumBitModelTotalBits << kCyclesBits) - 15 - bitCount); + ProbPrices[i] = (CProbPrice)(((unsigned)kNumBitModelTotalBits << kCyclesBits) - 15 - bitCount); // printf("\n%3d: %5d", i, ProbPrices[i]); } } @@ -985,7 +1096,11 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes) p->additionalOffset++; p->numAvail = p->matchFinder.GetNumAvailableBytes(p->matchFinderObj); - numPairs = p->matchFinder.GetMatches(p->matchFinderObj, p->matches); + { + const UInt32 *d = p->matchFinder.GetMatches(p->matchFinderObj, p->matches); + // if (!d) { p->mf_Failure = True; *numPairsRes = 0; return 0; } + numPairs = (unsigned)(d - p->matches); + } *numPairsRes = numPairs; #ifdef SHOW_STAT @@ -1001,7 +1116,7 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes) if (numPairs == 0) return 0; { - unsigned len = p->matches[(size_t)numPairs - 2]; + const unsigned len = p->matches[(size_t)numPairs - 2]; if (len != p->numFastBytes) return len; { @@ -1011,7 +1126,7 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes) { const Byte *p1 = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; const Byte *p2 = p1 + len; - ptrdiff_t dif = (ptrdiff_t)-1 - p->matches[(size_t)numPairs - 1]; + const ptrdiff_t dif = (ptrdiff_t)-1 - (ptrdiff_t)p->matches[(size_t)numPairs - 1]; const Byte *lim = p1 + numAvail; for (; p2 != lim && *p2 == p2[dif]; p2++) {} @@ -1167,6 +1282,8 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) repLens[i] = len; if (len > repLens[repMaxIndex]) repMaxIndex = i; + if (len == LZMA_MATCH_LEN_MAX) // 21.03 : optimization + break; } if (repLens[repMaxIndex] >= p->numFastBytes) @@ -1179,10 +1296,12 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) } matches = p->matches; + #define MATCHES matches + // #define MATCHES p->matches if (mainLen >= p->numFastBytes) { - p->backRes = matches[(size_t)numPairs - 1] + LZMA_NUM_REPS; + p->backRes = MATCHES[(size_t)numPairs - 1] + LZMA_NUM_REPS; MOVE_POS(p, mainLen - 1) return mainLen; } @@ -1276,13 +1395,13 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) if (len < 2) len = 2; else - while (len > matches[offs]) + while (len > MATCHES[offs]) offs += 2; for (; ; len++) { COptimal *opt; - UInt32 dist = matches[(size_t)offs + 1]; + UInt32 dist = MATCHES[(size_t)offs + 1]; UInt32 price = normalMatchPrice + GET_PRICE_LEN(&p->lenEnc, posState, len); unsigned lenToPosState = GetLenToPosState(len); @@ -1306,7 +1425,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) opt->extra = 0; } - if (len == matches[offs]) + if (len == MATCHES[offs]) { offs += 2; if (offs == numPairs) @@ -1727,8 +1846,8 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) if (newLen > numAvail) { newLen = numAvail; - for (numPairs = 0; newLen > matches[numPairs]; numPairs += 2); - matches[numPairs] = (UInt32)newLen; + for (numPairs = 0; newLen > MATCHES[numPairs]; numPairs += 2); + MATCHES[numPairs] = (UInt32)newLen; numPairs += 2; } @@ -1747,9 +1866,9 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) } offs = 0; - while (startLen > matches[offs]) + while (startLen > MATCHES[offs]) offs += 2; - dist = matches[(size_t)offs + 1]; + dist = MATCHES[(size_t)offs + 1]; // if (dist >= kNumFullDistances) GetPosSlot2(dist, posSlot); @@ -1776,7 +1895,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) } } - if (len == matches[offs]) + if (len == MATCHES[offs]) { // if (p->_maxMode) { // MATCH : LIT : REP_0 @@ -1841,7 +1960,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) offs += 2; if (offs == numPairs) break; - dist = matches[(size_t)offs + 1]; + dist = MATCHES[(size_t)offs + 1]; // if (dist >= kNumFullDistances) GetPosSlot2(dist, posSlot); } @@ -2059,8 +2178,23 @@ static SRes CheckErrors(CLzmaEnc *p) return p->result; if (p->rc.res != SZ_OK) p->result = SZ_ERROR_WRITE; - if (p->matchFinderBase.result != SZ_OK) + + #ifndef _7ZIP_ST + if ( + // p->mf_Failure || + (p->mtMode && + ( // p->matchFinderMt.failure_LZ_LZ || + p->matchFinderMt.failure_LZ_BT)) + ) + { + p->result = MY_HRES_ERROR__INTERNAL_ERROR; + // printf("\nCheckErrors p->matchFinderMt.failureLZ\n"); + } + #endif + + if (MFB.result != SZ_OK) p->result = SZ_ERROR_READ; + if (p->result != SZ_OK) p->finished = True; return p->result; @@ -2198,14 +2332,14 @@ MY_NO_INLINE static void FillDistancesPrices(CLzmaEnc *p) -void LzmaEnc_Construct(CLzmaEnc *p) +static void LzmaEnc_Construct(CLzmaEnc *p) { RangeEnc_Construct(&p->rc); - MatchFinder_Construct(&p->matchFinderBase); + MatchFinder_Construct(&MFB); #ifndef _7ZIP_ST + p->matchFinderMt.MatchFinder = &MFB; MatchFinderMt_Construct(&p->matchFinderMt); - p->matchFinderMt.MatchFinder = &p->matchFinderBase; #endif { @@ -2221,7 +2355,6 @@ void LzmaEnc_Construct(CLzmaEnc *p) LzmaEnc_InitPriceTables(p->ProbPrices); p->litProbs = NULL; p->saveState.litProbs = NULL; - } CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc) @@ -2233,7 +2366,7 @@ CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc) return p; } -void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAllocPtr alloc) +static void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->litProbs); ISzAlloc_Free(alloc, p->saveState.litProbs); @@ -2241,13 +2374,13 @@ void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAllocPtr alloc) p->saveState.litProbs = NULL; } -void LzmaEnc_Destruct(CLzmaEnc *p, ISzAllocPtr alloc, ISzAllocPtr allocBig) +static void LzmaEnc_Destruct(CLzmaEnc *p, ISzAllocPtr alloc, ISzAllocPtr allocBig) { #ifndef _7ZIP_ST MatchFinderMt_Destruct(&p->matchFinderMt, allocBig); #endif - MatchFinder_Free(&p->matchFinderBase, allocBig); + MatchFinder_Free(&MFB, allocBig); LzmaEnc_FreeLits(p, alloc); RangeEnc_Free(&p->rc, alloc); } @@ -2259,11 +2392,18 @@ void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAllocPtr alloc, ISzAllocPtr allocBig) } +MY_NO_INLINE static SRes LzmaEnc_CodeOneBlock(CLzmaEnc *p, UInt32 maxPackSize, UInt32 maxUnpackSize) { UInt32 nowPos32, startPos32; if (p->needInit) { + #ifndef _7ZIP_ST + if (p->mtMode) + { + RINOK(MatchFinderMt_InitMt(&p->matchFinderMt)); + } + #endif p->matchFinder.Init(p->matchFinderObj); p->needInit = 0; } @@ -2521,12 +2661,12 @@ static SRes LzmaEnc_CodeOneBlock(CLzmaEnc *p, UInt32 maxPackSize, UInt32 maxUnpa // { int y; for (y = 0; y < 100; y++) { FillDistancesPrices(p); // }} - LenPriceEnc_UpdateTables(&p->lenEnc, 1 << p->pb, &p->lenProbs, p->ProbPrices); + LenPriceEnc_UpdateTables(&p->lenEnc, (unsigned)1 << p->pb, &p->lenProbs, p->ProbPrices); } if (p->repLenEncCounter <= 0) { p->repLenEncCounter = REP_LEN_COUNT; - LenPriceEnc_UpdateTables(&p->repLenEnc, 1 << p->pb, &p->repLenProbs, p->ProbPrices); + LenPriceEnc_UpdateTables(&p->repLenEnc, (unsigned)1 << p->pb, &p->repLenProbs, p->ProbPrices); } } @@ -2559,11 +2699,13 @@ static SRes LzmaEnc_CodeOneBlock(CLzmaEnc *p, UInt32 maxPackSize, UInt32 maxUnpa static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig) { UInt32 beforeSize = kNumOpts; + UInt32 dictSize; + if (!RangeEnc_Alloc(&p->rc, alloc)) return SZ_ERROR_MEM; #ifndef _7ZIP_ST - p->mtMode = (p->multiThread && !p->fastMode && (p->matchFinderBase.btMode != 0)); + p->mtMode = (p->multiThread && !p->fastMode && (MFB.btMode != 0)); #endif { @@ -2582,36 +2724,56 @@ static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, } } - p->matchFinderBase.bigHash = (Byte)(p->dictSize > kBigHashDicLimit ? 1 : 0); + MFB.bigHash = (Byte)(p->dictSize > kBigHashDicLimit ? 1 : 0); - if (beforeSize + p->dictSize < keepWindowSize) - beforeSize = keepWindowSize - p->dictSize; + + dictSize = p->dictSize; + if (dictSize == ((UInt32)2 << 30) || + dictSize == ((UInt32)3 << 30)) + { + /* 21.03 : here we reduce the dictionary for 2 reasons: + 1) we don't want 32-bit back_distance matches in decoder for 2 GB dictionary. + 2) we want to elimate useless last MatchFinder_Normalize3() for corner cases, + where data size is aligned for 1 GB: 5/6/8 GB. + That reducing must be >= 1 for such corner cases. */ + dictSize -= 1; + } + + if (beforeSize + dictSize < keepWindowSize) + beforeSize = keepWindowSize - dictSize; + + /* in worst case we can look ahead for + max(LZMA_MATCH_LEN_MAX, numFastBytes + 1 + numFastBytes) bytes. + we send larger value for (keepAfter) to MantchFinder_Create(): + (numFastBytes + LZMA_MATCH_LEN_MAX + 1) + */ #ifndef _7ZIP_ST if (p->mtMode) { - RINOK(MatchFinderMt_Create(&p->matchFinderMt, p->dictSize, beforeSize, p->numFastBytes, - LZMA_MATCH_LEN_MAX - + 1 /* 18.04 */ + RINOK(MatchFinderMt_Create(&p->matchFinderMt, dictSize, beforeSize, + p->numFastBytes, LZMA_MATCH_LEN_MAX + 1 /* 18.04 */ , allocBig)); p->matchFinderObj = &p->matchFinderMt; - p->matchFinderBase.bigHash = (Byte)( - (p->dictSize > kBigHashDicLimit && p->matchFinderBase.hashMask >= 0xFFFFFF) ? 1 : 0); + MFB.bigHash = (Byte)( + (p->dictSize > kBigHashDicLimit && MFB.hashMask >= 0xFFFFFF) ? 1 : 0); MatchFinderMt_CreateVTable(&p->matchFinderMt, &p->matchFinder); } else #endif { - if (!MatchFinder_Create(&p->matchFinderBase, p->dictSize, beforeSize, p->numFastBytes, LZMA_MATCH_LEN_MAX, allocBig)) + if (!MatchFinder_Create(&MFB, dictSize, beforeSize, + p->numFastBytes, LZMA_MATCH_LEN_MAX + 1 /* 21.03 */ + , allocBig)) return SZ_ERROR_MEM; - p->matchFinderObj = &p->matchFinderBase; - MatchFinder_CreateVTable(&p->matchFinderBase, &p->matchFinder); + p->matchFinderObj = &MFB; + MatchFinder_CreateVTable(&MFB, &p->matchFinder); } return SZ_OK; } -void LzmaEnc_Init(CLzmaEnc *p) +static void LzmaEnc_Init(CLzmaEnc *p) { unsigned i; p->state = 0; @@ -2675,12 +2837,14 @@ void LzmaEnc_Init(CLzmaEnc *p) p->additionalOffset = 0; - p->pbMask = (1 << p->pb) - 1; + p->pbMask = ((unsigned)1 << p->pb) - 1; p->lpMask = ((UInt32)0x100 << p->lp) - ((unsigned)0x100 >> p->lc); + + // p->mf_Failure = False; } -void LzmaEnc_InitPrices(CLzmaEnc *p) +static void LzmaEnc_InitPrices(CLzmaEnc *p) { if (!p->fastMode) { @@ -2694,8 +2858,8 @@ void LzmaEnc_InitPrices(CLzmaEnc *p) p->repLenEncCounter = REP_LEN_COUNT; - LenPriceEnc_UpdateTables(&p->lenEnc, 1 << p->pb, &p->lenProbs, p->ProbPrices); - LenPriceEnc_UpdateTables(&p->repLenEnc, 1 << p->pb, &p->repLenProbs, p->ProbPrices); + LenPriceEnc_UpdateTables(&p->lenEnc, (unsigned)1 << p->pb, &p->lenProbs, p->ProbPrices); + LenPriceEnc_UpdateTables(&p->repLenEnc, (unsigned)1 << p->pb, &p->repLenProbs, p->ProbPrices); } static SRes LzmaEnc_AllocAndInit(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig) @@ -2719,7 +2883,7 @@ static SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInS ISzAllocPtr alloc, ISzAllocPtr allocBig) { CLzmaEnc *p = (CLzmaEnc *)pp; - p->matchFinderBase.stream = inStream; + MFB.stream = inStream; p->needInit = 1; p->rc.outStream = outStream; return LzmaEnc_AllocAndInit(p, 0, alloc, allocBig); @@ -2730,16 +2894,16 @@ SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp, ISzAllocPtr alloc, ISzAllocPtr allocBig) { CLzmaEnc *p = (CLzmaEnc *)pp; - p->matchFinderBase.stream = inStream; + MFB.stream = inStream; p->needInit = 1; return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig); } static void LzmaEnc_SetInputBuf(CLzmaEnc *p, const Byte *src, SizeT srcLen) { - p->matchFinderBase.directInput = 1; - p->matchFinderBase.bufferBase = (Byte *)src; - p->matchFinderBase.directInputRem = srcLen; + MFB.directInput = 1; + MFB.bufferBase = (Byte *)src; + MFB.directInputRem = srcLen; } SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const Byte *src, SizeT srcLen, @@ -2781,19 +2945,23 @@ static size_t SeqOutStreamBuf_Write(const ISeqOutStream *pp, const void *data, s size = p->rem; p->overflow = True; } - memcpy(p->data, data, size); - p->rem -= size; - p->data += size; + if (size != 0) + { + memcpy(p->data, data, size); + p->rem -= size; + p->data += size; + } return size; } +/* UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp) { const CLzmaEnc *p = (CLzmaEnc *)pp; return p->matchFinder.GetNumAvailableBytes(p->matchFinderObj); } - +*/ const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle pp) { @@ -2841,6 +3009,7 @@ SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit, } +MY_NO_INLINE static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress) { SRes res = SZ_OK; @@ -2870,7 +3039,7 @@ static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress) LzmaEnc_Finish(p); /* - if (res == SZ_OK && !Inline_MatchFinder_IsFinishedOK(&p->matchFinderBase)) + if (res == SZ_OK && !Inline_MatchFinder_IsFinishedOK(&MFB)) res = SZ_ERROR_FAIL; } */ @@ -2889,35 +3058,43 @@ SRes LzmaEnc_Encode(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *i SRes LzmaEnc_WriteProperties(CLzmaEncHandle pp, Byte *props, SizeT *size) { - CLzmaEnc *p = (CLzmaEnc *)pp; - unsigned i; - UInt32 dictSize = p->dictSize; if (*size < LZMA_PROPS_SIZE) return SZ_ERROR_PARAM; *size = LZMA_PROPS_SIZE; - props[0] = (Byte)((p->pb * 5 + p->lp) * 9 + p->lc); - - if (dictSize >= ((UInt32)1 << 22)) { - UInt32 kDictMask = ((UInt32)1 << 20) - 1; - if (dictSize < (UInt32)0xFFFFFFFF - kDictMask) - dictSize = (dictSize + kDictMask) & ~kDictMask; - } - else for (i = 11; i <= 30; i++) - { - if (dictSize <= ((UInt32)2 << i)) { dictSize = (2 << i); break; } - if (dictSize <= ((UInt32)3 << i)) { dictSize = (3 << i); break; } - } + const CLzmaEnc *p = (const CLzmaEnc *)pp; + const UInt32 dictSize = p->dictSize; + UInt32 v; + props[0] = (Byte)((p->pb * 5 + p->lp) * 9 + p->lc); + + // we write aligned dictionary value to properties for lzma decoder + if (dictSize >= ((UInt32)1 << 21)) + { + const UInt32 kDictMask = ((UInt32)1 << 20) - 1; + v = (dictSize + kDictMask) & ~kDictMask; + if (v < dictSize) + v = dictSize; + } + else + { + unsigned i = 11 * 2; + do + { + v = (UInt32)(2 + (i & 1)) << (i >> 1); + i++; + } + while (v < dictSize); + } - for (i = 0; i < 4; i++) - props[1 + i] = (Byte)(dictSize >> (8 * i)); - return SZ_OK; + SetUi32(props + 1, v); + return SZ_OK; + } } unsigned LzmaEnc_IsWriteEndMark(CLzmaEncHandle pp) { - return ((CLzmaEnc *)pp)->writeEndMark; + return (unsigned)((CLzmaEnc *)pp)->writeEndMark; } @@ -2974,3 +3151,15 @@ SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, LzmaEnc_Destroy(p, alloc, allocBig); return res; } + + +/* +#ifndef _7ZIP_ST +void LzmaEnc_GetLzThreads(CLzmaEncHandle pp, HANDLE lz_threads[2]) +{ + const CLzmaEnc *p = (CLzmaEnc *)pp; + lz_threads[0] = p->matchFinderMt.hashSync.thread; + lz_threads[1] = p->matchFinderMt.btSync.thread; +} +#endif +*/ diff --git a/3rdparty/7z/src/LzmaEnc.h b/3rdparty/7z/src/LzmaEnc.h index c9938f04bc..26757ba6b0 100644 --- a/3rdparty/7z/src/LzmaEnc.h +++ b/3rdparty/7z/src/LzmaEnc.h @@ -1,5 +1,5 @@ /* LzmaEnc.h -- LZMA Encoder -2017-07-27 : Igor Pavlov : Public domain */ +2019-10-30 : Igor Pavlov : Public domain */ #ifndef __LZMA_ENC_H #define __LZMA_ENC_H @@ -29,6 +29,8 @@ typedef struct _CLzmaEncProps UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1. Encoder uses this value to reduce dictionary size */ + + UInt64 affinity; } CLzmaEncProps; void LzmaEncProps_Init(CLzmaEncProps *p); diff --git a/3rdparty/7z/src/LzmaLib.h b/3rdparty/7z/src/LzmaLib.h index 5c35e53654..4103e224ac 100644 --- a/3rdparty/7z/src/LzmaLib.h +++ b/3rdparty/7z/src/LzmaLib.h @@ -1,5 +1,5 @@ /* LzmaLib.h -- LZMA library interface -2013-01-18 : Igor Pavlov : Public domain */ +2021-04-03 : Igor Pavlov : Public domain */ #ifndef __LZMA_LIB_H #define __LZMA_LIB_H @@ -40,14 +40,16 @@ outPropsSize - level - compression level: 0 <= level <= 9; level dictSize algo fb - 0: 16 KB 0 32 - 1: 64 KB 0 32 - 2: 256 KB 0 32 - 3: 1 MB 0 32 - 4: 4 MB 0 32 + 0: 64 KB 0 32 + 1: 256 KB 0 32 + 2: 1 MB 0 32 + 3: 4 MB 0 32 + 4: 16 MB 0 32 5: 16 MB 1 32 6: 32 MB 1 32 - 7+: 64 MB 1 64 + 7: 32 MB 1 64 + 8: 64 MB 1 64 + 9: 64 MB 1 64 The default value for "level" is 5. @@ -83,6 +85,11 @@ fb - Word size (the number of fast bytes). numThreads - The number of thereads. 1 or 2. The default value is 2. Fast mode (algo = 0) can use only 1 thread. +In: + dest - output data buffer + destLen - output data buffer size + src - input data + srcLen - input data size Out: destLen - processed output size Returns: @@ -108,8 +115,8 @@ MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char LzmaUncompress -------------- In: - dest - output data - destLen - output data size + dest - output data buffer + destLen - output data buffer size src - input data srcLen - input data size Out: diff --git a/3rdparty/7z/src/MtCoder.c b/3rdparty/7z/src/MtCoder.c index 5667f2d5b5..e39d9cb191 100644 --- a/3rdparty/7z/src/MtCoder.c +++ b/3rdparty/7z/src/MtCoder.c @@ -1,5 +1,5 @@ /* MtCoder.c -- Multi-thread Coder -2018-07-04 : Igor Pavlov : Public domain */ +2021-12-21 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -7,7 +7,7 @@ #ifndef _7ZIP_ST -SRes MtProgressThunk_Progress(const ICompressProgress *pp, UInt64 inSize, UInt64 outSize) +static SRes MtProgressThunk_Progress(const ICompressProgress *pp, UInt64 inSize, UInt64 outSize) { CMtProgressThunk *thunk = CONTAINER_FROM_VTBL(pp, CMtProgressThunk, vt); UInt64 inSize2 = 0; @@ -44,7 +44,7 @@ static WRes ArEvent_OptCreate_And_Reset(CEvent *p) } -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp); +static THREAD_FUNC_DECL ThreadFunc(void *pp); static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) @@ -70,8 +70,7 @@ static void MtCoderThread_Destruct(CMtCoderThread *t) { t->stop = 1; Event_Set(&t->startEvent); - Thread_Wait(&t->thread); - Thread_Close(&t->thread); + Thread_Wait_Close(&t->thread); } Event_Close(&t->startEvent); @@ -336,13 +335,13 @@ static SRes ThreadFunc2(CMtCoderThread *t) } -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp) +static THREAD_FUNC_DECL ThreadFunc(void *pp) { CMtCoderThread *t = (CMtCoderThread *)pp; for (;;) { if (Event_Wait(&t->startEvent) != 0) - return SZ_ERROR_THREAD; + return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; if (t->stop) return 0; { @@ -358,7 +357,7 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp) unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); if (numFinished == mtc->numStartedThreads) if (Event_Set(&mtc->finishedEvent) != 0) - return SZ_ERROR_THREAD; + return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; } #endif } @@ -496,12 +495,7 @@ SRes MtCoder_Code(CMtCoder *p) { RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->readEvent)); - - if (Semaphore_IsCreated(&p->blocksSemaphore)) - { - RINOK_THREAD(Semaphore_Close(&p->blocksSemaphore)); - } - RINOK_THREAD(Semaphore_Create(&p->blocksSemaphore, numBlocksMax, numBlocksMax)); + RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, numBlocksMax, numBlocksMax)); } for (i = 0; i < MTCODER__BLOCKS_MAX - 1; i++) diff --git a/3rdparty/7z/src/MtDec.c b/3rdparty/7z/src/MtDec.c index 25a8b046d9..854087b929 100644 --- a/3rdparty/7z/src/MtDec.c +++ b/3rdparty/7z/src/MtDec.c @@ -1,16 +1,21 @@ /* MtDec.c -- Multi-thread Decoder -2019-02-02 : Igor Pavlov : Public domain */ +2021-12-21 : Igor Pavlov : Public domain */ #include "Precomp.h" // #define SHOW_DEBUG_INFO // #include +#include #ifdef SHOW_DEBUG_INFO #include #endif +#include "MtDec.h" + +#ifndef _7ZIP_ST + #ifdef SHOW_DEBUG_INFO #define PRF(x) x #else @@ -19,10 +24,6 @@ #define PRF_STR_INT(s, d) PRF(printf("\n" s " %d\n", (unsigned)d)) -#include "MtDec.h" - -#ifndef _7ZIP_ST - void MtProgress_Init(CMtProgress *p, ICompressProgress *progress) { p->progress = progress; @@ -77,7 +78,7 @@ void MtProgress_SetError(CMtProgress *p, SRes res) } -#define RINOK_THREAD(x) RINOK(x) +#define RINOK_THREAD(x) RINOK_WRes(x) static WRes ArEvent_OptCreate_And_Reset(CEvent *p) @@ -101,7 +102,7 @@ typedef struct __CMtDecBufLink CMtDecBufLink; -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp); +static THREAD_FUNC_DECL ThreadFunc(void *pp); static WRes MtDecThread_CreateEvents(CMtDecThread *t) @@ -156,8 +157,7 @@ static void MtDecThread_CloseThread(CMtDecThread *t) { Event_Set(&t->canWrite); /* we can disable it. There are no threads waiting canWrite in normal cases */ Event_Set(&t->canRead); - Thread_Wait(&t->thread); - Thread_Close(&t->thread); + Thread_Wait_Close(&t->thread); } Event_Close(&t->canRead); @@ -289,12 +289,13 @@ static WRes ThreadFunc2(CMtDecThread *t) Byte *afterEndData = NULL; size_t afterEndData_Size = 0; + BoolInt afterEndData_IsCross = False; BoolInt canCreateNewThread = False; // CMtDecCallbackInfo parse; CMtDecThread *nextThread; - PRF_STR_INT("Event_Wait(&t->canRead)", t->index); + PRF_STR_INT("=============== Event_Wait(&t->canRead)", t->index); RINOK_THREAD(Event_Wait(&t->canRead)); if (p->exitThread) @@ -418,10 +419,12 @@ static WRes ThreadFunc2(CMtDecThread *t) parse.srcFinished = finish; parse.canCreateNewThread = True; - // PRF(printf("\nParse size = %d\n", (unsigned)size)) + PRF(printf("\nParse size = %d\n", (unsigned)size)); p->mtCallback->Parse(p->mtCallbackObject, t->index, &parse); + PRF(printf(" Parse processed = %d, state = %d \n", (unsigned)parse.srcSize, (unsigned)parse.state)); + needWrite = True; canCreateNewThread = parse.canCreateNewThread; @@ -478,16 +481,12 @@ static WRes ThreadFunc2(CMtDecThread *t) if (parse.state == MTDEC_PARSE_END) { - p->crossStart = 0; - p->crossEnd = 0; - - if (crossSize != 0) - memcpy(data + parse.srcSize, parseData + parse.srcSize, size - parse.srcSize); // we need all data - afterEndData_Size = size - parse.srcSize; afterEndData = parseData + parse.srcSize; - + afterEndData_Size = size - parse.srcSize; + if (crossSize != 0) + afterEndData_IsCross = True; // we reduce data size to required bytes (parsed only) - inDataSize -= (size - parse.srcSize); + inDataSize -= afterEndData_Size; if (!prev) inDataSize_Start = parse.srcSize; break; @@ -752,13 +751,15 @@ static WRes ThreadFunc2(CMtDecThread *t) { // p->inProcessed += inCodePos; + PRF(printf("\n--Write afterSize = %d\n", (unsigned)afterEndData_Size)); + res = p->mtCallback->Write(p->mtCallbackObject, t->index, res == SZ_OK && needWriteToStream && !wasInterrupted, // needWrite - afterEndData, afterEndData_Size, + afterEndData, afterEndData_Size, afterEndData_IsCross, &needContinue, &canRecode); - - // res= E_INVALIDARG; // for test + + // res = SZ_ERROR_FAIL; // for test PRF(printf("\nAfter Write needContinue = %d\n", (unsigned)needContinue)); PRF(printf("\nprocessed = %d\n", (unsigned)p->inProcessed)); @@ -835,7 +836,7 @@ static WRes ThreadFunc2(CMtDecThread *t) #endif -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc1(void *pp) +static THREAD_FUNC_DECL ThreadFunc1(void *pp) { WRes res; @@ -847,7 +848,7 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc1(void *pp) res = ThreadFunc2(t); p = t->mtDec; if (res == 0) - return p->exitThreadWRes; + return (THREAD_FUNC_RET_TYPE)(UINT_PTR)p->exitThreadWRes; { // it's unexpected situation for some threading function error if (p->exitThreadWRes == 0) @@ -858,15 +859,14 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc1(void *pp) Event_Set(&p->threads[0].canWrite); MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res)); } - return res; + return (THREAD_FUNC_RET_TYPE)(UINT_PTR)res; } -static MY_NO_INLINE THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp) +static MY_NO_INLINE THREAD_FUNC_DECL ThreadFunc(void *pp) { - CMtDecThread *t = (CMtDecThread *)pp; - - // fprintf(stderr, "\n%d = %p - before", t->index, &t); #ifdef USE_ALLOCA + CMtDecThread *t = (CMtDecThread *)pp; + // fprintf(stderr, "\n%d = %p - before", t->index, &t); t->allocaPtr = alloca(t->index * 128); #endif return ThreadFunc1(pp); @@ -1092,13 +1092,14 @@ SRes MtDec_Code(CMtDec *p) { WRes wres; - WRes sres; + SRes sres; CMtDecThread *nextThread = &p->threads[p->numStartedThreads++]; // wres = MtDecThread_CreateAndStart(nextThread); wres = MtDecThread_CreateEvents(nextThread); if (wres == 0) { wres = Event_Set(&nextThread->canWrite); if (wres == 0) { wres = Event_Set(&nextThread->canRead); - if (wres == 0) { wres = ThreadFunc(nextThread); + if (wres == 0) { THREAD_FUNC_RET_TYPE res = ThreadFunc(nextThread); + wres = (WRes)(UINT_PTR)res; if (wres != 0) { p->needContinue = False; @@ -1130,8 +1131,8 @@ SRes MtDec_Code(CMtDec *p) return SZ_OK; // if (sres != SZ_OK) - return sres; - // return E_FAIL; + return sres; + // return SZ_ERROR_FAIL; } } diff --git a/3rdparty/7z/src/MtDec.h b/3rdparty/7z/src/MtDec.h index 9864cc8741..7a30b6a9e5 100644 --- a/3rdparty/7z/src/MtDec.h +++ b/3rdparty/7z/src/MtDec.h @@ -1,5 +1,5 @@ /* MtDec.h -- Multi-thread Decoder -2018-07-04 : Igor Pavlov : Public domain */ +2020-03-05 : Igor Pavlov : Public domain */ #ifndef __MT_DEC_H #define __MT_DEC_H @@ -108,11 +108,12 @@ typedef struct */ SRes (*Write)(void *p, unsigned coderIndex, BoolInt needWriteToStream, - const Byte *src, size_t srcSize, + const Byte *src, size_t srcSize, BoolInt isCross, // int srcFinished, BoolInt *needContinue, BoolInt *canRecode); -} IMtDecCallback; + +} IMtDecCallback2; @@ -132,7 +133,7 @@ typedef struct _CMtDec ICompressProgress *progress; ISzAllocPtr alloc; - IMtDecCallback *mtCallback; + IMtDecCallback2 *mtCallback; void *mtCallbackObject; diff --git a/3rdparty/7z/src/Ppmd.h b/3rdparty/7z/src/Ppmd.h index 4b99415212..ee93ecece4 100644 --- a/3rdparty/7z/src/Ppmd.h +++ b/3rdparty/7z/src/Ppmd.h @@ -1,5 +1,5 @@ /* Ppmd.h -- PPMD codec common code -2017-04-03 : Igor Pavlov : Public domain +2021-04-13 : Igor Pavlov : Public domain This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ #ifndef __PPMD_H @@ -9,7 +9,16 @@ This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ EXTERN_C_BEGIN -#ifdef MY_CPU_32BIT +#if defined(MY_CPU_SIZEOF_POINTER) && (MY_CPU_SIZEOF_POINTER == 4) +/* + PPMD code always uses 32-bit internal fields in PPMD structures to store internal references in main block. + if (PPMD_32BIT is defined), the PPMD code stores internal pointers to 32-bit reference fields. + if (PPMD_32BIT is NOT defined), the PPMD code stores internal UInt32 offsets to reference fields. + if (pointer size is 64-bit), then (PPMD_32BIT) mode is not allowed, + if (pointer size is 32-bit), then (PPMD_32BIT) mode is optional, + and it's allowed to disable PPMD_32BIT mode even if pointer is 32-bit. + PPMD code works slightly faster in (PPMD_32BIT) mode. +*/ #define PPMD_32BIT #endif @@ -28,7 +37,7 @@ EXTERN_C_BEGIN #define PPMD_N4 ((128 + 3 - 1 * PPMD_N1 - 2 * PPMD_N2 - 3 * PPMD_N3) / 4) #define PPMD_NUM_INDEXES (PPMD_N1 + PPMD_N2 + PPMD_N3 + PPMD_N4) -#pragma pack(push, 1) +MY_CPU_pragma_pack_push_1 /* Most compilers works OK here even without #pragma pack(push, 1), but some GCC compilers need it. */ /* SEE-contexts for PPM-contexts with masked symbols */ @@ -40,41 +49,114 @@ typedef struct } CPpmd_See; #define Ppmd_See_Update(p) if ((p)->Shift < PPMD_PERIOD_BITS && --(p)->Count == 0) \ - { (p)->Summ <<= 1; (p)->Count = (Byte)(3 << (p)->Shift++); } + { (p)->Summ = (UInt16)((p)->Summ << 1); (p)->Count = (Byte)(3 << (p)->Shift++); } + typedef struct { Byte Symbol; Byte Freq; - UInt16 SuccessorLow; - UInt16 SuccessorHigh; + UInt16 Successor_0; + UInt16 Successor_1; } CPpmd_State; -#pragma pack(pop) +typedef struct CPpmd_State2_ +{ + Byte Symbol; + Byte Freq; +} CPpmd_State2; -typedef - #ifdef PPMD_32BIT - CPpmd_State * - #else - UInt32 - #endif - CPpmd_State_Ref; +typedef struct CPpmd_State4_ +{ + UInt16 Successor_0; + UInt16 Successor_1; +} CPpmd_State4; -typedef - #ifdef PPMD_32BIT - void * - #else - UInt32 - #endif - CPpmd_Void_Ref; +MY_CPU_pragma_pop + +/* + PPMD code can write full CPpmd_State structure data to CPpmd*_Context + at (byte offset = 2) instead of some fields of original CPpmd*_Context structure. + + If we use pointers to different types, but that point to shared + memory space, we can have aliasing problem (strict aliasing). + + XLC compiler in -O2 mode can change the order of memory write instructions + in relation to read instructions, if we have use pointers to different types. + + To solve that aliasing problem we use combined CPpmd*_Context structure + with unions that contain the fields from both structures: + the original CPpmd*_Context and CPpmd_State. + So we can access the fields from both structures via one pointer, + and the compiler doesn't change the order of write instructions + in relation to read instructions. + + If we don't use memory write instructions to shared memory in + some local code, and we use only reading instructions (read only), + then probably it's safe to use pointers to different types for reading. +*/ + + + +#ifdef PPMD_32BIT + + #define Ppmd_Ref_Type(type) type * + #define Ppmd_GetRef(p, ptr) (ptr) + #define Ppmd_GetPtr(p, ptr) (ptr) + #define Ppmd_GetPtr_Type(p, ptr, note_type) (ptr) + +#else + + #define Ppmd_Ref_Type(type) UInt32 + #define Ppmd_GetRef(p, ptr) ((UInt32)((Byte *)(ptr) - (p)->Base)) + #define Ppmd_GetPtr(p, offs) ((void *)((p)->Base + (offs))) + #define Ppmd_GetPtr_Type(p, offs, type) ((type *)Ppmd_GetPtr(p, offs)) + +#endif // PPMD_32BIT + + +typedef Ppmd_Ref_Type(CPpmd_State) CPpmd_State_Ref; +typedef Ppmd_Ref_Type(void) CPpmd_Void_Ref; +typedef Ppmd_Ref_Type(Byte) CPpmd_Byte_Ref; + + +/* +#ifdef MY_CPU_LE_UNALIGN +// the unaligned 32-bit access latency can be too large, if the data is not in L1 cache. +#define Ppmd_GET_SUCCESSOR(p) ((CPpmd_Void_Ref)*(const UInt32 *)(const void *)&(p)->Successor_0) +#define Ppmd_SET_SUCCESSOR(p, v) *(UInt32 *)(void *)(void *)&(p)->Successor_0 = (UInt32)(v) + +#else +*/ + +/* + We can write 16-bit halves to 32-bit (Successor) field in any selected order. + But the native order is more consistent way. + So we use the native order, if LE/BE order can be detected here at compile time. +*/ + +#ifdef MY_CPU_BE + + #define Ppmd_GET_SUCCESSOR(p) \ + ( (CPpmd_Void_Ref) (((UInt32)(p)->Successor_0 << 16) | (p)->Successor_1) ) + + #define Ppmd_SET_SUCCESSOR(p, v) { \ + (p)->Successor_0 = (UInt16)(((UInt32)(v) >> 16) /* & 0xFFFF */); \ + (p)->Successor_1 = (UInt16)((UInt32)(v) /* & 0xFFFF */); } + +#else + + #define Ppmd_GET_SUCCESSOR(p) \ + ( (CPpmd_Void_Ref) ((p)->Successor_0 | ((UInt32)(p)->Successor_1 << 16)) ) + + #define Ppmd_SET_SUCCESSOR(p, v) { \ + (p)->Successor_0 = (UInt16)((UInt32)(v) /* & 0xFFFF */); \ + (p)->Successor_1 = (UInt16)(((UInt32)(v) >> 16) /* & 0xFFFF */); } + +#endif + +// #endif -typedef - #ifdef PPMD_32BIT - Byte * - #else - UInt32 - #endif - CPpmd_Byte_Ref; #define PPMD_SetAllBitsIn256Bytes(p) \ { size_t z; for (z = 0; z < 256 / sizeof(p[0]); z += 8) { \ diff --git a/3rdparty/7z/src/Ppmd7.c b/3rdparty/7z/src/Ppmd7.c index 80e7de9a63..b6ecf1430d 100644 --- a/3rdparty/7z/src/Ppmd7.c +++ b/3rdparty/7z/src/Ppmd7.c @@ -1,5 +1,5 @@ /* Ppmd7.c -- PPMdH codec -2018-07-04 : Igor Pavlov : Public domain +2021-04-13 : Igor Pavlov : Public domain This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ #include "Precomp.h" @@ -8,7 +8,12 @@ This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ #include "Ppmd7.h" -const Byte PPMD7_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 }; +/* define PPMD7_ORDER_0_SUPPPORT to suport order-0 mode, unsupported by orignal PPMd var.H. code */ +// #define PPMD7_ORDER_0_SUPPPORT + +MY_ALIGN(16) +static const Byte PPMD7_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 }; +MY_ALIGN(16) static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051}; #define MAX_FREQ 124 @@ -16,13 +21,10 @@ static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x #define U2B(nu) ((UInt32)(nu) * UNIT_SIZE) #define U2I(nu) (p->Units2Indx[(size_t)(nu) - 1]) -#define I2U(indx) (p->Indx2Units[indx]) +#define I2U(indx) ((unsigned)p->Indx2Units[indx]) +#define I2U_UInt16(indx) ((UInt16)p->Indx2Units[indx]) -#ifdef PPMD_32BIT - #define REF(ptr) (ptr) -#else - #define REF(ptr) ((UInt32)((Byte *)(ptr) - (p)->Base)) -#endif +#define REF(ptr) Ppmd_GetRef(p, ptr) #define STATS_REF(ptr) ((CPpmd_State_Ref)REF(ptr)) @@ -35,13 +37,7 @@ typedef CPpmd7_Context * CTX_PTR; struct CPpmd7_Node_; -typedef - #ifdef PPMD_32BIT - struct CPpmd7_Node_ * - #else - UInt32 - #endif - CPpmd7_Node_Ref; +typedef Ppmd_Ref_Type(struct CPpmd7_Node_) CPpmd7_Node_Ref; typedef struct CPpmd7_Node_ { @@ -51,17 +47,13 @@ typedef struct CPpmd7_Node_ CPpmd7_Node_Ref Prev; } CPpmd7_Node; -#ifdef PPMD_32BIT - #define NODE(ptr) (ptr) -#else - #define NODE(offs) ((CPpmd7_Node *)(p->Base + (offs))) -#endif +#define NODE(r) Ppmd_GetPtr_Type(p, r, CPpmd7_Node) void Ppmd7_Construct(CPpmd7 *p) { unsigned i, k, m; - p->Base = 0; + p->Base = NULL; for (i = 0, k = 0; i < PPMD_NUM_INDEXES; i++) { @@ -77,6 +69,7 @@ void Ppmd7_Construct(CPpmd7 *p) for (i = 0; i < 3; i++) p->NS2Indx[i] = (Byte)i; + for (m = i, k = 1; i < 256; i++) { p->NS2Indx[i] = (Byte)m; @@ -84,54 +77,63 @@ void Ppmd7_Construct(CPpmd7 *p) k = (++m) - 2; } - memset(p->HB2Flag, 0, 0x40); - memset(p->HB2Flag + 0x40, 8, 0x100 - 0x40); + memcpy(p->ExpEscape, PPMD7_kExpEscape, 16); } + void Ppmd7_Free(CPpmd7 *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->Base); p->Size = 0; - p->Base = 0; + p->Base = NULL; } + BoolInt Ppmd7_Alloc(CPpmd7 *p, UInt32 size, ISzAllocPtr alloc) { if (!p->Base || p->Size != size) { - size_t size2; Ppmd7_Free(p, alloc); - size2 = 0 - #ifndef PPMD_32BIT - + UNIT_SIZE - #endif - ; - p->AlignOffset = - #ifdef PPMD_32BIT - (4 - size) & 3; - #else - 4 - (size & 3); - #endif - if ((p->Base = (Byte *)ISzAlloc_Alloc(alloc, p->AlignOffset + size + size2)) == 0) + p->AlignOffset = (4 - size) & 3; + if ((p->Base = (Byte *)ISzAlloc_Alloc(alloc, p->AlignOffset + size)) == NULL) return False; p->Size = size; } return True; } + + +// ---------- Internal Memory Allocator ---------- + +/* We can use CPpmd7_Node in list of free units (as in Ppmd8) + But we still need one additional list walk pass in GlueFreeBlocks(). + So we use simple CPpmd_Void_Ref instead of CPpmd7_Node in InsertNode() / RemoveNode() +*/ + +#define EMPTY_NODE 0 + + static void InsertNode(CPpmd7 *p, void *node, unsigned indx) { *((CPpmd_Void_Ref *)node) = p->FreeList[indx]; + // ((CPpmd7_Node *)node)->Next = (CPpmd7_Node_Ref)p->FreeList[indx]; + p->FreeList[indx] = REF(node); + } + static void *RemoveNode(CPpmd7 *p, unsigned indx) { CPpmd_Void_Ref *node = (CPpmd_Void_Ref *)Ppmd7_GetPtr(p, p->FreeList[indx]); p->FreeList[indx] = *node; + // CPpmd7_Node *node = NODE((CPpmd7_Node_Ref)p->FreeList[indx]); + // p->FreeList[indx] = node->Next; return node; } + static void SplitBlock(CPpmd7 *p, void *ptr, unsigned oldIndx, unsigned newIndx) { unsigned i, nu = I2U(oldIndx) - I2U(newIndx); @@ -144,123 +146,167 @@ static void SplitBlock(CPpmd7 *p, void *ptr, unsigned oldIndx, unsigned newIndx) InsertNode(p, ptr, i); } + +/* we use CPpmd7_Node_Union union to solve XLC -O2 strict pointer aliasing problem */ + +typedef union _CPpmd7_Node_Union +{ + CPpmd7_Node Node; + CPpmd7_Node_Ref NextRef; +} CPpmd7_Node_Union; + +/* Original PPmdH (Ppmd7) code uses doubly linked list in GlueFreeBlocks() + we use single linked list similar to Ppmd8 code */ + + static void GlueFreeBlocks(CPpmd7 *p) { - #ifdef PPMD_32BIT - CPpmd7_Node headItem; - CPpmd7_Node_Ref head = &headItem; - #else - CPpmd7_Node_Ref head = p->AlignOffset + p->Size; - #endif - - CPpmd7_Node_Ref n = head; - unsigned i; - + /* + we use first UInt16 field of 12-bytes UNITs as record type stamp + CPpmd_State { Byte Symbol; Byte Freq; : Freq != 0 + CPpmd7_Context { UInt16 NumStats; : NumStats != 0 + CPpmd7_Node { UInt16 Stamp : Stamp == 0 for free record + : Stamp == 1 for head record and guard + Last 12-bytes UNIT in array is always contains 12-bytes order-0 CPpmd7_Context record. + */ + CPpmd7_Node_Ref head, n = 0; + p->GlueCount = 255; - /* create doubly-linked list of free blocks */ - for (i = 0; i < PPMD_NUM_INDEXES; i++) - { - UInt16 nu = I2U(i); - CPpmd7_Node_Ref next = (CPpmd7_Node_Ref)p->FreeList[i]; - p->FreeList[i] = 0; - while (next != 0) - { - CPpmd7_Node *node = NODE(next); - node->Next = n; - n = NODE(n)->Prev = next; - next = *(const CPpmd7_Node_Ref *)node; - node->Stamp = 0; - node->NU = (UInt16)nu; - } - } - NODE(head)->Stamp = 1; - NODE(head)->Next = n; - NODE(n)->Prev = head; + + /* we set guard NODE at LoUnit */ if (p->LoUnit != p->HiUnit) - ((CPpmd7_Node *)p->LoUnit)->Stamp = 1; - - /* Glue free blocks */ - while (n != head) + ((CPpmd7_Node *)(void *)p->LoUnit)->Stamp = 1; + { - CPpmd7_Node *node = NODE(n); - UInt32 nu = (UInt32)node->NU; - for (;;) + /* Create list of free blocks. + We still need one additional list walk pass before Glue. */ + unsigned i; + for (i = 0; i < PPMD_NUM_INDEXES; i++) { - CPpmd7_Node *node2 = NODE(n) + nu; - nu += node2->NU; - if (node2->Stamp != 0 || nu >= 0x10000) - break; - NODE(node2->Prev)->Next = node2->Next; - NODE(node2->Next)->Prev = node2->Prev; - node->NU = (UInt16)nu; + const UInt16 nu = I2U_UInt16(i); + CPpmd7_Node_Ref next = (CPpmd7_Node_Ref)p->FreeList[i]; + p->FreeList[i] = 0; + while (next != 0) + { + /* Don't change the order of the following commands: */ + CPpmd7_Node_Union *un = (CPpmd7_Node_Union *)NODE(next); + const CPpmd7_Node_Ref tmp = next; + next = un->NextRef; + un->Node.Stamp = EMPTY_NODE; + un->Node.NU = nu; + un->Node.Next = n; + n = tmp; + } } - n = node->Next; } - + + head = n; + /* Glue and Fill must walk the list in same direction */ + { + /* Glue free blocks */ + CPpmd7_Node_Ref *prev = &head; + while (n) + { + CPpmd7_Node *node = NODE(n); + UInt32 nu = node->NU; + n = node->Next; + if (nu == 0) + { + *prev = n; + continue; + } + prev = &node->Next; + for (;;) + { + CPpmd7_Node *node2 = node + nu; + nu += node2->NU; + if (node2->Stamp != EMPTY_NODE || nu >= 0x10000) + break; + node->NU = (UInt16)nu; + node2->NU = 0; + } + } + } + /* Fill lists of free blocks */ - for (n = NODE(head)->Next; n != head;) + for (n = head; n != 0;) { CPpmd7_Node *node = NODE(n); - unsigned nu; - CPpmd7_Node_Ref next = node->Next; - for (nu = node->NU; nu > 128; nu -= 128, node += 128) + UInt32 nu = node->NU; + unsigned i; + n = node->Next; + if (nu == 0) + continue; + for (; nu > 128; nu -= 128, node += 128) InsertNode(p, node, PPMD_NUM_INDEXES - 1); if (I2U(i = U2I(nu)) != nu) { unsigned k = I2U(--i); - InsertNode(p, node + k, nu - k - 1); + InsertNode(p, node + k, (unsigned)nu - k - 1); } InsertNode(p, node, i); - n = next; } } + +MY_NO_INLINE static void *AllocUnitsRare(CPpmd7 *p, unsigned indx) { unsigned i; - void *retVal; + if (p->GlueCount == 0) { GlueFreeBlocks(p); if (p->FreeList[indx] != 0) return RemoveNode(p, indx); } + i = indx; + do { if (++i == PPMD_NUM_INDEXES) { UInt32 numBytes = U2B(I2U(indx)); + Byte *us = p->UnitsStart; p->GlueCount--; - return ((UInt32)(p->UnitsStart - p->Text) > numBytes) ? (p->UnitsStart -= numBytes) : (NULL); + return ((UInt32)(us - p->Text) > numBytes) ? (p->UnitsStart = us - numBytes) : NULL; } } while (p->FreeList[i] == 0); - retVal = RemoveNode(p, i); - SplitBlock(p, retVal, i, indx); - return retVal; + + { + void *block = RemoveNode(p, i); + SplitBlock(p, block, i, indx); + return block; + } } + static void *AllocUnits(CPpmd7 *p, unsigned indx) { - UInt32 numBytes; if (p->FreeList[indx] != 0) return RemoveNode(p, indx); - numBytes = U2B(I2U(indx)); - if (numBytes <= (UInt32)(p->HiUnit - p->LoUnit)) { - void *retVal = p->LoUnit; - p->LoUnit += numBytes; - return retVal; + UInt32 numBytes = U2B(I2U(indx)); + Byte *lo = p->LoUnit; + if ((UInt32)(p->HiUnit - lo) >= numBytes) + { + p->LoUnit = lo + numBytes; + return lo; + } } return AllocUnitsRare(p, indx); } -#define MyMem12Cpy(dest, src, num) \ - { UInt32 *d = (UInt32 *)dest; const UInt32 *s = (const UInt32 *)src; UInt32 n = num; \ - do { d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; s += 3; d += 3; } while (--n); } +#define MyMem12Cpy(dest, src, num) \ + { UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \ + do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); } + + +/* static void *ShrinkUnits(CPpmd7 *p, void *oldPtr, unsigned oldNU, unsigned newNU) { unsigned i0 = U2I(oldNU); @@ -277,20 +323,25 @@ static void *ShrinkUnits(CPpmd7 *p, void *oldPtr, unsigned oldNU, unsigned newNU SplitBlock(p, oldPtr, i0, i1); return oldPtr; } +*/ -#define SUCCESSOR(p) ((CPpmd_Void_Ref)((p)->SuccessorLow | ((UInt32)(p)->SuccessorHigh << 16))) +#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) static void SetSuccessor(CPpmd_State *p, CPpmd_Void_Ref v) { - (p)->SuccessorLow = (UInt16)((UInt32)(v) & 0xFFFF); - (p)->SuccessorHigh = (UInt16)(((UInt32)(v) >> 16) & 0xFFFF); + Ppmd_SET_SUCCESSOR(p, v); } -static void RestartModel(CPpmd7 *p) + + +MY_NO_INLINE +static +void RestartModel(CPpmd7 *p) { - unsigned i, k, m; + unsigned i, k; memset(p->FreeList, 0, sizeof(p->FreeList)); + p->Text = p->Base + p->AlignOffset; p->HiUnit = p->Text + p->Size; p->LoUnit = p->UnitsStart = p->HiUnit - p->Size / 8 / UNIT_SIZE * 7 * UNIT_SIZE; @@ -300,57 +351,110 @@ static void RestartModel(CPpmd7 *p) p->RunLength = p->InitRL = -(Int32)((p->MaxOrder < 12) ? p->MaxOrder : 12) - 1; p->PrevSuccess = 0; - p->MinContext = p->MaxContext = (CTX_PTR)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */ - p->MinContext->Suffix = 0; - p->MinContext->NumStats = 256; - p->MinContext->SummFreq = 256 + 1; - p->FoundState = (CPpmd_State *)p->LoUnit; /* AllocUnits(p, PPMD_NUM_INDEXES - 1); */ - p->LoUnit += U2B(256 / 2); - p->MinContext->Stats = REF(p->FoundState); - for (i = 0; i < 256; i++) { - CPpmd_State *s = &p->FoundState[i]; - s->Symbol = (Byte)i; - s->Freq = 1; - SetSuccessor(s, 0); + CPpmd7_Context *mc = (CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */ + CPpmd_State *s = (CPpmd_State *)p->LoUnit; /* AllocUnits(p, PPMD_NUM_INDEXES - 1); */ + + p->LoUnit += U2B(256 / 2); + p->MaxContext = p->MinContext = mc; + p->FoundState = s; + + mc->NumStats = 256; + mc->Union2.SummFreq = 256 + 1; + mc->Union4.Stats = REF(s); + mc->Suffix = 0; + + for (i = 0; i < 256; i++, s++) + { + s->Symbol = (Byte)i; + s->Freq = 1; + SetSuccessor(s, 0); + } + + #ifdef PPMD7_ORDER_0_SUPPPORT + if (p->MaxOrder == 0) + { + CPpmd_Void_Ref r = REF(mc); + s = p->FoundState; + for (i = 0; i < 256; i++, s++) + SetSuccessor(s, r); + return; + } + #endif } for (i = 0; i < 128; i++) + + + for (k = 0; k < 8; k++) { + unsigned m; UInt16 *dest = p->BinSumm[i] + k; UInt16 val = (UInt16)(PPMD_BIN_SCALE - kInitBinEsc[k] / (i + 2)); for (m = 0; m < 64; m += 8) dest[m] = val; } - + + for (i = 0; i < 25; i++) - for (k = 0; k < 16; k++) + { + + CPpmd_See *s = p->See[i]; + + + + unsigned summ = ((5 * i + 10) << (PPMD_PERIOD_BITS - 4)); + for (k = 0; k < 16; k++, s++) { - CPpmd_See *s = &p->See[i][k]; - s->Summ = (UInt16)((5 * i + 10) << (s->Shift = PPMD_PERIOD_BITS - 4)); + s->Summ = (UInt16)summ; + s->Shift = (PPMD_PERIOD_BITS - 4); s->Count = 4; } + } + + p->DummySee.Summ = 0; /* unused */ + p->DummySee.Shift = PPMD_PERIOD_BITS; + p->DummySee.Count = 64; /* unused */ } + void Ppmd7_Init(CPpmd7 *p, unsigned maxOrder) { p->MaxOrder = maxOrder; + RestartModel(p); - p->DummySee.Shift = PPMD_PERIOD_BITS; - p->DummySee.Summ = 0; /* unused */ - p->DummySee.Count = 64; /* unused */ } -static CTX_PTR CreateSuccessors(CPpmd7 *p, BoolInt skip) + + +/* + CreateSuccessors() + It's called when (FoundState->Successor) is RAW-Successor, + that is the link to position in Raw text. + So we create Context records and write the links to + FoundState->Successor and to identical RAW-Successors in suffix + contexts of MinContex. + + The function returns: + if (OrderFall == 0) then MinContext is already at MAX order, + { return pointer to new or existing context of same MAX order } + else + { return pointer to new real context that will be (Order+1) in comparison with MinContext + + also it can return pointer to real context of same order, +*/ + +MY_NO_INLINE +static CTX_PTR CreateSuccessors(CPpmd7 *p) { - CPpmd_State upState; CTX_PTR c = p->MinContext; CPpmd_Byte_Ref upBranch = (CPpmd_Byte_Ref)SUCCESSOR(p->FoundState); - CPpmd_State *ps[PPMD7_MAX_ORDER]; + Byte newSym, newFreq; unsigned numPs = 0; - - if (!skip) + CPpmd_State *ps[PPMD7_MAX_ORDER]; + + if (p->OrderFall != 0) ps[numPs++] = p->FoundState; while (c->Suffix) @@ -358,44 +462,70 @@ static CTX_PTR CreateSuccessors(CPpmd7 *p, BoolInt skip) CPpmd_Void_Ref successor; CPpmd_State *s; c = SUFFIX(c); + + if (c->NumStats != 1) { - for (s = STATS(c); s->Symbol != p->FoundState->Symbol; s++); + Byte sym = p->FoundState->Symbol; + for (s = STATS(c); s->Symbol != sym; s++); + } else + { s = ONE_STATE(c); + + } successor = SUCCESSOR(s); if (successor != upBranch) { + // (c) is real record Context here, c = CTX(successor); if (numPs == 0) + { + // (c) is real record MAX Order Context here, + // So we don't need to create any new contexts. return c; + } break; } ps[numPs++] = s; } - upState.Symbol = *(const Byte *)Ppmd7_GetPtr(p, upBranch); - SetSuccessor(&upState, upBranch + 1); + // All created contexts will have single-symbol with new RAW-Successor + // All new RAW-Successors will point to next position in RAW text + // after FoundState->Successor + + newSym = *(const Byte *)Ppmd7_GetPtr(p, upBranch); + upBranch++; + if (c->NumStats == 1) - upState.Freq = ONE_STATE(c)->Freq; + newFreq = ONE_STATE(c)->Freq; else { UInt32 cf, s0; CPpmd_State *s; - for (s = STATS(c); s->Symbol != upState.Symbol; s++); - cf = s->Freq - 1; - s0 = c->SummFreq - c->NumStats - cf; - upState.Freq = (Byte)(1 + ((2 * cf <= s0) ? (5 * cf > s0) : ((2 * cf + 3 * s0 - 1) / (2 * s0)))); + for (s = STATS(c); s->Symbol != newSym; s++); + cf = (UInt32)s->Freq - 1; + s0 = (UInt32)c->Union2.SummFreq - c->NumStats - cf; + /* + cf - is frequency of symbol that will be Successor in new context records. + s0 - is commulative frequency sum of another symbols from parent context. + max(newFreq)= (s->Freq + 1), when (s0 == 1) + we have requirement (Ppmd7Context_OneState()->Freq <= 128) in BinSumm[] + so (s->Freq < 128) - is requirement for multi-symbol contexts + */ + newFreq = (Byte)(1 + ((2 * cf <= s0) ? (5 * cf > s0) : (2 * cf + s0 - 1) / (2 * s0) + 1)); } + // Create new single-symbol contexts from low order to high order in loop + do { - /* Create Child */ - CTX_PTR c1; /* = AllocContext(p); */ + CTX_PTR c1; + /* = AllocContext(p); */ if (p->HiUnit != p->LoUnit) - c1 = (CTX_PTR)(p->HiUnit -= UNIT_SIZE); + c1 = (CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); else if (p->FreeList[0] != 0) c1 = (CTX_PTR)RemoveNode(p, 0); else @@ -404,8 +534,11 @@ static CTX_PTR CreateSuccessors(CPpmd7 *p, BoolInt skip) if (!c1) return NULL; } + c1->NumStats = 1; - *ONE_STATE(c1) = upState; + ONE_STATE(c1)->Symbol = newSym; + ONE_STATE(c1)->Freq = newFreq; + SetSuccessor(ONE_STATE(c1), upBranch); c1->Suffix = REF(c); SetSuccessor(ps[--numPs], REF(c1)); c = c1; @@ -415,21 +548,26 @@ static CTX_PTR CreateSuccessors(CPpmd7 *p, BoolInt skip) return c; } -static void SwapStates(CPpmd_State *t1, CPpmd_State *t2) -{ - CPpmd_State tmp = *t1; - *t1 = *t2; - *t2 = tmp; -} -static void UpdateModel(CPpmd7 *p) + +#define SwapStates(s) \ + { CPpmd_State tmp = s[0]; s[0] = s[-1]; s[-1] = tmp; } + + +void Ppmd7_UpdateModel(CPpmd7 *p); +MY_NO_INLINE +void Ppmd7_UpdateModel(CPpmd7 *p) { - CPpmd_Void_Ref successor, fSuccessor = SUCCESSOR(p->FoundState); - CTX_PTR c; + CPpmd_Void_Ref maxSuccessor, minSuccessor; + CTX_PTR c, mc; unsigned s0, ns; - + + + if (p->FoundState->Freq < MAX_FREQ / 4 && p->MinContext->Suffix != 0) { + /* Update Freqs in Suffix Context */ + c = SUFFIX(p->MinContext); if (c->NumStats == 1) @@ -441,27 +579,39 @@ static void UpdateModel(CPpmd7 *p) else { CPpmd_State *s = STATS(c); - if (s->Symbol != p->FoundState->Symbol) + Byte sym = p->FoundState->Symbol; + + if (s->Symbol != sym) { - do { s++; } while (s->Symbol != p->FoundState->Symbol); + do + { + // s++; if (s->Symbol == sym) break; + s++; + } + while (s->Symbol != sym); + if (s[0].Freq >= s[-1].Freq) { - SwapStates(&s[0], &s[-1]); + SwapStates(s); s--; } } + if (s->Freq < MAX_FREQ - 9) { - s->Freq += 2; - c->SummFreq += 2; + s->Freq = (Byte)(s->Freq + 2); + c->Union2.SummFreq = (UInt16)(c->Union2.SummFreq + 2); } } } + if (p->OrderFall == 0) { - p->MinContext = p->MaxContext = CreateSuccessors(p, True); - if (p->MinContext == 0) + /* MAX ORDER context */ + /* (FoundState->Successor) is RAW-Successor. */ + p->MaxContext = p->MinContext = CreateSuccessors(p); + if (!p->MinContext) { RestartModel(p); return; @@ -469,45 +619,93 @@ static void UpdateModel(CPpmd7 *p) SetSuccessor(p->FoundState, REF(p->MinContext)); return; } + + + /* NON-MAX ORDER context */ - *p->Text++ = p->FoundState->Symbol; - successor = REF(p->Text); - if (p->Text >= p->UnitsStart) { - RestartModel(p); - return; + Byte *text = p->Text; + *text++ = p->FoundState->Symbol; + p->Text = text; + if (text >= p->UnitsStart) + { + RestartModel(p); + return; + } + maxSuccessor = REF(text); } - if (fSuccessor) + minSuccessor = SUCCESSOR(p->FoundState); + + if (minSuccessor) { - if (fSuccessor <= successor) + // there is Successor for FoundState in MinContext. + // So the next context will be one order higher than MinContext. + + if (minSuccessor <= maxSuccessor) { - CTX_PTR cs = CreateSuccessors(p, False); - if (cs == NULL) + // minSuccessor is RAW-Successor. So we will create real contexts records: + CTX_PTR cs = CreateSuccessors(p); + if (!cs) { RestartModel(p); return; } - fSuccessor = REF(cs); + minSuccessor = REF(cs); } + + // minSuccessor now is real Context pointer that points to existing (Order+1) context + if (--p->OrderFall == 0) { - successor = fSuccessor; + /* + if we move to MaxOrder context, then minSuccessor will be common Succesor for both: + MinContext that is (MaxOrder - 1) + MaxContext that is (MaxOrder) + so we don't need new RAW-Successor, and we can use real minSuccessor + as succssors for both MinContext and MaxContext. + */ + maxSuccessor = minSuccessor; + + /* + if (MaxContext != MinContext) + { + there was order fall from MaxOrder and we don't need current symbol + to transfer some RAW-Succesors to real contexts. + So we roll back pointer in raw data for one position. + } + */ p->Text -= (p->MaxContext != p->MinContext); } } else { - SetSuccessor(p->FoundState, successor); - fSuccessor = REF(p->MinContext); + /* + FoundState has NULL-Successor here. + And only root 0-order context can contain NULL-Successors. + We change Successor in FoundState to RAW-Successor, + And next context will be same 0-order root Context. + */ + SetSuccessor(p->FoundState, maxSuccessor); + minSuccessor = REF(p->MinContext); } - - s0 = p->MinContext->SummFreq - (ns = p->MinContext->NumStats) - (p->FoundState->Freq - 1); - - for (c = p->MaxContext; c != p->MinContext; c = SUFFIX(c)) + + mc = p->MinContext; + c = p->MaxContext; + + p->MaxContext = p->MinContext = CTX(minSuccessor); + + if (c == mc) + return; + + // s0 : is pure Escape Freq + s0 = mc->Union2.SummFreq - (ns = mc->NumStats) - ((unsigned)p->FoundState->Freq - 1); + + do { unsigned ns1; - UInt32 cf, sf; + UInt32 sum; + if ((ns1 = c->NumStats) != 1) { if ((ns1 & 1) == 0) @@ -527,80 +725,127 @@ static void UpdateModel(CPpmd7 *p) oldPtr = STATS(c); MyMem12Cpy(ptr, oldPtr, oldNU); InsertNode(p, oldPtr, i); - c->Stats = STATS_REF(ptr); + c->Union4.Stats = STATS_REF(ptr); } } - c->SummFreq = (UInt16)(c->SummFreq + (2 * ns1 < ns) + 2 * ((4 * ns1 <= ns) & (c->SummFreq <= 8 * ns1))); + sum = c->Union2.SummFreq; + /* max increase of Escape_Freq is 3 here. + total increase of Union2.SummFreq for all symbols is less than 256 here */ + sum += (UInt32)(2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1)); + /* original PPMdH uses 16-bit variable for (sum) here. + But (sum < 0x9000). So we don't truncate (sum) to 16-bit */ + // sum = (UInt16)sum; } else { + // instead of One-symbol context we create 2-symbol context CPpmd_State *s = (CPpmd_State*)AllocUnits(p, 0); if (!s) { RestartModel(p); return; } - *s = *ONE_STATE(c); - c->Stats = REF(s); - if (s->Freq < MAX_FREQ / 4 - 1) - s->Freq <<= 1; - else - s->Freq = MAX_FREQ - 4; - c->SummFreq = (UInt16)(s->Freq + p->InitEsc + (ns > 3)); - } - cf = 2 * (UInt32)p->FoundState->Freq * (c->SummFreq + 6); - sf = (UInt32)s0 + c->SummFreq; - if (cf < 6 * sf) - { - cf = 1 + (cf > sf) + (cf >= 4 * sf); - c->SummFreq += 3; - } - else - { - cf = 4 + (cf >= 9 * sf) + (cf >= 12 * sf) + (cf >= 15 * sf); - c->SummFreq = (UInt16)(c->SummFreq + cf); + { + unsigned freq = c->Union2.State2.Freq; + // s = *ONE_STATE(c); + s->Symbol = c->Union2.State2.Symbol; + s->Successor_0 = c->Union4.State4.Successor_0; + s->Successor_1 = c->Union4.State4.Successor_1; + // SetSuccessor(s, c->Union4.Stats); // call it only for debug purposes to check the order of + // (Successor_0 and Successor_1) in LE/BE. + c->Union4.Stats = REF(s); + if (freq < MAX_FREQ / 4 - 1) + freq <<= 1; + else + freq = MAX_FREQ - 4; + // (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context + s->Freq = (Byte)freq; + // max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here + sum = freq + p->InitEsc + (ns > 3); + } } + { CPpmd_State *s = STATS(c) + ns1; - SetSuccessor(s, successor); + UInt32 cf = 2 * (sum + 6) * (UInt32)p->FoundState->Freq; + UInt32 sf = (UInt32)s0 + sum; s->Symbol = p->FoundState->Symbol; - s->Freq = (Byte)cf; c->NumStats = (UInt16)(ns1 + 1); + SetSuccessor(s, maxSuccessor); + + if (cf < 6 * sf) + { + cf = (UInt32)1 + (cf > sf) + (cf >= 4 * sf); + sum += 3; + /* It can add (0, 1, 2) to Escape_Freq */ + } + else + { + cf = (UInt32)4 + (cf >= 9 * sf) + (cf >= 12 * sf) + (cf >= 15 * sf); + sum += cf; + } + + c->Union2.SummFreq = (UInt16)sum; + s->Freq = (Byte)cf; } + c = SUFFIX(c); } - p->MaxContext = p->MinContext = CTX(fSuccessor); + while (c != mc); } + + +MY_NO_INLINE static void Rescale(CPpmd7 *p) { unsigned i, adder, sumFreq, escFreq; CPpmd_State *stats = STATS(p->MinContext); CPpmd_State *s = p->FoundState; + + /* Sort the list by Freq */ + if (s != stats) { CPpmd_State tmp = *s; - for (; s != stats; s--) + do s[0] = s[-1]; + while (--s != stats); *s = tmp; } - escFreq = p->MinContext->SummFreq - s->Freq; - s->Freq += 4; - adder = (p->OrderFall != 0); - s->Freq = (Byte)((s->Freq + adder) >> 1); + sumFreq = s->Freq; + escFreq = p->MinContext->Union2.SummFreq - sumFreq; + + /* + if (p->OrderFall == 0), adder = 0 : it's allowed to remove symbol from MAX Order context + if (p->OrderFall != 0), adder = 1 : it's NOT allowed to remove symbol from NON-MAX Order context + */ + + adder = (p->OrderFall != 0); + + #ifdef PPMD7_ORDER_0_SUPPPORT + adder |= (p->MaxOrder == 0); // we don't remove symbols from order-0 context + #endif + + sumFreq = (sumFreq + 4 + adder) >> 1; + i = (unsigned)p->MinContext->NumStats - 1; + s->Freq = (Byte)sumFreq; - i = p->MinContext->NumStats - 1; do { - escFreq -= (++s)->Freq; - s->Freq = (Byte)((s->Freq + adder) >> 1); - sumFreq += s->Freq; - if (s[0].Freq > s[-1].Freq) + unsigned freq = (++s)->Freq; + escFreq -= freq; + freq = (freq + adder) >> 1; + sumFreq += freq; + s->Freq = (Byte)freq; + if (freq > s[-1].Freq) { + CPpmd_State tmp = *s; CPpmd_State *s1 = s; - CPpmd_State tmp = *s1; do + { s1[0] = s1[-1]; - while (--s1 != stats && tmp.Freq > s1[-1].Freq); + } + while (--s1 != stats && freq > s1[-1].Freq); *s1 = tmp; } } @@ -608,47 +853,89 @@ static void Rescale(CPpmd7 *p) if (s->Freq == 0) { - unsigned numStats = p->MinContext->NumStats; - unsigned n0, n1; - do { i++; } while ((--s)->Freq == 0); + /* Remove all items with Freq == 0 */ + CPpmd7_Context *mc; + unsigned numStats, numStatsNew, n0, n1; + + i = 0; do { i++; } while ((--s)->Freq == 0); + + /* We increase (escFreq) for the number of removed symbols. + So we will have (0.5) increase for Escape_Freq in avarage per + removed symbol after Escape_Freq halving */ escFreq += i; - p->MinContext->NumStats = (UInt16)(p->MinContext->NumStats - i); - if (p->MinContext->NumStats == 1) + mc = p->MinContext; + numStats = mc->NumStats; + numStatsNew = numStats - i; + mc->NumStats = (UInt16)(numStatsNew); + n0 = (numStats + 1) >> 1; + + if (numStatsNew == 1) { - CPpmd_State tmp = *stats; + /* Create Single-Symbol context */ + unsigned freq = stats->Freq; + do { - tmp.Freq = (Byte)(tmp.Freq - (tmp.Freq >> 1)); escFreq >>= 1; + freq = (freq + 1) >> 1; } while (escFreq > 1); - InsertNode(p, stats, U2I(((numStats + 1) >> 1))); - *(p->FoundState = ONE_STATE(p->MinContext)) = tmp; + + s = ONE_STATE(mc); + *s = *stats; + s->Freq = (Byte)freq; // (freq <= 260 / 4) + p->FoundState = s; + InsertNode(p, stats, U2I(n0)); return; } - n0 = (numStats + 1) >> 1; - n1 = (p->MinContext->NumStats + 1) >> 1; + + n1 = (numStatsNew + 1) >> 1; if (n0 != n1) - p->MinContext->Stats = STATS_REF(ShrinkUnits(p, stats, n0, n1)); + { + // p->MinContext->Union4.Stats = STATS_REF(ShrinkUnits(p, stats, n0, n1)); + unsigned i0 = U2I(n0); + unsigned i1 = U2I(n1); + if (i0 != i1) + { + if (p->FreeList[i1] != 0) + { + void *ptr = RemoveNode(p, i1); + p->MinContext->Union4.Stats = STATS_REF(ptr); + MyMem12Cpy(ptr, (const void *)stats, n1); + InsertNode(p, stats, i0); + } + else + SplitBlock(p, stats, i0, i1); + } + } + } + { + CPpmd7_Context *mc = p->MinContext; + mc->Union2.SummFreq = (UInt16)(sumFreq + escFreq - (escFreq >> 1)); + // Escape_Freq halving here + p->FoundState = STATS(mc); } - p->MinContext->SummFreq = (UInt16)(sumFreq + escFreq - (escFreq >> 1)); - p->FoundState = STATS(p->MinContext); } + CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq) { CPpmd_See *see; - unsigned nonMasked = p->MinContext->NumStats - numMasked; - if (p->MinContext->NumStats != 256) + const CPpmd7_Context *mc = p->MinContext; + unsigned numStats = mc->NumStats; + if (numStats != 256) { - see = p->See[(unsigned)p->NS2Indx[(size_t)nonMasked - 1]] + - (nonMasked < (unsigned)SUFFIX(p->MinContext)->NumStats - p->MinContext->NumStats) + - 2 * (unsigned)(p->MinContext->SummFreq < 11 * p->MinContext->NumStats) + - 4 * (unsigned)(numMasked > nonMasked) + + unsigned nonMasked = numStats - numMasked; + see = p->See[(unsigned)p->NS2Indx[(size_t)nonMasked - 1]] + + (nonMasked < (unsigned)SUFFIX(mc)->NumStats - numStats) + + 2 * (unsigned)(mc->Union2.SummFreq < 11 * numStats) + + 4 * (unsigned)(numMasked > nonMasked) + p->HiBitsFlag; { - unsigned r = (see->Summ >> see->Shift); - see->Summ = (UInt16)(see->Summ - r); + // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ + unsigned summ = (UInt16)see->Summ; // & 0xFFFF + unsigned r = (summ >> see->Shift); + see->Summ = (UInt16)(summ - r); *escFreq = r + (r == 0); } } @@ -660,53 +947,158 @@ CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq) return see; } + static void NextContext(CPpmd7 *p) { CTX_PTR c = CTX(SUCCESSOR(p->FoundState)); - if (p->OrderFall == 0 && (Byte *)c > p->Text) - p->MinContext = p->MaxContext = c; + if (p->OrderFall == 0 && (const Byte *)c > p->Text) + p->MaxContext = p->MinContext = c; else - UpdateModel(p); + Ppmd7_UpdateModel(p); } + void Ppmd7_Update1(CPpmd7 *p) { CPpmd_State *s = p->FoundState; - s->Freq += 4; - p->MinContext->SummFreq += 4; - if (s[0].Freq > s[-1].Freq) + unsigned freq = s->Freq; + freq += 4; + p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4); + s->Freq = (Byte)freq; + if (freq > s[-1].Freq) { - SwapStates(&s[0], &s[-1]); + SwapStates(s); p->FoundState = --s; - if (s->Freq > MAX_FREQ) + if (freq > MAX_FREQ) Rescale(p); } NextContext(p); } + void Ppmd7_Update1_0(CPpmd7 *p) { - p->PrevSuccess = (2 * p->FoundState->Freq > p->MinContext->SummFreq); - p->RunLength += p->PrevSuccess; - p->MinContext->SummFreq += 4; - if ((p->FoundState->Freq += 4) > MAX_FREQ) + CPpmd_State *s = p->FoundState; + CPpmd7_Context *mc = p->MinContext; + unsigned freq = s->Freq; + unsigned summFreq = mc->Union2.SummFreq; + p->PrevSuccess = (2 * freq > summFreq); + p->RunLength += (int)p->PrevSuccess; + mc->Union2.SummFreq = (UInt16)(summFreq + 4); + freq += 4; + s->Freq = (Byte)freq; + if (freq > MAX_FREQ) Rescale(p); NextContext(p); } + +/* void Ppmd7_UpdateBin(CPpmd7 *p) { - p->FoundState->Freq = (Byte)(p->FoundState->Freq + (p->FoundState->Freq < 128 ? 1: 0)); + unsigned freq = p->FoundState->Freq; + p->FoundState->Freq = (Byte)(freq + (freq < 128)); p->PrevSuccess = 1; p->RunLength++; NextContext(p); } +*/ void Ppmd7_Update2(CPpmd7 *p) { - p->MinContext->SummFreq += 4; - if ((p->FoundState->Freq += 4) > MAX_FREQ) - Rescale(p); + CPpmd_State *s = p->FoundState; + unsigned freq = s->Freq; + freq += 4; p->RunLength = p->InitRL; - UpdateModel(p); + p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4); + s->Freq = (Byte)freq; + if (freq > MAX_FREQ) + Rescale(p); + Ppmd7_UpdateModel(p); } + + + +/* +PPMd Memory Map: +{ + [ 0 ] contains subset of original raw text, that is required to create context + records, Some symbols are not written, when max order context was reached + [ Text ] free area + [ UnitsStart ] CPpmd_State vectors and CPpmd7_Context records + [ LoUnit ] free area for CPpmd_State and CPpmd7_Context items +[ HiUnit ] CPpmd7_Context records + [ Size ] end of array +} + +These addresses don't cross at any time. +And the following condtions is true for addresses: + (0 <= Text < UnitsStart <= LoUnit <= HiUnit <= Size) + +Raw text is BYTE--aligned. +the data in block [ UnitsStart ... Size ] contains 12-bytes aligned UNITs. + +Last UNIT of array at offset (Size - 12) is root order-0 CPpmd7_Context record. +The code can free UNITs memory blocks that were allocated to store CPpmd_State vectors. +The code doesn't free UNITs allocated for CPpmd7_Context records. + +The code calls RestartModel(), when there is no free memory for allocation. +And RestartModel() changes the state to orignal start state, with full free block. + + +The code allocates UNITs with the following order: + +Allocation of 1 UNIT for Context record + - from free space (HiUnit) down to (LoUnit) + - from FreeList[0] + - AllocUnitsRare() + +AllocUnits() for CPpmd_State vectors: + - from FreeList[i] + - from free space (LoUnit) up to (HiUnit) + - AllocUnitsRare() + +AllocUnitsRare() + - if (GlueCount == 0) + { Glue lists, GlueCount = 255, allocate from FreeList[i]] } + - loop for all higher sized FreeList[...] lists + - from (UnitsStart - Text), GlueCount-- + - ERROR + + +Each Record with Context contains the CPpmd_State vector, where each +CPpmd_State contains the link to Successor. +There are 3 types of Successor: + 1) NULL-Successor - NULL pointer. NULL-Successor links can be stored + only in 0-order Root Context Record. + We use 0 value as NULL-Successor + 2) RAW-Successor - the link to position in raw text, + that "RAW-Successor" is being created after first + occurrence of new symbol for some existing context record. + (RAW-Successor > 0). + 3) RECORD-Successor - the link to CPpmd7_Context record of (Order+1), + that record is being created when we go via RAW-Successor again. + +For any successors at any time: the following condtions are true for Successor links: +(NULL-Successor < RAW-Successor < UnitsStart <= RECORD-Successor) + + +---------- Symbol Frequency, SummFreq and Range in Range_Coder ---------- + +CPpmd7_Context::SummFreq = Sum(Stats[].Freq) + Escape_Freq + +The PPMd code tries to fulfill the condition: + (SummFreq <= (256 * 128 = RC::kBot)) + +We have (Sum(Stats[].Freq) <= 256 * 124), because of (MAX_FREQ = 124) +So (4 = 128 - 124) is average reserve for Escape_Freq for each symbol. +If (CPpmd_State::Freq) is not aligned for 4, the reserve can be 5, 6 or 7. +SummFreq and Escape_Freq can be changed in Rescale() and *Update*() functions. +Rescale() can remove symbols only from max-order contexts. So Escape_Freq can increase after multiple calls of Rescale() for +max-order context. + +When the PPMd code still break (Total <= RC::Range) condition in range coder, +we have two ways to resolve that problem: + 1) we can report error, if we want to keep compatibility with original PPMd code that has no fix for such cases. + 2) we can reduce (Total) value to (RC::Range) by reducing (Escape_Freq) part of (Total) value. +*/ diff --git a/3rdparty/7z/src/Ppmd7.h b/3rdparty/7z/src/Ppmd7.h index cce93f1209..297e35fe9f 100644 --- a/3rdparty/7z/src/Ppmd7.h +++ b/3rdparty/7z/src/Ppmd7.h @@ -1,10 +1,8 @@ -/* Ppmd7.h -- PPMdH compression codec -2018-07-04 : Igor Pavlov : Public domain -This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ - -/* This code supports virtual RangeDecoder and includes the implementation -of RangeCoder from 7z, instead of RangeCoder from original PPMd var.H. -If you need the compatibility with original PPMd var.H, you can use external RangeDecoder */ +/* Ppmd7.h -- Ppmd7 (PPMdH) compression codec +2021-04-13 : Igor Pavlov : Public domain +This code is based on: + PPMd var.H (2001): Dmitry Shkarin : Public domain */ + #ifndef __PPMD7_H #define __PPMD7_H @@ -21,23 +19,56 @@ EXTERN_C_BEGIN struct CPpmd7_Context_; -typedef - #ifdef PPMD_32BIT - struct CPpmd7_Context_ * - #else - UInt32 - #endif - CPpmd7_Context_Ref; +typedef Ppmd_Ref_Type(struct CPpmd7_Context_) CPpmd7_Context_Ref; + +// MY_CPU_pragma_pack_push_1 typedef struct CPpmd7_Context_ { UInt16 NumStats; - UInt16 SummFreq; - CPpmd_State_Ref Stats; + + + union + { + UInt16 SummFreq; + CPpmd_State2 State2; + } Union2; + + union + { + CPpmd_State_Ref Stats; + CPpmd_State4 State4; + } Union4; + CPpmd7_Context_Ref Suffix; } CPpmd7_Context; -#define Ppmd7Context_OneState(p) ((CPpmd_State *)&(p)->SummFreq) +// MY_CPU_pragma_pop + +#define Ppmd7Context_OneState(p) ((CPpmd_State *)&(p)->Union2) + + + + +typedef struct +{ + UInt32 Range; + UInt32 Code; + UInt32 Low; + IByteIn *Stream; +} CPpmd7_RangeDec; + + +typedef struct +{ + UInt32 Range; + Byte Cache; + // Byte _dummy_[3]; + UInt64 Low; + UInt64 CacheSize; + IByteOut *Stream; +} CPpmd7z_RangeEnc; + typedef struct { @@ -48,17 +79,30 @@ typedef struct UInt32 Size; UInt32 GlueCount; - Byte *Base, *LoUnit, *HiUnit, *Text, *UnitsStart; UInt32 AlignOffset; + Byte *Base, *LoUnit, *HiUnit, *Text, *UnitsStart; - Byte Indx2Units[PPMD_NUM_INDEXES]; + + + + union + { + CPpmd7_RangeDec dec; + CPpmd7z_RangeEnc enc; + } rc; + + Byte Indx2Units[PPMD_NUM_INDEXES + 2]; // +2 for alignment Byte Units2Indx[128]; CPpmd_Void_Ref FreeList[PPMD_NUM_INDEXES]; - Byte NS2Indx[256], NS2BSIndx[256], HB2Flag[256]; + + Byte NS2BSIndx[256], NS2Indx[256]; + Byte ExpEscape[16]; CPpmd_See DummySee, See[25][16]; UInt16 BinSumm[128][64]; + // int LastSymbol; } CPpmd7; + void Ppmd7_Construct(CPpmd7 *p); BoolInt Ppmd7_Alloc(CPpmd7 *p, UInt32 size, ISzAllocPtr alloc); void Ppmd7_Free(CPpmd7 *p, ISzAllocPtr alloc); @@ -68,74 +112,69 @@ void Ppmd7_Init(CPpmd7 *p, unsigned maxOrder); /* ---------- Internal Functions ---------- */ -extern const Byte PPMD7_kExpEscape[16]; - -#ifdef PPMD_32BIT - #define Ppmd7_GetPtr(p, ptr) (ptr) - #define Ppmd7_GetContext(p, ptr) (ptr) - #define Ppmd7_GetStats(p, ctx) ((ctx)->Stats) -#else - #define Ppmd7_GetPtr(p, offs) ((void *)((p)->Base + (offs))) - #define Ppmd7_GetContext(p, offs) ((CPpmd7_Context *)Ppmd7_GetPtr((p), (offs))) - #define Ppmd7_GetStats(p, ctx) ((CPpmd_State *)Ppmd7_GetPtr((p), ((ctx)->Stats))) -#endif +#define Ppmd7_GetPtr(p, ptr) Ppmd_GetPtr(p, ptr) +#define Ppmd7_GetContext(p, ptr) Ppmd_GetPtr_Type(p, ptr, CPpmd7_Context) +#define Ppmd7_GetStats(p, ctx) Ppmd_GetPtr_Type(p, (ctx)->Union4.Stats, CPpmd_State) void Ppmd7_Update1(CPpmd7 *p); void Ppmd7_Update1_0(CPpmd7 *p); void Ppmd7_Update2(CPpmd7 *p); -void Ppmd7_UpdateBin(CPpmd7 *p); + +#define PPMD7_HiBitsFlag_3(sym) ((((unsigned)sym + 0xC0) >> (8 - 3)) & (1 << 3)) +#define PPMD7_HiBitsFlag_4(sym) ((((unsigned)sym + 0xC0) >> (8 - 4)) & (1 << 4)) +// #define PPMD7_HiBitsFlag_3(sym) ((sym) < 0x40 ? 0 : (1 << 3)) +// #define PPMD7_HiBitsFlag_4(sym) ((sym) < 0x40 ? 0 : (1 << 4)) #define Ppmd7_GetBinSumm(p) \ - &p->BinSumm[(size_t)(unsigned)Ppmd7Context_OneState(p->MinContext)->Freq - 1][p->PrevSuccess + \ - p->NS2BSIndx[(size_t)Ppmd7_GetContext(p, p->MinContext->Suffix)->NumStats - 1] + \ - (p->HiBitsFlag = p->HB2Flag[p->FoundState->Symbol]) + \ - 2 * p->HB2Flag[(unsigned)Ppmd7Context_OneState(p->MinContext)->Symbol] + \ - ((p->RunLength >> 26) & 0x20)] + &p->BinSumm[(size_t)(unsigned)Ppmd7Context_OneState(p->MinContext)->Freq - 1] \ + [ p->PrevSuccess + ((p->RunLength >> 26) & 0x20) \ + + p->NS2BSIndx[(size_t)Ppmd7_GetContext(p, p->MinContext->Suffix)->NumStats - 1] \ + + PPMD7_HiBitsFlag_4(Ppmd7Context_OneState(p->MinContext)->Symbol) \ + + (p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol)) ] CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *scale); +/* +We support two versions of Ppmd7 (PPMdH) methods that use same CPpmd7 structure: + 1) Ppmd7a_*: original PPMdH + 2) Ppmd7z_*: modified PPMdH with 7z Range Coder +Ppmd7_*: the structures and functions that are common for both versions of PPMd7 (PPMdH) +*/ + /* ---------- Decode ---------- */ -typedef struct IPpmd7_RangeDec IPpmd7_RangeDec; +#define PPMD7_SYM_END (-1) +#define PPMD7_SYM_ERROR (-2) -struct IPpmd7_RangeDec -{ - UInt32 (*GetThreshold)(const IPpmd7_RangeDec *p, UInt32 total); - void (*Decode)(const IPpmd7_RangeDec *p, UInt32 start, UInt32 size); - UInt32 (*DecodeBit)(const IPpmd7_RangeDec *p, UInt32 size0); -}; +/* +You must set (CPpmd7::rc.dec.Stream) before Ppmd7*_RangeDec_Init() -typedef struct -{ - IPpmd7_RangeDec vt; - UInt32 Range; - UInt32 Code; - IByteIn *Stream; -} CPpmd7z_RangeDec; +Ppmd7*_DecodeSymbol() +out: + >= 0 : decoded byte + -1 : PPMD7_SYM_END : End of payload marker + -2 : PPMD7_SYM_ERROR : Data error +*/ -void Ppmd7z_RangeDec_CreateVTable(CPpmd7z_RangeDec *p); -BoolInt Ppmd7z_RangeDec_Init(CPpmd7z_RangeDec *p); +/* Ppmd7a_* : original PPMdH */ +BoolInt Ppmd7a_RangeDec_Init(CPpmd7_RangeDec *p); +#define Ppmd7a_RangeDec_IsFinishedOK(p) ((p)->Code == 0) +int Ppmd7a_DecodeSymbol(CPpmd7 *p); + +/* Ppmd7z_* : modified PPMdH with 7z Range Coder */ +BoolInt Ppmd7z_RangeDec_Init(CPpmd7_RangeDec *p); #define Ppmd7z_RangeDec_IsFinishedOK(p) ((p)->Code == 0) - -int Ppmd7_DecodeSymbol(CPpmd7 *p, const IPpmd7_RangeDec *rc); +int Ppmd7z_DecodeSymbol(CPpmd7 *p); +// Byte *Ppmd7z_DecodeSymbols(CPpmd7 *p, Byte *buf, const Byte *lim); /* ---------- Encode ---------- */ -typedef struct -{ - UInt64 Low; - UInt32 Range; - Byte Cache; - UInt64 CacheSize; - IByteOut *Stream; -} CPpmd7z_RangeEnc; - -void Ppmd7z_RangeEnc_Init(CPpmd7z_RangeEnc *p); -void Ppmd7z_RangeEnc_FlushData(CPpmd7z_RangeEnc *p); - -void Ppmd7_EncodeSymbol(CPpmd7 *p, CPpmd7z_RangeEnc *rc, int symbol); +void Ppmd7z_Init_RangeEnc(CPpmd7 *p); +void Ppmd7z_Flush_RangeEnc(CPpmd7 *p); +// void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol); +void Ppmd7z_EncodeSymbols(CPpmd7 *p, const Byte *buf, const Byte *lim); EXTERN_C_END diff --git a/3rdparty/7z/src/Ppmd7Dec.c b/3rdparty/7z/src/Ppmd7Dec.c index 2026407108..a18f0b8733 100644 --- a/3rdparty/7z/src/Ppmd7Dec.c +++ b/3rdparty/7z/src/Ppmd7Dec.c @@ -1,6 +1,8 @@ -/* Ppmd7Dec.c -- PPMdH Decoder -2018-07-04 : Igor Pavlov : Public domain -This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ +/* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder +2021-04-13 : Igor Pavlov : Public domain +This code is based on: + PPMd var.H (2001): Dmitry Shkarin : Public domain */ + #include "Precomp.h" @@ -8,184 +10,288 @@ This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ #define kTopValue (1 << 24) -BoolInt Ppmd7z_RangeDec_Init(CPpmd7z_RangeDec *p) + +#define READ_BYTE(p) IByteIn_Read((p)->Stream) + +BoolInt Ppmd7z_RangeDec_Init(CPpmd7_RangeDec *p) { unsigned i; p->Code = 0; p->Range = 0xFFFFFFFF; - if (IByteIn_Read(p->Stream) != 0) + if (READ_BYTE(p) != 0) return False; for (i = 0; i < 4; i++) - p->Code = (p->Code << 8) | IByteIn_Read(p->Stream); + p->Code = (p->Code << 8) | READ_BYTE(p); return (p->Code < 0xFFFFFFFF); } -#define GET_Ppmd7z_RangeDec CPpmd7z_RangeDec *p = CONTAINER_FROM_VTBL(pp, CPpmd7z_RangeDec, vt); - -static UInt32 Range_GetThreshold(const IPpmd7_RangeDec *pp, UInt32 total) +#define RC_NORM_BASE(p) if ((p)->Range < kTopValue) \ + { (p)->Code = ((p)->Code << 8) | READ_BYTE(p); (p)->Range <<= 8; + +#define RC_NORM_1(p) RC_NORM_BASE(p) } +#define RC_NORM(p) RC_NORM_BASE(p) RC_NORM_BASE(p) }} + +// we must use only one type of Normalization from two: LOCAL or REMOTE +#define RC_NORM_LOCAL(p) // RC_NORM(p) +#define RC_NORM_REMOTE(p) RC_NORM(p) + +#define R (&p->rc.dec) + +MY_FORCE_INLINE +// MY_NO_INLINE +static void RangeDec_Decode(CPpmd7 *p, UInt32 start, UInt32 size) { - GET_Ppmd7z_RangeDec - return p->Code / (p->Range /= total); + + + R->Code -= start * R->Range; + R->Range *= size; + RC_NORM_LOCAL(R) } -static void Range_Normalize(CPpmd7z_RangeDec *p) -{ - if (p->Range < kTopValue) - { - p->Code = (p->Code << 8) | IByteIn_Read(p->Stream); - p->Range <<= 8; - if (p->Range < kTopValue) - { - p->Code = (p->Code << 8) | IByteIn_Read(p->Stream); - p->Range <<= 8; - } - } -} - -static void Range_Decode(const IPpmd7_RangeDec *pp, UInt32 start, UInt32 size) -{ - GET_Ppmd7z_RangeDec - p->Code -= start * p->Range; - p->Range *= size; - Range_Normalize(p); -} - -static UInt32 Range_DecodeBit(const IPpmd7_RangeDec *pp, UInt32 size0) -{ - GET_Ppmd7z_RangeDec - UInt32 newBound = (p->Range >> 14) * size0; - UInt32 symbol; - if (p->Code < newBound) - { - symbol = 0; - p->Range = newBound; - } - else - { - symbol = 1; - p->Code -= newBound; - p->Range -= newBound; - } - Range_Normalize(p); - return symbol; -} - -void Ppmd7z_RangeDec_CreateVTable(CPpmd7z_RangeDec *p) -{ - p->vt.GetThreshold = Range_GetThreshold; - p->vt.Decode = Range_Decode; - p->vt.DecodeBit = Range_DecodeBit; -} +#define RC_Decode(start, size) RangeDec_Decode(p, start, size); +#define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R) +#define RC_GetThreshold(total) (R->Code / (R->Range /= (total))) -#define MASK(sym) ((signed char *)charMask)[sym] +#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref)) +typedef CPpmd7_Context * CTX_PTR; +#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) +void Ppmd7_UpdateModel(CPpmd7 *p); -int Ppmd7_DecodeSymbol(CPpmd7 *p, const IPpmd7_RangeDec *rc) +#define MASK(sym) ((unsigned char *)charMask)[sym] +// MY_FORCE_INLINE +// static +int Ppmd7z_DecodeSymbol(CPpmd7 *p) { size_t charMask[256 / sizeof(size_t)]; + if (p->MinContext->NumStats != 1) { CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext); unsigned i; UInt32 count, hiCnt; - if ((count = rc->GetThreshold(rc, p->MinContext->SummFreq)) < (hiCnt = s->Freq)) + UInt32 summFreq = p->MinContext->Union2.SummFreq; + + + + + count = RC_GetThreshold(summFreq); + hiCnt = count; + + if ((Int32)(count -= s->Freq) < 0) { - Byte symbol; - rc->Decode(rc, 0, s->Freq); + Byte sym; + RC_DecodeFinal(0, s->Freq); p->FoundState = s; - symbol = s->Symbol; + sym = s->Symbol; Ppmd7_Update1_0(p); - return symbol; + return sym; } + p->PrevSuccess = 0; - i = p->MinContext->NumStats - 1; + i = (unsigned)p->MinContext->NumStats - 1; + do { - if ((hiCnt += (++s)->Freq) > count) + if ((Int32)(count -= (++s)->Freq) < 0) { - Byte symbol; - rc->Decode(rc, hiCnt - s->Freq, s->Freq); + Byte sym; + RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq); p->FoundState = s; - symbol = s->Symbol; + sym = s->Symbol; Ppmd7_Update1(p); - return symbol; + return sym; } } while (--i); - if (count >= p->MinContext->SummFreq) - return -2; - p->HiBitsFlag = p->HB2Flag[p->FoundState->Symbol]; - rc->Decode(rc, hiCnt, p->MinContext->SummFreq - hiCnt); + + if (hiCnt >= summFreq) + return PPMD7_SYM_ERROR; + + hiCnt -= count; + RC_Decode(hiCnt, summFreq - hiCnt); + + p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol); PPMD_SetAllBitsIn256Bytes(charMask); - MASK(s->Symbol) = 0; - i = p->MinContext->NumStats - 1; - do { MASK((--s)->Symbol) = 0; } while (--i); + // i = p->MinContext->NumStats - 1; + // do { MASK((--s)->Symbol) = 0; } while (--i); + { + CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext); + MASK(s->Symbol) = 0; + do + { + unsigned sym0 = s2[0].Symbol; + unsigned sym1 = s2[1].Symbol; + s2 += 2; + MASK(sym0) = 0; + MASK(sym1) = 0; + } + while (s2 < s); + } } else { + CPpmd_State *s = Ppmd7Context_OneState(p->MinContext); UInt16 *prob = Ppmd7_GetBinSumm(p); - if (rc->DecodeBit(rc, *prob) == 0) + UInt32 pr = *prob; + UInt32 size0 = (R->Range >> 14) * pr; + pr = PPMD_UPDATE_PROB_1(pr); + + if (R->Code < size0) { - Byte symbol; - *prob = (UInt16)PPMD_UPDATE_PROB_0(*prob); - symbol = (p->FoundState = Ppmd7Context_OneState(p->MinContext))->Symbol; - Ppmd7_UpdateBin(p); - return symbol; + Byte sym; + *prob = (UInt16)(pr + (1 << PPMD_INT_BITS)); + + // RangeDec_DecodeBit0(size0); + R->Range = size0; + RC_NORM_1(R) + /* we can use single byte normalization here because of + (min(BinSumm[][]) = 95) > (1 << (14 - 8)) */ + + // sym = (p->FoundState = Ppmd7Context_OneState(p->MinContext))->Symbol; + // Ppmd7_UpdateBin(p); + { + unsigned freq = s->Freq; + CTX_PTR c = CTX(SUCCESSOR(s)); + sym = s->Symbol; + p->FoundState = s; + p->PrevSuccess = 1; + p->RunLength++; + s->Freq = (Byte)(freq + (freq < 128)); + // NextContext(p); + if (p->OrderFall == 0 && (const Byte *)c > p->Text) + p->MaxContext = p->MinContext = c; + else + Ppmd7_UpdateModel(p); + } + return sym; } - *prob = (UInt16)PPMD_UPDATE_PROB_1(*prob); - p->InitEsc = PPMD7_kExpEscape[*prob >> 10]; + + *prob = (UInt16)pr; + p->InitEsc = p->ExpEscape[pr >> 10]; + + // RangeDec_DecodeBit1(size0); + + R->Code -= size0; + R->Range -= size0; + RC_NORM_LOCAL(R) + PPMD_SetAllBitsIn256Bytes(charMask); MASK(Ppmd7Context_OneState(p->MinContext)->Symbol) = 0; p->PrevSuccess = 0; } + for (;;) { - CPpmd_State *ps[256], *s; + CPpmd_State *s, *s2; UInt32 freqSum, count, hiCnt; + CPpmd_See *see; - unsigned i, num, numMasked = p->MinContext->NumStats; + CPpmd7_Context *mc; + unsigned numMasked; + RC_NORM_REMOTE(R) + mc = p->MinContext; + numMasked = mc->NumStats; + do { p->OrderFall++; - if (!p->MinContext->Suffix) - return -1; - p->MinContext = Ppmd7_GetContext(p, p->MinContext->Suffix); + if (!mc->Suffix) + return PPMD7_SYM_END; + mc = Ppmd7_GetContext(p, mc->Suffix); } - while (p->MinContext->NumStats == numMasked); - hiCnt = 0; - s = Ppmd7_GetStats(p, p->MinContext); - i = 0; - num = p->MinContext->NumStats - numMasked; - do - { - int k = (int)(MASK(s->Symbol)); - hiCnt += (s->Freq & k); - ps[i] = s++; - i -= k; - } - while (i != num); + while (mc->NumStats == numMasked); + s = Ppmd7_GetStats(p, mc); + + { + unsigned num = mc->NumStats; + unsigned num2 = num / 2; + + num &= 1; + hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); + s += num; + p->MinContext = mc; + + do + { + unsigned sym0 = s[0].Symbol; + unsigned sym1 = s[1].Symbol; + s += 2; + hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); + hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); + } + while (--num2); + } + see = Ppmd7_MakeEscFreq(p, numMasked, &freqSum); freqSum += hiCnt; - count = rc->GetThreshold(rc, freqSum); + + + + + count = RC_GetThreshold(freqSum); if (count < hiCnt) { - Byte symbol; - CPpmd_State **pps = ps; - for (hiCnt = 0; (hiCnt += (*pps)->Freq) <= count; pps++); - s = *pps; - rc->Decode(rc, hiCnt - s->Freq, s->Freq); + Byte sym; + + s = Ppmd7_GetStats(p, p->MinContext); + hiCnt = count; + // count -= s->Freq & (unsigned)(MASK(s->Symbol)); + // if ((Int32)count >= 0) + { + for (;;) + { + count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; + // count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; + }; + } + s--; + RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq); + + // new (see->Summ) value can overflow over 16-bits in some rare cases Ppmd_See_Update(see); p->FoundState = s; - symbol = s->Symbol; + sym = s->Symbol; Ppmd7_Update2(p); - return symbol; + return sym; } + if (count >= freqSum) - return -2; - rc->Decode(rc, hiCnt, freqSum - hiCnt); + return PPMD7_SYM_ERROR; + + RC_Decode(hiCnt, freqSum - hiCnt); + + // We increase (see->Summ) for sum of Freqs of all non_Masked symbols. + // new (see->Summ) value can overflow over 16-bits in some rare cases see->Summ = (UInt16)(see->Summ + freqSum); - do { MASK(ps[--i]->Symbol) = 0; } while (i != 0); + + s = Ppmd7_GetStats(p, p->MinContext); + s2 = s + p->MinContext->NumStats; + do + { + MASK(s->Symbol) = 0; + s++; + } + while (s != s2); } } + +/* +Byte *Ppmd7z_DecodeSymbols(CPpmd7 *p, Byte *buf, const Byte *lim) +{ + int sym = 0; + if (buf != lim) + do + { + sym = Ppmd7z_DecodeSymbol(p); + if (sym < 0) + break; + *buf = (Byte)sym; + } + while (++buf < lim); + p->LastSymbol = sym; + return buf; +} +*/ diff --git a/3rdparty/7z/src/Ppmd7Enc.c b/3rdparty/7z/src/Ppmd7Enc.c index a74d3002b6..6af1ec15ec 100644 --- a/3rdparty/7z/src/Ppmd7Enc.c +++ b/3rdparty/7z/src/Ppmd7Enc.c @@ -1,6 +1,8 @@ -/* Ppmd7Enc.c -- PPMdH Encoder -2017-04-03 : Igor Pavlov : Public domain -This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ +/* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder +2021-04-13 : Igor Pavlov : Public domain +This code is based on: + PPMd var.H (2001): Dmitry Shkarin : Public domain */ + #include "Precomp.h" @@ -8,65 +10,60 @@ This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ #define kTopValue (1 << 24) -void Ppmd7z_RangeEnc_Init(CPpmd7z_RangeEnc *p) +#define R (&p->rc.enc) + +void Ppmd7z_Init_RangeEnc(CPpmd7 *p) { - p->Low = 0; - p->Range = 0xFFFFFFFF; - p->Cache = 0; - p->CacheSize = 1; + R->Low = 0; + R->Range = 0xFFFFFFFF; + R->Cache = 0; + R->CacheSize = 1; } -static void RangeEnc_ShiftLow(CPpmd7z_RangeEnc *p) +MY_NO_INLINE +static void RangeEnc_ShiftLow(CPpmd7 *p) { - if ((UInt32)p->Low < (UInt32)0xFF000000 || (unsigned)(p->Low >> 32) != 0) + if ((UInt32)R->Low < (UInt32)0xFF000000 || (unsigned)(R->Low >> 32) != 0) { - Byte temp = p->Cache; + Byte temp = R->Cache; do { - IByteOut_Write(p->Stream, (Byte)(temp + (Byte)(p->Low >> 32))); + IByteOut_Write(R->Stream, (Byte)(temp + (Byte)(R->Low >> 32))); temp = 0xFF; } - while (--p->CacheSize != 0); - p->Cache = (Byte)((UInt32)p->Low >> 24); + while (--R->CacheSize != 0); + R->Cache = (Byte)((UInt32)R->Low >> 24); } - p->CacheSize++; - p->Low = (UInt32)p->Low << 8; + R->CacheSize++; + R->Low = (UInt32)((UInt32)R->Low << 8); } -static void RangeEnc_Encode(CPpmd7z_RangeEnc *p, UInt32 start, UInt32 size, UInt32 total) +#define RC_NORM_BASE(p) if (R->Range < kTopValue) { R->Range <<= 8; RangeEnc_ShiftLow(p); +#define RC_NORM_1(p) RC_NORM_BASE(p) } +#define RC_NORM(p) RC_NORM_BASE(p) RC_NORM_BASE(p) }} + +// we must use only one type of Normalization from two: LOCAL or REMOTE +#define RC_NORM_LOCAL(p) // RC_NORM(p) +#define RC_NORM_REMOTE(p) RC_NORM(p) + +/* +#define RangeEnc_Encode(p, start, _size_) \ + { UInt32 size = _size_; \ + R->Low += start * R->Range; \ + R->Range *= size; \ + RC_NORM_LOCAL(p); } +*/ + +MY_FORCE_INLINE +// MY_NO_INLINE +static void RangeEnc_Encode(CPpmd7 *p, UInt32 start, UInt32 size) { - p->Low += start * (p->Range /= total); - p->Range *= size; - while (p->Range < kTopValue) - { - p->Range <<= 8; - RangeEnc_ShiftLow(p); - } + R->Low += start * R->Range; + R->Range *= size; + RC_NORM_LOCAL(p); } -static void RangeEnc_EncodeBit_0(CPpmd7z_RangeEnc *p, UInt32 size0) -{ - p->Range = (p->Range >> 14) * size0; - while (p->Range < kTopValue) - { - p->Range <<= 8; - RangeEnc_ShiftLow(p); - } -} - -static void RangeEnc_EncodeBit_1(CPpmd7z_RangeEnc *p, UInt32 size0) -{ - UInt32 newBound = (p->Range >> 14) * size0; - p->Low += newBound; - p->Range -= newBound; - while (p->Range < kTopValue) - { - p->Range <<= 8; - RangeEnc_ShiftLow(p); - } -} - -void Ppmd7z_RangeEnc_FlushData(CPpmd7z_RangeEnc *p) +void Ppmd7z_Flush_RangeEnc(CPpmd7 *p) { unsigned i; for (i = 0; i < 5; i++) @@ -74,31 +71,53 @@ void Ppmd7z_RangeEnc_FlushData(CPpmd7z_RangeEnc *p) } -#define MASK(sym) ((signed char *)charMask)[sym] -void Ppmd7_EncodeSymbol(CPpmd7 *p, CPpmd7z_RangeEnc *rc, int symbol) +#define RC_Encode(start, size) RangeEnc_Encode(p, start, size); +#define RC_EncodeFinal(start, size) RC_Encode(start, size); RC_NORM_REMOTE(p); + +#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref)) +#define SUFFIX(ctx) CTX((ctx)->Suffix) +typedef CPpmd7_Context * CTX_PTR; +#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) + +void Ppmd7_UpdateModel(CPpmd7 *p); + +#define MASK(sym) ((unsigned char *)charMask)[sym] + +MY_FORCE_INLINE +static +void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol) { size_t charMask[256 / sizeof(size_t)]; + if (p->MinContext->NumStats != 1) { CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext); UInt32 sum; unsigned i; + + + + + R->Range /= p->MinContext->Union2.SummFreq; + if (s->Symbol == symbol) { - RangeEnc_Encode(rc, 0, s->Freq, p->MinContext->SummFreq); + // R->Range /= p->MinContext->Union2.SummFreq; + RC_EncodeFinal(0, s->Freq); p->FoundState = s; Ppmd7_Update1_0(p); return; } p->PrevSuccess = 0; sum = s->Freq; - i = p->MinContext->NumStats - 1; + i = (unsigned)p->MinContext->NumStats - 1; do { if ((++s)->Symbol == symbol) { - RangeEnc_Encode(rc, sum, s->Freq, p->MinContext->SummFreq); + // R->Range /= p->MinContext->Union2.SummFreq; + RC_EncodeFinal(sum, s->Freq); p->FoundState = s; Ppmd7_Update1(p); return; @@ -106,82 +125,199 @@ void Ppmd7_EncodeSymbol(CPpmd7 *p, CPpmd7z_RangeEnc *rc, int symbol) sum += s->Freq; } while (--i); + + // R->Range /= p->MinContext->Union2.SummFreq; + RC_Encode(sum, p->MinContext->Union2.SummFreq - sum); - p->HiBitsFlag = p->HB2Flag[p->FoundState->Symbol]; + p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol); PPMD_SetAllBitsIn256Bytes(charMask); - MASK(s->Symbol) = 0; - i = p->MinContext->NumStats - 1; - do { MASK((--s)->Symbol) = 0; } while (--i); - RangeEnc_Encode(rc, sum, p->MinContext->SummFreq - sum, p->MinContext->SummFreq); + // MASK(s->Symbol) = 0; + // i = p->MinContext->NumStats - 1; + // do { MASK((--s)->Symbol) = 0; } while (--i); + { + CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext); + MASK(s->Symbol) = 0; + do + { + unsigned sym0 = s2[0].Symbol; + unsigned sym1 = s2[1].Symbol; + s2 += 2; + MASK(sym0) = 0; + MASK(sym1) = 0; + } + while (s2 < s); + } } else { UInt16 *prob = Ppmd7_GetBinSumm(p); CPpmd_State *s = Ppmd7Context_OneState(p->MinContext); + UInt32 pr = *prob; + UInt32 bound = (R->Range >> 14) * pr; + pr = PPMD_UPDATE_PROB_1(pr); if (s->Symbol == symbol) { - RangeEnc_EncodeBit_0(rc, *prob); - *prob = (UInt16)PPMD_UPDATE_PROB_0(*prob); - p->FoundState = s; - Ppmd7_UpdateBin(p); + *prob = (UInt16)(pr + (1 << PPMD_INT_BITS)); + // RangeEnc_EncodeBit_0(p, bound); + R->Range = bound; + RC_NORM_1(p); + + // p->FoundState = s; + // Ppmd7_UpdateBin(p); + { + unsigned freq = s->Freq; + CTX_PTR c = CTX(SUCCESSOR(s)); + p->FoundState = s; + p->PrevSuccess = 1; + p->RunLength++; + s->Freq = (Byte)(freq + (freq < 128)); + // NextContext(p); + if (p->OrderFall == 0 && (const Byte *)c > p->Text) + p->MaxContext = p->MinContext = c; + else + Ppmd7_UpdateModel(p); + } return; } - else - { - RangeEnc_EncodeBit_1(rc, *prob); - *prob = (UInt16)PPMD_UPDATE_PROB_1(*prob); - p->InitEsc = PPMD7_kExpEscape[*prob >> 10]; - PPMD_SetAllBitsIn256Bytes(charMask); - MASK(s->Symbol) = 0; - p->PrevSuccess = 0; - } + + *prob = (UInt16)pr; + p->InitEsc = p->ExpEscape[pr >> 10]; + // RangeEnc_EncodeBit_1(p, bound); + R->Low += bound; + R->Range -= bound; + RC_NORM_LOCAL(p) + + PPMD_SetAllBitsIn256Bytes(charMask); + MASK(s->Symbol) = 0; + p->PrevSuccess = 0; } + for (;;) { - UInt32 escFreq; CPpmd_See *see; CPpmd_State *s; - UInt32 sum; - unsigned i, numMasked = p->MinContext->NumStats; + UInt32 sum, escFreq; + CPpmd7_Context *mc; + unsigned i, numMasked; + + RC_NORM_REMOTE(p) + + mc = p->MinContext; + numMasked = mc->NumStats; + do { p->OrderFall++; - if (!p->MinContext->Suffix) + if (!mc->Suffix) return; /* EndMarker (symbol = -1) */ - p->MinContext = Ppmd7_GetContext(p, p->MinContext->Suffix); + mc = Ppmd7_GetContext(p, mc->Suffix); + i = mc->NumStats; } - while (p->MinContext->NumStats == numMasked); + while (i == numMasked); + + p->MinContext = mc; - see = Ppmd7_MakeEscFreq(p, numMasked, &escFreq); - s = Ppmd7_GetStats(p, p->MinContext); + // see = Ppmd7_MakeEscFreq(p, numMasked, &escFreq); + { + if (i != 256) + { + unsigned nonMasked = i - numMasked; + see = p->See[(unsigned)p->NS2Indx[(size_t)nonMasked - 1]] + + p->HiBitsFlag + + (nonMasked < (unsigned)SUFFIX(mc)->NumStats - i) + + 2 * (unsigned)(mc->Union2.SummFreq < 11 * i) + + 4 * (unsigned)(numMasked > nonMasked); + { + // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ + unsigned summ = (UInt16)see->Summ; // & 0xFFFF + unsigned r = (summ >> see->Shift); + see->Summ = (UInt16)(summ - r); + escFreq = r + (r == 0); + } + } + else + { + see = &p->DummySee; + escFreq = 1; + } + } + + s = Ppmd7_GetStats(p, mc); sum = 0; - i = p->MinContext->NumStats; + // i = mc->NumStats; + do { - int cur = s->Symbol; - if (cur == symbol) + unsigned cur = s->Symbol; + if ((int)cur == symbol) { UInt32 low = sum; - CPpmd_State *s1 = s; - do - { - sum += (s->Freq & (int)(MASK(s->Symbol))); - s++; - } - while (--i); - RangeEnc_Encode(rc, low, s1->Freq, sum + escFreq); + UInt32 freq = s->Freq; + unsigned num2; + Ppmd_See_Update(see); - p->FoundState = s1; + p->FoundState = s; + sum += escFreq; + + num2 = i / 2; + i &= 1; + sum += freq & (0 - (UInt32)i); + if (num2 != 0) + { + s += i; + for (;;) + { + unsigned sym0 = s[0].Symbol; + unsigned sym1 = s[1].Symbol; + s += 2; + sum += (s[-2].Freq & (unsigned)(MASK(sym0))); + sum += (s[-1].Freq & (unsigned)(MASK(sym1))); + if (--num2 == 0) + break; + } + } + + + R->Range /= sum; + RC_EncodeFinal(low, freq); Ppmd7_Update2(p); return; } - sum += (s->Freq & (int)(MASK(cur))); - MASK(cur) = 0; + sum += (s->Freq & (unsigned)(MASK(cur))); s++; } while (--i); - RangeEnc_Encode(rc, sum, escFreq, sum + escFreq); - see->Summ = (UInt16)(see->Summ + sum + escFreq); + { + UInt32 total = sum + escFreq; + see->Summ = (UInt16)(see->Summ + total); + + R->Range /= total; + RC_Encode(sum, escFreq); + } + + { + CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext); + s--; + MASK(s->Symbol) = 0; + do + { + unsigned sym0 = s2[0].Symbol; + unsigned sym1 = s2[1].Symbol; + s2 += 2; + MASK(sym0) = 0; + MASK(sym1) = 0; + } + while (s2 < s); + } + } +} + + +void Ppmd7z_EncodeSymbols(CPpmd7 *p, const Byte *buf, const Byte *lim) +{ + for (; buf < lim; buf++) + { + Ppmd7z_EncodeSymbol(p, *buf); } } diff --git a/3rdparty/7z/src/Sha256.c b/3rdparty/7z/src/Sha256.c index 90994e5abb..c03b75afe6 100644 --- a/3rdparty/7z/src/Sha256.c +++ b/3rdparty/7z/src/Sha256.c @@ -1,5 +1,5 @@ -/* Crypto/Sha256.c -- SHA-256 Hash -2017-04-03 : Igor Pavlov : Public domain +/* Sha256.c -- SHA-256 Hash +2021-04-01 : Igor Pavlov : Public domain This code is based on public domain code from Wei Dai's Crypto++ library. */ #include "Precomp.h" @@ -10,16 +10,107 @@ This code is based on public domain code from Wei Dai's Crypto++ library. */ #include "RotateDefs.h" #include "Sha256.h" -/* define it for speed optimization */ -#ifndef _SFX -#define _SHA256_UNROLL -#define _SHA256_UNROLL2 +#if defined(_MSC_VER) && (_MSC_VER < 1900) +// #define USE_MY_MM #endif -/* #define _SHA256_UNROLL2 */ +#ifdef MY_CPU_X86_OR_AMD64 + #ifdef _MSC_VER + #if _MSC_VER >= 1200 + #define _SHA_SUPPORTED + #endif + #elif defined(__clang__) + #if (__clang_major__ >= 8) // fix that check + #define _SHA_SUPPORTED + #endif + #elif defined(__GNUC__) + #if (__GNUC__ >= 8) // fix that check + #define _SHA_SUPPORTED + #endif + #elif defined(__INTEL_COMPILER) + #if (__INTEL_COMPILER >= 1800) // fix that check + #define _SHA_SUPPORTED + #endif + #endif +#elif defined(MY_CPU_ARM_OR_ARM64) + #ifdef _MSC_VER + #if _MSC_VER >= 1910 + #define _SHA_SUPPORTED + #endif + #elif defined(__clang__) + #if (__clang_major__ >= 8) // fix that check + #define _SHA_SUPPORTED + #endif + #elif defined(__GNUC__) + #if (__GNUC__ >= 6) // fix that check + #define _SHA_SUPPORTED + #endif + #endif +#endif -void Sha256_Init(CSha256 *p) +void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); + +#ifdef _SHA_SUPPORTED + void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); + + static SHA256_FUNC_UPDATE_BLOCKS g_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks; + static SHA256_FUNC_UPDATE_BLOCKS g_FUNC_UPDATE_BLOCKS_HW; + + #define UPDATE_BLOCKS(p) p->func_UpdateBlocks +#else + #define UPDATE_BLOCKS(p) Sha256_UpdateBlocks +#endif + + +BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo) { + SHA256_FUNC_UPDATE_BLOCKS func = Sha256_UpdateBlocks; + + #ifdef _SHA_SUPPORTED + if (algo != SHA256_ALGO_SW) + { + if (algo == SHA256_ALGO_DEFAULT) + func = g_FUNC_UPDATE_BLOCKS; + else + { + if (algo != SHA256_ALGO_HW) + return False; + func = g_FUNC_UPDATE_BLOCKS_HW; + if (!func) + return False; + } + } + #else + if (algo > 1) + return False; + #endif + + p->func_UpdateBlocks = func; + return True; +} + + +/* define it for speed optimization */ + +#ifdef _SFX + #define STEP_PRE 1 + #define STEP_MAIN 1 +#else + #define STEP_PRE 2 + #define STEP_MAIN 4 + // #define _SHA256_UNROLL +#endif + +#if STEP_MAIN != 16 + #define _SHA256_BIG_W +#endif + + + + +void Sha256_InitState(CSha256 *p) +{ + p->count = 0; p->state[0] = 0x6a09e667; p->state[1] = 0xbb67ae85; p->state[2] = 0x3c6ef372; @@ -28,7 +119,17 @@ void Sha256_Init(CSha256 *p) p->state[5] = 0x9b05688c; p->state[6] = 0x1f83d9ab; p->state[7] = 0x5be0cd19; - p->count = 0; +} + +void Sha256_Init(CSha256 *p) +{ + p->func_UpdateBlocks = + #ifdef _SHA_SUPPORTED + g_FUNC_UPDATE_BLOCKS; + #else + NULL; + #endif + Sha256_InitState(p); } #define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22)) @@ -36,61 +137,100 @@ void Sha256_Init(CSha256 *p) #define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) #define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10)) -#define blk0(i) (W[i]) -#define blk2(i) (W[i] += s1(W[((i)-2)&15]) + W[((i)-7)&15] + s0(W[((i)-15)&15])) - #define Ch(x,y,z) (z^(x&(y^z))) #define Maj(x,y,z) ((x&y)|(z&(x|y))) -#ifdef _SHA256_UNROLL2 -#define R(a,b,c,d,e,f,g,h, i) \ - h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + (j ? blk2(i) : blk0(i)); \ +#define W_PRE(i) (W[(i) + (size_t)(j)] = GetBe32(data + ((size_t)(j) + i) * 4)) + +#define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15)) + +#ifdef _SHA256_BIG_W + // we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned. + #define w(j, i) W[(size_t)(j) + i] + #define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i)) +#else + #if STEP_MAIN == 16 + #define w(j, i) W[(i) & 15] + #else + #define w(j, i) W[((size_t)(j) + (i)) & 15] + #endif + #define blk2(j, i) (w(j, i) += blk2_main(j, i)) +#endif + +#define W_MAIN(i) blk2(j, i) + + +#define T1(wx, i) \ + tmp = h + S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ + h = g; \ + g = f; \ + f = e; \ + e = d + tmp; \ + tmp += S0(a) + Maj(a, b, c); \ + d = c; \ + c = b; \ + b = a; \ + a = tmp; \ + +#define R1_PRE(i) T1( W_PRE, i) +#define R1_MAIN(i) T1( W_MAIN, i) + +#if (!defined(_SHA256_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4) +#define R2_MAIN(i) \ + R1_MAIN(i) \ + R1_MAIN(i + 1) \ + +#endif + + + +#if defined(_SHA256_UNROLL) && STEP_MAIN >= 8 + +#define T4( a,b,c,d,e,f,g,h, wx, i) \ + h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ + tmp = h; \ + h += d; \ + d = tmp + S0(a) + Maj(a, b, c); \ + +#define R4( wx, i) \ + T4 ( a,b,c,d,e,f,g,h, wx, (i )); \ + T4 ( d,a,b,c,h,e,f,g, wx, (i+1)); \ + T4 ( c,d,a,b,g,h,e,f, wx, (i+2)); \ + T4 ( b,c,d,a,f,g,h,e, wx, (i+3)); \ + +#define R4_PRE(i) R4( W_PRE, i) +#define R4_MAIN(i) R4( W_MAIN, i) + + +#define T8( a,b,c,d,e,f,g,h, wx, i) \ + h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ d += h; \ - h += S0(a) + Maj(a, b, c) + h += S0(a) + Maj(a, b, c); \ -#define RX_8(i) \ - R(a,b,c,d,e,f,g,h, i); \ - R(h,a,b,c,d,e,f,g, i+1); \ - R(g,h,a,b,c,d,e,f, i+2); \ - R(f,g,h,a,b,c,d,e, i+3); \ - R(e,f,g,h,a,b,c,d, i+4); \ - R(d,e,f,g,h,a,b,c, i+5); \ - R(c,d,e,f,g,h,a,b, i+6); \ - R(b,c,d,e,f,g,h,a, i+7) +#define R8( wx, i) \ + T8 ( a,b,c,d,e,f,g,h, wx, i ); \ + T8 ( h,a,b,c,d,e,f,g, wx, i+1); \ + T8 ( g,h,a,b,c,d,e,f, wx, i+2); \ + T8 ( f,g,h,a,b,c,d,e, wx, i+3); \ + T8 ( e,f,g,h,a,b,c,d, wx, i+4); \ + T8 ( d,e,f,g,h,a,b,c, wx, i+5); \ + T8 ( c,d,e,f,g,h,a,b, wx, i+6); \ + T8 ( b,c,d,e,f,g,h,a, wx, i+7); \ -#define RX_16 RX_8(0); RX_8(8); - -#else - -#define a(i) T[(0-(i))&7] -#define b(i) T[(1-(i))&7] -#define c(i) T[(2-(i))&7] -#define d(i) T[(3-(i))&7] -#define e(i) T[(4-(i))&7] -#define f(i) T[(5-(i))&7] -#define g(i) T[(6-(i))&7] -#define h(i) T[(7-(i))&7] - -#define R(i) \ - h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[(i)+(size_t)(j)] + (j ? blk2(i) : blk0(i)); \ - d(i) += h(i); \ - h(i) += S0(a(i)) + Maj(a(i), b(i), c(i)) \ - -#ifdef _SHA256_UNROLL - -#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7); -#define RX_16 RX_8(0); RX_8(8); - -#else - -#define RX_16 unsigned i; for (i = 0; i < 16; i++) { R(i); } +#define R8_PRE(i) R8( W_PRE, i) +#define R8_MAIN(i) R8( W_MAIN, i) #endif -#endif +void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); -static const UInt32 K[64] = { +// static +extern MY_ALIGN(64) +const UInt32 SHA256_K_ARRAY[64]; + +MY_ALIGN(64) +const UInt32 SHA256_K_ARRAY[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, @@ -109,30 +249,27 @@ static const UInt32 K[64] = { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 }; -static void Sha256_WriteByteBlock(CSha256 *p) -{ - UInt32 W[16]; - unsigned j; - UInt32 *state; +#define K SHA256_K_ARRAY - #ifdef _SHA256_UNROLL2 - UInt32 a,b,c,d,e,f,g,h; + +MY_NO_INLINE +void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks) +{ + UInt32 W + #ifdef _SHA256_BIG_W + [64]; #else - UInt32 T[8]; + [16]; #endif - for (j = 0; j < 16; j += 4) - { - const Byte *ccc = p->buffer + j * 4; - W[j ] = GetBe32(ccc); - W[j + 1] = GetBe32(ccc + 4); - W[j + 2] = GetBe32(ccc + 8); - W[j + 3] = GetBe32(ccc + 12); - } + unsigned j; - state = p->state; + UInt32 a,b,c,d,e,f,g,h; - #ifdef _SHA256_UNROLL2 + #if !defined(_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) + UInt32 tmp; + #endif + a = state[0]; b = state[1]; c = state[2]; @@ -141,39 +278,96 @@ static void Sha256_WriteByteBlock(CSha256 *p) f = state[5]; g = state[6]; h = state[7]; - #else - for (j = 0; j < 8; j++) - T[j] = state[j]; - #endif - for (j = 0; j < 64; j += 16) + while (numBlocks) { - RX_16 + + for (j = 0; j < 16; j += STEP_PRE) + { + #if STEP_PRE > 4 + + #if STEP_PRE < 8 + R4_PRE(0); + #else + R8_PRE(0); + #if STEP_PRE == 16 + R8_PRE(8); + #endif + #endif + + #else + + R1_PRE(0); + #if STEP_PRE >= 2 + R1_PRE(1); + #if STEP_PRE >= 4 + R1_PRE(2); + R1_PRE(3); + #endif + #endif + + #endif + } + + for (j = 16; j < 64; j += STEP_MAIN) + { + #if defined(_SHA256_UNROLL) && STEP_MAIN >= 8 + + #if STEP_MAIN < 8 + R4_MAIN(0); + #else + R8_MAIN(0); + #if STEP_MAIN == 16 + R8_MAIN(8); + #endif + #endif + + #else + + R1_MAIN(0); + #if STEP_MAIN >= 2 + R1_MAIN(1); + #if STEP_MAIN >= 4 + R2_MAIN(2); + #if STEP_MAIN >= 8 + R2_MAIN(4); + R2_MAIN(6); + #if STEP_MAIN >= 16 + R2_MAIN(8); + R2_MAIN(10); + R2_MAIN(12); + R2_MAIN(14); + #endif + #endif + #endif + #endif + #endif + } + + a += state[0]; state[0] = a; + b += state[1]; state[1] = b; + c += state[2]; state[2] = c; + d += state[3]; state[3] = d; + e += state[4]; state[4] = e; + f += state[5]; state[5] = f; + g += state[6]; state[6] = g; + h += state[7]; state[7] = h; + + data += 64; + numBlocks--; } - #ifdef _SHA256_UNROLL2 - state[0] += a; - state[1] += b; - state[2] += c; - state[3] += d; - state[4] += e; - state[5] += f; - state[6] += g; - state[7] += h; - #else - for (j = 0; j < 8; j++) - state[j] += T[j]; - #endif - /* Wipe variables */ /* memset(W, 0, sizeof(W)); */ - /* memset(T, 0, sizeof(T)); */ } #undef S0 #undef S1 #undef s0 #undef s1 +#undef K + +#define Sha256_UpdateBlock(p) UPDATE_BLOCKS(p)(p->state, p->buffer, 1) void Sha256_Update(CSha256 *p, const Byte *data, size_t size) { @@ -193,25 +387,26 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) return; } - size -= num; - memcpy(p->buffer + pos, data, num); - data += num; + if (pos != 0) + { + size -= num; + memcpy(p->buffer + pos, data, num); + data += num; + Sha256_UpdateBlock(p); + } } - - for (;;) { - Sha256_WriteByteBlock(p); - if (size < 64) - break; - size -= 64; - memcpy(p->buffer, data, 64); - data += 64; - } - - if (size != 0) + size_t numBlocks = size >> 6; + UPDATE_BLOCKS(p)(p->state, data, numBlocks); + size &= 0x3F; + if (size == 0) + return; + data += (numBlocks << 6); memcpy(p->buffer, data, size); + } } + void Sha256_Final(CSha256 *p, Byte *digest) { unsigned pos = (unsigned)p->count & 0x3F; @@ -219,30 +414,73 @@ void Sha256_Final(CSha256 *p, Byte *digest) p->buffer[pos++] = 0x80; - while (pos != (64 - 8)) + if (pos > (64 - 8)) { - pos &= 0x3F; - if (pos == 0) - Sha256_WriteByteBlock(p); - p->buffer[pos++] = 0; + while (pos != 64) { p->buffer[pos++] = 0; } + // memset(&p->buf.buffer[pos], 0, 64 - pos); + Sha256_UpdateBlock(p); + pos = 0; } + /* + if (pos & 3) + { + p->buffer[pos] = 0; + p->buffer[pos + 1] = 0; + p->buffer[pos + 2] = 0; + pos += 3; + pos &= ~3; + } + { + for (; pos < 64 - 8; pos += 4) + *(UInt32 *)(&p->buffer[pos]) = 0; + } + */ + + memset(&p->buffer[pos], 0, (64 - 8) - pos); + { UInt64 numBits = (p->count << 3); SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32)); SetBe32(p->buffer + 64 - 4, (UInt32)(numBits)); } - Sha256_WriteByteBlock(p); + Sha256_UpdateBlock(p); for (i = 0; i < 8; i += 2) { UInt32 v0 = p->state[i]; - UInt32 v1 = p->state[i + 1]; + UInt32 v1 = p->state[(size_t)i + 1]; SetBe32(digest , v0); SetBe32(digest + 4, v1); digest += 8; } - Sha256_Init(p); + Sha256_InitState(p); +} + + +void Sha256Prepare() +{ + #ifdef _SHA_SUPPORTED + SHA256_FUNC_UPDATE_BLOCKS f, f_hw; + f = Sha256_UpdateBlocks; + f_hw = NULL; + #ifdef MY_CPU_X86_OR_AMD64 + #ifndef USE_MY_MM + if (CPU_IsSupported_SHA() + && CPU_IsSupported_SSSE3() + // && CPU_IsSupported_SSE41() + ) + #endif + #else + if (CPU_IsSupported_SHA2()) + #endif + { + // printf("\n========== HW SHA256 ======== \n"); + f = f_hw = Sha256_UpdateBlocks_HW; + } + g_FUNC_UPDATE_BLOCKS = f; + g_FUNC_UPDATE_BLOCKS_HW = f_hw; + #endif } diff --git a/3rdparty/7z/src/Sha256.h b/3rdparty/7z/src/Sha256.h index 7f17ccf9c9..f529339865 100644 --- a/3rdparty/7z/src/Sha256.h +++ b/3rdparty/7z/src/Sha256.h @@ -1,26 +1,76 @@ /* Sha256.h -- SHA-256 Hash -2013-01-18 : Igor Pavlov : Public domain */ +2021-01-01 : Igor Pavlov : Public domain */ -#ifndef __CRYPTO_SHA256_H -#define __CRYPTO_SHA256_H +#ifndef __7Z_SHA256_H +#define __7Z_SHA256_H #include "7zTypes.h" EXTERN_C_BEGIN -#define SHA256_DIGEST_SIZE 32 +#define SHA256_NUM_BLOCK_WORDS 16 +#define SHA256_NUM_DIGEST_WORDS 8 + +#define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4) +#define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4) + +typedef void (MY_FAST_CALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks); + +/* + if (the system supports different SHA256 code implementations) + { + (CSha256::func_UpdateBlocks) will be used + (CSha256::func_UpdateBlocks) can be set by + Sha256_Init() - to default (fastest) + Sha256_SetFunction() - to any algo + } + else + { + (CSha256::func_UpdateBlocks) is ignored. + } +*/ typedef struct { - UInt32 state[8]; + SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; UInt64 count; - Byte buffer[64]; + UInt64 __pad_2[2]; + UInt32 state[SHA256_NUM_DIGEST_WORDS]; + + Byte buffer[SHA256_BLOCK_SIZE]; } CSha256; + +#define SHA256_ALGO_DEFAULT 0 +#define SHA256_ALGO_SW 1 +#define SHA256_ALGO_HW 2 + +/* +Sha256_SetFunction() +return: + 0 - (algo) value is not supported, and func_UpdateBlocks was not changed + 1 - func_UpdateBlocks was set according (algo) value. +*/ + +BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo); + +void Sha256_InitState(CSha256 *p); void Sha256_Init(CSha256 *p); void Sha256_Update(CSha256 *p, const Byte *data, size_t size); void Sha256_Final(CSha256 *p, Byte *digest); + + + +// void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); + +/* +call Sha256Prepare() once at program start. +It prepares all supported implementations, and detects the fastest implementation. +*/ + +void Sha256Prepare(void); + EXTERN_C_END #endif diff --git a/3rdparty/7z/src/Sha256Opt.c b/3rdparty/7z/src/Sha256Opt.c new file mode 100644 index 0000000000..cc8c53e1b6 --- /dev/null +++ b/3rdparty/7z/src/Sha256Opt.c @@ -0,0 +1,373 @@ +/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions +2021-04-01 : Igor Pavlov : Public domain */ + +#include "Precomp.h" + +#if defined(_MSC_VER) +#if (_MSC_VER < 1900) && (_MSC_VER >= 1200) +// #define USE_MY_MM +#endif +#endif + +#include "CpuArch.h" + +#ifdef MY_CPU_X86_OR_AMD64 + #if defined(__clang__) + #if (__clang_major__ >= 8) // fix that check + #define USE_HW_SHA + #ifndef __SHA__ + #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) + #if defined(_MSC_VER) + // SSSE3: for clang-cl: + #include + #define __SHA__ + #endif + #endif + + #endif + #elif defined(__GNUC__) + #if (__GNUC__ >= 8) // fix that check + #define USE_HW_SHA + #ifndef __SHA__ + #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) + // #pragma GCC target("sha,ssse3") + #endif + #endif + #elif defined(__INTEL_COMPILER) + #if (__INTEL_COMPILER >= 1800) // fix that check + #define USE_HW_SHA + #endif + #elif defined(_MSC_VER) + #ifdef USE_MY_MM + #define USE_VER_MIN 1300 + #else + #define USE_VER_MIN 1910 + #endif + #if _MSC_VER >= USE_VER_MIN + #define USE_HW_SHA + #endif + #endif +// #endif // MY_CPU_X86_OR_AMD64 + +#ifdef USE_HW_SHA + +// #pragma message("Sha256 HW") +// #include + +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) +#include +#else +#include + +#if defined(_MSC_VER) && (_MSC_VER >= 1600) +// #include +#endif + +#ifdef USE_MY_MM +#include "My_mm.h" +#endif + +#endif + +/* +SHA256 uses: +SSE2: + _mm_loadu_si128 + _mm_storeu_si128 + _mm_set_epi32 + _mm_add_epi32 + _mm_shuffle_epi32 / pshufd + + + +SSSE3: + _mm_shuffle_epi8 / pshufb + _mm_alignr_epi8 +SHA: + _mm_sha256* +*/ + +// K array must be aligned for 16-bytes at least. +// The compiler can look align attribute and selects +// movdqu - for code without align attribute +// movdqa - for code with align attribute +extern +MY_ALIGN(64) +const UInt32 SHA256_K_ARRAY[64]; + +#define K SHA256_K_ARRAY + + +#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); +#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); +#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); + + +#define LOAD_SHUFFLE(m, k) \ + m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ + m = _mm_shuffle_epi8(m, mask); \ + +#define SM1(g0, g1, g2, g3) \ + SHA256_MSG1(g3, g0); \ + +#define SM2(g0, g1, g2, g3) \ + tmp = _mm_alignr_epi8(g1, g0, 4); \ + ADD_EPI32(g2, tmp); \ + SHA25G_MSG2(g2, g1); \ + +// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k) +// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1) + + +#define NNN(g0, g1, g2, g3) + + +#define RND2(t0, t1) \ + t0 = _mm_sha256rnds2_epu32(t0, t1, msg); + +#define RND2_0(m, k) \ + msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \ + RND2(state0, state1); \ + msg = _mm_shuffle_epi32(msg, 0x0E); \ + + +#define RND2_1 \ + RND2(state1, state0); \ + + +// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2 + +#define R4(k, g0, g1, g2, g3, OP0, OP1) \ + RND2_0(g0, k); \ + OP0(g0, g1, g2, g3); \ + RND2_1; \ + OP1(g0, g1, g2, g3); \ + +#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ + R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \ + R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \ + R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \ + R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \ + +#define PREPARE_STATE \ + tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ + state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \ + state1 = state0; \ + state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \ + state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \ + + +void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); +#ifdef ATTRIB_SHA +ATTRIB_SHA +#endif +void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) +{ + const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); + __m128i tmp; + __m128i state0, state1; + + if (numBlocks == 0) + return; + + state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]); + state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]); + + PREPARE_STATE + + do + { + __m128i state0_save, state1_save; + __m128i m0, m1, m2, m3; + __m128i msg; + // #define msg tmp + + state0_save = state0; + state1_save = state1; + + LOAD_SHUFFLE (m0, 0) + LOAD_SHUFFLE (m1, 1) + LOAD_SHUFFLE (m2, 2) + LOAD_SHUFFLE (m3, 3) + + + + R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ); + R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); + R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); + R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ); + + ADD_EPI32(state0, state0_save); + ADD_EPI32(state1, state1_save); + + data += 64; + } + while (--numBlocks); + + PREPARE_STATE + + _mm_storeu_si128((__m128i *) (void *) &state[0], state0); + _mm_storeu_si128((__m128i *) (void *) &state[4], state1); +} + +#endif // USE_HW_SHA + +#elif defined(MY_CPU_ARM_OR_ARM64) + + #if defined(__clang__) + #if (__clang_major__ >= 8) // fix that check + #define USE_HW_SHA + #endif + #elif defined(__GNUC__) + #if (__GNUC__ >= 6) // fix that check + #define USE_HW_SHA + #endif + #elif defined(_MSC_VER) + #if _MSC_VER >= 1910 + #define USE_HW_SHA + #endif + #endif + +#ifdef USE_HW_SHA + +// #pragma message("=== Sha256 HW === ") + +#if defined(__clang__) || defined(__GNUC__) + #ifdef MY_CPU_ARM64 + #define ATTRIB_SHA __attribute__((__target__("+crypto"))) + #else + #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) + #endif +#else + // _MSC_VER + // for arm32 + #define _ARM_USE_NEW_NEON_INTRINSICS +#endif + +#if defined(_MSC_VER) && defined(MY_CPU_ARM64) +#include +#else +#include +#endif + +typedef uint32x4_t v128; +// typedef __n128 v128; // MSVC + +#ifdef MY_CPU_BE + #define MY_rev32_for_LE(x) +#else + #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) +#endif + +#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) +#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) + +#define LOAD_SHUFFLE(m, k) \ + m = LOAD_128((data + (k) * 16)); \ + MY_rev32_for_LE(m); \ + +// K array must be aligned for 16-bytes at least. +extern +MY_ALIGN(64) +const UInt32 SHA256_K_ARRAY[64]; + +#define K SHA256_K_ARRAY + + +#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); +#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); + +#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0) +#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1) +#define NNN(g0, g1, g2, g3) + + +#define R4(k, g0, g1, g2, g3, OP0, OP1) \ + msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \ + tmp = state0; \ + state0 = vsha256hq_u32( state0, state1, msg ); \ + state1 = vsha256h2q_u32( state1, tmp, msg ); \ + OP0(g0, g1, g2, g3); \ + OP1(g0, g1, g2, g3); \ + + +#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ + R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \ + R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \ + R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \ + R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \ + + +void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); +#ifdef ATTRIB_SHA +ATTRIB_SHA +#endif +void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) +{ + v128 state0, state1; + + if (numBlocks == 0) + return; + + state0 = LOAD_128(&state[0]); + state1 = LOAD_128(&state[4]); + + do + { + v128 state0_save, state1_save; + v128 m0, m1, m2, m3; + v128 msg, tmp; + + state0_save = state0; + state1_save = state1; + + LOAD_SHUFFLE (m0, 0) + LOAD_SHUFFLE (m1, 1) + LOAD_SHUFFLE (m2, 2) + LOAD_SHUFFLE (m3, 3) + + R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ); + R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); + R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); + R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ); + + state0 = vaddq_u32(state0, state0_save); + state1 = vaddq_u32(state1, state1_save); + + data += 64; + } + while (--numBlocks); + + STORE_128(&state[0], state0); + STORE_128(&state[4], state1); +} + +#endif // USE_HW_SHA + +#endif // MY_CPU_ARM_OR_ARM64 + + +#ifndef USE_HW_SHA + +// #error Stop_Compiling_UNSUPPORTED_SHA +// #include + +// #include "Sha256.h" +void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); + +#pragma message("Sha256 HW-SW stub was used") + +void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); +void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) +{ + Sha256_UpdateBlocks(state, data, numBlocks); + /* + UNUSED_VAR(state); + UNUSED_VAR(data); + UNUSED_VAR(numBlocks); + exit(1); + return; + */ +} + +#endif diff --git a/3rdparty/7z/src/Threads.c b/3rdparty/7z/src/Threads.c index 8fd86f224b..6eb45b08a8 100644 --- a/3rdparty/7z/src/Threads.c +++ b/3rdparty/7z/src/Threads.c @@ -1,9 +1,11 @@ /* Threads.c -- multithreading library -2017-06-26 : Igor Pavlov : Public domain */ +2021-12-21 : Igor Pavlov : Public domain */ #include "Precomp.h" -#ifndef UNDER_CE +#ifdef _WIN32 + +#ifndef USE_THREADS_CreateThread #include #endif @@ -29,28 +31,103 @@ WRes HandlePtr_Close(HANDLE *p) return 0; } -WRes Handle_WaitObject(HANDLE h) { return (WRes)WaitForSingleObject(h, INFINITE); } +WRes Handle_WaitObject(HANDLE h) +{ + DWORD dw = WaitForSingleObject(h, INFINITE); + /* + (dw) result: + WAIT_OBJECT_0 // 0 + WAIT_ABANDONED // 0x00000080 : is not compatible with Win32 Error space + WAIT_TIMEOUT // 0x00000102 : is compatible with Win32 Error space + WAIT_FAILED // 0xFFFFFFFF + */ + if (dw == WAIT_FAILED) + { + dw = GetLastError(); + if (dw == 0) + return WAIT_FAILED; + } + return (WRes)dw; +} + +#define Thread_Wait(p) Handle_WaitObject(*(p)) + +WRes Thread_Wait_Close(CThread *p) +{ + WRes res = Thread_Wait(p); + WRes res2 = Thread_Close(p); + return (res != 0 ? res : res2); +} WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) { /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ - - #ifdef UNDER_CE - + + #ifdef USE_THREADS_CreateThread + DWORD threadId; - *p = CreateThread(0, 0, func, param, 0, &threadId); - + *p = CreateThread(NULL, 0, func, param, 0, &threadId); + #else - + unsigned threadId; - *p = (HANDLE)_beginthreadex(NULL, 0, func, param, 0, &threadId); - + *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId)); + #endif /* maybe we must use errno here, but probably GetLastError() is also OK. */ return HandleToWRes(*p); } + +WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) +{ + #ifdef USE_THREADS_CreateThread + + UNUSED_VAR(affinity) + return Thread_Create(p, func, param); + + #else + + /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ + HANDLE h; + WRes wres; + unsigned threadId; + h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId)); + *p = h; + wres = HandleToWRes(h); + if (h) + { + { + // DWORD_PTR prevMask = + SetThreadAffinityMask(h, (DWORD_PTR)affinity); + /* + if (prevMask == 0) + { + // affinity change is non-critical error, so we can ignore it + // wres = GetError(); + } + */ + } + { + DWORD prevSuspendCount = ResumeThread(h); + /* ResumeThread() returns: + 0 : was_not_suspended + 1 : was_resumed + -1 : error + */ + if (prevSuspendCount == (DWORD)-1) + wres = GetError(); + } + } + + /* maybe we must use errno here, but probably GetLastError() is also OK. */ + return wres; + + #endif +} + + static WRes Event_Create(CEvent *p, BOOL manualReset, int signaled) { *p = CreateEvent(NULL, manualReset, (signaled ? TRUE : FALSE), NULL); @@ -68,10 +145,22 @@ WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p) { return AutoResetEven WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount) { + // negative ((LONG)maxCount) is not supported in WIN32::CreateSemaphore() *p = CreateSemaphore(NULL, (LONG)initCount, (LONG)maxCount, NULL); return HandleToWRes(*p); } +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount) +{ + // if (Semaphore_IsCreated(p)) + { + WRes wres = Semaphore_Close(p); + if (wres != 0) + return wres; + } + return Semaphore_Create(p, initCount, maxCount); +} + static WRes Semaphore_Release(CSemaphore *p, LONG releaseCount, LONG *previousCount) { return BOOLToWRes(ReleaseSemaphore(*p, releaseCount, previousCount)); } WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num) @@ -80,7 +169,9 @@ WRes Semaphore_Release1(CSemaphore *p) { return Semaphore_ReleaseN(p, 1); } WRes CriticalSection_Init(CCriticalSection *p) { - /* InitializeCriticalSection can raise only STATUS_NO_MEMORY exception */ + /* InitializeCriticalSection() can raise exception: + Windows XP, 2003 : can raise a STATUS_NO_MEMORY exception + Windows Vista+ : no exceptions */ #ifdef _MSC_VER __try #endif @@ -89,7 +180,361 @@ WRes CriticalSection_Init(CCriticalSection *p) /* InitializeCriticalSectionAndSpinCount(p, 0); */ } #ifdef _MSC_VER - __except (EXCEPTION_EXECUTE_HANDLER) { return 1; } + __except (EXCEPTION_EXECUTE_HANDLER) { return ERROR_NOT_ENOUGH_MEMORY; } #endif return 0; } + + + + +#else // _WIN32 + +// ---------- POSIX ---------- + +#ifndef __APPLE__ +#ifndef _7ZIP_AFFINITY_DISABLE +// _GNU_SOURCE can be required for pthread_setaffinity_np() / CPU_ZERO / CPU_SET +#define _GNU_SOURCE +#endif +#endif + +#include "Threads.h" + +#include +#include +#include +#ifdef _7ZIP_AFFINITY_SUPPORTED +// #include +#endif + + +// #include +// #define PRF(p) p +#define PRF(p) + +#define Print(s) PRF(printf("\n%s\n", s)) + +// #include + +WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet) +{ + // new thread in Posix probably inherits affinity from parrent thread + Print("Thread_Create_With_CpuSet"); + + pthread_attr_t attr; + int ret; + // int ret2; + + p->_created = 0; + + RINOK(pthread_attr_init(&attr)); + + ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + if (!ret) + { + if (cpuSet) + { + #ifdef _7ZIP_AFFINITY_SUPPORTED + + /* + printf("\n affinity :"); + unsigned i; + for (i = 0; i < sizeof(*cpuSet) && i < 8; i++) + { + Byte b = *((const Byte *)cpuSet + i); + char temp[32]; + #define GET_HEX_CHAR(t) ((char)(((t < 10) ? ('0' + t) : ('A' + (t - 10))))) + temp[0] = GET_HEX_CHAR((b & 0xF)); + temp[1] = GET_HEX_CHAR((b >> 4)); + // temp[0] = GET_HEX_CHAR((b >> 4)); // big-endian + // temp[1] = GET_HEX_CHAR((b & 0xF)); // big-endian + temp[2] = 0; + printf("%s", temp); + } + printf("\n"); + */ + + // ret2 = + pthread_attr_setaffinity_np(&attr, sizeof(*cpuSet), cpuSet); + // if (ret2) ret = ret2; + #endif + } + + ret = pthread_create(&p->_tid, &attr, func, param); + + if (!ret) + { + p->_created = 1; + /* + if (cpuSet) + { + // ret2 = + pthread_setaffinity_np(p->_tid, sizeof(*cpuSet), cpuSet); + // if (ret2) ret = ret2; + } + */ + } + } + // ret2 = + pthread_attr_destroy(&attr); + // if (ret2 != 0) ret = ret2; + return ret; +} + + +WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) +{ + return Thread_Create_With_CpuSet(p, func, param, NULL); +} + + +WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) +{ + Print("Thread_Create_WithAffinity"); + CCpuSet cs; + unsigned i; + CpuSet_Zero(&cs); + for (i = 0; i < sizeof(affinity) * 8; i++) + { + if (affinity == 0) + break; + if (affinity & 1) + { + CpuSet_Set(&cs, i); + } + affinity >>= 1; + } + return Thread_Create_With_CpuSet(p, func, param, &cs); +} + + +WRes Thread_Close(CThread *p) +{ + // Print("Thread_Close"); + int ret; + if (!p->_created) + return 0; + + ret = pthread_detach(p->_tid); + p->_tid = 0; + p->_created = 0; + return ret; +} + + +WRes Thread_Wait_Close(CThread *p) +{ + // Print("Thread_Wait_Close"); + void *thread_return; + int ret; + if (!p->_created) + return EINVAL; + + ret = pthread_join(p->_tid, &thread_return); + // probably we can't use that (_tid) after pthread_join(), so we close thread here + p->_created = 0; + p->_tid = 0; + return ret; +} + + + +static WRes Event_Create(CEvent *p, int manualReset, int signaled) +{ + RINOK(pthread_mutex_init(&p->_mutex, NULL)); + RINOK(pthread_cond_init(&p->_cond, NULL)); + p->_manual_reset = manualReset; + p->_state = (signaled ? True : False); + p->_created = 1; + return 0; +} + +WRes ManualResetEvent_Create(CManualResetEvent *p, int signaled) + { return Event_Create(p, True, signaled); } +WRes ManualResetEvent_CreateNotSignaled(CManualResetEvent *p) + { return ManualResetEvent_Create(p, 0); } +WRes AutoResetEvent_Create(CAutoResetEvent *p, int signaled) + { return Event_Create(p, False, signaled); } +WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p) + { return AutoResetEvent_Create(p, 0); } + + +WRes Event_Set(CEvent *p) +{ + RINOK(pthread_mutex_lock(&p->_mutex)); + p->_state = True; + int res1 = pthread_cond_broadcast(&p->_cond); + int res2 = pthread_mutex_unlock(&p->_mutex); + return (res2 ? res2 : res1); +} + +WRes Event_Reset(CEvent *p) +{ + RINOK(pthread_mutex_lock(&p->_mutex)); + p->_state = False; + return pthread_mutex_unlock(&p->_mutex); +} + +WRes Event_Wait(CEvent *p) +{ + RINOK(pthread_mutex_lock(&p->_mutex)); + while (p->_state == False) + { + // ETIMEDOUT + // ret = + pthread_cond_wait(&p->_cond, &p->_mutex); + // if (ret != 0) break; + } + if (p->_manual_reset == False) + { + p->_state = False; + } + return pthread_mutex_unlock(&p->_mutex); +} + +WRes Event_Close(CEvent *p) +{ + if (!p->_created) + return 0; + p->_created = 0; + { + int res1 = pthread_mutex_destroy(&p->_mutex); + int res2 = pthread_cond_destroy(&p->_cond); + return (res1 ? res1 : res2); + } +} + + +WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount) +{ + if (initCount > maxCount || maxCount < 1) + return EINVAL; + RINOK(pthread_mutex_init(&p->_mutex, NULL)); + RINOK(pthread_cond_init(&p->_cond, NULL)); + p->_count = initCount; + p->_maxCount = maxCount; + p->_created = 1; + return 0; +} + + +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount) +{ + if (Semaphore_IsCreated(p)) + { + /* + WRes wres = Semaphore_Close(p); + if (wres != 0) + return wres; + */ + if (initCount > maxCount || maxCount < 1) + return EINVAL; + // return EINVAL; // for debug + p->_count = initCount; + p->_maxCount = maxCount; + return 0; + } + return Semaphore_Create(p, initCount, maxCount); +} + + +WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 releaseCount) +{ + UInt32 newCount; + int ret; + + if (releaseCount < 1) + return EINVAL; + + RINOK(pthread_mutex_lock(&p->_mutex)); + + newCount = p->_count + releaseCount; + if (newCount > p->_maxCount) + ret = ERROR_TOO_MANY_POSTS; // EINVAL; + else + { + p->_count = newCount; + ret = pthread_cond_broadcast(&p->_cond); + } + RINOK(pthread_mutex_unlock(&p->_mutex)); + return ret; +} + +WRes Semaphore_Wait(CSemaphore *p) +{ + RINOK(pthread_mutex_lock(&p->_mutex)); + while (p->_count < 1) + { + pthread_cond_wait(&p->_cond, &p->_mutex); + } + p->_count--; + return pthread_mutex_unlock(&p->_mutex); +} + +WRes Semaphore_Close(CSemaphore *p) +{ + if (!p->_created) + return 0; + p->_created = 0; + { + int res1 = pthread_mutex_destroy(&p->_mutex); + int res2 = pthread_cond_destroy(&p->_cond); + return (res1 ? res1 : res2); + } +} + + + +WRes CriticalSection_Init(CCriticalSection *p) +{ + // Print("CriticalSection_Init"); + if (!p) + return EINTR; + return pthread_mutex_init(&p->_mutex, NULL); +} + +void CriticalSection_Enter(CCriticalSection *p) +{ + // Print("CriticalSection_Enter"); + if (p) + { + // int ret = + pthread_mutex_lock(&p->_mutex); + } +} + +void CriticalSection_Leave(CCriticalSection *p) +{ + // Print("CriticalSection_Leave"); + if (p) + { + // int ret = + pthread_mutex_unlock(&p->_mutex); + } +} + +void CriticalSection_Delete(CCriticalSection *p) +{ + // Print("CriticalSection_Delete"); + if (p) + { + // int ret = + pthread_mutex_destroy(&p->_mutex); + } +} + +LONG InterlockedIncrement(LONG volatile *addend) +{ + // Print("InterlockedIncrement"); + #ifdef USE_HACK_UNSAFE_ATOMIC + LONG val = *addend + 1; + *addend = val; + return val; + #else + return __sync_add_and_fetch(addend, 1); + #endif +} + +#endif // _WIN32 diff --git a/3rdparty/7z/src/Threads.h b/3rdparty/7z/src/Threads.h index f913241aea..e9493afff6 100644 --- a/3rdparty/7z/src/Threads.h +++ b/3rdparty/7z/src/Threads.h @@ -1,38 +1,139 @@ /* Threads.h -- multithreading library -2017-06-18 : Igor Pavlov : Public domain */ +2021-12-21 : Igor Pavlov : Public domain */ #ifndef __7Z_THREADS_H #define __7Z_THREADS_H #ifdef _WIN32 -#include +#include +#else + +#if defined(__linux__) +#if !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) +#ifndef _7ZIP_AFFINITY_DISABLE +#define _7ZIP_AFFINITY_SUPPORTED +// #pragma message(" ==== _7ZIP_AFFINITY_SUPPORTED") +// #define _GNU_SOURCE +#endif +#endif +#endif + +#include + #endif #include "7zTypes.h" EXTERN_C_BEGIN +#ifdef _WIN32 + WRes HandlePtr_Close(HANDLE *h); WRes Handle_WaitObject(HANDLE h); typedef HANDLE CThread; -#define Thread_Construct(p) *(p) = NULL + +#define Thread_Construct(p) { *(p) = NULL; } #define Thread_WasCreated(p) (*(p) != NULL) #define Thread_Close(p) HandlePtr_Close(p) -#define Thread_Wait(p) Handle_WaitObject(*(p)) +// #define Thread_Wait(p) Handle_WaitObject(*(p)) + +#ifdef UNDER_CE + // if (USE_THREADS_CreateThread is defined), we use _beginthreadex() + // if (USE_THREADS_CreateThread is not definned), we use CreateThread() + #define USE_THREADS_CreateThread +#endif typedef -#ifdef UNDER_CE - DWORD + #ifdef USE_THREADS_CreateThread + DWORD + #else + unsigned + #endif + THREAD_FUNC_RET_TYPE; + +typedef DWORD_PTR CAffinityMask; +typedef DWORD_PTR CCpuSet; + +#define CpuSet_Zero(p) { *(p) = 0; } +#define CpuSet_Set(p, cpu) { *(p) |= ((DWORD_PTR)1 << (cpu)); } + +#else // _WIN32 + +typedef struct _CThread +{ + pthread_t _tid; + int _created; +} CThread; + +#define Thread_Construct(p) { (p)->_tid = 0; (p)->_created = 0; } +#define Thread_WasCreated(p) ((p)->_created != 0) +WRes Thread_Close(CThread *p); +// #define Thread_Wait Thread_Wait_Close + +typedef void * THREAD_FUNC_RET_TYPE; + +typedef UInt64 CAffinityMask; + +#ifdef _7ZIP_AFFINITY_SUPPORTED + +typedef cpu_set_t CCpuSet; +#define CpuSet_Zero(p) CPU_ZERO(p) +#define CpuSet_Set(p, cpu) CPU_SET(cpu, p) +#define CpuSet_IsSet(p, cpu) CPU_ISSET(cpu, p) + #else - unsigned + +typedef UInt64 CCpuSet; +#define CpuSet_Zero(p) { *(p) = 0; } +#define CpuSet_Set(p, cpu) { *(p) |= ((UInt64)1 << (cpu)); } +#define CpuSet_IsSet(p, cpu) ((*(p) & ((UInt64)1 << (cpu))) != 0) + #endif - THREAD_FUNC_RET_TYPE; + + +#endif // _WIN32 + #define THREAD_FUNC_CALL_TYPE MY_STD_CALL -#define THREAD_FUNC_DECL THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE + +#if defined(_WIN32) && defined(__GNUC__) +/* GCC compiler for x86 32-bit uses the rule: + the stack is 16-byte aligned before CALL instruction for function calling. + But only root function main() contains instructions that + set 16-byte alignment for stack pointer. And another functions + just keep alignment, if it was set in some parent function. + + The problem: + if we create new thread in MinGW (GCC) 32-bit x86 via _beginthreadex() or CreateThread(), + the root function of thread doesn't set 16-byte alignment. + And stack frames in all child functions also will be unaligned in that case. + + Here we set (force_align_arg_pointer) attribute for root function of new thread. + Do we need (force_align_arg_pointer) also for another systems? */ + + #define THREAD_FUNC_ATTRIB_ALIGN_ARG __attribute__((force_align_arg_pointer)) + // #define THREAD_FUNC_ATTRIB_ALIGN_ARG // for debug : bad alignment in SSE functions +#else + #define THREAD_FUNC_ATTRIB_ALIGN_ARG +#endif + +#define THREAD_FUNC_DECL THREAD_FUNC_ATTRIB_ALIGN_ARG THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE + typedef THREAD_FUNC_RET_TYPE (THREAD_FUNC_CALL_TYPE * THREAD_FUNC_TYPE)(void *); WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param); +WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity); +WRes Thread_Wait_Close(CThread *p); + +#ifdef _WIN32 +#define Thread_Create_With_CpuSet(p, func, param, cs) \ + Thread_Create_With_Affinity(p, func, param, *cs) +#else +WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet); +#endif + + +#ifdef _WIN32 typedef HANDLE CEvent; typedef CEvent CAutoResetEvent; @@ -54,6 +155,7 @@ typedef HANDLE CSemaphore; #define Semaphore_Close(p) HandlePtr_Close(p) #define Semaphore_Wait(p) Handle_WaitObject(*(p)) WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount); +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount); WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num); WRes Semaphore_Release1(CSemaphore *p); @@ -63,6 +165,68 @@ WRes CriticalSection_Init(CCriticalSection *p); #define CriticalSection_Enter(p) EnterCriticalSection(p) #define CriticalSection_Leave(p) LeaveCriticalSection(p) + +#else // _WIN32 + +typedef struct _CEvent +{ + int _created; + int _manual_reset; + int _state; + pthread_mutex_t _mutex; + pthread_cond_t _cond; +} CEvent; + +typedef CEvent CAutoResetEvent; +typedef CEvent CManualResetEvent; + +#define Event_Construct(p) (p)->_created = 0 +#define Event_IsCreated(p) ((p)->_created) + +WRes ManualResetEvent_Create(CManualResetEvent *p, int signaled); +WRes ManualResetEvent_CreateNotSignaled(CManualResetEvent *p); +WRes AutoResetEvent_Create(CAutoResetEvent *p, int signaled); +WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p); +WRes Event_Set(CEvent *p); +WRes Event_Reset(CEvent *p); +WRes Event_Wait(CEvent *p); +WRes Event_Close(CEvent *p); + + +typedef struct _CSemaphore +{ + int _created; + UInt32 _count; + UInt32 _maxCount; + pthread_mutex_t _mutex; + pthread_cond_t _cond; +} CSemaphore; + +#define Semaphore_Construct(p) (p)->_created = 0 +#define Semaphore_IsCreated(p) ((p)->_created) + +WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount); +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount); +WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num); +#define Semaphore_Release1(p) Semaphore_ReleaseN(p, 1) +WRes Semaphore_Wait(CSemaphore *p); +WRes Semaphore_Close(CSemaphore *p); + + +typedef struct _CCriticalSection +{ + pthread_mutex_t _mutex; +} CCriticalSection; + +WRes CriticalSection_Init(CCriticalSection *p); +void CriticalSection_Delete(CCriticalSection *cs); +void CriticalSection_Enter(CCriticalSection *cs); +void CriticalSection_Leave(CCriticalSection *cs); + +LONG InterlockedIncrement(LONG volatile *addend); + +#endif // _WIN32 + EXTERN_C_END #endif diff --git a/3rdparty/7z/src/Xz.c b/3rdparty/7z/src/Xz.c index 7e061d6e7d..d6e2596a90 100644 --- a/3rdparty/7z/src/Xz.c +++ b/3rdparty/7z/src/Xz.c @@ -1,5 +1,5 @@ /* Xz.c - Xz -2017-05-12 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -41,7 +41,7 @@ void Xz_Free(CXzStream *p, ISzAllocPtr alloc) unsigned XzFlags_GetCheckSize(CXzStreamFlags f) { unsigned t = XzFlags_GetCheckType(f); - return (t == 0) ? 0 : (4 << ((t - 1) / 3)); + return (t == 0) ? 0 : ((unsigned)4 << ((t - 1) / 3)); } void XzCheck_Init(CXzCheck *p, unsigned mode) diff --git a/3rdparty/7z/src/Xz.h b/3rdparty/7z/src/Xz.h index fad56a3fb7..cf9458e39a 100644 --- a/3rdparty/7z/src/Xz.h +++ b/3rdparty/7z/src/Xz.h @@ -1,5 +1,5 @@ /* Xz.h - Xz interface -2018-07-04 : Igor Pavlov : Public domain */ +2021-04-01 : Igor Pavlov : Public domain */ #ifndef __XZ_H #define __XZ_H @@ -47,7 +47,7 @@ typedef struct CXzFilter filters[XZ_NUM_FILTERS_MAX]; } CXzBlock; -#define XzBlock_GetNumFilters(p) (((p)->flags & XZ_BF_NUM_FILTERS_MASK) + 1) +#define XzBlock_GetNumFilters(p) (((unsigned)(p)->flags & XZ_BF_NUM_FILTERS_MASK) + 1) #define XzBlock_HasPackSize(p) (((p)->flags & XZ_BF_PACK_SIZE) != 0) #define XzBlock_HasUnpackSize(p) (((p)->flags & XZ_BF_UNPACK_SIZE) != 0) #define XzBlock_HasUnsupportedFlags(p) (((p)->flags & ~(XZ_BF_NUM_FILTERS_MASK | XZ_BF_PACK_SIZE | XZ_BF_UNPACK_SIZE)) != 0) @@ -277,7 +277,10 @@ void XzUnpacker_Free(CXzUnpacker *p); { XzUnpacker_Init() for() + { XzUnpacker_Code(); + } + XzUnpacker_IsStreamWasFinished() } Interface-2 : Direct output buffer: @@ -288,7 +291,10 @@ void XzUnpacker_Free(CXzUnpacker *p); XzUnpacker_Init() XzUnpacker_SetOutBufMode(); // to set output buffer and size for() + { XzUnpacker_Code(); // (dest = NULL) in XzUnpacker_Code() + } + XzUnpacker_IsStreamWasFinished() } Interface-3 : Direct output buffer : One call full decoding @@ -296,6 +302,7 @@ void XzUnpacker_Free(CXzUnpacker *p); It uses Interface-2 internally. { XzUnpacker_CodeFull() + XzUnpacker_IsStreamWasFinished() } */ @@ -309,8 +316,12 @@ Returns: SZ_OK status: CODER_STATUS_NOT_FINISHED, - CODER_STATUS_NEEDS_MORE_INPUT - maybe there are more xz streams, - call XzUnpacker_IsStreamWasFinished to check that current stream was finished + CODER_STATUS_NEEDS_MORE_INPUT - the decoder can return it in two cases: + 1) it needs more input data to finish current xz stream + 2) xz stream was finished successfully. But the decoder supports multiple + concatented xz streams. So it expects more input data for new xz streams. + Call XzUnpacker_IsStreamWasFinished() to check that latest xz stream was finished successfully. + SZ_ERROR_MEM - Memory allocation error SZ_ERROR_DATA - Data error SZ_ERROR_UNSUPPORTED - Unsupported method or method properties @@ -335,12 +346,17 @@ SRes XzUnpacker_CodeFull(CXzUnpacker *p, Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, ECoderFinishMode finishMode, ECoderStatus *status); +/* +If you decode full xz stream(s), then you can call XzUnpacker_IsStreamWasFinished() +after successful XzUnpacker_CodeFull() or after last call of XzUnpacker_Code(). +*/ + BoolInt XzUnpacker_IsStreamWasFinished(const CXzUnpacker *p); /* -XzUnpacker_GetExtraSize() returns then number of uncofirmed bytes, +XzUnpacker_GetExtraSize() returns then number of unconfirmed bytes, if it's in (XZ_STATE_STREAM_HEADER) state or in (XZ_STATE_STREAM_PADDING) state. -These bytes can be some bytes after xz archive, or +These bytes can be some data after xz archive, or it can be start of new xz stream. Call XzUnpacker_GetExtraSize() after XzUnpacker_Code() function to detect real size of @@ -371,19 +387,46 @@ BoolInt XzUnpacker_IsBlockFinished(const CXzUnpacker *p); -/* ---------- Multi Threading Decoding ---------- */ + + + +/* ---- Single-Thread and Multi-Thread xz Decoding with Input/Output Streams ---- */ + +/* + if (CXzDecMtProps::numThreads > 1), the decoder can try to use + Multi-Threading. The decoder analyses xz block header, and if + there are pack size and unpack size values stored in xz block header, + the decoder reads compressed data of block to internal buffers, + and then it can start parallel decoding, if there are another blocks. + The decoder can switch back to Single-Thread decoding after some conditions. + + The sequence of calls for xz decoding with in/out Streams: + { + XzDecMt_Create() + XzDecMtProps_Init(XzDecMtProps) to set default values of properties + // then you can change some XzDecMtProps parameters with required values + // here you can set the number of threads and (memUseMax) - the maximum + Memory usage for multithreading decoding. + for() + { + XzDecMt_Decode() // one call per one file + } + XzDecMt_Destroy() + } +*/ typedef struct { - size_t inBufSize_ST; - size_t outStep_ST; - BoolInt ignoreErrors; + size_t inBufSize_ST; // size of input buffer for Single-Thread decoding + size_t outStep_ST; // size of output buffer for Single-Thread decoding + BoolInt ignoreErrors; // if set to 1, the decoder can ignore some errors and it skips broken parts of data. #ifndef _7ZIP_ST - unsigned numThreads; - size_t inBufSize_MT; - size_t memUseMax; + unsigned numThreads; // the number of threads for Multi-Thread decoding. if (umThreads == 1) it will use Single-thread decoding + size_t inBufSize_MT; // size of small input data buffers for Multi-Thread decoding. Big number of such small buffers can be created + size_t memUseMax; // the limit of total memory usage for Multi-Thread decoding. + // it's recommended to set (memUseMax) manually to value that is smaller of total size of RAM in computer. #endif } CXzDecMtProps; @@ -393,7 +436,7 @@ void XzDecMtProps_Init(CXzDecMtProps *p); typedef void * CXzDecMtHandle; /* - alloc : XzDecMt uses CAlignOffsetAlloc for addresses allocated by (alloc). + alloc : XzDecMt uses CAlignOffsetAlloc internally for addresses allocated by (alloc). allocMid : for big allocations, aligned allocation is better */ @@ -407,33 +450,46 @@ typedef struct Byte NumStreams_Defined; Byte NumBlocks_Defined; - Byte DataAfterEnd; + Byte DataAfterEnd; // there are some additional data after good xz streams, and that data is not new xz stream. Byte DecodingTruncated; // Decoding was Truncated, we need only partial output data - UInt64 InSize; // pack size processed + UInt64 InSize; // pack size processed. That value doesn't include the data after + // end of xz stream, if that data was not correct UInt64 OutSize; UInt64 NumStreams; UInt64 NumBlocks; - SRes DecodeRes; - SRes ReadRes; - SRes ProgressRes; - SRes CombinedRes; - SRes CombinedRes_Type; + SRes DecodeRes; // the error code of xz streams data decoding + SRes ReadRes; // error code from ISeqInStream:Read() + SRes ProgressRes; // error code from ICompressProgress:Progress() + SRes CombinedRes; // Combined result error code that shows main rusult + // = S_OK, if there is no error. + // but check also (DataAfterEnd) that can show additional minor errors. + + SRes CombinedRes_Type; // = SZ_ERROR_READ, if error from ISeqInStream + // = SZ_ERROR_PROGRESS, if error from ICompressProgress + // = SZ_ERROR_WRITE, if error from ISeqOutStream + // = SZ_ERROR_* codes for decoding } CXzStatInfo; void XzStatInfo_Clear(CXzStatInfo *p); /* + XzDecMt_Decode() -SRes: - SZ_OK - OK +SRes: it's combined decoding result. It also is equal to stat->CombinedRes. + + SZ_OK - no error + check also output value in (stat->DataAfterEnd) + that can show additional possible error + SZ_ERROR_MEM - Memory allocation error SZ_ERROR_NO_ARCHIVE - is not xz archive SZ_ERROR_ARCHIVE - Headers error SZ_ERROR_DATA - Data Error + SZ_ERROR_UNSUPPORTED - Unsupported method or method properties SZ_ERROR_CRC - CRC Error SZ_ERROR_INPUT_EOF - it needs more input data SZ_ERROR_WRITE - ISeqOutStream error @@ -451,8 +507,9 @@ SRes XzDecMt_Decode(CXzDecMtHandle p, // Byte *outBuf, size_t *outBufSize, ISeqInStream *inStream, // const Byte *inData, size_t inDataSize, - CXzStatInfo *stat, - int *isMT, // 0 means that ST (Single-Thread) version was used + CXzStatInfo *stat, // out: decoding results and statistics + int *isMT, // out: 0 means that ST (Single-Thread) version was used + // 1 means that MT (Multi-Thread) version was used ICompressProgress *progress); EXTERN_C_END diff --git a/3rdparty/7z/src/XzCrc64Opt.c b/3rdparty/7z/src/XzCrc64Opt.c index 9273465d47..a0637dd222 100644 --- a/3rdparty/7z/src/XzCrc64Opt.c +++ b/3rdparty/7z/src/XzCrc64Opt.c @@ -1,5 +1,5 @@ /* XzCrc64Opt.c -- CRC64 calculation -2017-06-30 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -9,6 +9,7 @@ #define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) +UInt64 MY_FAST_CALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table); UInt64 MY_FAST_CALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table) { const Byte *p = (const Byte *)data; @@ -16,7 +17,7 @@ UInt64 MY_FAST_CALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, con v = CRC64_UPDATE_BYTE_2(v, *p); for (; size >= 4; size -= 4, p += 4) { - UInt32 d = (UInt32)v ^ *(const UInt32 *)p; + UInt32 d = (UInt32)v ^ *(const UInt32 *)(const void *)p; v = (v >> 32) ^ (table + 0x300)[((d ) & 0xFF)] ^ (table + 0x200)[((d >> 8) & 0xFF)] @@ -45,6 +46,7 @@ UInt64 MY_FAST_CALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, con #define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[(Byte)((crc) >> 56) ^ (b)] ^ ((crc) << 8)) +UInt64 MY_FAST_CALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table); UInt64 MY_FAST_CALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table) { const Byte *p = (const Byte *)data; @@ -54,7 +56,7 @@ UInt64 MY_FAST_CALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size v = CRC64_UPDATE_BYTE_2_BE(v, *p); for (; size >= 4; size -= 4, p += 4) { - UInt32 d = (UInt32)(v >> 32) ^ *(const UInt32 *)p; + UInt32 d = (UInt32)(v >> 32) ^ *(const UInt32 *)(const void *)p; v = (v << 32) ^ (table + 0x000)[((d ) & 0xFF)] ^ (table + 0x100)[((d >> 8) & 0xFF)] diff --git a/3rdparty/7z/src/XzDec.c b/3rdparty/7z/src/XzDec.c index 4f53272078..49329f16a6 100644 --- a/3rdparty/7z/src/XzDec.c +++ b/3rdparty/7z/src/XzDec.c @@ -1,5 +1,5 @@ /* XzDec.c -- Xz Decode -2019-02-02 : Igor Pavlov : Public domain */ +2021-09-04 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -240,6 +240,7 @@ static SRes BraState_Code2(void *pp, } +SRes BraState_SetFromMethod(IStateCoder *p, UInt64 id, int encodeMode, ISzAllocPtr alloc); SRes BraState_SetFromMethod(IStateCoder *p, UInt64 id, int encodeMode, ISzAllocPtr alloc) { CBraState *decoder; @@ -772,7 +773,8 @@ static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ - if (s == 0) return SZ_ERROR_ARCHIVE; pos += s; } + if (s == 0) return SZ_ERROR_ARCHIVE; \ + pos += s; } static BoolInt XzBlock_AreSupportedFilters(const CXzBlock *p) @@ -1038,7 +1040,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, (p->outBuf ? NULL : dest), &destLen2, destFinish, src, &srcLen2, srcFinished2, finishMode2); - + *status = p->decoder.status; XzCheck_Update(&p->check, (p->outBuf ? p->outBuf + p->outDataWritten : dest), destLen2); if (!p->outBuf) @@ -1275,9 +1277,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, } else { + const Byte *ptr = p->buf; p->state = XZ_STATE_STREAM_FOOTER; p->pos = 0; - if (CRC_GET_DIGEST(p->crc) != GetUi32(p->buf)) + if (CRC_GET_DIGEST(p->crc) != GetUi32(ptr)) return SZ_ERROR_CRC; } break; @@ -1456,7 +1459,6 @@ typedef struct ISeqInStream *inStream; ISeqOutStream *outStream; ICompressProgress *progress; - // CXzStatInfo *stat; BoolInt finishMode; BoolInt outSize_Defined; @@ -1492,8 +1494,9 @@ typedef struct UInt64 numBlocks; // UInt64 numBadBlocks; - SRes mainErrorCode; - + SRes mainErrorCode; // it's set to error code, if the size Code() output doesn't patch the size from Parsing stage + // it can be = SZ_ERROR_INPUT_EOF + // it can be = SZ_ERROR_DATA, in some another cases BoolInt isBlockHeaderState_Parse; BoolInt isBlockHeaderState_Write; UInt64 outProcessed_Parse; @@ -1877,7 +1880,7 @@ static SRes XzDecMt_Callback_PreCode(void *pp, unsigned coderIndex) { // if (res == SZ_ERROR_MEM) return res; if (me->props.ignoreErrors && res != SZ_ERROR_MEM) - return S_OK; + return SZ_OK; return res; } } @@ -1898,15 +1901,18 @@ static SRes XzDecMt_Callback_Code(void *pp, unsigned coderIndex, *outCodePos = coder->outCodeSize; *stop = True; + if (srcSize > coder->inPreSize - coder->inCodeSize) + return SZ_ERROR_FAIL; + if (coder->inCodeSize < coder->inPreHeaderSize) { - UInt64 rem = coder->inPreHeaderSize - coder->inCodeSize; - size_t step = srcSize; - if (step > rem) - step = (size_t)rem; + size_t step = coder->inPreHeaderSize - coder->inCodeSize; + if (step > srcSize) + step = srcSize; src += step; srcSize -= step; coder->inCodeSize += step; + *inCodePos = coder->inCodeSize; if (coder->inCodeSize < coder->inPreHeaderSize) { *stop = False; @@ -1956,7 +1962,7 @@ static SRes XzDecMt_Callback_Code(void *pp, unsigned coderIndex, { *inCodePos = coder->inPreSize; *outCodePos = coder->outPreSize; - return S_OK; + return SZ_OK; } return coder->codeRes; } @@ -1966,7 +1972,7 @@ static SRes XzDecMt_Callback_Code(void *pp, unsigned coderIndex, static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, BoolInt needWriteToStream, - const Byte *src, size_t srcSize, + const Byte *src, size_t srcSize, BoolInt isCross, // int srcFinished, BoolInt *needContinue, BoolInt *canRecode) @@ -1985,7 +1991,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, if (!coder->dec.headerParsedOk || !coder->outBuf) { if (me->finishedDecoderIndex < 0) - me->finishedDecoderIndex = coderIndex; + me->finishedDecoderIndex = (int)coderIndex; return SZ_OK; } @@ -2077,7 +2083,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, if (coder->codeRes != SZ_OK) if (!me->props.ignoreErrors) { - me->finishedDecoderIndex = coderIndex; + me->finishedDecoderIndex = (int)coderIndex; return res; } @@ -2086,7 +2092,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, if (coder->inPreSize != coder->inCodeSize || coder->blockPackTotal != coder->inCodeSize) { - me->finishedDecoderIndex = coderIndex; + me->finishedDecoderIndex = (int)coderIndex; return SZ_OK; } @@ -2125,22 +2131,41 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, return SZ_OK; } + /* + We have processed all xz-blocks of stream, + And xz unpacker is at XZ_STATE_BLOCK_HEADER state, where + (src) is a pointer to xz-Index structure. + We finish reading of current xz-Stream, including Zero padding after xz-Stream. + We exit, if we reach extra byte (first byte of new-Stream or another data). + But we don't update input stream pointer for that new extra byte. + If extra byte is not correct first byte of xz-signature, + we have SZ_ERROR_NO_ARCHIVE error here. + */ + res = XzUnpacker_Code(dec, NULL, &outSizeCur, src, &srcProcessed, me->mtc.readWasFinished, // srcFinished CODER_FINISH_END, // CODER_FINISH_ANY, &status); + + // res = SZ_ERROR_ARCHIVE; // for failure test me->status = status; me->codeRes = res; + if (isCross) + me->mtc.crossStart += srcProcessed; + me->mtc.inProcessed += srcProcessed; me->mtc.mtProgress.totalInSize = me->mtc.inProcessed; + srcSize -= srcProcessed; + src += srcProcessed; + if (res != SZ_OK) { - return S_OK; + return SZ_OK; // return res; } @@ -2149,20 +2174,26 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, *needContinue = True; me->isBlockHeaderState_Parse = False; me->isBlockHeaderState_Write = False; + + if (!isCross) { Byte *crossBuf = MtDec_GetCrossBuff(&me->mtc); if (!crossBuf) return SZ_ERROR_MEM; - memcpy(crossBuf, src + srcProcessed, srcSize - srcProcessed); + if (srcSize != 0) + memcpy(crossBuf, src, srcSize); + me->mtc.crossStart = 0; + me->mtc.crossEnd = srcSize; } - me->mtc.crossStart = 0; - me->mtc.crossEnd = srcSize - srcProcessed; + + PRF_STR_INT("XZ_STATE_STREAM_HEADER crossEnd = ", (unsigned)me->mtc.crossEnd); + return SZ_OK; } - if (status != CODER_STATUS_NEEDS_MORE_INPUT) + if (status != CODER_STATUS_NEEDS_MORE_INPUT || srcSize != 0) { - return E_FAIL; + return SZ_ERROR_FAIL; } if (me->mtc.readWasFinished) @@ -2174,7 +2205,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, { size_t inPos; size_t inLim; - const Byte *inData; + // const Byte *inData; UInt64 inProgressPrev = me->mtc.inProcessed; // XzDecMt_Prepare_InBuf_ST(p); @@ -2184,9 +2215,8 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, inPos = 0; inLim = 0; - // outProcessed = 0; - inData = crossBuf; + // inData = crossBuf; for (;;) { @@ -2201,7 +2231,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, { inPos = 0; inLim = me->mtc.inBufSize; - me->mtc.readRes = ISeqInStream_Read(me->inStream, (void *)inData, &inLim); + me->mtc.readRes = ISeqInStream_Read(me->inStream, (void *)crossBuf, &inLim); me->mtc.readProcessed += inLim; if (inLim == 0 || me->mtc.readRes != SZ_OK) me->mtc.readWasFinished = True; @@ -2213,7 +2243,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, res = XzUnpacker_Code(dec, NULL, &outProcessed, - inData + inPos, &inProcessed, + crossBuf + inPos, &inProcessed, (inProcessed == 0), // srcFinished CODER_FINISH_END, &status); @@ -2225,7 +2255,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, if (res != SZ_OK) { - return S_OK; + return SZ_OK; // return res; } @@ -2240,7 +2270,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, } if (status != CODER_STATUS_NEEDS_MORE_INPUT) - return E_FAIL; + return SZ_ERROR_FAIL; if (me->mtc.progress) { @@ -2276,13 +2306,6 @@ void XzStatInfo_Clear(CXzStatInfo *p) p->NumStreams_Defined = False; p->NumBlocks_Defined = False; - // p->IsArc = False; - // p->UnexpectedEnd = False; - // p->Unsupported = False; - // p->HeadersError = False; - // p->DataError = False; - // p->CrcError = False; - p->DataAfterEnd = False; p->DecodingTruncated = False; @@ -2296,6 +2319,16 @@ void XzStatInfo_Clear(CXzStatInfo *p) +/* + XzDecMt_Decode_ST() can return SZ_OK or the following errors + - SZ_ERROR_MEM for memory allocation error + - error from XzUnpacker_Code() function + - SZ_ERROR_WRITE for ISeqOutStream::Write(). stat->CombinedRes_Type = SZ_ERROR_WRITE in that case + - ICompressProgress::Progress() error, stat->CombinedRes_Type = SZ_ERROR_PROGRESS. + But XzDecMt_Decode_ST() doesn't return ISeqInStream::Read() errors. + ISeqInStream::Read() result is set to p->readRes. + also it can set stat->CombinedRes_Type to SZ_ERROR_WRITE or SZ_ERROR_PROGRESS. +*/ static SRes XzDecMt_Decode_ST(CXzDecMt *p #ifndef _7ZIP_ST @@ -2384,7 +2417,7 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p inPos = 0; inLim = p->inBufSize; inData = p->inBuf; - p->readRes = ISeqInStream_Read(p->inStream, (void *)inData, &inLim); + p->readRes = ISeqInStream_Read(p->inStream, (void *)p->inBuf, &inLim); p->readProcessed += inLim; if (inLim == 0 || p->readRes != SZ_OK) p->readWasFinished = True; @@ -2426,8 +2459,8 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p if (finished || outProcessed >= outSize) if (outPos != 0) { - size_t written = ISeqOutStream_Write(p->outStream, p->outBuf, outPos); - p->outProcessed += written; + const size_t written = ISeqOutStream_Write(p->outStream, p->outBuf, outPos); + // p->outProcessed += written; // 21.01: BUG fixed if (written != outPos) { stat->CombinedRes_Type = SZ_ERROR_WRITE; @@ -2438,9 +2471,8 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p if (p->progress && res == SZ_OK) { - UInt64 inDelta = p->inProcessed - inPrev; - UInt64 outDelta = p->outProcessed - outPrev; - if (inDelta >= (1 << 22) || outDelta >= (1 << 22)) + if (p->inProcessed - inPrev >= (1 << 22) || + p->outProcessed - outPrev >= (1 << 22)) { res = ICompressProgress_Progress(p->progress, p->inProcessed, p->outProcessed); if (res != SZ_OK) @@ -2455,14 +2487,31 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p } if (finished) - return res; + { + // p->codeRes is preliminary error from XzUnpacker_Code. + // and it can be corrected later as final result + // so we return SZ_OK here instead of (res); + return SZ_OK; + // return res; + } } } -static SRes XzStatInfo_SetStat(const CXzUnpacker *dec, + + +/* +XzStatInfo_SetStat() transforms + CXzUnpacker return code and status to combined CXzStatInfo results. + it can convert SZ_OK to SZ_ERROR_INPUT_EOF + it can convert SZ_ERROR_NO_ARCHIVE to SZ_OK and (DataAfterEnd = 1) +*/ + +static void XzStatInfo_SetStat(const CXzUnpacker *dec, int finishMode, - UInt64 readProcessed, UInt64 inProcessed, - SRes res, ECoderStatus status, + // UInt64 readProcessed, + UInt64 inProcessed, + SRes res, // it's result from CXzUnpacker unpacker + ECoderStatus status, BoolInt decodingTruncated, CXzStatInfo *stat) { @@ -2484,12 +2533,20 @@ static SRes XzStatInfo_SetStat(const CXzUnpacker *dec, if (status == CODER_STATUS_NEEDS_MORE_INPUT) { // CODER_STATUS_NEEDS_MORE_INPUT is expected status for correct xz streams + // any extra data is part of correct data extraSize = 0; + // if xz stream was not finished, then we need more data if (!XzUnpacker_IsStreamWasFinished(dec)) res = SZ_ERROR_INPUT_EOF; } - else if (!decodingTruncated || finishMode) // (status == CODER_STATUS_NOT_FINISHED) - res = SZ_ERROR_DATA; + else + { + // CODER_STATUS_FINISHED_WITH_MARK is not possible for multi stream xz decoding + // so he we have (status == CODER_STATUS_NOT_FINISHED) + // if (status != CODER_STATUS_FINISHED_WITH_MARK) + if (!decodingTruncated || finishMode) + res = SZ_ERROR_DATA; + } } else if (res == SZ_ERROR_NO_ARCHIVE) { @@ -2497,24 +2554,29 @@ static SRes XzStatInfo_SetStat(const CXzUnpacker *dec, SZ_ERROR_NO_ARCHIVE is possible for 2 states: XZ_STATE_STREAM_HEADER - if bad signature or bad CRC XZ_STATE_STREAM_PADDING - if non-zero padding data - extraSize / inProcessed don't include "bad" byte + extraSize and inProcessed don't include "bad" byte */ - if (inProcessed != extraSize) // if good streams before error - if (extraSize != 0 || readProcessed != inProcessed) + // if (inProcessed == extraSize), there was no any good xz stream header, and we keep error + if (inProcessed != extraSize) // if there were good xz streams before error + { + // if (extraSize != 0 || readProcessed != inProcessed) { + // he we suppose that all xz streams were finsihed OK, and we have + // some extra data after all streams stat->DataAfterEnd = True; - // there is some good xz stream before. So we set SZ_OK res = SZ_OK; } + } } - stat->DecodeRes = res; + if (stat->DecodeRes == SZ_OK) + stat->DecodeRes = res; stat->InSize -= extraSize; - return res; } + SRes XzDecMt_Decode(CXzDecMtHandle pp, const CXzDecMtProps *props, const UInt64 *outDataSize, int finishMode, @@ -2557,8 +2619,9 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, p->inProcessed = 0; p->readProcessed = 0; p->readWasFinished = False; + p->readRes = SZ_OK; - p->codeRes = 0; + p->codeRes = SZ_OK; p->status = CODER_STATUS_NOT_SPECIFIED; XzUnpacker_Init(&p->dec); @@ -2589,8 +2652,9 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, if (p->props.numThreads > 1) { - IMtDecCallback vt; - + IMtDecCallback2 vt; + BoolInt needContinue; + SRes res; // we just free ST buffers here // but we still keep state variables, that was set in XzUnpacker_Init() XzDecMt_FreeSt(p); @@ -2628,45 +2692,45 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, vt.Code = XzDecMt_Callback_Code; vt.Write = XzDecMt_Callback_Write; + + res = MtDec_Code(&p->mtc); + + + stat->InSize = p->mtc.inProcessed; + + p->inProcessed = p->mtc.inProcessed; + p->readRes = p->mtc.readRes; + p->readWasFinished = p->mtc.readWasFinished; + p->readProcessed = p->mtc.readProcessed; + + tMode = True; + needContinue = False; + + if (res == SZ_OK) { - BoolInt needContinue; - - SRes res = MtDec_Code(&p->mtc); - - stat->InSize = p->mtc.inProcessed; - - p->inProcessed = p->mtc.inProcessed; - p->readRes = p->mtc.readRes; - p->readWasFinished = p->mtc.readWasFinished; - p->readProcessed = p->mtc.readProcessed; - - tMode = True; - needContinue = False; - - if (res == SZ_OK) + if (p->mtc.mtProgress.res != SZ_OK) { - if (p->mtc.mtProgress.res != SZ_OK) - { - res = p->mtc.mtProgress.res; - stat->ProgressRes = res; - stat->CombinedRes_Type = SZ_ERROR_PROGRESS; - } - else - needContinue = p->mtc.needContinue; + res = p->mtc.mtProgress.res; + stat->ProgressRes = res; + stat->CombinedRes_Type = SZ_ERROR_PROGRESS; } - - if (!needContinue) + else + needContinue = p->mtc.needContinue; + } + + if (!needContinue) + { { SRes codeRes; BoolInt truncated = False; ECoderStatus status; - CXzUnpacker *dec; + const CXzUnpacker *dec; stat->OutSize = p->outProcessed; if (p->finishedDecoderIndex >= 0) { - CXzDecMtThread *coder = &p->coders[(unsigned)p->finishedDecoderIndex]; + const CXzDecMtThread *coder = &p->coders[(unsigned)p->finishedDecoderIndex]; codeRes = coder->codeRes; dec = &coder->dec; status = coder->status; @@ -2679,41 +2743,46 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, truncated = p->parsing_Truncated; } else - return E_FAIL; + return SZ_ERROR_FAIL; + + if (p->mainErrorCode != SZ_OK) + stat->DecodeRes = p->mainErrorCode; XzStatInfo_SetStat(dec, p->finishMode, - p->mtc.readProcessed, p->mtc.inProcessed, + // p->mtc.readProcessed, + p->mtc.inProcessed, codeRes, status, truncated, stat); - - if (res == SZ_OK) - { - if (p->writeRes != SZ_OK) - { - res = p->writeRes; - stat->CombinedRes_Type = SZ_ERROR_WRITE; - } - else if (p->mtc.readRes != SZ_OK && p->mtc.inProcessed == p->mtc.readProcessed) - { - res = p->mtc.readRes; - stat->ReadRes = res; - stat->CombinedRes_Type = SZ_ERROR_READ; - } - else if (p->mainErrorCode != SZ_OK) - { - res = p->mainErrorCode; - } - } - - stat->CombinedRes = res; - if (stat->CombinedRes_Type == SZ_OK) - stat->CombinedRes_Type = res; - return res; } - PRF_STR("----- decoding ST -----"); + if (res == SZ_OK) + { + stat->ReadRes = p->mtc.readRes; + + if (p->writeRes != SZ_OK) + { + res = p->writeRes; + stat->CombinedRes_Type = SZ_ERROR_WRITE; + } + else if (p->mtc.readRes != SZ_OK + // && p->mtc.inProcessed == p->mtc.readProcessed + && stat->DecodeRes == SZ_ERROR_INPUT_EOF) + { + res = p->mtc.readRes; + stat->CombinedRes_Type = SZ_ERROR_READ; + } + else if (stat->DecodeRes != SZ_OK) + res = stat->DecodeRes; + } + + stat->CombinedRes = res; + if (stat->CombinedRes_Type == SZ_OK) + stat->CombinedRes_Type = res; + return res; } + + PRF_STR("----- decoding ST -----"); } #endif @@ -2729,33 +2798,35 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, , stat ); + #ifndef _7ZIP_ST + // we must set error code from MT decoding at first + if (p->mainErrorCode != SZ_OK) + stat->DecodeRes = p->mainErrorCode; + #endif + XzStatInfo_SetStat(&p->dec, p->finishMode, - p->readProcessed, p->inProcessed, + // p->readProcessed, + p->inProcessed, p->codeRes, p->status, False, // truncated stat); + stat->ReadRes = p->readRes; + if (res == SZ_OK) { - /* - if (p->writeRes != SZ_OK) - { - res = p->writeRes; - stat->CombinedRes_Type = SZ_ERROR_WRITE; - } - else - */ - if (p->readRes != SZ_OK && p->inProcessed == p->readProcessed) + if (p->readRes != SZ_OK + // && p->inProcessed == p->readProcessed + && stat->DecodeRes == SZ_ERROR_INPUT_EOF) { + // we set read error as combined error, only if that error was the reason + // of decoding problem res = p->readRes; - stat->ReadRes = res; stat->CombinedRes_Type = SZ_ERROR_READ; } - #ifndef _7ZIP_ST - else if (p->mainErrorCode != SZ_OK) - res = p->mainErrorCode; - #endif + else if (stat->DecodeRes != SZ_OK) + res = stat->DecodeRes; } stat->CombinedRes = res; diff --git a/3rdparty/7z/src/XzEnc.c b/3rdparty/7z/src/XzEnc.c index 309eca949e..759ba670e7 100644 --- a/3rdparty/7z/src/XzEnc.c +++ b/3rdparty/7z/src/XzEnc.c @@ -1,5 +1,5 @@ /* XzEnc.c -- Xz Encode -2019-02-02 : Igor Pavlov : Public domain */ +2021-04-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -36,7 +36,7 @@ #define XzBlock_ClearFlags(p) (p)->flags = 0; -#define XzBlock_SetNumFilters(p, n) (p)->flags |= ((n) - 1); +#define XzBlock_SetNumFilters(p, n) (p)->flags = (Byte)((p)->flags | ((n) - 1)); #define XzBlock_SetHasPackSize(p) (p)->flags |= XZ_BF_PACK_SIZE; #define XzBlock_SetHasUnpackSize(p) (p)->flags |= XZ_BF_UNPACK_SIZE; @@ -552,7 +552,7 @@ static void XzEncProps_Normalize_Fixed(CXzProps *p) numBlocks++; if (numBlocks < (unsigned)t2) { - t2r = (unsigned)numBlocks; + t2r = (int)numBlocks; if (t2r == 0) t2r = 1; t3 = t1 * t2r; @@ -751,7 +751,8 @@ static SRes Xz_CompressBlock( } else if (fp->ipDefined) { - SetUi32(filter->props, fp->ip); + Byte *ptr = filter->props; + SetUi32(ptr, fp->ip); filter->propsSize = 4; } } @@ -1196,7 +1197,7 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr p->outBufSize = destBlockSize; } - p->mtCoder.numThreadsMax = props->numBlockThreads_Max; + p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max; p->mtCoder.expectedDataSize = p->expectedDataSize; RINOK(MtCoder_Code(&p->mtCoder)); diff --git a/3rdparty/7z/src/XzIn.c b/3rdparty/7z/src/XzIn.c index 792a617865..07201b8425 100644 --- a/3rdparty/7z/src/XzIn.c +++ b/3rdparty/7z/src/XzIn.c @@ -1,5 +1,5 @@ /* XzIn.c - Xz input -2018-07-04 : Igor Pavlov : Public domain */ +2021-09-04 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -26,7 +26,8 @@ SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStream *inStream) #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ - if (s == 0) return SZ_ERROR_ARCHIVE; pos += s; } + if (s == 0) return SZ_ERROR_ARCHIVE; \ + pos += s; } SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStream *inStream, BoolInt *isIndex, UInt32 *headerSizeRes) { @@ -152,7 +153,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStream *stream, Int64 *startOff { UInt64 indexSize; Byte buf[XZ_STREAM_FOOTER_SIZE]; - UInt64 pos = *startOffset; + UInt64 pos = (UInt64)*startOffset; if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE) return SZ_ERROR_NO_ARCHIVE; @@ -202,8 +203,13 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStream *stream, Int64 *startOff if (!XzFlags_IsSupported(p->flags)) return SZ_ERROR_UNSUPPORTED; - if (GetUi32(buf) != CrcCalc(buf + 4, 6)) - return SZ_ERROR_ARCHIVE; + { + /* to eliminate GCC 6.3 warning: + dereferencing type-punned pointer will break strict-aliasing rules */ + const Byte *buf_ptr = buf; + if (GetUi32(buf_ptr) != CrcCalc(buf + 4, 6)) + return SZ_ERROR_ARCHIVE; + } indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2; @@ -222,7 +228,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStream *stream, Int64 *startOff return SZ_ERROR_ARCHIVE; pos -= (totalSize + XZ_STREAM_HEADER_SIZE); RINOK(LookInStream_SeekTo(stream, pos)); - *startOffset = pos; + *startOffset = (Int64)pos; } { CXzStreamFlags headerFlags; @@ -294,12 +300,12 @@ SRes Xzs_ReadBackward(CXzs *p, ILookInStream *stream, Int64 *startOffset, ICompr SRes res; Xz_Construct(&st); res = Xz_ReadBackward(&st, stream, startOffset, alloc); - st.startOffset = *startOffset; + st.startOffset = (UInt64)*startOffset; RINOK(res); if (p->num == p->numAllocated) { - size_t newNum = p->num + p->num / 4 + 1; - Byte *data = (Byte *)ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream)); + const size_t newNum = p->num + p->num / 4 + 1; + void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream)); if (!data) return SZ_ERROR_MEM; p->numAllocated = newNum; @@ -311,8 +317,8 @@ SRes Xzs_ReadBackward(CXzs *p, ILookInStream *stream, Int64 *startOffset, ICompr p->streams[p->num++] = st; if (*startOffset == 0) break; - RINOK(LookInStream_SeekTo(stream, *startOffset)); - if (progress && ICompressProgress_Progress(progress, endOffset - *startOffset, (UInt64)(Int64)-1) != SZ_OK) + RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset)); + if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK) return SZ_ERROR_PROGRESS; } return SZ_OK;