您现在的位置是:首页 >学无止境 >FIPS202:AVX2 版本的 C/C++ 实现网站首页学无止境
FIPS202:AVX2 版本的 C/C++ 实现
简介FIPS202:AVX2 版本的 C/C++ 实现
文章目录
花了好大力气,终于把
AVX2
版本的
FIPS202
调好了。兼容
VS2022
以及
GCC
,另外在
fips202x4.c
中添加了一些函数。
原始版本的源码(从 NIST PQC
实现中扒出来的),其中有好多的强制类型转换与 VS2022
不兼容。
Keccak4x
align.h
/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".
For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/
To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/
#ifndef _align_h_
#define _align_h_
/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
#ifdef ALIGN
#undef ALIGN
#endif
#if defined(__GNUC__)
#define ALIGN(x) __attribute__ ((aligned(x)))
#elif defined(_MSC_VER)
#define ALIGN(x) __declspec(align(x))
#elif defined(__ARMCC_VERSION)
#define ALIGN(x) __align(x)
#else
#define ALIGN(x)
#endif
#endif
brg_endian.h
/*
---------------------------------------------------------------------------
Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
LICENSE TERMS
The redistribution and use of this software (with or without changes)
is allowed without the payment of fees or royalties provided that:
1. source code distributions include the above copyright notice, this
list of conditions and the following disclaimer;
2. binary distributions include the above copyright notice, this list
of conditions and the following disclaimer in their documentation;
3. the name of the copyright holder is not used to endorse products
built using this software without specific written permission.
DISCLAIMER
This software is provided 'as is' with no explicit or implied warranties
in respect of its properties, including, but not limited to, correctness
and/or fitness for purpose.
---------------------------------------------------------------------------
Issue Date: 20/12/2007
Changes for ARM 9/9/2010
*/
#ifndef _BRG_ENDIAN_H
#define _BRG_ENDIAN_H
#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
#if 0
/* Include files where endian defines and byteswap functions may reside */
#if defined( __sun )
# include <sys/isa_defs.h>
#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
# include <sys/endian.h>
#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) ||
defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
# include <machine/endian.h>
#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
# if !defined( __MINGW32__ ) && !defined( _AIX )
# include <endian.h>
# if !defined( __BEOS__ )
# include <byteswap.h>
# endif
# endif
#endif
#endif
/* Now attempt to set the define for platform byte order using any */
/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */
/* seem to encompass most endian symbol definitions */
#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( _BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( _LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( __BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( __BIG_ENDIAN__ )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN__ )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
/* if the platform byte order could not be determined, then try to */
/* set this define using common machine defines */
#if !defined(PLATFORM_BYTE_ORDER)
#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) ||
defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) ||
defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) ||
defined( vax ) || defined( vms ) || defined( VMS ) ||
defined( __VMS ) || defined( _M_X64 )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) ||
defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) ||
defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) ||
defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) ||
defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) ||
defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) ||
defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined(__arm__)
# ifdef __BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# else
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif 1 /* **** EDIT HERE IF NECESSARY **** */
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif 0 /* **** EDIT HERE IF NECESSARY **** */
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#else
# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
#endif
#endif
#endif
SIMD256-config.h
#define KeccakP1600times4_implementation_config "AVX2, all rounds unrolled"
#define KeccakP1600times4_fullUnrolling
#define KeccakP1600times4_useAVX2
KeccakP-1600-times4-SnP.h
/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".
For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/
To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/
#ifndef _KeccakP_1600_times4_SnP_h_
#define _KeccakP_1600_times4_SnP_h_
/** For the documentation, see PlSnP-documentation.h.
*/
#include "SIMD256-config.h"
#define KeccakP1600times4_implementation "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")"
#define KeccakP1600times4_statesSizeInBytes 800
#define KeccakP1600times4_statesAlignment 32
#define KeccakF1600times4_FastLoop_supported
#define KeccakP1600times4_12rounds_FastLoop_supported
#include <stddef.h>
#define KeccakP1600times4_StaticInitialize()
void KeccakP1600times4_InitializeAll(void *states);
#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset)
((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte)
void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount);
void KeccakP1600times4_PermuteAll_12rounds(void *states);
void KeccakP1600times4_PermuteAll_24rounds(void *states);
void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length);
void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset);
size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
#endif
KeccakP-1600-unrolling.macros.h
/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".
For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/
To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/
#if (defined(FullUnrolling))
#define rounds24
prepareTheta
thetaRhoPiChiIotaPrepareTheta( 0, A, E)
thetaRhoPiChiIotaPrepareTheta( 1, E, A)
thetaRhoPiChiIotaPrepareTheta( 2, A, E)
thetaRhoPiChiIotaPrepareTheta( 3, E, A)
thetaRhoPiChiIotaPrepareTheta( 4, A, E)
thetaRhoPiChiIotaPrepareTheta( 5, E, A)
thetaRhoPiChiIotaPrepareTheta( 6, A, E)
thetaRhoPiChiIotaPrepareTheta( 7, E, A)
thetaRhoPiChiIotaPrepareTheta( 8, A, E)
thetaRhoPiChiIotaPrepareTheta( 9, E, A)
thetaRhoPiChiIotaPrepareTheta(10, A, E)
thetaRhoPiChiIotaPrepareTheta(11, E, A)
thetaRhoPiChiIotaPrepareTheta(12, A, E)
thetaRhoPiChiIotaPrepareTheta(13, E, A)
thetaRhoPiChiIotaPrepareTheta(14, A, E)
thetaRhoPiChiIotaPrepareTheta(15, E, A)
thetaRhoPiChiIotaPrepareTheta(16, A, E)
thetaRhoPiChiIotaPrepareTheta(17, E, A)
thetaRhoPiChiIotaPrepareTheta(18, A, E)
thetaRhoPiChiIotaPrepareTheta(19, E, A)
thetaRhoPiChiIotaPrepareTheta(20, A, E)
thetaRhoPiChiIotaPrepareTheta(21, E, A)
thetaRhoPiChiIotaPrepareTheta(22, A, E)
thetaRhoPiChiIota(23, E, A)
#define rounds12
prepareTheta
thetaRhoPiChiIotaPrepareTheta(12, A, E)
thetaRhoPiChiIotaPrepareTheta(13, E, A)
thetaRhoPiChiIotaPrepareTheta(14, A, E)
thetaRhoPiChiIotaPrepareTheta(15, E, A)
thetaRhoPiChiIotaPrepareTheta(16, A, E)
thetaRhoPiChiIotaPrepareTheta(17, E, A)
thetaRhoPiChiIotaPrepareTheta(18, A, E)
thetaRhoPiChiIotaPrepareTheta(19, E, A)
thetaRhoPiChiIotaPrepareTheta(20, A, E)
thetaRhoPiChiIotaPrepareTheta(21, E, A)
thetaRhoPiChiIotaPrepareTheta(22, A, E)
thetaRhoPiChiIota(23, E, A)
#elif (Unrolling == 12)
#define rounds24
prepareTheta
for(i=0; i<24; i+=12) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A)
thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E)
thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A)
thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E)
thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A)
thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E)
thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A)
thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E)
thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A)
thetaRhoPiChiIotaPrepareTheta(i+10, A, E)
thetaRhoPiChiIotaPrepareTheta(i+11, E, A)
}
#define rounds12
prepareTheta
thetaRhoPiChiIotaPrepareTheta(12, A, E)
thetaRhoPiChiIotaPrepareTheta(13, E, A)
thetaRhoPiChiIotaPrepareTheta(14, A, E)
thetaRhoPiChiIotaPrepareTheta(15, E, A)
thetaRhoPiChiIotaPrepareTheta(16, A, E)
thetaRhoPiChiIotaPrepareTheta(17, E, A)
thetaRhoPiChiIotaPrepareTheta(18, A, E)
thetaRhoPiChiIotaPrepareTheta(19, E, A)
thetaRhoPiChiIotaPrepareTheta(20, A, E)
thetaRhoPiChiIotaPrepareTheta(21, E, A)
thetaRhoPiChiIotaPrepareTheta(22, A, E)
thetaRhoPiChiIota(23, E, A)
#elif (Unrolling == 6)
#define rounds24
prepareTheta
for(i=0; i<24; i+=6) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
thetaRhoPiChiIotaPrepareTheta(i+1, E, A)
thetaRhoPiChiIotaPrepareTheta(i+2, A, E)
thetaRhoPiChiIotaPrepareTheta(i+3, E, A)
thetaRhoPiChiIotaPrepareTheta(i+4, A, E)
thetaRhoPiChiIotaPrepareTheta(i+5, E, A)
}
#define rounds12
prepareTheta
for(i=12; i<24; i+=6) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
thetaRhoPiChiIotaPrepareTheta(i+1, E, A)
thetaRhoPiChiIotaPrepareTheta(i+2, A, E)
thetaRhoPiChiIotaPrepareTheta(i+3, E, A)
thetaRhoPiChiIotaPrepareTheta(i+4, A, E)
thetaRhoPiChiIotaPrepareTheta(i+5, E, A)
}
#elif (Unrolling == 4)
#define rounds24
prepareTheta
for(i=0; i<24; i+=4) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
thetaRhoPiChiIotaPrepareTheta(i+1, E, A)
thetaRhoPiChiIotaPrepareTheta(i+2, A, E)
thetaRhoPiChiIotaPrepareTheta(i+3, E, A)
}
#define rounds12
prepareTheta
for(i=12; i<24; i+=4) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
thetaRhoPiChiIotaPrepareTheta(i+1, E, A)
thetaRhoPiChiIotaPrepareTheta(i+2, A, E)
thetaRhoPiChiIotaPrepareTheta(i+3, E, A)
}
#elif (Unrolling == 3)
#define rounds24
prepareTheta
for(i=0; i<24; i+=3) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
thetaRhoPiChiIotaPrepareTheta(i+1, E, A)
thetaRhoPiChiIotaPrepareTheta(i+2, A, E)
copyStateVariables(A, E)
}
#define rounds12
prepareTheta
for(i=12; i<24; i+=3) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
thetaRhoPiChiIotaPrepareTheta(i+1, E, A)
thetaRhoPiChiIotaPrepareTheta(i+2, A, E)
copyStateVariables(A, E)
}
#elif (Unrolling == 2)
#define rounds24
prepareTheta
for(i=0; i<24; i+=2) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
thetaRhoPiChiIotaPrepareTheta(i+1, E, A)
}
#define rounds12
prepareTheta
for(i=12; i<24; i+=2) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
thetaRhoPiChiIotaPrepareTheta(i+1, E, A)
}
#elif (Unrolling == 1)
#define rounds24
prepareTheta
for(i=0; i<24; i++) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
copyStateVariables(A, E)
}
#define rounds12
prepareTheta
for(i=12; i<24; i++) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
copyStateVariables(A, E)
}
#else
#error "Unrolling is not correctly specified!"
#endif
#define roundsN(__nrounds)
prepareTheta
i = 24 - (__nrounds);
if ((i&1) != 0) {
thetaRhoPiChiIotaPrepareTheta(i, A, E)
copyStateVariables(A, E)
++i;
}
for( /* empty */; i<24; i+=2) {
thetaRhoPiChiIotaPrepareTheta(i , A, E)
thetaRhoPiChiIotaPrepareTheta(i+1, E, A)
}
KeccakP-1600-times4-SIMD256.c
/*
Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
denoted as "the implementer".
For more information, feedback or questions, please refer to our websites:
http://keccak.noekeon.org/
http://keyak.noekeon.org/
http://ketje.noekeon.org/
To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <smmintrin.h>
#include <wmmintrin.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "align.h"
#include "KeccakP-1600-times4-SnP.h"
#include "SIMD256-config.h"
#include "brg_endian.h"
#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
#error Expecting a little-endian platform
#endif
typedef unsigned char UINT8;
typedef unsigned long long int UINT64;
typedef __m128i V128;
typedef __m256i V256;
#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
#if defined(KeccakP1600times4_useAVX2)
#define ANDnu256(a, b) _mm256_andnot_si256(a, b)
#define CONST256(a) _mm256_load_si256((const V256 *)&(a))
//#define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a))
#define CONST256_64(a) _mm256_castpd_si256(_mm256_broadcast_sd((const double*)(&a)))
#define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
#define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
#define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
#define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
#define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
#define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
#define XOR256(a, b) _mm256_xor_si256(a, b)
#define XOReq256(a, b) a = _mm256_xor_si256(a, b)
#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
//#define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
#define PERM128( a, b, c ) _mm256_permute2x128_si256(a, b, c)
//#define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
#define SHUFFLE64( a, b, c ) _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c))
#define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ),
lanesH01 = UNPACKH( lanes0, lanes1 ),
lanesL23 = UNPACKL( lanes2, lanes3 ),
lanesH23 = UNPACKH( lanes2, lanes3 ),
lanes0 = PERM128( lanesL01, lanesL23, 0x20 ),
lanes2 = PERM128( lanesL01, lanesL23, 0x31 ),
lanes1 = PERM128( lanesH01, lanesH23, 0x20 ),
lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
#define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ),
lanesH01 = PERM128( lanes1, lanes3, 0x20 ),
lanesL23 = PERM128( lanes0, lanes2, 0x31 ),
lanesH23 = PERM128( lanes1, lanes3, 0x31 ),
lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ),
lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ),
lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ),
lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
#endif
#define SnP_laneLengthInBytes 8
void KeccakP1600times4_InitializeAll(void *states)
{
memset(states, 0, KeccakP1600times4_statesSizeInBytes);
}
void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
{
unsigned int sizeLeft = length;
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
const unsigned char *curData = data;
UINT64 *statesAsLanes = (UINT64 *)states;
if ((sizeLeft > 0) && (offsetInLane != 0)) {
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
UINT64 lane = 0;
if (bytesInLane > sizeLeft)
bytesInLane = sizeLeft;
memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
sizeLeft -= bytesInLane;
lanePosition++;
curData += bytesInLane;
}
while(sizeLeft >= SnP_laneLengthInBytes) {
UINT64 lane = *((const UINT64*)curData);
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
sizeLeft -= SnP_laneLengthInBytes;
lanePosition++;
curData += SnP_laneLengthInBytes;
}
if (sizeLeft > 0) {
UINT64 lane = 0;
memcpy(&lane, curData, sizeLeft);
statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
}
}
void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{
V256 *stateAsLanes = (V256 *)states;
unsigned int i;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
#define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
#define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),
lanes1 = LOAD256u( curData1[argIndex]),
lanes2 = LOAD256u( curData2[argIndex]),
lanes3 = LOAD256u( curData3[argIndex]),
INTLEAVE(),
XOReq256( stateAsLanes[argIndex+0], lanes0 ),
XOReq256( stateAsLanes[argIndex+1], lanes1 ),
XOReq256( stateAsLanes[argIndex+2], lanes2 ),
XOReq256( stateAsLanes[argIndex+3], lanes3 )
if ( laneCount >= 16 ) {
Xor_In4( 0 );
Xor_In4( 4 );
Xor_In4( 8 );
Xor_In4( 12 );
if ( laneCount >= 20 ) {
Xor_In4( 16 );
for(i=20; i<laneCount; i++)
Xor_In( i );
}
else {
for(i=16; i<laneCount; i++)
Xor_In( i );
}
}
else {
for(i=0; i<laneCount; i++)
Xor_In( i );
}
#undef Xor_In
#undef Xor_In4
}
void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
{
unsigned int sizeLeft = length;
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
const unsigned char *curData = data;
UINT64 *statesAsLanes = (UINT64 *)states;
if ((sizeLeft > 0) && (offsetInLane != 0)) {
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
if (bytesInLane > sizeLeft)
bytesInLane = sizeLeft;
memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
sizeLeft -= bytesInLane;
lanePosition++;
curData += bytesInLane;
}
while(sizeLeft >= SnP_laneLengthInBytes) {
UINT64 lane = *((const UINT64*)curData);
statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
sizeLeft -= SnP_laneLengthInBytes;
lanePosition++;
curData += SnP_laneLengthInBytes;
}
if (sizeLeft > 0) {
memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
}
}
void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{
V256 *stateAsLanes = (V256 *)states;
unsigned int i;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
#define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
#define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),
lanes1 = LOAD256u( curData1[argIndex]),
lanes2 = LOAD256u( curData2[argIndex]),
lanes3 = LOAD256u( curData3[argIndex]),
INTLEAVE(),
STORE256( stateAsLanes[argIndex+0], lanes0 ),
STORE256( stateAsLanes[argIndex+1], lanes1 ),
STORE256( stateAsLanes[argIndex+2], lanes2 ),
STORE256( stateAsLanes[argIndex+3], lanes3 )
if ( laneCount >= 16 ) {
OverWr4( 0 );
OverWr4( 4 );
OverWr4( 8 );
OverWr4( 12 );
if ( laneCount >= 20 ) {
OverWr4( 16 );
for(i=20; i<laneCount; i++)
OverWr( i );
}
else {
for(i=16; i<laneCount; i++)
OverWr( i );
}
}
else {
for(i=0; i<laneCount; i++)
OverWr( i );
}
#undef OverWr
#undef OverWr4
}
void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
{
unsigned int sizeLeft = byteCount;
unsigned int lanePosition = 0;
UINT64 *statesAsLanes = (UINT64 *)states;
while(sizeLeft >= SnP_laneLengthInBytes) {
statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
sizeLeft -= SnP_laneLengthInBytes;
lanePosition++;
}
if (sizeLeft > 0) {
memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
}
}
void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
{
unsigned int sizeLeft = length;
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
unsigned char *curData = data;
const UINT64 *statesAsLanes = (const UINT64 *)states;
if ((sizeLeft > 0) && (offsetInLane != 0)) {
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
if (bytesInLane > sizeLeft)
bytesInLane = sizeLeft;
memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
sizeLeft -= bytesInLane;
lanePosition++;
curData += bytesInLane;
}
while(sizeLeft >= SnP_laneLengthInBytes) {
*(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
sizeLeft -= SnP_laneLengthInBytes;
lanePosition++;
curData += SnP_laneLengthInBytes;
}
if (sizeLeft > 0) {
memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
}
}
void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{
UINT64 *curData0 = (UINT64 *)data;
UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);
UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
const V256 *stateAsLanes = (const V256 *)states;
const UINT64 *stateAsLanes64 = (const UINT64*)states;
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
unsigned int i;
#define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)],
curData1[argIndex] = stateAsLanes64[4*(argIndex)+1],
curData2[argIndex] = stateAsLanes64[4*(argIndex)+2],
curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
#define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ),
lanes1 = LOAD256( stateAsLanes[argIndex+1] ),
lanes2 = LOAD256( stateAsLanes[argIndex+2] ),
lanes3 = LOAD256( stateAsLanes[argIndex+3] ),
UNINTLEAVE(),
STORE256u( curData0[argIndex], lanes0 ),
STORE256u( curData1[argIndex], lanes1 ),
STORE256u( curData2[argIndex], lanes2 ),
STORE256u( curData3[argIndex], lanes3 )
if ( laneCount >= 16 ) {
Extr4( 0 );
Extr4( 4 );
Extr4( 8 );
Extr4( 12 );
if ( laneCount >= 20 ) {
Extr4( 16 );
for(i=20; i<laneCount; i++)
Extr( i );
}
else {
for(i=16; i<laneCount; i++)
Extr( i );
}
}
else {
for(i=0; i<laneCount; i++)
Extr( i );
}
#undef Extr
#undef Extr4
}
void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
{
unsigned int sizeLeft = length;
unsigned int lanePosition = offset/SnP_laneLengthInBytes;
unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
const unsigned char *curInput = input;
unsigned char *curOutput = output;
const UINT64 *statesAsLanes = (const UINT64 *)states;
if ((sizeLeft > 0) && (offsetInLane != 0)) {
unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
if (bytesInLane > sizeLeft)
bytesInLane = sizeLeft;
sizeLeft -= bytesInLane;
do {
*(curOutput++) = *(curInput++) ^ (unsigned char)lane;
lane >>= 8;
} while ( --bytesInLane != 0);
lanePosition++;
}
while(sizeLeft >= SnP_laneLengthInBytes) {
*((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
sizeLeft -= SnP_laneLengthInBytes;
lanePosition++;
curInput += SnP_laneLengthInBytes;
curOutput += SnP_laneLengthInBytes;
}
if (sizeLeft != 0) {
UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
do {
*(curOutput++) = *(curInput++) ^ (unsigned char)lane;
lane >>= 8;
} while ( --sizeLeft != 0);
}
}
void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
{
const UINT64 *curInput0 = (UINT64 *)input;
const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
UINT64 *curOutput0 = (UINT64 *)output;
UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
const V256 *stateAsLanes = (const V256 *)states;
const UINT64 *stateAsLanes64 = (const UINT64*)states;
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
unsigned int i;
#define ExtrXor( argIndex )
curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],
curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],
curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],
curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
#define ExtrXor4( argIndex )
lanes0 = LOAD256( stateAsLanes[argIndex+0] ),
lanes1 = LOAD256( stateAsLanes[argIndex+1] ),
lanes2 = LOAD256( stateAsLanes[argIndex+2] ),
lanes3 = LOAD256( stateAsLanes[argIndex+3] ),
UNINTLEAVE(),
lanesL01 = LOAD256u( curInput0[argIndex]),
lanesH01 = LOAD256u( curInput1[argIndex]),
lanesL23 = LOAD256u( curInput2[argIndex]),
lanesH23 = LOAD256u( curInput3[argIndex]),
XOReq256( lanes0, lanesL01 ),
XOReq256( lanes1, lanesH01 ),
XOReq256( lanes2, lanesL23 ),
XOReq256( lanes3, lanesH23 ),
STORE256u( curOutput0[argIndex], lanes0 ),
STORE256u( curOutput1[argIndex], lanes1 ),
STORE256u( curOutput2[argIndex], lanes2 ),
STORE256u( curOutput3[argIndex], lanes3 )
if ( laneCount >= 16 ) {
ExtrXor4( 0 );
ExtrXor4( 4 );
ExtrXor4( 8 );
ExtrXor4( 12 );
if ( laneCount >= 20 ) {
ExtrXor4( 16 );
for(i=20; i<laneCount; i++)
ExtrXor( i );
}
else {
for(i=16; i<laneCount; i++)
ExtrXor( i );
}
}
else {
for(i=0; i<laneCount; i++)
ExtrXor( i );
}
#undef ExtrXor
#undef ExtrXor4
}
#define declareABCDE
V256 Aba, Abe, Abi, Abo, Abu;
V256 Aga, Age, Agi, Ago, Agu;
V256 Aka, Ake, Aki, Ako, Aku;
V256 Ama, Ame, Ami, Amo, Amu;
V256 Asa, Ase, Asi, Aso, Asu;
V256 Bba, Bbe, Bbi, Bbo, Bbu;
V256 Bga, Bge, Bgi, Bgo, Bgu;
V256 Bka, Bke, Bki, Bko, Bku;
V256 Bma, Bme, Bmi, Bmo, Bmu;
V256 Bsa, Bse, Bsi, Bso, Bsu;
V256 Ca, Ce, Ci, Co, Cu;
V256 Ca1, Ce1, Ci1, Co1, Cu1;
V256 Da, De, Di, Do, Du;
V256 Eba, Ebe, Ebi, Ebo, Ebu;
V256 Ega, Ege, Egi, Ego, Egu;
V256 Eka, Eke, Eki, Eko, Eku;
V256 Ema, Eme, Emi, Emo, Emu;
V256 Esa, Ese, Esi, Eso, Esu;
#define prepareTheta
Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa))));
Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase))));
Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi))));
Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso))));
Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu))));
/* --- Theta Rho Pi Chi Iota Prepare-theta */
/* --- 64-bit lanes mapped to 64-bit words */
#define thetaRhoPiChiIotaPrepareTheta(i, A, E)
ROL64in256(Ce1, Ce, 1);
Da = XOR256(Cu, Ce1);
ROL64in256(Ci1, Ci, 1);
De = XOR256(Ca, Ci1);
ROL64in256(Co1, Co, 1);
Di = XOR256(Ce, Co1);
ROL64in256(Cu1, Cu, 1);
Do = XOR256(Ci, Cu1);
ROL64in256(Ca1, Ca, 1);
Du = XOR256(Co, Ca1);
XOReq256(A##ba, Da);
Bba = A##ba;
XOReq256(A##ge, De);
ROL64in256(Bbe, A##ge, 44);
XOReq256(A##ki, Di);
ROL64in256(Bbi, A##ki, 43);
E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi));
XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i]));
Ca = E##ba;
XOReq256(A##mo, Do);
ROL64in256(Bbo, A##mo, 21);
E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo));
Ce = E##be;
XOReq256(A##su, Du);
ROL64in256(Bbu, A##su, 14);
E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu));
Ci = E##bi;
E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba));
Co = E##bo;
E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe));
Cu = E##bu;
XOReq256(A##bo, Do);
ROL64in256(Bga, A##bo, 28);
XOReq256(A##gu, Du);
ROL64in256(Bge, A##gu, 20);
XOReq256(A##ka, Da);
ROL64in256(Bgi, A##ka, 3);
E##ga = XOR256(Bga, ANDnu256(Bge, Bgi));
XOReq256(Ca, E##ga);
XOReq256(A##me, De);
ROL64in256(Bgo, A##me, 45);
E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo));
XOReq256(Ce, E##ge);
XOReq256(A##si, Di);
ROL64in256(Bgu, A##si, 61);
E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu));
XOReq256(Ci, E##gi);
E##go = XOR256(Bgo, ANDnu256(Bgu, Bga));
XOReq256(Co, E##go);
E##gu = XOR256(Bgu, ANDnu256(Bga, Bge));
XOReq256(Cu, E##gu);
XOReq256(A##be, De);
ROL64in256(Bka, A##be, 1);
XOReq256(A##gi, Di);
ROL64in256(Bke, A##gi, 6);
XOReq256(A##ko, Do);
ROL64in256(Bki, A##ko, 25);
E##ka = XOR256(Bka, ANDnu256(Bke, Bki));
XOReq256(Ca, E##ka);
XOReq256(A##mu, Du);
ROL64in256_8(Bko, A##mu);
E##ke = XOR256(Bke, ANDnu256(Bki, Bko));
XOReq256(Ce, E##ke);
XOReq256(A##sa, Da);
ROL64in256(Bku, A##sa, 18);
E##ki = XOR256(Bki, ANDnu256(Bko, Bku));
XOReq256(Ci, E##ki);
E##ko = XOR256(Bko, ANDnu256(Bku, Bka));
XOReq256(Co, E##ko);
E##ku = XOR256(Bku, ANDnu256(Bka, Bke));
XOReq256(Cu, E##ku);
XOReq256(A##bu, Du);
ROL64in256(Bma, A##bu, 27);
XOReq256(A##ga, Da);
ROL64in256(Bme, A##ga, 36);
XOReq256(A##ke, De);
ROL64in256(Bmi, A##ke, 10);
E##ma = XOR256(Bma, ANDnu256(Bme, Bmi));
XOReq256(Ca, E##ma);
XOReq256(A##mi, Di);
ROL64in256(Bmo, A##mi, 15);
E##me = XOR256(Bme, ANDnu256(Bmi, Bmo));
XOReq256(Ce, E##me);
XOReq256(A##so, Do);
ROL64in256_56(Bmu, A##so);
E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu));
XOReq256(Ci, E##mi);
E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma));
XOReq256(Co, E##mo);
E##mu = XOR256(Bmu, ANDnu256(Bma, Bme));
XOReq256(Cu, E##mu);
XOReq256(A##bi, Di);
ROL64in256(Bsa, A##bi, 62);
XOReq256(A##go, Do);
ROL64in256(Bse, A##go, 55);
XOReq256(A##ku, Du);
ROL64in256(Bsi, A##ku, 39);
E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi));
XOReq256(Ca, E##sa);
XOReq256(A##ma, Da);
ROL64in256(Bso, A##ma, 41);
E##se = XOR256(Bse, ANDnu256(Bsi, Bso));
XOReq256(Ce, E##se);
XOReq256(A##se, De);
ROL64in256(Bsu, A##se, 2);
E##si = XOR256(Bsi, ANDnu256(Bso, Bsu));
XOReq256(Ci, E##si);
E##so = XOR256(Bso, ANDnu256(Bsu, Bsa));
XOReq256(Co, E##so);
E##su = XOR256(Bsu, ANDnu256(Bsa, Bse));
XOReq256(Cu, E##su);
/* --- Theta Rho Pi Chi Iota */
/* --- 64-bit lanes mapped to 64-bit words */
#define thetaRhoPiChiIota(i, A, E)
ROL64in256(Ce1, Ce, 1);
Da = XOR256(Cu, Ce1);
ROL64in256(Ci1, Ci, 1);
De = XOR256(Ca, Ci1);
ROL64in256(Co1, Co, 1);
Di = XOR256(Ce, Co1);
ROL64in256(Cu1, Cu, 1);
Do = XOR256(Ci, Cu1);
ROL64in256(Ca1, Ca, 1);
Du = XOR256(Co, Ca1);
XOReq256(A##ba, Da);
Bba = A##ba;
XOReq256(A##ge, De);
ROL64in256(Bbe, A##ge, 44);
XOReq256(A##ki, Di);
ROL64in256(Bbi, A##ki, 43);
E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi));
XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i]));
XOReq256(A##mo, Do);
ROL64in256(Bbo, A##mo, 21);
E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo));
XOReq256(A##su, Du);
ROL64in256(Bbu, A##su, 14);
E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu));
E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba));
E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe));
XOReq256(A##bo, Do);
ROL64in256(Bga, A##bo, 28);
XOReq256(A##gu, Du);
ROL64in256(Bge, A##gu, 20);
XOReq256(A##ka, Da);
ROL64in256(Bgi, A##ka, 3);
E##ga = XOR256(Bga, ANDnu256(Bge, Bgi));
XOReq256(A##me, De);
ROL64in256(Bgo, A##me, 45);
E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo));
XOReq256(A##si, Di);
ROL64in256(Bgu, A##si, 61);
E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu));
E##go = XOR256(Bgo, ANDnu256(Bgu, Bga));
E##gu = XOR256(Bgu, ANDnu256(Bga, Bge));
XOReq256(A##be, De);
ROL64in256(Bka, A##be, 1);
XOReq256(A##gi, Di);
ROL64in256(Bke, A##gi, 6);
XOReq256(A##ko, Do);
ROL64in256(Bki, A##ko, 25);
E##ka = XOR256(Bka, ANDnu256(Bke, Bki));
XOReq256(A##mu, Du);
ROL64in256_8(Bko, A##mu);
E##ke = XOR256(Bke, ANDnu256(Bki, Bko));
XOReq256(A##sa, Da);
ROL64in256(Bku, A##sa, 18);
E##ki = XOR256(Bki, ANDnu256(Bko, Bku));
E##ko = XOR256(Bko, ANDnu256(Bku, Bka));
E##ku = XOR256(Bku, ANDnu256(Bka, Bke));
XOReq256(A##bu, Du);
ROL64in256(Bma, A##bu, 27);
XOReq256(A##ga, Da);
ROL64in256(Bme, A##ga, 36);
XOReq256(A##ke, De);
ROL64in256(Bmi, A##ke, 10);
E##ma = XOR256(Bma, ANDnu256(Bme, Bmi));
XOReq256(A##mi, Di);
ROL64in256(Bmo, A##mi, 15);
E##me = XOR256(Bme, ANDnu256(Bmi, Bmo));
XOReq256(A##so, Do);
ROL64in256_56(Bmu, A##so);
E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu));
E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma));
E##mu = XOR256(Bmu, ANDnu256(Bma, Bme));
XOReq256(A##bi, Di);
ROL64in256(Bsa, A##bi, 62);
XOReq256(A##go, Do);
ROL64in256(Bse, A##go, 55);
XOReq256(A##ku, Du);
ROL64in256(Bsi, A##ku, 39);
E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi));
XOReq256(A##ma, Da);
ROL64in256(Bso, A##ma, 41);
E##se = XOR256(Bse, ANDnu256(Bsi, Bso));
XOReq256(A##se, De);
ROL64in256(Bsu, A##se, 2);
E##si = XOR256(Bsi, ANDnu256(Bso, Bsu));
E##so = XOR256(Bso, ANDnu256(Bsu, Bsa));
E##su = XOR256(Bsu, ANDnu256(Bsa, Bse));
static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
0x0000000000000001ULL,
0x0000000000008082ULL,
0x800000000000808aULL,
0x8000000080008000ULL,
0x000000000000808bULL,
0x0000000080000001ULL,
0x8000000080008081ULL,
0x8000000000008009ULL,
0x000000000000008aULL,
0x0000000000000088ULL,
0x0000000080008009ULL,
0x000000008000000aULL,
0x000000008000808bULL,
0x800000000000008bULL,
0x8000000000008089ULL,
0x8000000000008003ULL,
0x8000000000008002ULL,
0x8000000000000080ULL,
0x000000000000800aULL,
0x800000008000000aULL,
0x8000000080008081ULL,
0x8000000000008080ULL,
0x0000000080000001ULL,
0x8000000080008008ULL};
#define copyFromState(X, state)
X##ba = LOAD256(state[ 0]);
X##be = LOAD256(state[ 1]);
X##bi = LOAD256(state[ 2]);
X##bo = LOAD256(state[ 3]);
X##bu = LOAD256(state[ 4]);
X##ga = LOAD256(state[ 5]);
X##ge = LOAD256(state[ 6]);
X##gi = LOAD256(state[ 7]);
X##go = LOAD256(state[ 8]);
X##gu = LOAD256(state[ 9]);
X##ka = LOAD256(state[10]);
X##ke = LOAD256(state[11]);
X##ki = LOAD256(state[12]);
X##ko = LOAD256(state[13]);
X##ku = LOAD256(state[14]);
X##ma = LOAD256(state[15]);
X##me = LOAD256(state[16]);
X##mi = LOAD256(state[17]);
X##mo = LOAD256(state[18]);
X##mu = LOAD256(state[19]);
X##sa = LOAD256(state[20]);
X##se = LOAD256(state[21]);
X##si = LOAD256(state[22]);
X##so = LOAD256(state[23]);
X##su = LOAD256(state[24]);
#define copyToState(state, X)
STORE256(state[ 0], X##ba);
STORE256(state[ 1], X##be);
STORE256(state[ 2], X##bi);
STORE256(state[ 3], X##bo);
STORE256(state[ 4], X##bu);
STORE256(state[ 5], X##ga);
STORE256(state[ 6], X##ge);
STORE256(state[ 7], X##gi);
STORE256(state[ 8], X##go);
STORE256(state[ 9], X##gu);
STORE256(state[10], X##ka);
STORE256(state[11], X##ke);
STORE256(state[12], X##ki);
STORE256(state[13], X##ko);
STORE256(state[14], X##ku);
STORE256(state[15], X##ma);
STORE256(state[16], X##me);
STORE256(state[17], X##mi);
STORE256(state[18], X##mo);
STORE256(state[19], X##mu);
STORE256(state[20], X##sa);
STORE256(state[21], X##se);
STORE256(state[22], X##si);
STORE256(state[23], X##so);
STORE256(state[24], X##su);
#define copyStateVariables(X, Y)
X##ba = Y##ba;
X##be = Y##be;
X##bi = Y##bi;
X##bo = Y##bo;
X##bu = Y##bu;
X##ga = Y##ga;
X##ge = Y##ge;
X##gi = Y##gi;
X##go = Y##go;
X##gu = Y##gu;
X##ka = Y##ka;
X##ke = Y##ke;
X##ki = Y##ki;
X##ko = Y##ko;
X##ku = Y##ku;
X##ma = Y##ma;
X##me = Y##me;
X##mi = Y##mi;
X##mo = Y##mo;
X##mu = Y##mu;
X##sa = Y##sa;
X##se = Y##se;
X##si = Y##si;
X##so = Y##so;
X##su = Y##su;
#ifdef KeccakP1600times4_fullUnrolling
#define FullUnrolling
#else
#define Unrolling KeccakP1600times4_unrolling
#endif
#include "KeccakP-1600-unrolling.macros.h"
void KeccakP1600times4_PermuteAll_24rounds(void *states)
{
V256 *statesAsLanes = (V256 *)states;
declareABCDE
#ifndef KeccakP1600times4_fullUnrolling
unsigned int i;
#endif
copyFromState(A, statesAsLanes)
rounds24
copyToState(statesAsLanes, A)
}
void KeccakP1600times4_PermuteAll_12rounds(void *states)
{
V256 *statesAsLanes = (V256 *)states;
declareABCDE
#ifndef KeccakP1600times4_fullUnrolling
unsigned int i;
#endif
copyFromState(A, statesAsLanes)
rounds12
copyToState(statesAsLanes, A)
}
size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
{
if (laneCount == 21) {
#if 0
const unsigned char *dataStart = data;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
V256 *stateAsLanes = (V256 *)states;
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
#define Xor_In( argIndex )
XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
#define Xor_In4( argIndex )
lanes0 = LOAD256u( curData0[argIndex]),
lanes1 = LOAD256u( curData1[argIndex]),
lanes2 = LOAD256u( curData2[argIndex]),
lanes3 = LOAD256u( curData3[argIndex]),
INTLEAVE(),
XOReq256( stateAsLanes[argIndex+0], lanes0 ),
XOReq256( stateAsLanes[argIndex+1], lanes1 ),
XOReq256( stateAsLanes[argIndex+2], lanes2 ),
XOReq256( stateAsLanes[argIndex+3], lanes3 )
Xor_In4( 0 );
Xor_In4( 4 );
Xor_In4( 8 );
Xor_In4( 12 );
Xor_In4( 16 );
Xor_In( 20 );
#undef Xor_In
#undef Xor_In4
KeccakP1600times4_PermuteAll_24rounds(states);
curData0 += laneOffsetSerial;
curData1 += laneOffsetSerial;
curData2 += laneOffsetSerial;
curData3 += laneOffsetSerial;
dataByteLen -= laneOffsetSerial*8;
}
return (const unsigned char *)curData0 - dataStart;
#else
// unsigned int i;
const unsigned char *dataStart = data;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
V256 *statesAsLanes = (V256 *)states;
declareABCDE
copyFromState(A, statesAsLanes)
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
#define XOR_In( Xxx, argIndex )
XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
XOR_In( Aba, 0 );
XOR_In( Abe, 1 );
XOR_In( Abi, 2 );
XOR_In( Abo, 3 );
XOR_In( Abu, 4 );
XOR_In( Aga, 5 );
XOR_In( Age, 6 );
XOR_In( Agi, 7 );
XOR_In( Ago, 8 );
XOR_In( Agu, 9 );
XOR_In( Aka, 10 );
XOR_In( Ake, 11 );
XOR_In( Aki, 12 );
XOR_In( Ako, 13 );
XOR_In( Aku, 14 );
XOR_In( Ama, 15 );
XOR_In( Ame, 16 );
XOR_In( Ami, 17 );
XOR_In( Amo, 18 );
XOR_In( Amu, 19 );
XOR_In( Asa, 20 );
#undef XOR_In
rounds24
curData0 += laneOffsetSerial;
curData1 += laneOffsetSerial;
curData2 += laneOffsetSerial;
curData3 += laneOffsetSerial;
dataByteLen -= laneOffsetSerial*8;
}
copyToState(statesAsLanes, A)
return (const unsigned char *)curData0 - dataStart;
#endif
}
else {
// unsigned int i;
const unsigned char *dataStart = data;
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
KeccakP1600times4_PermuteAll_24rounds(states);
data += laneOffsetSerial*8;
dataByteLen -= laneOffsetSerial*8;
}
return data - dataStart;
}
}
size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
{
if (laneCount == 21) {
#if 0
const unsigned char *dataStart = data;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
V256 *stateAsLanes = states;
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
#define Xor_In( argIndex )
XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
#define Xor_In4( argIndex )
lanes0 = LOAD256u( curData0[argIndex]),
lanes1 = LOAD256u( curData1[argIndex]),
lanes2 = LOAD256u( curData2[argIndex]),
lanes3 = LOAD256u( curData3[argIndex]),
INTLEAVE(),
XOReq256( stateAsLanes[argIndex+0], lanes0 ),
XOReq256( stateAsLanes[argIndex+1], lanes1 ),
XOReq256( stateAsLanes[argIndex+2], lanes2 ),
XOReq256( stateAsLanes[argIndex+3], lanes3 )
Xor_In4( 0 );
Xor_In4( 4 );
Xor_In4( 8 );
Xor_In4( 12 );
Xor_In4( 16 );
Xor_In( 20 );
#undef Xor_In
#undef Xor_In4
KeccakP1600times4_PermuteAll_12rounds(states);
curData0 += laneOffsetSerial;
curData1 += laneOffsetSerial;
curData2 += laneOffsetSerial;
curData3 += laneOffsetSerial;
dataByteLen -= laneOffsetSerial*8;
}
return (const unsigned char *)curData0 - dataStart;
#else
// unsigned int i;
const unsigned char *dataStart = data;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
V256 *statesAsLanes = states;
declareABCDE
copyFromState(A, statesAsLanes)
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
#define XOR_In( Xxx, argIndex )
XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
XOR_In( Aba, 0 );
XOR_In( Abe, 1 );
XOR_In( Abi, 2 );
XOR_In( Abo, 3 );
XOR_In( Abu, 4 );
XOR_In( Aga, 5 );
XOR_In( Age, 6 );
XOR_In( Agi, 7 );
XOR_In( Ago, 8 );
XOR_In( Agu, 9 );
XOR_In( Aka, 10 );
XOR_In( Ake, 11 );
XOR_In( Aki, 12 );
XOR_In( Ako, 13 );
XOR_In( Aku, 14 );
XOR_In( Ama, 15 );
XOR_In( Ame, 16 );
XOR_In( Ami, 17 );
XOR_In( Amo, 18 );
XOR_In( Amu, 19 );
XOR_In( Asa, 20 );
#undef XOR_In
rounds12
curData0 += laneOffsetSerial;
curData1 += laneOffsetSerial;
curData2 += laneOffsetSerial;
curData3 += laneOffsetSerial;
dataByteLen -= laneOffsetSerial*8;
}
copyToState(statesAsLanes, A)
return (const unsigned char *)curData0 - dataStart;
#endif
}
else {
// unsigned int i;
const unsigned char *dataStart = data;
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
KeccakP1600times4_PermuteAll_12rounds(states);
data += laneOffsetSerial*8;
dataByteLen -= laneOffsetSerial*8;
}
return data - dataStart;
}
}
fips202
fips202.h
#ifndef FIPS202_H
#define FIPS202_H
#include <stddef.h>
#include <stdint.h>
#define SHAKE128_RATE 168
#define SHAKE256_RATE 136
#define SHA3_256_RATE 136
#define SHA3_512_RATE 72
#define FIPS202_NAMESPACE(s) fips202_ref_##s
typedef struct {
uint64_t s[25];
unsigned int pos;
} keccak_state;
#define shake128_init FIPS202_NAMESPACE(shake128_init)
void shake128_init(keccak_state *state);
#define shake128_absorb FIPS202_NAMESPACE(shake128_absorb)
void shake128_absorb(keccak_state *state, const uint8_t *in, size_t inlen);
#define shake128_finalize FIPS202_NAMESPACE(shake128_finalize)
void shake128_finalize(keccak_state *state);
#define shake128_squeeze FIPS202_NAMESPACE(shake128_squeeze)
void shake128_squeeze(uint8_t *out, size_t outlen, keccak_state *state);
#define shake128_absorb_once FIPS202_NAMESPACE(shake128_absorb_once)
void shake128_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen);
#define shake128_squeezeblocks FIPS202_NAMESPACE(shake128_squeezeblocks)
void shake128_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state);
#define shake256_init FIPS202_NAMESPACE(shake256_init)
void shake256_init(keccak_state *state);
#define shake256_absorb FIPS202_NAMESPACE(shake256_absorb)
void shake256_absorb(keccak_state *state, const uint8_t *in, size_t inlen);
#define shake256_finalize FIPS202_NAMESPACE(shake256_finalize)
void shake256_finalize(keccak_state *state);
#define shake256_squeeze FIPS202_NAMESPACE(shake256_squeeze)
void shake256_squeeze(uint8_t *out, size_t outlen, keccak_state *state);
#define shake256_absorb_once FIPS202_NAMESPACE(shake256_absorb_once)
void shake256_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen);
#define shake256_squeezeblocks FIPS202_NAMESPACE(shake256_squeezeblocks)
void shake256_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state);
#define shake128 FIPS202_NAMESPACE(shake128)
void shake128(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen);
#define shake256 FIPS202_NAMESPACE(shake256)
void shake256(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen);
#define sha3_256 FIPS202_NAMESPACE(sha3_256)
void sha3_256(uint8_t h[32], const uint8_t *in, size_t inlen);
#define sha3_512 FIPS202_NAMESPACE(sha3_512)
void sha3_512(uint8_t h[64], const uint8_t *in, size_t inlen);
#endif
fips202.c
/* Based on the public domain implementation in crypto_hash/keccakc512/simple/ from
* http://bench.cr.yp.to/supercop.html by Ronny Van Keer and the public domain "TweetFips202"
* implementation from https://twitter.com/tweetfips202 by Gilles Van Assche, Daniel J. Bernstein,
* and Peter Schwabe */
#include <stddef.h>
#include <stdint.h>
#include "fips202.h"
#define NROUNDS 24
#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset)))
/*************************************************
* Name: load64
*
* Description: Load 8 bytes into uint64_t in little-endian order
*
* Arguments: - const uint8_t *x: pointer to input byte array
*
* Returns the loaded 64-bit unsigned integer
**************************************************/
static uint64_t load64(const uint8_t x[8]) {
unsigned int i;
uint64_t r = 0;
for(i=0;i<8;i++)
r |= (uint64_t)x[i] << 8*i;
return r;
}
/*************************************************
* Name: store64
*
* Description: Store a 64-bit integer to array of 8 bytes in little-endian order
*
* Arguments: - uint8_t *x: pointer to the output byte array (allocated)
* - uint64_t u: input 64-bit unsigned integer
**************************************************/
static void store64(uint8_t x[8], uint64_t u) {
unsigned int i;
for(i=0;i<8;i++)
x[i] = u >> 8*i;
}
/* Keccak round constants */
static const uint64_t KeccakF_RoundConstants[NROUNDS] = {
(uint64_t)0x0000000000000001ULL,
(uint64_t)0x0000000000008082ULL,
(uint64_t)0x800000000000808aULL,
(uint64_t)0x8000000080008000ULL,
(uint64_t)0x000000000000808bULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008009ULL,
(uint64_t)0x000000000000008aULL,
(uint64_t)0x0000000000000088ULL,
(uint64_t)0x0000000080008009ULL,
(uint64_t)0x000000008000000aULL,
(uint64_t)0x000000008000808bULL,
(uint64_t)0x800000000000008bULL,
(uint64_t)0x8000000000008089ULL,
(uint64_t)0x8000000000008003ULL,
(uint64_t)0x8000000000008002ULL,
(uint64_t)0x8000000000000080ULL,
(uint64_t)0x000000000000800aULL,
(uint64_t)0x800000008000000aULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008080ULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008008ULL
};
/*************************************************
* Name: KeccakF1600_StatePermute
*
* Description: The Keccak F1600 Permutation
*
* Arguments: - uint64_t *state: pointer to input/output Keccak state
**************************************************/
static void KeccakF1600_StatePermute(uint64_t state[25])
{
int round;
uint64_t Aba, Abe, Abi, Abo, Abu;
uint64_t Aga, Age, Agi, Ago, Agu;
uint64_t Aka, Ake, Aki, Ako, Aku;
uint64_t Ama, Ame, Ami, Amo, Amu;
uint64_t Asa, Ase, Asi, Aso, Asu;
uint64_t BCa, BCe, BCi, BCo, BCu;
uint64_t Da, De, Di, Do, Du;
uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
uint64_t Ega, Ege, Egi, Ego, Egu;
uint64_t Eka, Eke, Eki, Eko, Eku;
uint64_t Ema, Eme, Emi, Emo, Emu;
uint64_t Esa, Ese, Esi, Eso, Esu;
//copyFromState(A, state)
Aba = state[ 0];
Abe = state[ 1];
Abi = state[ 2];
Abo = state[ 3];
Abu = state[ 4];
Aga = state[ 5];
Age = state[ 6];
Agi = state[ 7];
Ago = state[ 8];
Agu = state[ 9];
Aka = state[10];
Ake = state[11];
Aki = state[12];
Ako = state[13];
Aku = state[14];
Ama = state[15];
Ame = state[16];
Ami = state[17];
Amo = state[18];
Amu = state[19];
Asa = state[20];
Ase = state[21];
Asi = state[22];
Aso = state[23];
Asu = state[24];
for(round = 0; round < NROUNDS; round += 2) {
// prepareTheta
BCa = Aba^Aga^Aka^Ama^Asa;
BCe = Abe^Age^Ake^Ame^Ase;
BCi = Abi^Agi^Aki^Ami^Asi;
BCo = Abo^Ago^Ako^Amo^Aso;
BCu = Abu^Agu^Aku^Amu^Asu;
//thetaRhoPiChiIotaPrepareTheta(round, A, E)
Da = BCu^ROL(BCe, 1);
De = BCa^ROL(BCi, 1);
Di = BCe^ROL(BCo, 1);
Do = BCi^ROL(BCu, 1);
Du = BCo^ROL(BCa, 1);
Aba ^= Da;
BCa = Aba;
Age ^= De;
BCe = ROL(Age, 44);
Aki ^= Di;
BCi = ROL(Aki, 43);
Amo ^= Do;
BCo = ROL(Amo, 21);
Asu ^= Du;
BCu = ROL(Asu, 14);
Eba = BCa ^((~BCe)& BCi );
Eba ^= (uint64_t)KeccakF_RoundConstants[round];
Ebe = BCe ^((~BCi)& BCo );
Ebi = BCi ^((~BCo)& BCu );
Ebo = BCo ^((~BCu)& BCa );
Ebu = BCu ^((~BCa)& BCe );
Abo ^= Do;
BCa = ROL(Abo, 28);
Agu ^= Du;
BCe = ROL(Agu, 20);
Aka ^= Da;
BCi = ROL(Aka, 3);
Ame ^= De;
BCo = ROL(Ame, 45);
Asi ^= Di;
BCu = ROL(Asi, 61);
Ega = BCa ^((~BCe)& BCi );
Ege = BCe ^((~BCi)& BCo );
Egi = BCi ^((~BCo)& BCu );
Ego = BCo ^((~BCu)& BCa );
Egu = BCu ^((~BCa)& BCe );
Abe ^= De;
BCa = ROL(Abe, 1);
Agi ^= Di;
BCe = ROL(Agi, 6);
Ako ^= Do;
BCi = ROL(Ako, 25);
Amu ^= Du;
BCo = ROL(Amu, 8);
Asa ^= Da;
BCu = ROL(Asa, 18);
Eka = BCa ^((~BCe)& BCi );
Eke = BCe ^((~BCi)& BCo );
Eki = BCi ^((~BCo)& BCu );
Eko = BCo ^((~BCu)& BCa );
Eku = BCu ^((~BCa)& BCe );
Abu ^= Du;
BCa = ROL(Abu, 27);
Aga ^= Da;
BCe = ROL(Aga, 36);
Ake ^= De;
BCi = ROL(Ake, 10);
Ami ^= Di;
BCo = ROL(Ami, 15);
Aso ^= Do;
BCu = ROL(Aso, 56);
Ema = BCa ^((~BCe)& BCi );
Eme = BCe ^((~BCi)& BCo );
Emi = BCi ^((~BCo)& BCu );
Emo = BCo ^((~BCu)& BCa );
Emu = BCu ^((~BCa)& BCe );
Abi ^= Di;
BCa = ROL(Abi, 62);
Ago ^= Do;
BCe = ROL(Ago, 55);
Aku ^= Du;
BCi = ROL(Aku, 39);
Ama ^= Da;
BCo = ROL(Ama, 41);
Ase ^= De;
BCu = ROL(Ase, 2);
Esa = BCa ^((~BCe)& BCi );
Ese = BCe ^((~BCi)& BCo );
Esi = BCi ^((~BCo)& BCu );
Eso = BCo ^((~BCu)& BCa );
Esu = BCu ^((~BCa)& BCe );
// prepareTheta
BCa = Eba^Ega^Eka^Ema^Esa;
BCe = Ebe^Ege^Eke^Eme^Ese;
BCi = Ebi^Egi^Eki^Emi^Esi;
BCo = Ebo^Ego^Eko^Emo^Eso;
BCu = Ebu^Egu^Eku^Emu^Esu;
//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
Da = BCu^ROL(BCe, 1);
De = BCa^ROL(BCi, 1);
Di = BCe^ROL(BCo, 1);
Do = BCi^ROL(BCu, 1);
Du = BCo^ROL(BCa, 1);
Eba ^= Da;
BCa = Eba;
Ege ^= De;
BCe = ROL(Ege, 44);
Eki ^= Di;
BCi = ROL(Eki, 43);
Emo ^= Do;
BCo = ROL(Emo, 21);
Esu ^= Du;
BCu = ROL(Esu, 14);
Aba = BCa ^((~BCe)& BCi );
Aba ^= (uint64_t)KeccakF_RoundConstants[round+1];
Abe = BCe ^((~BCi)& BCo );
Abi = BCi ^((~BCo)& BCu );
Abo = BCo ^((~BCu)& BCa );
Abu = BCu ^((~BCa)& BCe );
Ebo ^= Do;
BCa = ROL(Ebo, 28);
Egu ^= Du;
BCe = ROL(Egu, 20);
Eka ^= Da;
BCi = ROL(Eka, 3);
Eme ^= De;
BCo = ROL(Eme, 45);
Esi ^= Di;
BCu = ROL(Esi, 61);
Aga = BCa ^((~BCe)& BCi );
Age = BCe ^((~BCi)& BCo );
Agi = BCi ^((~BCo)& BCu );
Ago = BCo ^((~BCu)& BCa );
Agu = BCu ^((~BCa)& BCe );
Ebe ^= De;
BCa = ROL(Ebe, 1);
Egi ^= Di;
BCe = ROL(Egi, 6);
Eko ^= Do;
BCi = ROL(Eko, 25);
Emu ^= Du;
BCo = ROL(Emu, 8);
Esa ^= Da;
BCu = ROL(Esa, 18);
Aka = BCa ^((~BCe)& BCi );
Ake = BCe ^((~BCi)& BCo );
Aki = BCi ^((~BCo)& BCu );
Ako = BCo ^((~BCu)& BCa );
Aku = BCu ^((~BCa)& BCe );
Ebu ^= Du;
BCa = ROL(Ebu, 27);
Ega ^= Da;
BCe = ROL(Ega, 36);
Eke ^= De;
BCi = ROL(Eke, 10);
Emi ^= Di;
BCo = ROL(Emi, 15);
Eso ^= Do;
BCu = ROL(Eso, 56);
Ama = BCa ^((~BCe)& BCi );
Ame = BCe ^((~BCi)& BCo );
Ami = BCi ^((~BCo)& BCu );
Amo = BCo ^((~BCu)& BCa );
Amu = BCu ^((~BCa)& BCe );
Ebi ^= Di;
BCa = ROL(Ebi, 62);
Ego ^= Do;
BCe = ROL(Ego, 55);
Eku ^= Du;
BCi = ROL(Eku, 39);
Ema ^= Da;
BCo = ROL(Ema, 41);
Ese ^= De;
BCu = ROL(Ese, 2);
Asa = BCa ^((~BCe)& BCi );
Ase = BCe ^((~BCi)& BCo );
Asi = BCi ^((~BCo)& BCu );
Aso = BCo ^((~BCu)& BCa );
Asu = BCu ^((~BCa)& BCe );
}
//copyToState(state, A)
state[ 0] = Aba;
state[ 1] = Abe;
state[ 2] = Abi;
state[ 3] = Abo;
state[ 4] = Abu;
state[ 5] = Aga;
state[ 6] = Age;
state[ 7] = Agi;
state[ 8] = Ago;
state[ 9] = Agu;
state[10] = Aka;
state[11] = Ake;
state[12] = Aki;
state[13] = Ako;
state[14] = Aku;
state[15] = Ama;
state[16] = Ame;
state[17] = Ami;
state[18] = Amo;
state[19] = Amu;
state[20] = Asa;
state[21] = Ase;
state[22] = Asi;
state[23] = Aso;
state[24] = Asu;
}
/*************************************************
* Name: keccak_init
*
* Description: Initializes the Keccak state.
*
* Arguments: - uint64_t *s: pointer to Keccak state
**************************************************/
static void keccak_init(uint64_t s[25])
{
unsigned int i;
for(i=0;i<25;i++)
s[i] = 0;
}
/*************************************************
* Name: keccak_absorb
*
* Description: Absorb step of Keccak; incremental.
*
* Arguments: - uint64_t *s: pointer to Keccak state
* - unsigned int pos: position in current block to be absorbed
* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
* - const uint8_t *in: pointer to input to be absorbed into s
* - size_t inlen: length of input in bytes
*
* Returns new position pos in current block
**************************************************/
static unsigned int keccak_absorb(uint64_t s[25],
unsigned int pos,
unsigned int r,
const uint8_t *in,
size_t inlen)
{
unsigned int i;
while(pos+inlen >= r) {
for(i=pos;i<r;i++)
s[i/8] ^= (uint64_t)*in++ << 8*(i%8);
inlen -= r-pos;
KeccakF1600_StatePermute(s);
pos = 0;
}
for(i=pos;i<pos+inlen;i++)
s[i/8] ^= (uint64_t)*in++ << 8*(i%8);
return i;
}
/*************************************************
* Name: keccak_finalize
*
* Description: Finalize absorb step.
*
* Arguments: - uint64_t *s: pointer to Keccak state
* - unsigned int pos: position in current block to be absorbed
* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
* - uint8_t p: domain separation byte
**************************************************/
static void keccak_finalize(uint64_t s[25], unsigned int pos, unsigned int r, uint8_t p)
{
s[pos/8] ^= (uint64_t)p << 8*(pos%8);
s[r/8-1] ^= 1ULL << 63;
}
/*************************************************
* Name: keccak_squeeze
*
* Description: Squeeze step of Keccak. Squeezes arbitratrily many bytes.
* Modifies the state. Can be called multiple times to keep
* squeezing, i.e., is incremental.
*
* Arguments: - uint8_t *out: pointer to output
* - size_t outlen: number of bytes to be squeezed (written to out)
* - uint64_t *s: pointer to input/output Keccak state
* - unsigned int pos: number of bytes in current block already squeezed
* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
*
* Returns new position pos in current block
**************************************************/
static unsigned int keccak_squeeze(uint8_t *out,
size_t outlen,
uint64_t s[25],
unsigned int pos,
unsigned int r)
{
unsigned int i;
while(outlen) {
if(pos == r) {
KeccakF1600_StatePermute(s);
pos = 0;
}
for(i=pos;i < r && i < pos+outlen; i++)
*out++ = s[i/8] >> 8*(i%8);
outlen -= i-pos;
pos = i;
}
return pos;
}
/*************************************************
* Name: keccak_absorb_once
*
* Description: Absorb step of Keccak;
* non-incremental, starts by zeroeing the state.
*
* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state
* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
* - const uint8_t *in: pointer to input to be absorbed into s
* - size_t inlen: length of input in bytes
* - uint8_t p: domain-separation byte for different Keccak-derived functions
**************************************************/
static void keccak_absorb_once(uint64_t s[25],
unsigned int r,
const uint8_t *in,
size_t inlen,
uint8_t p)
{
unsigned int i;
for(i=0;i<25;i++)
s[i] = 0;
while(inlen >= r) {
for(i=0;i<r/8;i++)
s[i] ^= load64(in+8*i);
in += r;
inlen -= r;
KeccakF1600_StatePermute(s);
}
for(i=0;i<inlen;i++)
s[i/8] ^= (uint64_t)in[i] << 8*(i%8);
s[i/8] ^= (uint64_t)p << 8*(i%8);
s[(r-1)/8] ^= 1ULL << 63;
}
/*************************************************
* Name: keccak_squeezeblocks
*
* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each.
* Modifies the state. Can be called multiple times to keep
* squeezing, i.e., is incremental. Assumes zero bytes of current
* block have already been squeezed.
*
* Arguments: - uint8_t *out: pointer to output blocks
* - size_t nblocks: number of blocks to be squeezed (written to out)
* - uint64_t *s: pointer to input/output Keccak state
* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
**************************************************/
static void keccak_squeezeblocks(uint8_t *out,
size_t nblocks,
uint64_t s[25],
unsigned int r)
{
unsigned int i;
while(nblocks) {
KeccakF1600_StatePermute(s);
for(i=0;i<r/8;i++)
store64(out+8*i, s[i]);
out += r;
nblocks -= 1;
}
}
/*************************************************
* Name: shake128_init
*
* Description: Initilizes Keccak state for use as SHAKE128 XOF
*
* Arguments: - keccak_state *state: pointer to (uninitialized) Keccak state
**************************************************/
void shake128_init(keccak_state *state)
{
keccak_init(state->s);
state->pos = 0;
}
/*************************************************
* Name: shake128_absorb
*
* Description: Absorb step of the SHAKE128 XOF; incremental.
*
* Arguments: - keccak_state *state: pointer to (initialized) output Keccak state
* - const uint8_t *in: pointer to input to be absorbed into s
* - size_t inlen: length of input in bytes
**************************************************/
void shake128_absorb(keccak_state *state, const uint8_t *in, size_t inlen)
{
state->pos = keccak_absorb(state->s, state->pos, SHAKE128_RATE, in, inlen);
}
/*************************************************
* Name: shake128_finalize
*
* Description: Finalize absorb step of the SHAKE128 XOF.
*
* Arguments: - keccak_state *state: pointer to Keccak state
**************************************************/
void shake128_finalize(keccak_state *state)
{
keccak_finalize(state->s, state->pos, SHAKE128_RATE, 0x1F);
state->pos = SHAKE128_RATE;
}
/*************************************************
* Name: shake128_squeeze
*
* Description: Squeeze step of SHAKE128 XOF. Squeezes arbitraily many
* bytes. Can be called multiple times to keep squeezing.
*
* Arguments: - uint8_t *out: pointer to output blocks
* - size_t outlen : number of bytes to be squeezed (written to output)
* - keccak_state *s: pointer to input/output Keccak state
**************************************************/
void shake128_squeeze(uint8_t *out, size_t outlen, keccak_state *state)
{
state->pos = keccak_squeeze(out, outlen, state->s, state->pos, SHAKE128_RATE);
}
/*************************************************
* Name: shake128_absorb_once
*
* Description: Initialize, absorb into and finalize SHAKE128 XOF; non-incremental.
*
* Arguments: - keccak_state *state: pointer to (uninitialized) output Keccak state
* - const uint8_t *in: pointer to input to be absorbed into s
* - size_t inlen: length of input in bytes
**************************************************/
void shake128_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen)
{
keccak_absorb_once(state->s, SHAKE128_RATE, in, inlen, 0x1F);
state->pos = SHAKE128_RATE;
}
/*************************************************
* Name: shake128_squeezeblocks
*
* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of
* SHAKE128_RATE bytes each. Can be called multiple times
* to keep squeezing. Assumes new block has not yet been
* started (state->pos = SHAKE128_RATE).
*
* Arguments: - uint8_t *out: pointer to output blocks
* - size_t nblocks: number of blocks to be squeezed (written to output)
* - keccak_state *s: pointer to input/output Keccak state
**************************************************/
void shake128_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state)
{
keccak_squeezeblocks(out, nblocks, state->s, SHAKE128_RATE);
}
/*************************************************
* Name: shake256_init
*
* Description: Initilizes Keccak state for use as SHAKE256 XOF
*
* Arguments: - keccak_state *state: pointer to (uninitialized) Keccak state
**************************************************/
void shake256_init(keccak_state *state)
{
keccak_init(state->s);
state->pos = 0;
}
/*************************************************
* Name: shake256_absorb
*
* Description: Absorb step of the SHAKE256 XOF; incremental.
*
* Arguments: - keccak_state *state: pointer to (initialized) output Keccak state
* - const uint8_t *in: pointer to input to be absorbed into s
* - size_t inlen: length of input in bytes
**************************************************/
void shake256_absorb(keccak_state *state, const uint8_t *in, size_t inlen)
{
state->pos = keccak_absorb(state->s, state->pos, SHAKE256_RATE, in, inlen);
}
/*************************************************
* Name: shake256_finalize
*
* Description: Finalize absorb step of the SHAKE256 XOF.
*
* Arguments: - keccak_state *state: pointer to Keccak state
**************************************************/
void shake256_finalize(keccak_state *state)
{
keccak_finalize(state->s, state->pos, SHAKE256_RATE, 0x1F);
state->pos = SHAKE256_RATE;
}
/*************************************************
* Name: shake256_squeeze
*
* Description: Squeeze step of SHAKE256 XOF. Squeezes arbitraily many
* bytes. Can be called multiple times to keep squeezing.
*
* Arguments: - uint8_t *out: pointer to output blocks
* - size_t outlen : number of bytes to be squeezed (written to output)
* - keccak_state *s: pointer to input/output Keccak state
**************************************************/
void shake256_squeeze(uint8_t *out, size_t outlen, keccak_state *state)
{
state->pos = keccak_squeeze(out, outlen, state->s, state->pos, SHAKE256_RATE);
}
/*************************************************
* Name: shake256_absorb_once
*
* Description: Initialize, absorb into and finalize SHAKE256 XOF; non-incremental.
*
* Arguments: - keccak_state *state: pointer to (uninitialized) output Keccak state
* - const uint8_t *in: pointer to input to be absorbed into s
* - size_t inlen: length of input in bytes
**************************************************/
void shake256_absorb_once(keccak_state *state, const uint8_t *in, size_t inlen)
{
keccak_absorb_once(state->s, SHAKE256_RATE, in, inlen, 0x1F);
state->pos = SHAKE256_RATE;
}
/*************************************************
* Name: shake256_squeezeblocks
*
* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of
* SHAKE256_RATE bytes each. Can be called multiple times
* to keep squeezing. Assumes next block has not yet been
* started (state->pos = SHAKE256_RATE).
*
* Arguments: - uint8_t *out: pointer to output blocks
* - size_t nblocks: number of blocks to be squeezed (written to output)
* - keccak_state *s: pointer to input/output Keccak state
**************************************************/
void shake256_squeezeblocks(uint8_t *out, size_t nblocks, keccak_state *state)
{
keccak_squeezeblocks(out, nblocks, state->s, SHAKE256_RATE);
}
/*************************************************
* Name: shake128
*
* Description: SHAKE128 XOF with non-incremental API
*
* Arguments: - uint8_t *out: pointer to output
* - size_t outlen: requested output length in bytes
* - const uint8_t *in: pointer to input
* - size_t inlen: length of input in bytes
**************************************************/
void shake128(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen)
{
size_t nblocks;
keccak_state state;
shake128_absorb_once(&state, in, inlen);
nblocks = outlen/SHAKE128_RATE;
shake128_squeezeblocks(out, nblocks, &state);
outlen -= nblocks*SHAKE128_RATE;
out += nblocks*SHAKE128_RATE;
shake128_squeeze(out, outlen, &state);
}
/*************************************************
* Name: shake256
*
* Description: SHAKE256 XOF with non-incremental API
*
* Arguments: - uint8_t *out: pointer to output
* - size_t outlen: requested output length in bytes
* - const uint8_t *in: pointer to input
* - size_t inlen: length of input in bytes
**************************************************/
void shake256(uint8_t *out, size_t outlen, const uint8_t *in, size_t inlen)
{
size_t nblocks;
keccak_state state;
shake256_absorb_once(&state, in, inlen);
nblocks = outlen/SHAKE256_RATE;
shake256_squeezeblocks(out, nblocks, &state);
outlen -= nblocks*SHAKE256_RATE;
out += nblocks*SHAKE256_RATE;
shake256_squeeze(out, outlen, &state);
}
/*************************************************
* Name: sha3_256
*
* Description: SHA3-256 with non-incremental API
*
* Arguments: - uint8_t *h: pointer to output (32 bytes)
* - const uint8_t *in: pointer to input
* - size_t inlen: length of input in bytes
**************************************************/
void sha3_256(uint8_t h[32], const uint8_t *in, size_t inlen)
{
unsigned int i;
uint64_t s[25];
keccak_absorb_once(s, SHA3_256_RATE, in, inlen, 0x06);
KeccakF1600_StatePermute(s);
for(i=0;i<4;i++)
store64(h+8*i,s[i]);
}
/*************************************************
* Name: sha3_512
*
* Description: SHA3-512 with non-incremental API
*
* Arguments: - uint8_t *h: pointer to output (64 bytes)
* - const uint8_t *in: pointer to input
* - size_t inlen: length of input in bytes
**************************************************/
void sha3_512(uint8_t h[64], const uint8_t *in, size_t inlen)
{
unsigned int i;
uint64_t s[25];
keccak_absorb_once(s, SHA3_512_RATE, in, inlen, 0x06);
KeccakF1600_StatePermute(s);
for(i=0;i<8;i++)
store64(h+8*i,s[i]);
}
fips202x4.h
#ifndef FIPS202X4_H
#define FIPS202X4_H
#include <stddef.h>
#include <stdint.h>
#include <immintrin.h>
#define FIPS202X4_NAMESPACE(s) fips202x4_avx2_##s
typedef struct {
__m256i s[25];
} keccakx4_state;
#define shake128x4_absorb_once FIPS202X4_NAMESPACE(shake128x4_absorb_once)
void shake128x4_absorb_once(keccakx4_state* state,
const uint8_t* in0,
const uint8_t* in1,
const uint8_t* in2,
const uint8_t* in3,
size_t inlen);
#define shake128x4_squeezeblocks FIPS202X4_NAMESPACE(shake128x4_squeezeblocks)
void shake128x4_squeezeblocks(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t nblocks,
keccakx4_state* state);
#define shake128x4_squeeze FIPS202X4_NAMESPACE(shake128x4_squeeze)
void shake128x4_squeeze(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t outlen,
keccakx4_state* state);
#define shake256x4_absorb_once FIPS202X4_NAMESPACE(shake256x4_absorb_once)
void shake256x4_absorb_once(keccakx4_state* state,
const uint8_t* in0,
const uint8_t* in1,
const uint8_t* in2,
const uint8_t* in3,
size_t inlen);
#define shake256x4_squeezeblocks FIPS202X4_NAMESPACE(shake256x4_squeezeblocks)
void shake256x4_squeezeblocks(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t nblocks,
keccakx4_state* state);
#define shake256x4_squeeze FIPS202X4_NAMESPACE(shake256x4_squeeze)
void shake256x4_squeeze(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t outlen,
keccakx4_state* state);
#define shake128x4 FIPS202X4_NAMESPACE(shake128x4)
void shake128x4(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t outlen,
const uint8_t* in0,
const uint8_t* in1,
const uint8_t* in2,
const uint8_t* in3,
size_t inlen);
#define shake256x4 FIPS202X4_NAMESPACE(shake256x4)
void shake256x4(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t outlen,
const uint8_t* in0,
const uint8_t* in1,
const uint8_t* in2,
const uint8_t* in3,
size_t inlen);
#endif
fips202x4.c
#include <stddef.h>
#include <stdint.h>
#include <immintrin.h>
#include <string.h>
#include "fips202.h"
#include "fips202x4.h"
/* Use implementation from the Keccak Code Package */
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds
extern void KeccakF1600_StatePermute4x(__m256i* s);
static void keccakx4_absorb_once(__m256i s[25],
unsigned int r,
const uint8_t* in0,
const uint8_t* in1,
const uint8_t* in2,
const uint8_t* in3,
size_t inlen,
uint8_t p)
{
size_t i;
uint64_t pos = 0;
__m256i t, idx;
for (i = 0; i < 25; ++i)
s[i] = _mm256_setzero_si256();
idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0);
while (inlen >= r) {
for (i = 0; i < r / 8; ++i) {
t = _mm256_i64gather_epi64((long long*)pos, idx, 1);
s[i] = _mm256_xor_si256(s[i], t);
pos += 8;
}
inlen -= r;
KeccakF1600_StatePermute4x(s);
}
for (i = 0; i < inlen / 8; ++i) {
t = _mm256_i64gather_epi64((long long*)pos, idx, 1);
s[i] = _mm256_xor_si256(s[i], t);
pos += 8;
}
inlen -= 8 * i;
if (inlen) {
t = _mm256_i64gather_epi64((long long*)pos, idx, 1);
idx = _mm256_set1_epi64x((1ULL << (8 * inlen)) - 1);
t = _mm256_and_si256(t, idx);
s[i] = _mm256_xor_si256(s[i], t);
}
t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen);
s[i] = _mm256_xor_si256(s[i], t);
t = _mm256_set1_epi64x(1ULL << 63);
s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t);
}
static void keccakx4_squeezeblocks(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t nblocks,
unsigned int r,
__m256i s[25])
{
unsigned int i;
__m128d t;
while (nblocks > 0) {
KeccakF1600_StatePermute4x(s);
for (i = 0; i < r / 8; ++i) {
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));
_mm_storel_pd((double*) & out0[8 * i], t);
_mm_storeh_pd((double*) & out1[8 * i], t);
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1));
_mm_storel_pd((double*) & out2[8 * i], t);
_mm_storeh_pd((double*) & out3[8 * i], t);
}
out0 += r;
out1 += r;
out2 += r;
out3 += r;
--nblocks;
}
}
void shake128x4_absorb_once(keccakx4_state* state,
const uint8_t* in0,
const uint8_t* in1,
const uint8_t* in2,
const uint8_t* in3,
size_t inlen)
{
keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
}
void shake128x4_squeezeblocks(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t nblocks,
keccakx4_state* state)
{
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s);
}
void shake128x4_squeeze(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t outlen,
keccakx4_state* state)
{
unsigned int i;
size_t nblocks = outlen / SHAKE128_RATE;
uint8_t t[4][SHAKE128_RATE];
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s);
out0 += nblocks * SHAKE128_RATE;
out1 += nblocks * SHAKE128_RATE;
out2 += nblocks * SHAKE128_RATE;
out3 += nblocks * SHAKE128_RATE;
outlen -= nblocks * SHAKE128_RATE;
if (outlen) {
keccakx4_squeezeblocks(t[0], t[1], t[2], t[3], 1, SHAKE128_RATE, state->s);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
}
void shake256x4_absorb_once(keccakx4_state* state,
const uint8_t* in0,
const uint8_t* in1,
const uint8_t* in2,
const uint8_t* in3,
size_t inlen)
{
keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
}
void shake256x4_squeezeblocks(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t nblocks,
keccakx4_state* state)
{
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s);
}
void shake256x4_squeeze(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t outlen,
keccakx4_state* state)
{
unsigned int i;
size_t nblocks = outlen / SHAKE256_RATE;
uint8_t t[4][SHAKE256_RATE];
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s);
out0 += nblocks * SHAKE256_RATE;
out1 += nblocks * SHAKE256_RATE;
out2 += nblocks * SHAKE256_RATE;
out3 += nblocks * SHAKE256_RATE;
outlen -= nblocks * SHAKE256_RATE;
if (outlen) {
keccakx4_squeezeblocks(t[0], t[1], t[2], t[3], 1, SHAKE256_RATE, state->s);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
}
void shake128x4(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t outlen,
const uint8_t* in0,
const uint8_t* in1,
const uint8_t* in2,
const uint8_t* in3,
size_t inlen)
{
unsigned int i;
size_t nblocks = outlen / SHAKE128_RATE;
uint8_t t[4][SHAKE128_RATE];
keccakx4_state state;
shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen);
shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);
out0 += nblocks * SHAKE128_RATE;
out1 += nblocks * SHAKE128_RATE;
out2 += nblocks * SHAKE128_RATE;
out3 += nblocks * SHAKE128_RATE;
outlen -= nblocks * SHAKE128_RATE;
if (outlen) {
shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
}
void shake256x4(uint8_t* out0,
uint8_t* out1,
uint8_t* out2,
uint8_t* out3,
size_t outlen,
const uint8_t* in0,
const uint8_t* in1,
const uint8_t* in2,
const uint8_t* in3,
size_t inlen)
{
unsigned int i;
size_t nblocks = outlen / SHAKE256_RATE;
uint8_t t[4][SHAKE256_RATE];
keccakx4_state state;
shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen);
shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);
out0 += nblocks * SHAKE256_RATE;
out1 += nblocks * SHAKE256_RATE;
out2 += nblocks * SHAKE256_RATE;
out3 += nblocks * SHAKE256_RATE;
outlen -= nblocks * SHAKE256_RATE;
if (outlen) {
shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
}
风语者!平时喜欢研究各种技术,目前在从事后端开发工作,热爱生活、热爱工作。