src/crc32c/sse42.c - RealtimeRoboticsGroup/test - Gitiles

 /*
  * crc32c.c -- compute CRC-32C using the Intel crc32 instruction
  * Copyright (C) 2013 Mark Adler
  * Version 1.1  1 Aug 2013  Mark Adler
  *
  * Use hardware CRC instruction on Intel SSE 4.2 processors.  This computes a
  * CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc.  A software
  * version is provided as a fall-back, as well as for speed comparisons.
  *
  * (Modified for the use in RAWRTC. Software fallback has been stripped.)
  *
  *
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the author be held liable for any damages
  * arising from the use of this software.
  *
  * Permission is granted to anyone to use this software for any purpose,
  * including commercial applications, and to alter it and redistribute it
  * freely, subject to the following restrictions:
  *
  * 1. The origin of this software must not be misrepresented; you must not
  *    claim that you wrote the original software. If you use this software
  *    in a product, an acknowledgment in the product documentation would be
  *    appreciated but is not required.
  * 2. Altered source versions must be plainly marked as such, and must not be
  *    misrepresented as being the original software.
  * 3. This notice may not be removed or altered from any source distribution.
  *
  * Mark Adler
  * madler@alumni.caltech.edu
  */
 #include "sse42.h"
 #include <re.h>

 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
 #define POLY 0x82f63b78

 /* Multiply a matrix times a vector over the Galois field of two elements,
    GF(2).  Each element is a bit in an unsigned integer.  mat must have at
    least as many entries as the power of two for most significant one bit in
    vec. */
 static inline uint32_t gf2_matrix_times(uint32_t* mat, uint32_t vec) {
     uint32_t sum;

     sum = 0;
     while (vec) {
         if (vec & 1) {
             sum ^= *mat;
         }
         vec >>= 1;
         mat++;
     }
     return sum;
 }

 /* Multiply a matrix by itself over GF(2).  Both mat and square must have 32
    rows. */
 static inline void gf2_matrix_square(uint32_t* square, uint32_t* mat) {
     int n;

     for (n = 0; n < 32; n++) {
         square[n] = gf2_matrix_times(mat, mat[n]);
     }
 }

 /* Construct an operator to apply len zeros to a crc.  len must be a power of
    two.  If len is not a power of two, then the result is the same as for the
    largest power of two less than len.  The result for len == 0 is the same as
    for len == 1.  A version of this routine could be easily written for any
    len, but that is not needed for this application. */
 static void crc32c_zeros_op(uint32_t* even, size_t len) {
     int n;
     uint32_t row;
     uint32_t odd[32]; /* odd-power-of-two zeros operator */

     /* put operator for one zero bit in odd */
     odd[0] = POLY; /* CRC-32C polynomial */
     row = 1;
     for (n = 1; n < 32; n++) {
         odd[n] = row;
         row <<= 1;
     }

     /* put operator for two zero bits in even */
     gf2_matrix_square(even, odd);

     /* put operator for four zero bits in odd */
     gf2_matrix_square(odd, even);

     /* first square will put the operator for one zero byte (eight zero bits),
        in even -- next square puts operator for two zero bytes in odd, and so
        on, until len has been rotated down to zero */
     do {
         gf2_matrix_square(even, odd);
         len >>= 1;
         if (len == 0) {
             return;
         }
         gf2_matrix_square(odd, even);
         len >>= 1;
     } while (len);

     /* answer ended up in odd -- copy to even */
     for (n = 0; n < 32; n++) {
         even[n] = odd[n];
     }
 }

 /* Take a length and build four lookup tables for applying the zeros operator
    for that length, byte-by-byte on the operand. */
 static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
     uint32_t n;
     uint32_t op[32];

     crc32c_zeros_op(op, len);
     for (n = 0; n < 256; n++) {
         zeros[0][n] = gf2_matrix_times(op, n);
         zeros[1][n] = gf2_matrix_times(op, n << 8);
         zeros[2][n] = gf2_matrix_times(op, n << 16);
         zeros[3][n] = gf2_matrix_times(op, n << 24);
     }
 }

 /* Apply the zeros operator table to crc. */
 static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
     // clang-format off
     return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
            zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
     // clang-format on
 }

 /* Block sizes for three-way parallel crc computation.  LONG and SHORT must
    both be powers of two.  The associated string constants must be set
    accordingly, for use in constructing the assembler instructions. */
 #define LONG 8192
 #define LONGx1 "8192"
 #define LONGx2 "16384"
 #define SHORT 256
 #define SHORTx1 "256"
 #define SHORTx2 "512"

 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
 static uint32_t crc32c_long[4][256];
 static uint32_t crc32c_short[4][256];

 /* Compute CRC-32C using the Intel hardware instruction. */
 static uint32_t crc32c_hw(uint32_t crc, const void* buf, size_t len) {
     const unsigned char* next = buf;
     const unsigned char* end;
     uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */

     /* pre-process the crc */
     crc0 = crc ^ 0xffffffff;

     /* compute the crc for up to seven leading bytes to bring the data pointer
        to an eight-byte boundary */
     while (len && ((uintptr_t) next & 7) != 0) {
         __asm__("crc32b\t"
                 "(%1), %0"
                 : "=r"(crc0)
                 : "r"(next), "0"(crc0));
         next++;
         len--;
     }

     /* compute the crc on sets of LONG*3 bytes, executing three independent crc
        instructions, each on LONG bytes -- this is optimized for the Nehalem,
        Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
        throughput of one crc per cycle, but a latency of three cycles */
     while (len >= LONG * 3) {
         crc1 = 0;
         crc2 = 0;
         end = next + LONG;
         do {
             __asm__("crc32q\t"
                     "(%3), %0\n\t"
                     "crc32q\t" LONGx1 "(%3), %1\n\t"
                     "crc32q\t" LONGx2 "(%3), %2"
                     : "=r"(crc0), "=r"(crc1), "=r"(crc2)
                     : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
             next += 8;
         } while (next < end);
         crc0 = crc32c_shift(crc32c_long, (uint32_t) crc0) ^ crc1;
         crc0 = crc32c_shift(crc32c_long, (uint32_t) crc0) ^ crc2;
         next += LONG * 2;
         len -= LONG * 3;
     }

     /* do the same thing, but now on SHORT*3 blocks for the remaining data less
        than a LONG*3 block */
     while (len >= SHORT * 3) {
         crc1 = 0;
         crc2 = 0;
         end = next + SHORT;
         do {
             __asm__("crc32q\t"
                     "(%3), %0\n\t"
                     "crc32q\t" SHORTx1 "(%3), %1\n\t"
                     "crc32q\t" SHORTx2 "(%3), %2"
                     : "=r"(crc0), "=r"(crc1), "=r"(crc2)
                     : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
             next += 8;
         } while (next < end);
         crc0 = crc32c_shift(crc32c_short, (uint32_t) crc0) ^ crc1;
         crc0 = crc32c_shift(crc32c_short, (uint32_t) crc0) ^ crc2;
         next += SHORT * 2;
         len -= SHORT * 3;
     }

     /* compute the crc on the remaining eight-byte units less than a SHORT*3
        block */
     end = next + (len - (len & 7));
     while (next < end) {
         __asm__("crc32q\t"
                 "(%1), %0"
                 : "=r"(crc0)
                 : "r"(next), "0"(crc0));
         next += 8;
     }
     len &= 7;

     /* compute the crc for up to seven trailing bytes */
     while (len) {
         __asm__("crc32b\t"
                 "(%1), %0"
                 : "=r"(crc0)
                 : "r"(next), "0"(crc0));
         next++;
         len--;
     }

     /* return a post-processed crc */
     return (uint32_t) crc0 ^ 0xffffffff;
 }

 /*
  * Probe for SSE 4.2 support.
  */
 bool rawrtc_crc32c_sse42_supported(void) {
     uint32_t eax, ecx;
     eax = 1;
     __asm__("cpuid" : "=c"(ecx) : "a"(eax) : "%ebx", "%edx");
     return ((ecx >> 20) & 1) != 0;
 }

 /*
  * Initialize tables for shifting CRCs.
  */
 void rawrtc_crc32c_init_sse42(void) {
     crc32c_zeros(crc32c_long, LONG);
     crc32c_zeros(crc32c_short, SHORT);
 }

 /*
  * Compute CRC-32C using hardware instructions.
  */
 uint32_t rawrtc_crc32c_sse42(void const* buffer, size_t length) {
     return crc32c_hw(0x00000000, buffer, length);
 }
	/*
	* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
	* Copyright (C) 2013 Mark Adler
	* Version 1.1 1 Aug 2013 Mark Adler
	*
	* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
	* CRC-32C, not the CRC-32 used by Ethernet and zip, gzip, etc. A software
	* version is provided as a fall-back, as well as for speed comparisons.
	*
	* (Modified for the use in RAWRTC. Software fallback has been stripped.)
	*
	*
	*
	* This software is provided 'as-is', without any express or implied
	* warranty. In no event will the author be held liable for any damages
	* arising from the use of this software.
	*
	* Permission is granted to anyone to use this software for any purpose,
	* including commercial applications, and to alter it and redistribute it
	* freely, subject to the following restrictions:
	*
	* 1. The origin of this software must not be misrepresented; you must not
	* claim that you wrote the original software. If you use this software
	* in a product, an acknowledgment in the product documentation would be
	* appreciated but is not required.
	* 2. Altered source versions must be plainly marked as such, and must not be
	* misrepresented as being the original software.
	* 3. This notice may not be removed or altered from any source distribution.
	*
	* Mark Adler
	* madler@alumni.caltech.edu
	*/
	#include "sse42.h"
	#include <re.h>

	/* CRC-32C (iSCSI) polynomial in reversed bit order. */
	#define POLY 0x82f63b78

	/* Multiply a matrix times a vector over the Galois field of two elements,
	GF(2). Each element is a bit in an unsigned integer. mat must have at
	least as many entries as the power of two for most significant one bit in
	vec. */
	static inline uint32_t gf2_matrix_times(uint32_t* mat, uint32_t vec) {
	uint32_t sum;

	sum = 0;
	while (vec) {
	if (vec & 1) {
	sum ^= *mat;
	}
	vec >>= 1;
	mat++;
	}
	return sum;
	}

	/* Multiply a matrix by itself over GF(2). Both mat and square must have 32
	rows. */
	static inline void gf2_matrix_square(uint32_t* square, uint32_t* mat) {
	int n;

	for (n = 0; n < 32; n++) {
	square[n] = gf2_matrix_times(mat, mat[n]);
	}
	}

	/* Construct an operator to apply len zeros to a crc. len must be a power of
	two. If len is not a power of two, then the result is the same as for the
	largest power of two less than len. The result for len == 0 is the same as
	for len == 1. A version of this routine could be easily written for any
	len, but that is not needed for this application. */
	static void crc32c_zeros_op(uint32_t* even, size_t len) {
	int n;
	uint32_t row;
	uint32_t odd[32]; /* odd-power-of-two zeros operator */

	/* put operator for one zero bit in odd */
	odd[0] = POLY; /* CRC-32C polynomial */
	row = 1;
	for (n = 1; n < 32; n++) {
	odd[n] = row;
	row <<= 1;
	}

	/* put operator for two zero bits in even */
	gf2_matrix_square(even, odd);

	/* put operator for four zero bits in odd */
	gf2_matrix_square(odd, even);

	/* first square will put the operator for one zero byte (eight zero bits),
	in even -- next square puts operator for two zero bytes in odd, and so
	on, until len has been rotated down to zero */
	do {
	gf2_matrix_square(even, odd);
	len >>= 1;
	if (len == 0) {
	return;
	}
	gf2_matrix_square(odd, even);
	len >>= 1;
	} while (len);

	/* answer ended up in odd -- copy to even */
	for (n = 0; n < 32; n++) {
	even[n] = odd[n];
	}
	}

	/* Take a length and build four lookup tables for applying the zeros operator
	for that length, byte-by-byte on the operand. */
	static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
	uint32_t n;
	uint32_t op[32];

	crc32c_zeros_op(op, len);
	for (n = 0; n < 256; n++) {
	zeros[0][n] = gf2_matrix_times(op, n);
	zeros[1][n] = gf2_matrix_times(op, n << 8);
	zeros[2][n] = gf2_matrix_times(op, n << 16);
	zeros[3][n] = gf2_matrix_times(op, n << 24);
	}
	}

	/* Apply the zeros operator table to crc. */
	static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
	// clang-format off
	return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
	zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
	// clang-format on
	}

	/* Block sizes for three-way parallel crc computation. LONG and SHORT must
	both be powers of two. The associated string constants must be set
	accordingly, for use in constructing the assembler instructions. */
	#define LONG 8192
	#define LONGx1 "8192"
	#define LONGx2 "16384"
	#define SHORT 256
	#define SHORTx1 "256"
	#define SHORTx2 "512"

	/* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
	static uint32_t crc32c_long[4][256];
	static uint32_t crc32c_short[4][256];

	/* Compute CRC-32C using the Intel hardware instruction. */
	static uint32_t crc32c_hw(uint32_t crc, const void* buf, size_t len) {
	const unsigned char* next = buf;
	const unsigned char* end;
	uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */

	/* pre-process the crc */
	crc0 = crc ^ 0xffffffff;

	/* compute the crc for up to seven leading bytes to bring the data pointer
	to an eight-byte boundary */
	while (len && ((uintptr_t) next & 7) != 0) {
	__asm__("crc32b\t"
	"(%1), %0"
	: "=r"(crc0)
	: "r"(next), "0"(crc0));
	next++;
	len--;
	}

	/* compute the crc on sets of LONG*3 bytes, executing three independent crc
	instructions, each on LONG bytes -- this is optimized for the Nehalem,
	Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
	throughput of one crc per cycle, but a latency of three cycles */
	while (len >= LONG * 3) {
	crc1 = 0;
	crc2 = 0;
	end = next + LONG;
	do {
	__asm__("crc32q\t"
	"(%3), %0\n\t"
	"crc32q\t" LONGx1 "(%3), %1\n\t"
	"crc32q\t" LONGx2 "(%3), %2"
	: "=r"(crc0), "=r"(crc1), "=r"(crc2)
	: "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
	next += 8;
	} while (next < end);
	crc0 = crc32c_shift(crc32c_long, (uint32_t) crc0) ^ crc1;
	crc0 = crc32c_shift(crc32c_long, (uint32_t) crc0) ^ crc2;
	next += LONG * 2;
	len -= LONG * 3;
	}

	/* do the same thing, but now on SHORT*3 blocks for the remaining data less
	than a LONG3 block /
	while (len >= SHORT * 3) {
	crc1 = 0;
	crc2 = 0;
	end = next + SHORT;
	do {
	__asm__("crc32q\t"
	"(%3), %0\n\t"
	"crc32q\t" SHORTx1 "(%3), %1\n\t"
	"crc32q\t" SHORTx2 "(%3), %2"
	: "=r"(crc0), "=r"(crc1), "=r"(crc2)
	: "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
	next += 8;
	} while (next < end);
	crc0 = crc32c_shift(crc32c_short, (uint32_t) crc0) ^ crc1;
	crc0 = crc32c_shift(crc32c_short, (uint32_t) crc0) ^ crc2;
	next += SHORT * 2;
	len -= SHORT * 3;
	}

	/* compute the crc on the remaining eight-byte units less than a SHORT*3
	block */
	end = next + (len - (len & 7));
	while (next < end) {
	__asm__("crc32q\t"
	"(%1), %0"
	: "=r"(crc0)
	: "r"(next), "0"(crc0));
	next += 8;
	}
	len &= 7;

	/* compute the crc for up to seven trailing bytes */
	while (len) {
	__asm__("crc32b\t"
	"(%1), %0"
	: "=r"(crc0)
	: "r"(next), "0"(crc0));
	next++;
	len--;
	}

	/* return a post-processed crc */
	return (uint32_t) crc0 ^ 0xffffffff;
	}

	/*
	* Probe for SSE 4.2 support.
	*/
	bool rawrtc_crc32c_sse42_supported(void) {
	uint32_t eax, ecx;
	eax = 1;
	__asm__("cpuid" : "=c"(ecx) : "a"(eax) : "%ebx", "%edx");
	return ((ecx >> 20) & 1) != 0;
	}

	/*
	* Initialize tables for shifting CRCs.
	*/
	void rawrtc_crc32c_init_sse42(void) {
	crc32c_zeros(crc32c_long, LONG);
	crc32c_zeros(crc32c_short, SHORT);
	}

	/*
	* Compute CRC-32C using hardware instructions.
	*/
	uint32_t rawrtc_crc32c_sse42(void const* buffer, size_t length) {
	return crc32c_hw(0x00000000, buffer, length);
	}