CRC Calculation Of A Mostly Static Data Stream
Below is example C code for an alternative approach for CRC(A0). Rather than working with a matrix, a CRC can be cycled forward n bits by muliplying (CRC · ((2^n)%POLY)%POLY . So the repeated squaring is performed on an integer rather than a matrix. If n is constant, then (2^n)%POLY can be pre-computed.
/* crcpad.c - crc - data has a large number of trailing zeroes */
#include <stdio.h>
#include <stdlib.h>
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
#define POLY (0x04c11db7u)
static uint32_t crctbl[256];
void GenTbl(void) /* generate crc table */
{
uint32_t crc;
uint32_t c;
uint32_t i;
for(c = 0; c < 0x100; c++){
crc = c<<24;
for(i = 0; i < 8; i++)
/* assumes twos complement */
crc = (crc<<1)^((0-(crc>>31))&POLY);
crctbl[c] = crc;
}
}
uint32_t GenCrc(uint8_t * bfr, size_t size) /* generate crc */
{
uint32_t crc = 0u;
while(size--)
crc = (crc<<8)^crctbl[(crc>>24)^*bfr++];
return(crc);
}
/* carryless multiply modulo crc */
uint32_t MpyModCrc(uint32_t a, uint32_t b) /* (a*b)%crc */
{
uint32_t pd = 0;
uint32_t i;
for(i = 0; i < 32; i++){
/* assumes twos complement */
pd = (pd<<1)^((0-(pd>>31))&POLY);
pd ^= (0-(b>>31))&a;
b <<= 1;
}
return pd;
}
/* exponentiate by repeated squaring modulo crc */
uint32_t PowModCrc(uint32_t p) /* pow(2,p)%crc */
{
uint32_t prd = 0x1u; /* current product */
uint32_t sqr = 0x2u; /* current square */
while(p){
if(p&1)
prd = MpyModCrc(prd, sqr);
sqr = MpyModCrc(sqr, sqr);
p >>= 1;
}
return prd;
}
/* # data bytes */
#define DAT ( 32)
/* # zero bytes */
#define PAD (992)
/* DATA+PAD */
#define CNT (1024)
int main()
{
uint32_t pmc;
uint32_t crc;
uint32_t crf;
uint32_t i;
uint8_t *msg = malloc(CNT);
for(i = 0; i < DAT; i++) /* generate msg */
msg[i] = (uint8_t)rand();
for( ; i < CNT; i++)
msg[i] = 0;
GenTbl(); /* generate crc table */
crc = GenCrc(msg, CNT); /* generate crc normally */
crf = GenCrc(msg, DAT); /* generate crc for data */
pmc = PowModCrc(PAD*8); /* pmc = pow(2,PAD*8)%crc */
crf = MpyModCrc(crf, pmc); /* crf = (crf*pmc)%crc */
printf("%08x %08x\n", crc, crf);
free(msg);
return 0;
}
Example C code using intrinsic for carryless multiply, pclmulqdq == _mm_clmulepi64_si128:
/* crcpadm.c - crc - data has a large number of trailing zeroes */
/* pclmulqdq intrinsic version */
#include <stdio.h>
#include <stdlib.h>
#include <intrin.h>
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#define POLY (0x104c11db7ull)
#define POLYM ( 0x04c11db7u)
static uint32_t crctbl[256];
static __m128i poly; /* poly */
static __m128i invpoly; /* 2^64 / POLY */
void GenMPoly(void) /* generate __m12i8 poly info */
{
uint64_t N = 0x100000000ull;
uint64_t Q = 0;
for(size_t i = 0; i < 33; i++){
Q <<= 1;
if(N&0x100000000ull){
Q |= 1;
N ^= POLY;
}
N <<= 1;
}
poly.m128i_u64[0] = POLY;
invpoly.m128i_u64[0] = Q;
}
void GenTbl(void) /* generate crc table */
{
uint32_t crc;
uint32_t c;
uint32_t i;
for(c = 0; c < 0x100; c++){
crc = c<<24;
for(i = 0; i < 8; i++)
/* assumes twos complement */
crc = (crc<<1)^((0-(crc>>31))&POLYM);
crctbl[c] = crc;
}
}
uint32_t GenCrc(uint8_t * bfr, size_t size) /* generate crc */
{
uint32_t crc = 0u;
while(size--)
crc = (crc<<8)^crctbl[(crc>>24)^*bfr++];
return(crc);
}
/* carryless multiply modulo crc */
uint32_t MpyModCrc(uint32_t a, uint32_t b) /* (a*b)%crc */
{
__m128i ma, mb, mp, mt;
ma.m128i_u64[0] = a;
mb.m128i_u64[0] = b;
mp = _mm_clmulepi64_si128(ma, mb, 0x00); /* p[0] = a*b */
mt = _mm_clmulepi64_si128(mp, invpoly, 0x00); /* t[1] = (p[0]*((2^64)/POLY))>>64 */
mt = _mm_clmulepi64_si128(mt, poly, 0x01); /* t[0] = t[1]*POLY */
return mp.m128i_u32[0] ^ mt.m128i_u32[0]; /* ret = p[0] ^ t[0] */
}
/* exponentiate by repeated squaring modulo crc */
uint32_t PowModCrc(uint32_t p) /* pow(2,p)%crc */
{
uint32_t prd = 0x1u; /* current product */
uint32_t sqr = 0x2u; /* current square */
while(p){
if(p&1)
prd = MpyModCrc(prd, sqr);
sqr = MpyModCrc(sqr, sqr);
p >>= 1;
}
return prd;
}
/* # data bytes */
#define DAT ( 32)
/* # zero bytes */
#define PAD (992)
/* DATA+PAD */
#define CNT (1024)
int main()
{
uint32_t pmc;
uint32_t crc;
uint32_t crf;
uint32_t i;
uint8_t *msg = malloc(CNT);
GenMPoly(); /* generate __m128 polys */
GenTbl(); /* generate crc table */
for(i = 0; i < DAT; i++) /* generate msg */
msg[i] = (uint8_t)rand();
for( ; i < CNT; i++)
msg[i] = 0;
crc = GenCrc(msg, CNT); /* generate crc normally */
crf = GenCrc(msg, DAT); /* generate crc for data */
pmc = PowModCrc(PAD*8); /* pmc = pow(2,PAD*8)%crc */
crf = MpyModCrc(crf, pmc); /* crf = (crf*pmc)%crc */
printf("%08x %08x\n", crc, crf);
free(msg);
return 0;
}
Yes. You can see how in zlib's crc32_combine()
. If you have two sequences A and B, then the pure CRC of AB is the exclusive-or of the CRC of A0 and the CRC of 0B, where the 0's represent a series of zero bytes with the length of the corresponding sequence, i.e. B and A respectively.
For your application, you can pre-compute a single operator that applies 1020 zeros to the CRC of your first four bytes very rapidly. Then you can exclusive-or that with the pre-computed CRC of the 1020 bytes.
Update:
Here is a post of mine from 2008 with a detailed explanation that @ArtemB discovered (that I had forgotten about):
crc32_combine()
in zlib is based on two key tricks. For what follows,
we set aside the fact that the standard 32-bit CRC is pre and post-
conditioned. We can deal with that later. Assume for now a CRC that
has no such conditioning, and so starts with the register filled with
zeros.
Trick #1: CRCs are linear. So if you have stream X and stream Y of the same length and exclusive-or the two streams bit-by-bit to get Z, i.e. Z = X ^ Y (using the C notation for exclusive-or), then CRC(Z) = CRC(X) ^ CRC(Y). For the problem at hand we have two streams A and B of differing length that we want to concatenate into stream Z. What we have available are CRC(A) and CRC(B). What we want is a quick way to compute CRC(Z). The trick is to construct X = A concatenated with length(B) zero bits, and Y = length(A) zero bits concatenated with B. So if we represent concatenation simply by juxtaposition of the symbols, X = A0, Y = 0B, then X^Y = Z = AB. Then we have CRC(Z) = CRC(A0) ^ CRC(0B).
Now we need to know CRC(A0) and CRC(0B). CRC(0B) is easy. If we feed a bunch of zeros to the CRC machine starting with zero, the register is still filled with zeros. So it's as if we did nothing at all. Therefore CRC(0B) = CRC(B).
CRC(A0) requires more work however. Taking a non-zero CRC and feeding zeros to the CRC machine doesn't leave it alone. Every zero changes the register contents. So to get CRC(A0), we need to set the register to CRC(A), and then run length(B) zeros through it. Then we can exclusive-or the result of that with CRC(B) = CRC(0B), and we get what we want, which is CRC(Z) = CRC(AB). Voila!
Well, actually the voila is premature. I wasn't at all satisfied with that answer. I didn't want a calculation that took a time proportional to the length of B. That wouldn't save any time compared to simply setting the register to CRC(A) and running the B stream through. I figured there must be a faster way to compute the effect of feeding n zeros into the CRC machine (where n = length(B)). So that leads us to:
Trick #2: The CRC machine is a linear state machine. If we know the linear transformation that occurs when we feed a zero to the machine, then we can do operations on that transformation to more efficiently find the transformation that results from feeding n zeros into the machine.
The transformation of feeding a single zero bit into the CRC machine is completely represented by a 32x32 binary matrix. To apply the transformation we multiply the matrix by the register, taking the register as a 32 bit column vector. For the matrix multiplication in binary (i.e. over the Galois Field of 2), the role of multiplication is played by and'ing, and the role of addition is played by exclusive- or'ing.
There are a few different ways to construct the magic matrix that represents the transformation caused by feeding the CRC machine a single zero bit. One way is to observe that each column of the matrix is what you get when your register starts off with a single one in it. So the first column is what you get when the register is 100... and then feed a zero, the second column comes from starting with 0100..., etc. (Those are referred to as basis vectors.) You can see this simply by doing the matrix multiplication with those vectors. The matrix multiplication selects the column of the matrix corresponding to the location of the single one.
Now for the trick. Once we have the magic matrix, we can set aside the initial register contents for a while, and instead use the transformation for one zero to compute the transformation for n zeros. We could just multiply n copies of the matrix together to get the matrix for n zeros. But that's even worse than just running the n zeros through the machine. However there's an easy way to avoid most of those matrix multiplications to get the same answer. Suppose we want to know the transformation for running eight zero bits, or one byte through. Let's call the magic matrix that represents running one zero through: M. We could do seven matrix multiplications to get R = MxMxMxMxMxMxMxM. Instead, let's start with MxM and call that P. Then PxP is MxMxMxM. Let's call that Q. Then QxQ is R. So now we've reduced the seven multiplications to three. P = MxM, Q = PxP, and R = QxQ.
Now I'm sure you get the idea for an arbitrary n number of zeros. We can very rapidly generate transformation matrices Mk, where Mk is the transformation for running 2k zeros through. (In the paragraph above M3 is R.) We can make M1 through Mk with only k matrix multiplications, starting with M0 = M. k only has to be as large as the number of bits in the binary representation of n. We can then pick those matrices where there are ones in the binary representation of n and multiply them together to get the transformation of running n zeros through the CRC machine. So if n = 13, compute M0 x M2 x M3.
If j is the number of one's in the binary representation of n, then we just have j - 1 more matrix multiplications. So we have a total of k + j - 1 matrix multiplications, where j <= k = floor(logbase2(n)).
Now we take our rapidly constructed matrix for n zeros, and multiply that by CRC(A) to get CRC(A0). We can compute CRC(A0) in O(log(n)) time, instead of O(n) time. We exclusive or that with CRC(B) and Voila! (really this time), we have CRC(Z).
That's what zlib's crc32_combine()
does.
I will leave it as an exercise for the reader as to how to deal with
the pre and post conditioning of the CRC register. You just need to
apply the linearity observations above. Hint: You don't need to know
length(A). In fact crc32_combine()
only takes three arguments:
CRC(A), CRC(B), and length(B) (in bytes).