XRootD
XrdOucCRC32C.cc
Go to the documentation of this file.
1 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2  * Copyright (C) 2013, 2015 Mark Adler
3  * Version 1.3 31 Dec 2015 Mark Adler
4  */
5 
6 /*
7  This software is provided 'as-is', without any express or implied
8  warranty. In no event will the author be held liable for any damages
9  arising from the use of this software.
10 
11  Permission is granted to anyone to use this software for any purpose,
12  including commercial applications, and to alter it and redistribute it
13  freely, subject to the following restrictions:
14 
15  1. The origin of this software must not be misrepresented; you must not
16  claim that you wrote the original software. If you use this software
17  in a product, an acknowledgment in the product documentation would be
18  appreciated but is not required.
19  2. Altered source versions must be plainly marked as such, and must not be
20  misrepresented as being the original software.
21  3. This notice may not be removed or altered from any source distribution.
22 
23  Mark Adler
24  madler@alumni.caltech.edu
25  */
26 
27 /* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
28  CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software
29  version is provided as a fall-back, as well as for speed comparisons. */
30 
31 /* Version history:
32  1.0 10 Feb 2013 First version
33  1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
34  1.2 1 Nov 2015 Add const qualifier to avoid compiler warning
35  Load entire input into memory (test code)
36  Argument gives number of times to repeat (test code)
37  Argument < 0 forces software implementation (test code)
38  1.3 31 Dec 2015 Check for Intel architecture using compiler macro
39  Support big-endian processors in software calculation
40  Add header for external use
41  */
42 
43 /* Modification history:
44  22 Nov 2019 Rename orignal crc32.c to XrdOucCRC32.cc and crc32c.h to
45  XrdOucCRC32C.hh with corresponding change to include
46  statement herein. Add required casts to allow C++
47  compilation.
48  */
49 
50 #include <pthread.h>
51 #include "XrdOuc/XrdOucCRC32C.hh"
52 
53 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
54 #define POLY 0x82f63b78
55 
56 #ifdef __x86_64__
57 
58 /* Hardware CRC-32C for Intel and compatible processors. */
59 
60 /* Multiply a matrix times a vector over the Galois field of two elements,
61  GF(2). Each element is a bit in an unsigned integer. mat must have at
62  least as many entries as the power of two for most significant one bit in
63  vec. */
64 static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) {
65  uint32_t sum = 0;
66  while (vec) {
67  if (vec & 1)
68  sum ^= *mat;
69  vec >>= 1;
70  mat++;
71  }
72  return sum;
73 }
74 
75 /* Multiply a matrix by itself over GF(2). Both mat and square must have 32
76  rows. */
77 static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat) {
78  for (unsigned n = 0; n < 32; n++)
79  square[n] = gf2_matrix_times(mat, mat[n]);
80 }
81 
82 /* Construct an operator to apply len zeros to a crc. len must be a power of
83  two. If len is not a power of two, then the result is the same as for the
84  largest power of two less than len. The result for len == 0 is the same as
85  for len == 1. A version of this routine could be easily written for any
86  len, but that is not needed for this application. */
87 static void crc32c_zeros_op(uint32_t *even, size_t len) {
88  uint32_t odd[32]; /* odd-power-of-two zeros operator */
89 
90  /* put operator for one zero bit in odd */
91  odd[0] = POLY; /* CRC-32C polynomial */
92  uint32_t row = 1;
93  for (unsigned n = 1; n < 32; n++) {
94  odd[n] = row;
95  row <<= 1;
96  }
97 
98  /* put operator for two zero bits in even */
99  gf2_matrix_square(even, odd);
100 
101  /* put operator for four zero bits in odd */
102  gf2_matrix_square(odd, even);
103 
104  /* first square will put the operator for one zero byte (eight zero bits),
105  in even -- next square puts operator for two zero bytes in odd, and so
106  on, until len has been rotated down to zero */
107  do {
108  gf2_matrix_square(even, odd);
109  len >>= 1;
110  if (len == 0)
111  return;
112  gf2_matrix_square(odd, even);
113  len >>= 1;
114  } while (len);
115 
116  /* answer ended up in odd -- copy to even */
117  for (unsigned n = 0; n < 32; n++)
118  even[n] = odd[n];
119 }
120 
121 /* Take a length and build four lookup tables for applying the zeros operator
122  for that length, byte-by-byte on the operand. */
123 static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
124  uint32_t op[32];
125 
126  crc32c_zeros_op(op, len);
127  for (unsigned n = 0; n < 256; n++) {
128  zeros[0][n] = gf2_matrix_times(op, n);
129  zeros[1][n] = gf2_matrix_times(op, n << 8);
130  zeros[2][n] = gf2_matrix_times(op, n << 16);
131  zeros[3][n] = gf2_matrix_times(op, n << 24);
132  }
133 }
134 
135 /* Apply the zeros operator table to crc. */
136 static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
137  return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
138  zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
139 }
140 
141 /* Block sizes for three-way parallel crc computation. LONG and SHORT must
142  both be powers of two. The associated string constants must be set
143  accordingly, for use in constructing the assembler instructions. */
144 #define LONG 8192
145 #define LONGx1 "8192"
146 #define LONGx2 "16384"
147 #define SHORT 256
148 #define SHORTx1 "256"
149 #define SHORTx2 "512"
150 
151 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
152 static pthread_once_t crc32c_once_hw = PTHREAD_ONCE_INIT;
153 static uint32_t crc32c_long[4][256];
154 static uint32_t crc32c_short[4][256];
155 
156 /* Initialize tables for shifting crcs. */
157 static void crc32c_init_hw(void) {
158  crc32c_zeros(crc32c_long, LONG);
159  crc32c_zeros(crc32c_short, SHORT);
160 }
161 
162 /* Compute CRC-32C using the Intel hardware instruction. */
163 static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
164  /* populate shift tables the first time through */
165  pthread_once(&crc32c_once_hw, crc32c_init_hw);
166 
167  /* pre-process the crc */
168  crc = ~crc;
169  uint64_t crc0 = crc; /* 64-bits for crc32q instruction */
170 
171  /* compute the crc for up to seven leading bytes to bring the data pointer
172  to an eight-byte boundary */
173  unsigned char const *next = (unsigned char const *)buf;
174  while (len && ((uintptr_t)next & 7) != 0) {
175  __asm__("crc32b\t" "(%1), %0"
176  : "=r"(crc0)
177  : "r"(next), "0"(crc0));
178  next++;
179  len--;
180  }
181 
182  /* compute the crc on sets of LONG*3 bytes, executing three independent crc
183  instructions, each on LONG bytes -- this is optimized for the Nehalem,
184  Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
185  throughput of one crc per cycle, but a latency of three cycles */
186  while (len >= LONG*3) {
187  uint64_t crc1 = 0;
188  uint64_t crc2 = 0;
189  unsigned char const * const end = next + LONG;
190  do {
191  __asm__("crc32q\t" "(%3), %0\n\t"
192  "crc32q\t" LONGx1 "(%3), %1\n\t"
193  "crc32q\t" LONGx2 "(%3), %2"
194  : "=r"(crc0), "=r"(crc1), "=r"(crc2)
195  : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
196  next += 8;
197  } while (next < end);
198  crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
199  crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
200  next += LONG*2;
201  len -= LONG*3;
202  }
203 
204  /* do the same thing, but now on SHORT*3 blocks for the remaining data less
205  than a LONG*3 block */
206  while (len >= SHORT*3) {
207  uint64_t crc1 = 0;
208  uint64_t crc2 = 0;
209  unsigned char const * const end = next + SHORT;
210  do {
211  __asm__("crc32q\t" "(%3), %0\n\t"
212  "crc32q\t" SHORTx1 "(%3), %1\n\t"
213  "crc32q\t" SHORTx2 "(%3), %2"
214  : "=r"(crc0), "=r"(crc1), "=r"(crc2)
215  : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
216  next += 8;
217  } while (next < end);
218  crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
219  crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
220  next += SHORT*2;
221  len -= SHORT*3;
222  }
223 
224  /* compute the crc on the remaining eight-byte units less than a SHORT*3
225  block */
226  {
227  unsigned char const * const end = next + (len - (len & 7));
228  while (next < end) {
229  __asm__("crc32q\t" "(%1), %0"
230  : "=r"(crc0)
231  : "r"(next), "0"(crc0));
232  next += 8;
233  }
234  len &= 7;
235  }
236 
237  /* compute the crc for up to seven trailing bytes */
238  while (len) {
239  __asm__("crc32b\t" "(%1), %0"
240  : "=r"(crc0)
241  : "r"(next), "0"(crc0));
242  next++;
243  len--;
244  }
245 
246  /* return a post-processed crc */
247  return ~crc0;
248 }
249 
250 /* Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors
251  introduced in November, 2008. This does not check for the existence of the
252  cpuid instruction itself, which was introduced on the 486SL in 1992, so this
253  will fail on earlier x86 processors. cpuid works on all Pentium and later
254  processors. */
255 #define SSE42(have) \
256  do { \
257  uint32_t eax, ecx; \
258  eax = 1; \
259  __asm__("cpuid" \
260  : "=c"(ecx) \
261  : "a"(eax) \
262  : "%ebx", "%edx"); \
263  (have) = (ecx >> 20) & 1; \
264  } while (0)
265 
266 /* Compute a CRC-32C. If the crc32 instruction is available, use the hardware
267  version. Otherwise, use the software version. */
268 uint32_t crc32c(uint32_t crc, void const *buf, size_t len) {
269  int sse42;
270 
271  SSE42(sse42);
272  return sse42 ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
273 }
274 
275 #else /* !__x86_64__ */
276 
277 uint32_t crc32c(uint32_t crc, void const *buf, size_t len) {
278  return crc32c_sw(crc, buf, len);
279 }
280 
281 #endif
282 
283 /* Construct table for software CRC-32C little-endian calculation. */
284 static pthread_once_t crc32c_once_little = PTHREAD_ONCE_INIT;
285 static uint32_t crc32c_table_little[8][256];
286 static void crc32c_init_sw_little(void) {
287  for (unsigned n = 0; n < 256; n++) {
288  uint32_t crc = n;
289  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
290  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
291  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
292  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
293  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
294  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
295  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
296  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
297  crc32c_table_little[0][n] = crc;
298  }
299  for (unsigned n = 0; n < 256; n++) {
300  uint32_t crc = crc32c_table_little[0][n];
301  for (unsigned k = 1; k < 8; k++) {
302  crc = crc32c_table_little[0][crc & 0xff] ^ (crc >> 8);
303  crc32c_table_little[k][n] = crc;
304  }
305  }
306 }
307 
308 /* Compute a CRC-32C in software assuming a little-endian architecture,
309  constructing the required table if that hasn't already been done. */
310 uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len) {
311  unsigned char const *next = (unsigned char const *)buf;
312 
314  crc = ~crc;
315  while (len && ((uintptr_t)next & 7) != 0) {
316  crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
317  len--;
318  }
319  if (len >= 8) {
320  uint64_t crcw = crc;
321  do {
322  crcw ^= *(uint64_t const *)next;
323  crcw = crc32c_table_little[7][crcw & 0xff] ^
324  crc32c_table_little[6][(crcw >> 8) & 0xff] ^
325  crc32c_table_little[5][(crcw >> 16) & 0xff] ^
326  crc32c_table_little[4][(crcw >> 24) & 0xff] ^
327  crc32c_table_little[3][(crcw >> 32) & 0xff] ^
328  crc32c_table_little[2][(crcw >> 40) & 0xff] ^
329  crc32c_table_little[1][(crcw >> 48) & 0xff] ^
330  crc32c_table_little[0][crcw >> 56];
331  next += 8;
332  len -= 8;
333  } while (len >= 8);
334  crc = crcw;
335  }
336  while (len) {
337  crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
338  len--;
339  }
340  return ~crc;
341 }
342 
343 /* Swap the bytes in a uint64_t. (Only for big-endian.) */
344 #if defined(__has_builtin) || (defined(__GNUC__) && \
345  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
346 # define swap __builtin_bswap64
347 #else
348 static inline uint64_t swap(uint64_t x) {
349  x = ((x << 8) & 0xff00ff00ff00ff00) | ((x >> 8) & 0xff00ff00ff00ff);
350  x = ((x << 16) & 0xffff0000ffff0000) | ((x >> 16) & 0xffff0000ffff);
351  return (x << 32) | (x >> 32);
352 }
353 #endif
354 
355 /* Construct tables for software CRC-32C big-endian calculation. */
356 static pthread_once_t crc32c_once_big = PTHREAD_ONCE_INIT;
357 static uint32_t crc32c_table_big_byte[256];
358 static uint64_t crc32c_table_big[8][256];
359 static void crc32c_init_sw_big(void) {
360  for (unsigned n = 0; n < 256; n++) {
361  uint32_t crc = n;
362  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
363  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
364  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
365  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
366  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
367  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
368  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
369  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
370  crc32c_table_big_byte[n] = crc;
371  }
372  for (unsigned n = 0; n < 256; n++) {
373  uint32_t crc = crc32c_table_big_byte[n];
374  crc32c_table_big[0][n] = swap(crc);
375  for (unsigned k = 1; k < 8; k++) {
376  crc = crc32c_table_big_byte[crc & 0xff] ^ (crc >> 8);
377  crc32c_table_big[k][n] = swap(crc);
378  }
379  }
380 }
381 
382 /* Compute a CRC-32C in software assuming a big-endian architecture,
383  constructing the required tables if that hasn't already been done. */
384 uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len) {
385  unsigned char const *next = (unsigned char const *)buf;
386 
387  pthread_once(&crc32c_once_big, crc32c_init_sw_big);
388  crc = ~crc;
389  while (len && ((uintptr_t)next & 7) != 0) {
390  crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
391  len--;
392  }
393  if (len >= 8) {
394  uint64_t crcw = swap(crc);
395  do {
396  crcw ^= *(uint64_t const *)next;
397  crcw = crc32c_table_big[0][crcw & 0xff] ^
398  crc32c_table_big[1][(crcw >> 8) & 0xff] ^
399  crc32c_table_big[2][(crcw >> 16) & 0xff] ^
400  crc32c_table_big[3][(crcw >> 24) & 0xff] ^
401  crc32c_table_big[4][(crcw >> 32) & 0xff] ^
402  crc32c_table_big[5][(crcw >> 40) & 0xff] ^
403  crc32c_table_big[6][(crcw >> 48) & 0xff] ^
404  crc32c_table_big[7][(crcw >> 56)];
405  next += 8;
406  len -= 8;
407  } while (len >= 8);
408  crc = swap(crcw);
409  }
410  while (len) {
411  crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
412  len--;
413  }
414  return ~crc;
415 }
416 
417 /* Table-driven software CRC-32C. This is about 15 times slower than using the
418  hardware instructions. Determine the endianess of the processor and proceed
419  accordingly. Ideally the endianess will be determined at compile time, in
420  which case the unused functions and tables for the other endianess will be
421  removed by the optimizer. If not, then the proper routines and tables will
422  be used, even if the endianess is changed mid-stream. (Yes, there are
423  processors that permit that -- go figure.) */
424 uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len) {
425  static int const little = 1;
426  if (*(char const *)&little)
427  return crc32c_sw_little(crc, buf, len);
428  else
429  return crc32c_sw_big(crc, buf, len);
430 }
431 
432 #ifdef TEST
433 
434 #include <cstdio>
435 #include <cstdlib>
436 #include "load.h"
437 
438 int main(int argc, char **argv) {
439  /* interpret argument */
440  if (argc > 2) {
441  fputs("only one argument permitted\n", stderr);
442  return 1;
443  }
444  long rep = argc == 1 ? 1 : strtol(argv[1], NULL, 10);
445  if (rep == 0) {
446  fputs("usage: crc32c [[-]nnn] < data\n"
447  " where nnn is the number of times to repeat\n"
448  " negative forces the software implementation\n", stderr);
449  return 0;
450  }
451  int sw = rep < 0;
452  if (sw)
453  rep = -rep;
454 
455  /* read input */
456  void *dat = NULL;
457  size_t size, len;
458  int ret = load(stdin, 0, &dat, &size, &len);
459  if (ret) {
460  fputs("error reading from stdin\n", stderr);
461  return 1;
462  }
463 
464  /* compute crc rep times */
465  uint32_t crc;
466  do {
467  crc = sw ? crc32c_sw(0, dat, len) : crc32c(0, dat, len);
468  } while (--rep);
469 
470  /* show crc */
471  printf("0x%08x\n", crc);
472 
473  /* clean up */
474  free(dat);
475  return 0;
476 }
477 
478 #endif /* TEST */
int main(int argc, char *argv[])
Definition: XrdMain.cc:161
static pthread_once_t crc32c_once_little
uint32_t crc32c(uint32_t crc, void const *buf, size_t len)
static uint64_t swap(uint64_t x)
uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len)
uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len)
static uint64_t crc32c_table_big[8][256]
static void crc32c_init_sw_little(void)
static uint32_t crc32c_table_big_byte[256]
static void crc32c_init_sw_big(void)
#define POLY
Definition: XrdOucCRC32C.cc:54
uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len)
static uint32_t crc32c_table_little[8][256]
static pthread_once_t crc32c_once_big