/* Copyright 2024 Marc A. Lehmann */
/* Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands */
/*
  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the author be held liable for any damages
  arising from the use of this software.

  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:

  1. The origin of this software must not be misrepresented; you must not
    claim that you wrote the original software. If you use this software
    in a product, an acknowledgment in the product documentation would be
    appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
    misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.

  Ferry Toth
  ftoth@exalondelft.nl
*/

/* Use hardware CRC instruction on Intel SSE 4.2 processors.  This computes a
  CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. Where efficient
  3 crc32q instructions are used which a single core can execute in parallel.
  This compensates for the latency of a single crc32q instruction. Combining the 
  3 CRC-32C bytes is done using the pclmulqdq instruction, which has overhead of
  its own, and makes this code path only efficient for buffer sizes above 216 bytes. 
  All code requiring a crc32q instruction is done inside a macro, for which alternative
  code is generated in case of a 32 bit platform.
  
  This code is a port of Intels crc_iscsi_v_pcl.asm assembly code (which is part of
  this project as well as in a modified form the linux kernel) and reaches the same 
  throughput on 64bit platforms. The main advantage of this port is that it was
  relatively easy to port to 32bit platforms (like Intel Edison which currently has
  only 32bit support). Being written in C it is of course easier to maintain and possibly
  optimize further */

/* Version history:
  1.0  07 May 2016  Ferry Toth - First version
*/

#include <stddef.h>
#include <stdint.h>
#include "crc32intelc.h"
#include "crc32inteltable.c"
#include <x86intrin.h>

/* Compute CRC-32C using the Intel hardware instruction. */
static uint32_t crc32cIntelC ( uint32_t crc, const void *buf, size_t len )
{
        const unsigned char *next = ( const unsigned char * ) buf;
        unsigned long count;
        CRC_NATIVE crc0, crc1, crc2;
        crc0 = crc;

        if ( len >= 8 ) {
                // if len > 216 then align and use triplets
                if ( len > 216 ) {
                        {
                                uint32_t crc32bit = crc0;                                       // create this block actually prevent 2 asignments
                                unsigned long align = ( 8 - ( uintptr_t ) next ) % 8;           // byte to boundary
                                len -= align;
                                if ( align & 0x04 ) {
                                        crc32bit = __builtin_ia32_crc32si ( crc32bit, * ( uint32_t* ) next );
                                        next += sizeof(uint32_t);
                                };
                                if ( align & 0x02 ) {
                                        crc32bit = __builtin_ia32_crc32hi ( crc32bit, * ( uint16_t* ) next );
                                        next += sizeof(uint16_t);
                                };

                                if ( align & 0x01 ) {
                                        crc32bit = __builtin_ia32_crc32qi ( crc32bit, * ( next ) );
                                        next++;
                                };
                                crc0 = crc32bit;
                        };

                        // use Duff's device, a for() loop inside a switch() statement. This is Legal
                        // needs to execute at least once, round len down to nearast triplet multiple
                        count = len / 24;			// number of triplets
                        len %= 24;				// bytes remaining
                        unsigned long n = count / 128;		// #blocks = first block + full blocks
                        unsigned long block_size = count % 128;
                        if ( block_size == 0 ) {
                                block_size = 128;
                        } else {
                                n++;
                        };
                        const uint64_t *next0 = ( uint64_t* ) next + block_size; // points to the first byte of the next block
                        const uint64_t *next1 = next0 + block_size;
                        const uint64_t *next2 = next1 + block_size;

                        crc1 = crc2 = 0;
                        switch ( block_size ) {
                        case 128:
                                do {
                                        CRCtriplet ( crc, next, -128 );	// jumps here for a full block of len 128
                                case 127:
                                        CRCtriplet ( crc, next, -127 );	// jumps here or below for the first block smaller
                                case 126:
                                        CRCtriplet ( crc, next, -126 );	// than 128
                                case 125:
                                        CRCtriplet ( crc, next, -125 );
                                case 124:
                                        CRCtriplet ( crc, next, -124 );
                                case 123:
                                        CRCtriplet ( crc, next, -123 );
                                case 122:
                                        CRCtriplet ( crc, next, -122 );
                                case 121:
                                        CRCtriplet ( crc, next, -121 );
                                case 120:
                                        CRCtriplet ( crc, next, -120 );
                                case 119:
                                        CRCtriplet ( crc, next, -119 );
                                case 118:
                                        CRCtriplet ( crc, next, -118 );
                                case 117:
                                        CRCtriplet ( crc, next, -117 );
                                case 116:
                                        CRCtriplet ( crc, next, -116 );
                                case 115:
                                        CRCtriplet ( crc, next, -115 );
                                case 114:
                                        CRCtriplet ( crc, next, -114 );
                                case 113:
                                        CRCtriplet ( crc, next, -113 );
                                case 112:
                                        CRCtriplet ( crc, next, -112 );
                                case 111:
                                        CRCtriplet ( crc, next, -111 );
                                case 110:
                                        CRCtriplet ( crc, next, -110 );
                                case 109:
                                        CRCtriplet ( crc, next, -109 );
                                case 108:
                                        CRCtriplet ( crc, next, -108 );
                                case 107:
                                        CRCtriplet ( crc, next, -107 );
                                case 106:
                                        CRCtriplet ( crc, next, -106 );
                                case 105:
                                        CRCtriplet ( crc, next, -105 );
                                case 104:
                                        CRCtriplet ( crc, next, -104 );
                                case 103:
                                        CRCtriplet ( crc, next, -103 );
                                case 102:
                                        CRCtriplet ( crc, next, -102 );
                                case 101:
                                        CRCtriplet ( crc, next, -101 );
                                case 100:
                                        CRCtriplet ( crc, next, -100 );
                                case 99:
                                        CRCtriplet ( crc, next, -99 );
                                case 98:
                                        CRCtriplet ( crc, next, -98 );
                                case 97:
                                        CRCtriplet ( crc, next, -97 );
                                case 96:
                                        CRCtriplet ( crc, next, -96 );
                                case 95:
                                        CRCtriplet ( crc, next, -95 );
                                case 94:
                                        CRCtriplet ( crc, next, -94 );
                                case 93:
                                        CRCtriplet ( crc, next, -93 );
                                case 92:
                                        CRCtriplet ( crc, next, -92 );
                                case 91:
                                        CRCtriplet ( crc, next, -91 );
                                case 90:
                                        CRCtriplet ( crc, next, -90 );
                                case 89:
                                        CRCtriplet ( crc, next, -89 );
                                case 88:
                                        CRCtriplet ( crc, next, -88 );
                                case 87:
                                        CRCtriplet ( crc, next, -87 );
                                case 86:
                                        CRCtriplet ( crc, next, -86 );
                                case 85:
                                        CRCtriplet ( crc, next, -85 );
                                case 84:
                                        CRCtriplet ( crc, next, -84 );
                                case 83:
                                        CRCtriplet ( crc, next, -83 );
                                case 82:
                                        CRCtriplet ( crc, next, -82 );
                                case 81:
                                        CRCtriplet ( crc, next, -81 );
                                case 80:
                                        CRCtriplet ( crc, next, -80 );
                                case 79:
                                        CRCtriplet ( crc, next, -79 );
                                case 78:
                                        CRCtriplet ( crc, next, -78 );
                                case 77:
                                        CRCtriplet ( crc, next, -77 );
                                case 76:
                                        CRCtriplet ( crc, next, -76 );
                                case 75:
                                        CRCtriplet ( crc, next, -75 );
                                case 74:
                                        CRCtriplet ( crc, next, -74 );
                                case 73:
                                        CRCtriplet ( crc, next, -73 );
                                case 72:
                                        CRCtriplet ( crc, next, -72 );
                                case 71:
                                        CRCtriplet ( crc, next, -71 );
                                case 70:
                                        CRCtriplet ( crc, next, -70 );
                                case 69:
                                        CRCtriplet ( crc, next, -69 );
                                case 68:
                                        CRCtriplet ( crc, next, -68 );
                                case 67:
                                        CRCtriplet ( crc, next, -67 );
                                case 66:
                                        CRCtriplet ( crc, next, -66 );
                                case 65:
                                        CRCtriplet ( crc, next, -65 );
                                case 64:
                                        CRCtriplet ( crc, next, -64 );
                                case 63:
                                        CRCtriplet ( crc, next, -63 );
                                case 62:
                                        CRCtriplet ( crc, next, -62 );
                                case 61:
                                        CRCtriplet ( crc, next, -61 );
                                case 60:
                                        CRCtriplet ( crc, next, -60 );
                                case 59:
                                        CRCtriplet ( crc, next, -59 );
                                case 58:
                                        CRCtriplet ( crc, next, -58 );
                                case 57:
                                        CRCtriplet ( crc, next, -57 );
                                case 56:
                                        CRCtriplet ( crc, next, -56 );
                                case 55:
                                        CRCtriplet ( crc, next, -55 );
                                case 54:
                                        CRCtriplet ( crc, next, -54 );
                                case 53:
                                        CRCtriplet ( crc, next, -53 );
                                case 52:
                                        CRCtriplet ( crc, next, -52 );
                                case 51:
                                        CRCtriplet ( crc, next, -51 );
                                case 50:
                                        CRCtriplet ( crc, next, -50 );
                                case 49:
                                        CRCtriplet ( crc, next, -49 );
                                case 48:
                                        CRCtriplet ( crc, next, -48 );
                                case 47:
                                        CRCtriplet ( crc, next, -47 );
                                case 46:
                                        CRCtriplet ( crc, next, -46 );
                                case 45:
                                        CRCtriplet ( crc, next, -45 );
                                case 44:
                                        CRCtriplet ( crc, next, -44 );
                                case 43:
                                        CRCtriplet ( crc, next, -43 );
                                case 42:
                                        CRCtriplet ( crc, next, -42 );
                                case 41:
                                        CRCtriplet ( crc, next, -41 );
                                case 40:
                                        CRCtriplet ( crc, next, -40 );
                                case 39:
                                        CRCtriplet ( crc, next, -39 );
                                case 38:
                                        CRCtriplet ( crc, next, -38 );
                                case 37:
                                        CRCtriplet ( crc, next, -37 );
                                case 36:
                                        CRCtriplet ( crc, next, -36 );
                                case 35:
                                        CRCtriplet ( crc, next, -35 );
                                case 34:
                                        CRCtriplet ( crc, next, -34 );
                                case 33:
                                        CRCtriplet ( crc, next, -33 );
                                case 32:
                                        CRCtriplet ( crc, next, -32 );
                                case 31:
                                        CRCtriplet ( crc, next, -31 );
                                case 30:
                                        CRCtriplet ( crc, next, -30 );
                                case 29:
                                        CRCtriplet ( crc, next, -29 );
                                case 28:
                                        CRCtriplet ( crc, next, -28 );
                                case 27:
                                        CRCtriplet ( crc, next, -27 );
                                case 26:
                                        CRCtriplet ( crc, next, -26 );
                                case 25:
                                        CRCtriplet ( crc, next, -25 );
                                case 24:
                                        CRCtriplet ( crc, next, -24 );
                                case 23:
                                        CRCtriplet ( crc, next, -23 );
                                case 22:
                                        CRCtriplet ( crc, next, -22 );
                                case 21:
                                        CRCtriplet ( crc, next, -21 );
                                case 20:
                                        CRCtriplet ( crc, next, -20 );
                                case 19:
                                        CRCtriplet ( crc, next, -19 );
                                case 18:
                                        CRCtriplet ( crc, next, -18 );
                                case 17:
                                        CRCtriplet ( crc, next, -17 );
                                case 16:
                                        CRCtriplet ( crc, next, -16 );
                                case 15:
                                        CRCtriplet ( crc, next, -15 );
                                case 14:
                                        CRCtriplet ( crc, next, -14 );
                                case 13:
                                        CRCtriplet ( crc, next, -13 );
                                case 12:
                                        CRCtriplet ( crc, next, -12 );
                                case 11:
                                        CRCtriplet ( crc, next, -11 );
                                case 10:
                                        CRCtriplet ( crc, next, -10 );
                                case 9:
                                        CRCtriplet ( crc, next, -9 );
                                case 8:
                                        CRCtriplet ( crc, next, -8 );
                                case 7:
                                        CRCtriplet ( crc, next, -7 );
                                case 6:
                                        CRCtriplet ( crc, next, -6 );
                                case 5:
                                        CRCtriplet ( crc, next, -5 );
                                case 4:
                                        CRCtriplet ( crc, next, -4 );
                                case 3:
                                        CRCtriplet ( crc, next, -3 );
                                case 2:
                                        CRCtriplet ( crc, next, -2 );
                                case 1:
                                        CRCduplet ( crc, next, -1 );		        // the final triplet is actually only 2
                                        CombineCRC();
                                        if ( --n > 0 ) {
                                                crc1 = crc2 = 0;
                                                block_size = 128;
                                                next0 = next2 + 128;			// points to the first byte of the next block
                                                next1 = next0 + 128;			// from here on all blocks are 128 long
                                                next2 = next1 + 128;
                                        };
                                case 0:
                                        ;
                                } while ( n > 0 );
                        };
                        next = ( const unsigned char* ) next2;
                };
                unsigned count = len / 8;                                               // 216 of less bytes is 27 or less singlets
                len %= 8;
                next += ( count * 8 );
                switch ( count ) {
                case 27:
                        CRCsinglet ( crc0, next, -27 * 8 );
                case 26:
                        CRCsinglet ( crc0, next, -26 * 8 );
                case 25:
                        CRCsinglet ( crc0, next, -25 * 8 );
                case 24:
                        CRCsinglet ( crc0, next, -24 * 8 );
                case 23:
                        CRCsinglet ( crc0, next, -23 * 8 );
                case 22:
                        CRCsinglet ( crc0, next, -22 * 8 );
                case 21:
                        CRCsinglet ( crc0, next, -21 * 8 );
                case 20:
                        CRCsinglet ( crc0, next, -20 * 8 );
                case 19:
                        CRCsinglet ( crc0, next, -19 * 8 );
                case 18:
                        CRCsinglet ( crc0, next, -18 * 8 );
                case 17:
                        CRCsinglet ( crc0, next, -17 * 8 );
                case 16:
                        CRCsinglet ( crc0, next, -16 * 8 );
                case 15:
                        CRCsinglet ( crc0, next, -15 * 8 );
                case 14:
                        CRCsinglet ( crc0, next, -14 * 8 );
                case 13:
                        CRCsinglet ( crc0, next, -13 * 8 );
                case 12:
                        CRCsinglet ( crc0, next, -12 * 8 );
                case 11:
                        CRCsinglet ( crc0, next, -11 * 8 );
                case 10:
                        CRCsinglet ( crc0, next, -10 * 8 );
                case 9:
                        CRCsinglet ( crc0, next, -9 * 8 );
                case 8:
                        CRCsinglet ( crc0, next, -8 * 8 );
                case 7:
                        CRCsinglet ( crc0, next, -7 * 8 );
                case 6:
                        CRCsinglet ( crc0, next, -6 * 8 );
                case 5:
                        CRCsinglet ( crc0, next, -5 * 8 );
                case 4:
                        CRCsinglet ( crc0, next, -4 * 8 );
                case 3:
                        CRCsinglet ( crc0, next, -3 * 8 );
                case 2:
                        CRCsinglet ( crc0, next, -2 * 8 );
                case 1:
                        CRCsinglet ( crc0, next, -1 * 8 );
                case 0:
                        ;
                };

        };
        {
                uint32_t crc32bit = crc0;
                // less than 8 bytes remain
                /* compute the crc for up to seven trailing bytes */
                if ( len & 0x04 ) {
                        crc32bit = __builtin_ia32_crc32si ( crc32bit, * ( uint32_t* ) next );
                        next += 4;
                };
                if ( len & 0x02 ) {
                        crc32bit = __builtin_ia32_crc32hi ( crc32bit, * ( uint16_t* ) next );
                        next += 2;
                };

                if ( len & 0x01 ) {
                        crc32bit = __builtin_ia32_crc32qi ( crc32bit, * ( next ) );
                };
                return ( uint32_t ) crc32bit;
        };
};