This is quick write-up about carry-less multiplication clmul and example running with x86 assembly. It implement useful function for cryptography specs such as AES-GSM, CRC, and error correction.

alt text

X86 Assembly Link to heading

This example uses 128 SSE instructions and registers:

  • moves the the values to registers xmm0 and xmm1
  • pclmulqdq does carry-less multiplication
  • Move 64 bits to lo
  • shift 128 bits xmm and store the hi 64 bits
#include <stdint.h>
#include <stdio.h>

static inline __attribute__((always_inline)) void clmul64(uint64_t a, uint64_t b, uint64_t *hi, uint64_t *lo) {
    __asm__ __volatile__ (
        "movq %[a], %%xmm0\n\t"           
        "movq %[b], %%xmm1\n\t"           
        "pclmulqdq $0x00, %%xmm1, %%xmm0\n\t" 
        "movq %%xmm0, %[lo]\n\t"          
        "psrldq $8, %%xmm0\n\t"           
        "movq %%xmm0, %[hi]\n\t"
        : [hi] "=r" (*hi), [lo] "=r" (*lo)
        : [a] "r" (a), [b] "r" (b)
        : "xmm0", "xmm1"
    );
}

int main() {
    uint64_t a = 0x0123456789ABCDEF;
    uint64_t b = 0xFEDCBA9876543210;
    uint64_t hi, lo;
    
    clmul64(a, b, &hi, &lo);
    
    printf("Input A:  0x%016lX\n", a);
    printf("Input B:  0x%016lX\n", b);
    printf("Result Hi: 0x%016lX\n", hi);
    printf("Result Lo: 0x%016lX\n", lo);

    return 0;
}