This is quick write-up about carry-less multiplication clmul
and example running with x86 assembly. It implement useful function for cryptography specs such as AES-GSM, CRC, and error correction.
X86 Assembly Link to heading
This example uses 128 SSE instructions and registers:
- moves the the values to registers xmm0 and xmm1
- pclmulqdq does carry-less multiplication
- Move 64 bits to lo
- shift 128 bits xmm and store the hi 64 bits
#include <stdint.h>
#include <stdio.h>
static inline __attribute__((always_inline)) void clmul64(uint64_t a, uint64_t b, uint64_t *hi, uint64_t *lo) {
__asm__ __volatile__ (
"movq %[a], %%xmm0\n\t"
"movq %[b], %%xmm1\n\t"
"pclmulqdq $0x00, %%xmm1, %%xmm0\n\t"
"movq %%xmm0, %[lo]\n\t"
"psrldq $8, %%xmm0\n\t"
"movq %%xmm0, %[hi]\n\t"
: [hi] "=r" (*hi), [lo] "=r" (*lo)
: [a] "r" (a), [b] "r" (b)
: "xmm0", "xmm1"
);
}
int main() {
uint64_t a = 0x0123456789ABCDEF;
uint64_t b = 0xFEDCBA9876543210;
uint64_t hi, lo;
clmul64(a, b, &hi, &lo);
printf("Input A: 0x%016lX\n", a);
printf("Input B: 0x%016lX\n", b);
printf("Result Hi: 0x%016lX\n", hi);
printf("Result Lo: 0x%016lX\n", lo);
return 0;
}