This is a quick write-up about blake2b which defined as follows (from 1)

The BLAKE2 cryptographic hash function [BLAKE2] was designed by Jean- Philippe Aumasson, Samuel Neves, Zooko Wilcox-O’Hearn, and Christian Winnerlein.

And from 2, We get the definition from Blake

BLAKE is a cryptographic hash function based on Daniel J. Bernstein’s ChaCha stream cipher, but a permuted copy of the input block, XORed with round constants, is added before each ChaCha round. Like SHA-2, there are two variants differing in the word size.

BLAKE2b is faster than MD5, SHA-1, SHA-2, and SHA-3, on 64-bit x86-64 and ARM architectures.[4] BLAKE2 provides better security than SHA-2 and similar to that of SHA-3: immunity to length extension, indifferentiability from a random oracle, etc.[6]

BLAKE2 removes addition of constants to message words from BLAKE round function, changes two rotation constants, simplifies padding, adds parameter block that is XOR’ed with initialization vectors, and reduces the number of rounds from 16 to 12 for BLAKE2b (successor of BLAKE-512), and from 14 to 10 for BLAKE2s (successor of BLAKE-256).

The algorithm is defined as follows.

Algorithm BLAKE2b
   Input:
      M                               Message to be hashed
      cbMessageLen: Number, (0..2128)  Length of the message in bytes
      Key                             Optional 0..64 byte key
      cbKeyLen: Number, (0..64)       Length of optional key in bytes
      cbHashLen: Number, (1..64)      Desired hash length in bytes
   Output:
      Hash                            Hash of cbHashLen bytes

   Initialize State vector h with IV
   h0..7 ← IV0..7

   Mix key size (cbKeyLen) and desired hash length (cbHashLen) into h0
   h0 ← h0 xor 0x0101kknn
         where kk is Key Length (in bytes)
               nn is Desired Hash Length (in bytes)

   Each time we Compress we record how many bytes have been compressed
   cBytesCompressed ← 0
   cBytesRemaining  ← cbMessageLen

   If there was a key supplied (i.e. cbKeyLen > 0) 
   then pad with trailing zeros to make it 128-bytes (i.e. 16 words) 
   and prepend it to the message M
   if (cbKeyLen > 0) then
      M ← Pad(Key, 128) || M
      cBytesRemaining ← cBytesRemaining + 128
   end if

   Compress whole 128-byte chunks of the message, except the last chunk
   while (cBytesRemaining > 128) do
      chunk ← get next 128 bytes of message M
      cBytesCompressed ← cBytesCompressed + 128  increase count of bytes that have been compressed
      cBytesRemaining  ← cBytesRemaining  - 128  decrease count of bytes in M remaining to be processed

      h ← Compress(h, chunk, cBytesCompressed, false)  false ⇒ this is not the last chunk
   end while

   Compress the final bytes from M
   chunk ← get next 128 bytes of message M  We will get cBytesRemaining bytes (i.e. 0..128 bytes)
   cBytesCompressed ← cBytesCompressed+cBytesRemaining  The actual number of bytes leftover in M
   chunk ← Pad(chunk, 128)  If M was empty, then we will still compress a final chunk of zeros

   h ← Compress(h, chunk, cBytesCompressed, true)  true ⇒ this is the last chunk

   Result ← first cbHashLen bytes of little endian state vector h
End Algorithm BLAKE2b

Python Implementation Link to heading

from hashlib import blake2b
h = blake2b()
h.update(b'The quick brown fox jumps over the lazy dog')
print(h.hexdigest())

# https://datatracker.ietf.org/doc/html/rfc7693

IV0 = 0x6a09e667f3bcc908
IV1 = 0xbb67ae8584caa73b
IV2 = 0x3c6ef372fe94f82b
IV3 = 0xa54ff53a5f1d36f1
IV4 = 0x510e527fade682d1
IV5 = 0x9b05688c2b3e6c1f
IV6 = 0x1f83d9abfb41bd6b
IV7 = 0x5be0cd19137e2179


def xor_bytes(v1 ,v2):
    x = [ ((a) ^ (b)) for (a,b) in zip(v1, v2) ]
    return bytes(x)

def mix(Va, Vb, Vc, Vd, x, y):
    Va = (Va + Vb + x) & 0xFFFFFFFFFFFFFFFF
    Vd = ((Vd ^ Va) >> 32 | (Vd ^ Va) << (64 - 32)) & 0xFFFFFFFFFFFFFFFF

    Vc = (Vc + Vd) & 0xFFFFFFFFFFFFFFFF
    Vb = ((Vb ^ Vc) >> 24 | (Vb ^ Vc) << (64 - 24)) & 0xFFFFFFFFFFFFFFFF

    Va = (Va + Vb + y) & 0xFFFFFFFFFFFFFFFF
    Vd = ((Vd ^ Va) >> 16 | (Vd ^ Va) << (64 - 16)) & 0xFFFFFFFFFFFFFFFF

    Vc = (Vc + Vd) & 0xFFFFFFFFFFFFFFFF
    Vb = ((Vb ^ Vc) >> 63 | (Vb ^ Vc) << (64 - 63)) & 0xFFFFFFFFFFFFFFFF

    return Va, Vb, Vc, Vd

SIGMA = [
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
    [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
    [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
    [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
    [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
    [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
    [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
    [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
    [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0]
]

def compress(h, chunk, t, IsLastBlock):
    V = h + [IV0, IV1, IV2, IV3, IV4, IV5, IV6, IV7]
    V[12] ^= t & 0xFFFFFFFFFFFFFFFF
    V[13] ^= (t >> 64) & 0xFFFFFFFFFFFFFFFF

    if IsLastBlock:
        V[14] ^= 0xFFFFFFFFFFFFFFFF

    m = [int.from_bytes(chunk[i:i+8], 'little') for i in range(0, 128, 8)]

    for i in range(12):
        S = SIGMA[i % 10]
        V[0], V[4], V[8], V[12] = mix(V[0], V[4], V[8], V[12], m[S[0]], m[S[1]])
        V[1], V[5], V[9], V[13] = mix(V[1], V[5], V[9], V[13], m[S[2]], m[S[3]])
        V[2], V[6], V[10], V[14] = mix(V[2], V[6], V[10], V[14], m[S[4]], m[S[5]])
        V[3], V[7], V[11], V[15] = mix(V[3], V[7], V[11], V[15], m[S[6]], m[S[7]])

        V[0], V[5], V[10], V[15] = mix(V[0], V[5], V[10], V[15], m[S[8]], m[S[9]])
        V[1], V[6], V[11], V[12] = mix(V[1], V[6], V[11], V[12], m[S[10]], m[S[11]])
        V[2], V[7], V[8], V[13] = mix(V[2], V[7], V[8], V[13], m[S[12]], m[S[13]])
        V[3], V[4], V[9], V[14] = mix(V[3], V[4], V[9], V[14], m[S[14]], m[S[15]])

    for i in range(8):
        h[i] ^= V[i] ^ V[i + 8]

    return h

def my_blake2b(M, Key=b'', cbKeyLen=0, cbHashLen=64):
    h = [IV0, IV1, IV2, IV3, IV4, IV5, IV6, IV7]

    h[0] ^= 0x01010000 ^ (cbKeyLen << 8) ^ cbHashLen

    cbMessageLen = len(M)
    cBytesRemaining = cbMessageLen
    cBytesCompressed = 0

    if cbKeyLen > 0:
        M = Key.ljust(128, b'\x00') + M
        cBytesRemaining += 128

    while cBytesRemaining > 128:
        chunk = M[:128]
        M = M[128:]
        cBytesCompressed += 128
        cBytesRemaining -= 128
        h = compress(h, chunk, cBytesCompressed, False)

    chunk = M
    cBytesCompressed += cBytesRemaining
    chunk = chunk.ljust(128, b'\x00')
    h = compress(h, chunk, cBytesCompressed, True)

    res = b''.join(h[i].to_bytes(8, 'little') for i in range(8))[:cbHashLen]

    return res

plaintext = b'The quick brown fox jumps over the lazy dog'

h1 = my_blake2b(plaintext)
print(h1.hex())

assert(h.hexdigest() == h1.hex())