Porting a sha256 algorithm from x86 assembly to ARM

Dec 15

The other day I took part in the FE-CTF finals, and in the finals, there was a challenge, where you had to supply x86-64 shellcode.

The output of the shellcode should be the sha256sum of your own shellcode

I managed to solve the challenge after a couple of hours, and decided, that I'd like to port it to ARMv5 as this is a bit more relevant for ICSRange.

This blog post will explain my journey of porting the sha256 assembly implementation from x86 to ARMv5.

Porting it

The author of the x86-64 assembly implementation also has an implementation in x86.

Since x86 is 32-bit, it seems like a good idea to choose this as a base to work from, to port it for ARMv5.

The original version can be found here

I started off by porting the shellcode from AT&T syntax to Intel Syntax. This is mostly because I'm more comfortable with reading Intel Syntax, but also that x86 Intel Syntax is closer to ARM than AT&T syntax is.

This process took me quite some time, as I'd never seen the way he defines assembly as macros in his assembly before. But after a couple of hours, I managed to port it to Intel Syntax:

/* 
 * SHA-256 hash in x86 assembly
 * 
 * Copyright (c) 2021 Project Nayuki. (MIT License)
 * https://www.nayuki.io/page/fast-sha2-hashes-in-x86-assembly
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
 * this software and associated documentation files (the "Software"), to deal in
 * the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 * the Software, and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 * - The above copyright notice and this permission notice shall be included in
 *   all copies or substantial portions of the Software.
 * - The Software is provided "as is", without warranty of any kind, express or
 *   implied, including but not limited to the warranties of merchantability,
 *   fitness for a particular purpose and noninfringement. In no event shall the
 *   authors or copyright holders be liable for any claim, damages or other
 *   liability, whether in an action of contract, tort or otherwise, arising from,
 *   out of or in connection with the Software or the use or other dealings in the
 *   Software.
 */

/* void sha256_compress(const uint8_t block[static 64], uint32_t state[static 8]) */
.globl sha256_compress
.intel_syntax noprefix
sha256_compress:
    /* 
     * Storage usage:
     *   Bytes  Location   Description
     *       4  eax        Temporary for calculation per round
     *       4  ebx        Temporary for calculation per round
     *       4  ecx        Temporary for calculation per round
     *       4  edx        Temporary for calculation per round
     *       4  ebp        Temporary for calculation per round
     *       4  esi        (During state loading and update) base address of state array argument
     *                     (During hash rounds) temporary for calculation per round
     *       4  edi        Base address of block array argument (during key schedule loading rounds only)
     *       4  esp        x86 stack pointer
     *      32  [esp+  0]  SHA-256 state variables A,B,C,D,E,F,G,H (4 bytes each)
     *      64  [esp+ 32]  Key schedule of 16 * 4 bytes
     *       4  [esp+ 96]  Caller's value of ebx
     *       4  [esp+100]  Caller's value of esi
     *       4  [esp+104]  Caller's value of edi
     *       4  [esp+108]  Caller's value of ebp
     */

    #define SCHED(i)  dword ptr [esp+((((i)&0xF)+8)*4)]
    #define STATE(i)  dword ptr [esp+(i*4)]

    #define ROUNDTAIL(i, a, b, c, d, e, f, g, h, k)  \
        /* Part 0 */               \
        mov   eax, STATE(e);      \
        mov   ebx, eax;          \
        mov   ecx, eax;          \
        mov   edx, eax;          \
        ror   eax, 11;           \
        ror   ebx, 25;           \
        ror   ecx, 6;            \
        mov   esi, STATE(h);      \
        xor   eax, ebx;          \
        xor   ecx, eax;          \
        add   esi, ebp;          \
        mov   ebx, STATE(g);      \
        mov   eax, STATE(f);      \
        xor   eax, ebx;          \
        and   eax, edx;          \
        xor   eax, ebx;          \
        lea   ecx, [ecx + eax + k];  \
        add   esi,ecx;          \
        /* Part 1 */               \
        add  STATE(d),esi;      \
        /* Part 2 */               \
        mov   eax, STATE(a);      \
        mov   ebx, eax;          \
        mov   ecx, eax;          \
        mov   edx, %eax;          \
        ror   eax, 13;           \
        ror   ebx, 22;           \
        ror   ecx, 2;            \
        xor   eax, ebx;          \
        xor   ecx, eax;          \
        mov   eax, STATE(c);      \
        add   esi, ecx;          \
        mov   ecx, eax;          \
        mov   ebx, STATE(b);      \
        or    ecx, ebx;          \
        and   eax, ebx;          \
        and   ecx, edx;          \
        or    ecx, eax;          \
        add   esi, ecx;          \
        mov   STATE(h), esi;

    #define ROUNDb(i, a, b, c, d, e, f, g, h, k)  \
        mov  eax, SCHED(i-15);  \
        mov  ebp, SCHED(i-16);  \
        mov  ebx, eax;         \
        add  ebp, SCHED(i- 7);  \
        mov  ecx, eax;         \
        ror  ebx, 18;          \
        shr  ecx, 3;           \
        ror  eax, 7;           \
        xor  ebx, ecx;         \
        xor  eax, ebx;         \
        add  ebp, eax;         \
        mov  eax, SCHED(i- 2);  \
        mov  ebx, eax;         \
        mov  ecx, eax;         \
        ror  ebx, 19;          \
        shr  ecx, 10;          \
        ror  eax, 17;          \
        xor  ebx, ecx;         \
        xor  eax, ebx;         \
        add  ebp, eax;         \
        mov  SCHED(i), ebp;     \
        ROUNDTAIL(i, a, b, c, d, e, f, g, h, k)

    #define ROUNDa(i, a, b, c, d, e, f, g, h, k)  \
        mov    ebp, dword ptr [edi+(i*4)];  \
        bswap  ebp;               \
        mov    SCHED(i),ebp;     \
        ROUNDTAIL(i, a, b, c, d, e, f, g, h, k)

    /* Allocate scratch space, save registers */
    sub  esp, 112;
    mov  dword ptr [esp+96], ebx;
    mov  dword ptr [esp+100], esi;
    mov  dword ptr [esp+104], edi;
    mov  dword ptr [esp+108], ebp;

    /* Copy state */
    mov esi, dword ptr [esp+120];
    mov eax, dword ptr [esi];         mov dword ptr [esp], eax
    mov eax, dword ptr [esi+4];        mov dword ptr [esp+4], eax
    mov eax, dword ptr [esi+8];        mov dword ptr [esp+8], eax
    mov eax, dword ptr [esi+12];    mov dword ptr [esp+12], eax
    mov eax, dword ptr [esi+16];    mov dword ptr [esp+16], eax    
    mov eax, dword ptr [esi+20];    mov dword ptr [esp+20], eax
    mov eax, dword ptr [esi+24];    mov dword ptr [esp+24], eax
    mov eax, dword ptr [esi+28];    mov dword ptr [esp+28], eax

    /* Do 64 rounds of hashing */
    mov edi, dword ptr [esp+116]

    ROUNDa( 0, 0, 1, 2, 3, 4, 5, 6, 7, 0x428A2F98)
    ROUNDa( 1, 7, 0, 1, 2, 3, 4, 5, 6, 0x71374491)
    ROUNDa( 2, 6, 7, 0, 1, 2, 3, 4, 5, 0xB5C0FBCF)
    ROUNDa( 3, 5, 6, 7, 0, 1, 2, 3, 4, 0xE9B5DBA5)
    ROUNDa( 4, 4, 5, 6, 7, 0, 1, 2, 3, 0x3956C25B)
    ROUNDa( 5, 3, 4, 5, 6, 7, 0, 1, 2, 0x59F111F1)
    ROUNDa( 6, 2, 3, 4, 5, 6, 7, 0, 1, 0x923F82A4)
    ROUNDa( 7, 1, 2, 3, 4, 5, 6, 7, 0, 0xAB1C5ED5)
    ROUNDa( 8, 0, 1, 2, 3, 4, 5, 6, 7, 0xD807AA98)
    ROUNDa( 9, 7, 0, 1, 2, 3, 4, 5, 6, 0x12835B01)
    ROUNDa(10, 6, 7, 0, 1, 2, 3, 4, 5, 0x243185BE)
    ROUNDa(11, 5, 6, 7, 0, 1, 2, 3, 4, 0x550C7DC3)
    ROUNDa(12, 4, 5, 6, 7, 0, 1, 2, 3, 0x72BE5D74)
    ROUNDa(13, 3, 4, 5, 6, 7, 0, 1, 2, 0x80DEB1FE)
    ROUNDa(14, 2, 3, 4, 5, 6, 7, 0, 1, 0x9BDC06A7)
    ROUNDa(15, 1, 2, 3, 4, 5, 6, 7, 0, 0xC19BF174)
    ROUNDb(16, 0, 1, 2, 3, 4, 5, 6, 7, 0xE49B69C1)
    ROUNDb(17, 7, 0, 1, 2, 3, 4, 5, 6, 0xEFBE4786)
    ROUNDb(18, 6, 7, 0, 1, 2, 3, 4, 5, 0x0FC19DC6)
    ROUNDb(19, 5, 6, 7, 0, 1, 2, 3, 4, 0x240CA1CC)
    ROUNDb(20, 4, 5, 6, 7, 0, 1, 2, 3, 0x2DE92C6F)
    ROUNDb(21, 3, 4, 5, 6, 7, 0, 1, 2, 0x4A7484AA)
    ROUNDb(22, 2, 3, 4, 5, 6, 7, 0, 1, 0x5CB0A9DC)
    ROUNDb(23, 1, 2, 3, 4, 5, 6, 7, 0, 0x76F988DA)
    ROUNDb(24, 0, 1, 2, 3, 4, 5, 6, 7, 0x983E5152)
    ROUNDb(25, 7, 0, 1, 2, 3, 4, 5, 6, 0xA831C66D)
    ROUNDb(26, 6, 7, 0, 1, 2, 3, 4, 5, 0xB00327C8)
    ROUNDb(27, 5, 6, 7, 0, 1, 2, 3, 4, 0xBF597FC7)
    ROUNDb(28, 4, 5, 6, 7, 0, 1, 2, 3, 0xC6E00BF3)
    ROUNDb(29, 3, 4, 5, 6, 7, 0, 1, 2, 0xD5A79147)
    ROUNDb(30, 2, 3, 4, 5, 6, 7, 0, 1, 0x06CA6351)
    ROUNDb(31, 1, 2, 3, 4, 5, 6, 7, 0, 0x14292967)
    ROUNDb(32, 0, 1, 2, 3, 4, 5, 6, 7, 0x27B70A85)
    ROUNDb(33, 7, 0, 1, 2, 3, 4, 5, 6, 0x2E1B2138)
    ROUNDb(34, 6, 7, 0, 1, 2, 3, 4, 5, 0x4D2C6DFC)
    ROUNDb(35, 5, 6, 7, 0, 1, 2, 3, 4, 0x53380D13)
    ROUNDb(36, 4, 5, 6, 7, 0, 1, 2, 3, 0x650A7354)
    ROUNDb(37, 3, 4, 5, 6, 7, 0, 1, 2, 0x766A0ABB)
    ROUNDb(38, 2, 3, 4, 5, 6, 7, 0, 1, 0x81C2C92E)
    ROUNDb(39, 1, 2, 3, 4, 5, 6, 7, 0, 0x92722C85)
    ROUNDb(40, 0, 1, 2, 3, 4, 5, 6, 7, 0xA2BFE8A1)
    ROUNDb(41, 7, 0, 1, 2, 3, 4, 5, 6, 0xA81A664B)
    ROUNDb(42, 6, 7, 0, 1, 2, 3, 4, 5, 0xC24B8B70)
    ROUNDb(43, 5, 6, 7, 0, 1, 2, 3, 4, 0xC76C51A3)
    ROUNDb(44, 4, 5, 6, 7, 0, 1, 2, 3, 0xD192E819)
    ROUNDb(45, 3, 4, 5, 6, 7, 0, 1, 2, 0xD6990624)
    ROUNDb(46, 2, 3, 4, 5, 6, 7, 0, 1, 0xF40E3585)
    ROUNDb(47, 1, 2, 3, 4, 5, 6, 7, 0, 0x106AA070)
    ROUNDb(48, 0, 1, 2, 3, 4, 5, 6, 7, 0x19A4C116)
    ROUNDb(49, 7, 0, 1, 2, 3, 4, 5, 6, 0x1E376C08)
    ROUNDb(50, 6, 7, 0, 1, 2, 3, 4, 5, 0x2748774C)
    ROUNDb(51, 5, 6, 7, 0, 1, 2, 3, 4, 0x34B0BCB5)
    ROUNDb(52, 4, 5, 6, 7, 0, 1, 2, 3, 0x391C0CB3)
    ROUNDb(53, 3, 4, 5, 6, 7, 0, 1, 2, 0x4ED8AA4A)
    ROUNDb(54, 2, 3, 4, 5, 6, 7, 0, 1, 0x5B9CCA4F)
    ROUNDb(55, 1, 2, 3, 4, 5, 6, 7, 0, 0x682E6FF3)
    ROUNDb(56, 0, 1, 2, 3, 4, 5, 6, 7, 0x748F82EE)
    ROUNDb(57, 7, 0, 1, 2, 3, 4, 5, 6, 0x78A5636F)
    ROUNDb(58, 6, 7, 0, 1, 2, 3, 4, 5, 0x84C87814)
    ROUNDb(59, 5, 6, 7, 0, 1, 2, 3, 4, 0x8CC70208)
    ROUNDb(60, 4, 5, 6, 7, 0, 1, 2, 3, 0x90BEFFFA)
    ROUNDb(61, 3, 4, 5, 6, 7, 0, 1, 2, 0xA4506CEB)
    ROUNDb(62, 2, 3, 4, 5, 6, 7, 0, 1, 0xBEF9A3F7)
    ROUNDb(63, 1, 2, 3, 4, 5, 6, 7, 0, 0xC67178F2)

    /* Add to state */
    mov  esi, dword ptr [esp+120];  /* Argument: state */
    mov  eax, dword ptr [esp];     add dword ptr [esi], eax
    mov  eax, dword ptr [esp+4];   add dword ptr [esi+4], eax
    mov  eax, dword ptr [esp+8];   add dword ptr [esi+8], eax
    mov  eax, dword ptr [esp+12];  add dword ptr [esi+12], eax
    mov  eax, dword ptr [esp+16];  add dword ptr [esi+16], eax
    mov  eax, dword ptr [esp+20];  add dword ptr [esi+20], eax
    mov  eax, dword ptr [esp+24];  add dword ptr [esi+24], eax
    mov  eax, dword ptr [esp+28];  add dword ptr [esi+28], eax

    /* Restore registers */
    mov ebx, dword ptr [esp+96]
    mov esi, dword ptr [esp+100]
    mov edi, dword ptr [esp+104]
    mov ebp, dword ptr [esp+108]
    add esp, 112
    ret

All the conversions were done manually by hand, as I was not able to find a tool which was able to convert between the two syntaxes.

If you know such a tool, please let me know, I'd be very interested in hearing from you.

Once the conversions was done, it was time to figure out the ARMv5 aliases of the instructions.

mov reg1, dword ptr [reg2] - Has an equivalent alias of ldr reg1, [reg2]
mov dword ptr [reg1], reg2 - Has an equivalent alias of str reg2, [reg1]
Add and sub instructions are basically the same as x86 except it has another operand and immediates are prefixed with a #. So add reg1, reg2 would become add reg1, reg1, reg2. Adding an immediate would go from add reg1, 1 to add reg1, reg1, #1.
ror and rol has an equivalence of mov reg1, reg1, ror 2 and mov reg1, reg1, rol 2
or has an equivalence of orr reg1, reg1, #imm/reg2 so the x86 instruction or reg1, 7 would become orr reg1, reg1, #7
and has an equivalence like or. It's named the same
xor has an equivalence like or, but is called eor instead.
bswap has the equivalence of rev reg1, reg1.
Bitshifting has the equivalence of mov reg1, reg1, lsr 2 where the last letter in lsr specifies whether it's left or right shift.
There is no equivalent of the lea instruction, unfortunately. We will therefore come back to this later.

Converting SCHED and STATE

Now that we have the equivalents of our registers and instructions let's start off by redefining our 2 first defines, SCHED and STATE.

Current:

#define SCHED(i)  dword ptr [esp+((((i)&0xF)+8)*4)]
#define STATE(i)  dword ptr [esp+(i*4)]

As these are loading from an address or to an address, we only need to use ldr or str instead of mov, when relevant.

We can therefore simply define both of them the following way:

#define SCHED(i)  [sp, #((((i)&0xF)+8)*4)]
#define STATE(i)  [sp, #(i*4)]

Converting ROUNDTAIL

SCHED and STATE were the easy ones. The next one is ROUNDTAIL as it's used both by ROUNDa and ROUNDb:

Current:

    #define ROUNDTAIL(i, a, b, c, d, e, f, g, h, k)  \
        /* Part 0 */               \
        mov   eax, STATE(e);      \
        mov   ebx, eax;          \
        mov   ecx, eax;          \
        mov   edx, eax;          \
        ror   eax, 11;           \
        ror   ebx, 25;           \
        ror   ecx, 6;            \
        mov   esi, STATE(h);      \
        xor   eax, ebx;          \
        xor   ecx, eax;          \
        add   esi, ebp;          \
        mov   ebx, STATE(g);      \
        mov   eax, STATE(f);      \
        xor   eax, ebx;          \
        and   eax, edx;          \
        xor   eax, ebx;          \
        lea   ecx, [ecx + eax + k];  \
        add   esi,ecx;          \
        /* Part 1 */               \
        add  STATE(d),esi;      \
        /* Part 2 */               \
        mov   eax, STATE(a);      \
        mov   ebx, eax;          \
        mov   ecx, eax;          \
        mov   edx, eax;          \
        ror   eax, 13;           \
        ror   ebx, 22;           \
        ror   ecx, 2;            \
        xor   eax, ebx;          \
        xor   ecx, eax;          \
        mov   eax, STATE(c);      \
        add   esi, ecx;          \
        mov   ecx, eax;          \
        mov   ebx, STATE(b);      \
        or    ecx, ebx;          \
        and   eax, ebx;          \
        and   ecx, edx;          \
        or    ecx, eax;          \
        add   esi, ecx;          \
        mov   STATE(h), esi;

Doing a quick search and replace with the following:

eax = r0
ebx = r1
ecx = r2
edx = r3
esi = r4
ebp = r12
esp = sp

Yields us the following result to work with:

    #define ROUNDTAIL(i, a, b, c, d, e, f, g, h, k)  \
        /* Part 0 */                                   \
        mov   r0, STATE(e);                          \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        ror   r0, 11;                               \
        ror   r1, 25;                               \
        ror   r2, 6;                                \
        mov   r4, STATE(h);                          \
        xor   r0, r1;                                  \
        xor   r2, r0;                                  \
        add   r4, fp;                                  \
        mov   r1, STATE(g);                          \
        mov   r0, STATE(f);                          \
        xor   r0, r1;                                  \
        and   r0, r3;                                  \
        xor   r0, r1;                                  \
        lea   r2, [r2 + r0 + k];                      \
        add   r4,r2;                                  \
        /* Part 1 */                                   \
        add  STATE(d),r4;                              \
        /* Part 2 */                                   \
        mov   r0, STATE(a);                          \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        ror   r0, 13;                               \
        ror   r1, 22;                               \
        ror   r2, 2;                                \
        xor   r0, r1;                                  \
        xor   r2, r0;                                  \
        mov   r0, STATE(c);                          \
        add   r4, r2;                                  \
        mov   r2, r0;                                  \
        mov   r1, STATE(b);                          \
        or    r2, r1;                                  \
        and   r0, r1;                                  \
        and   r2, r3;                                  \
        or    r2, r0;                                  \
        add   r4, r2;                                  \
        mov   STATE(h), r4;

We then need to change the instructions to their equivalents. With the mov instructions coming first.

Anything that does a mov reg, STATE(x) needs to be replaced with an ldr instruction

Anything that does a mov STATE(x) needs to be replaced with an str instruction and the destination and source operands should be flipped.

The result of this conversion can be seen here:

    #define ROUNDTAIL(i, a, b, c, d, e, f, g, h, k)  \
        /* Done */                                  \
        ldr   r0, STATE(e);                            \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
                                                    \
        /* TODO */                                  \
        ror   r0, 11;                               \
        ror   r1, 25;                               \
        ror   r2, 6;                                \
                                                    \
        /* Done */                                  \
        ldr   r4, STATE(h);                          \
                                                    \
        /* TODO */                                  \
        xor   r0, r1;                                  \
        xor   r2, r0;                                  \
        add   r4, fp;                                  \
                                                    \
        /* Done */                                  \
        ldr   r1, STATE(g);                          \
        ldr   r0, STATE(f);                          \
                                                    \
        /* TODO */                                  \
        xor   r0, r1;                                  \
        and   r0, r3;                                  \
        xor   r0, r1;                                  \
        lea   r2, [r2 + r0 + k];                      \
        add   r4,r2;                                 \
        add   STATE(d),r4;                          \
                                                    \
        /* Done */                                  \
        ldr   r0, STATE(a);                          \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
                                                    \
        /* TODO */                                  \
        ror   r0, 13;                               \
        ror   r1, 22;                               \
        ror   r2, 2;                                \
        xor   r0, r1;                                  \
        xor   r2, r0;                                  \
                                                    \
        /* Done */                                  \
        ldr   r0, STATE(c);                          \
                                                    \
        /* TODO */                                    \
        add   r4, r2;                                  \
                                                    \
        /* Done */                                    \
        mov   r2, r0;                                  \
        ldr   r1, STATE(b);                          \
                                                    \
        /* TODO */                                  \
        or    r2, r1;                                  \
        and   r0, r1;                                  \
        and   r2, r3;                                  \
        or    r2, r0;                                  \
        add   r4, r2;                                  \
                                                    \
        /* Done */                                  \
        str   r4, STATE(h);                            \

Next let's fix the ror and xor instructions

    #define ROUNDTAIL(i, a, b, c, d, e, f, g, h, k)  \
        /* Done */                                  \    
        ldr   r0, STATE(e);                            \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        mov   r0, r0, ror 11;                       \
        mov   r1, r1, ror 25;                        \
        mov   r2, r2, ror 6;                        \
        ldr   r4, STATE(h);                          \
        eor   r0, r0, r1;                              \
        eor   r2, r2, r0;                              \
                                                    \
        /* TODO */                                  \
        add   r4, fp;                                  \
                                                    \
        /* Done */                                  \
        ldr   r1, STATE(g);                          \
        ldr   r0, STATE(f);                          \
        eor   r0, r0, r1;                              \
        and   r0, r0, r3;                              \
        eor   r0, r0, r1;                              \
                                                    \
        /* TODO */                                     \
        lea   r2, [r2 + r0 + k];                      \
        add   r4,r2;                                  \
        add   STATE(d),r4;                          \
                                                    \
        /* Done */                                  \
        ldr   r0, STATE(a);                          \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        mov   r0, r0, ror 13;                       \
        mov   r1, r1, ror 22;                       \
        mov   r2, r2, ror 2;                        \
        eor   r0, r0, r1;                              \
        eor   r2, r2, r0;                              \
        ldr   r0, STATE(c);                          \
                                                    \
        /* TODO */                                  \
        add   r4, r2;                                  \
                                                    \
        /* Done */                                  \
        mov   r2, r0;                                  \
        ldr   r1, STATE(b);                          \
                                                    \
        /* TODO */                                    \
        or    r2, r2, r1;                              \
        and   r0, r0, r1;                              \
        and   r2, r2, r3;                              \
        or    r2, r2, r0;                              \
        add   r4, r2;                                  \
                                                    \
        /* Done */                                    \
        str   r4, STATE(h);

It's started to shape up nicely. Let's fix the and and or instructions in the same vein:

    #define ROUNDTAIL(i, a, b, c, d, e, f, g, h, k)  \
        /* Done */                                  \    
        ldr   r0, STATE(e);                            \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        mov   r0, r0, ror 11;                       \
        mov   r1, r1, ror 25;                        \
        mov   r2, r2, ror 6;                        \
        ldr   r4, STATE(h);                          \
        eor   r0, r0, r1;                              \
        eor   r2, r2, r0;                              \
                                                    \
        /* TODO */                                  \
        add   r4, fp;                                  \
                                                    \
        /* Done */                                  \
        ldr   r1, STATE(g);                          \
        ldr   r0, STATE(f);                          \
        eor   r0, r0, r1;                              \
        and   r0, r0, r3;                              \
        eor   r0, r0, r1;                              \
                                                    \
        /* TODO */                                     \
        lea   r2, [r2 + r0 + k];                      \
        add   r4,r2;                                  \
        add   STATE(d),r4;                          \
                                                    \
        /* Done */                                  \
        ldr   r0, STATE(a);                          \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        mov   r0, r0, ror 13;                       \
        mov   r1, r1, ror 22;                       \
        mov   r2, r2, ror 2;                        \
        eor   r0, r0, r1;                              \
        eor   r2, r2, r0;                              \
        ldr   r0, STATE(c);                          \
                                                    \
        /* TODO */                                  \
        add   r4, r2;                                  \
                                                    \
        /* Done */                                  \
        mov   r2, r0;                                  \
        ldr   r1, STATE(b);                          \
        orr   r2, r2, r1;                              \
        and   r0, r0, r1;                              \
        and   r2, r2, r3;                              \
        orr   r2, r2, r0;                              \
                                                    \
        /* TODO */                                  \
        add   r4, r2;                                  \
                                                    \
        /* Done */                                    \
        str   r4, STATE(h);

Now the add instruction. The add instruction in arm does not allow us, to specify a memory value relative to a register, which we have one instruction which uses: add STATE(d),r4;

We therefore need to find a sequence of instructions equivalent to this.

The instruction basically does the following:

Adds r4 to the contents which are located at STATE(d).

To do this in ARM we will need to do the following

Load the contents of STATE(d) into a register
Add the contents together with r4
Store the result at STATE(d)

We also want to do this without modifying r4. Fortunately for us, the ARM architecture has way more registers available than x86 does.

We will therefore just clobber r6, as it's currently unused in our program.

The instruction equivalent therefore ends up being:

ldr r6, STATE(d)
add r6, r6, r4
str r6, STATE(d)

The rest of the add instructions are done exactly like the or, xor and and instructions:

    #define ROUNDTAIL(i, a, b, c, d, e, f, g, h, k)  \
        /* Done */              \    
        ldr   r0, STATE(e);                            \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        mov   r0, r0, ror 11;                       \
        mov   r1, r1, ror 25;                        \
        mov   r2, r2, ror 6;                        \
        ldr   r4, STATE(h);                          \
        eor   r0, r0, r1;                              \
        eor   r2, r2, r0;                              \
        add   r4, r4, fp;                              \
        ldr   r1, STATE(g);                          \
        ldr   r0, STATE(f);                          \
        eor   r0, r0, r1;                              \
        and   r0, r0, r3;                              \
        eor   r0, r0, r1;                              \
                                                    \
        /* TODO */                                     \
        lea   r2, [r2 + r0 + k];                      \
                                                    \
        /* Done */                                    \
        add   r4, r4, r2;                              \
        ldr   r6, STATE(d);                            \
        add   r6, r6, r4;                              \
        str   r6, STATE(d);                         \
        ldr   r0, STATE(a);                          \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        mov   r0, r0, ror 13;                       \
        mov   r1, r1, ror 22;                       \
        mov   r2, r2, ror 2;                        \
        eor   r0, r0, r1;                              \
        eor   r2, r2, r0;                              \
        ldr   r0, STATE(c);                          \
        add   r4, r4, r2;                              \
        mov   r2, r0;                                  \
        ldr   r1, STATE(b);                          \
        orr   r2, r2, r1;                              \
        and   r0, r0, r1;                              \
        and   r2, r2, r3;                              \
        orr   r2, r2, r0;                              \
        add   r4, r4, r2;                              \
        str   r4, STATE(h);

Lastly we are left with the lea instruction.

The lea instruction basically does a load effective address of the things specified in the brackets.

In this case, the author is using it to load an immediate value into a register, without having to perform multiple add operations.

However since arm does not have this functionality available, and only a RIP relative instruction (adr) we will have to resort to doing all these adds manually.

However, this is more easily said than done. In the beginning I expected that doing a naïve approach like the following would work:

mov r2, #0
add r2, r2, r0
add r2, r2, k

This is not the case though, because of how ARM works as an instruction set.

ARM does not allow for moving immediate values bigger than 30 bits into registers, if you are using anything bigger than 12 bits? (don't quote me on that), you will have to follow an encoding scheme, and some numbers are just not available.

I don't really understand the full reasoning behind this or the full technical details. But from what I could gather, it has to do with how ARMs instructions are a fixed length (32 bits) and therefore we can't move 32 bit immediate values into registers, as this would take more space than the 32 bits we have available.

If the technical details behind this interests you, you can read a full great blog post, I stumbled upon whilst fighting this issue here)

In short, we will have to find a way to move our 32-bit values into the registers where we still follow the instruction encoding.

This is a bit annoying as the 32-bit values we want to move into our register, if broken in half still do not follow ARM's instruction encoding.

Therefore I opted to move the 32-bit values into the register, by moving it a byte at a time then shift the bytes up.

This requires a rewrite of our ROUNDTAIL macro, as we need to be able to supply 8-bits seperately instead of one big 32-bit blob.

Another approach to this, would be to have a section, in which you have all the constant values needed for the algorithm, saved. Then loading the values using a mix of adr and ldr. I opted to not use this method, as having the values saved somewhere, would create a bigger headache of rewriting the macros.

Therefore I decided to rewrite the ROUNDTAIL macro as follows:

#define ROUNDTAIL(i, a, b, c, d, e, f, g, h, ktop, kmid1, kmid2, kbot)

We then need to actually perform our calculations. The lea instruction looks as follows:

lea   r2, [r2 + r0 + k];

Since we are already clobbering r6, we will use r6 to build our 32-bit value:

mov   r6, #0;
mov   r6, #(ktop);    
mov   r6, r6, lsl 8;
orr   r6, r6, #(kmid1);
mov   r6, r6, lsl 8;  
orr   r6, r6, #(kmid2);
mov   r6, r6, lsl 8;  
orr   r6, r6, #(kbot);

These instructions simply build the 32-bit value 1 byte at a time, by moving it into the register, then shifting it appropriately.

Lastly we need to add it to the original value of r2 and add r0 to that:

add   r2, r2, r6;
add   r2, r2, r0;

Wow. A lot of work to just replace one simple instruction huh?.

Well this was the hardest bit and we have now managed to convert ROUNDTAIL to ARM:

    #define ROUNDTAIL(i, a, b, c, d, e, f, g, h, ktop, kmid1, kmid2, kbot)  \
        ldr   r0, STATE(e);                            \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        mov   r0, r0, ror 11;                       \
        mov   r1, r1, ror 25;                        \
        mov   r2, r2, ror 6;                        \
        ldr   r4, STATE(h);                          \
        eor   r0, r0, r1;                              \
        eor   r2, r2, r0;                              \
        add   r4, r4, fp;                              \
        ldr   r1, STATE(g);                          \
        ldr   r0, STATE(f);                          \
        eor   r0, r0, r1;                              \
        and   r0, r0, r3;                              \
        eor   r0, r0, r1;                              \
        mov   r6, #0;                                  \
        mov   r6, #(ktop);                            \
        mov   r6, r6, lsl 8;                           \
        orr   r6, r6, #(kmid1);                        \
        mov   r6, r6, lsl 8;                          \
        orr   r6, r6, #(kmid2);                        \
        mov   r6, r6, lsl 8;                          \
        orr   r6, r6, #(kbot);                        \
        add   r2, r2, r6;                           \
        add   r2, r2, r0;                              \
        add   r4, r4, r2;                              \
        ldr   r6, STATE(d);                            \
        add   r6, r6, r4;                              \
        str   r6, STATE(d);                         \
        ldr   r0, STATE(a);                          \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        mov   r0, r0, ror 13;                       \
        mov   r1, r1, ror 22;                       \
        mov   r2, r2, ror 2;                        \
        eor   r0, r0, r1;                              \
        eor   r2, r2, r0;                              \
        ldr   r0, STATE(c);                          \
        add   r4, r4, r2;                              \
        mov   r2, r0;                                  \
        ldr   r1, STATE(b);                          \
        orr   r2, r2, r1;                              \
        and   r0, r0, r1;                              \
        and   r2, r2, r3;                              \
        orr   r2, r2, r0;                              \
        add   r4, r4, r2;                              \
        str   r4, STATE(h);

Converting ROUNDb

Next up for conversion is ROUNDb

Conversion of this is pretty straight forward, and only really has one issue which is easy to deal with.

x86 version:

    #define ROUNDb(i, a, b, c, d, e, f, g, h, k)  \
        mov  eax, SCHED(i-15);  \
        mov  ebp, SCHED(i-16);  \
        mov  ebx, eax;         \
        add  ebp, SCHED(i- 7);  \
        mov  ecx, eax;         \
        ror  ebx, 18;          \
        shr  ecx, 3;           \
        ror  eax, 7;           \
        xor  ebx, ecx;         \
        xor  eax, ebx;         \
        add  ebp, eax;         \
        mov  eax, SCHED(i- 2);  \
        mov  ebx, eax;         \
        mov  ecx, eax;         \
        ror  ebx, 19;          \
        shr  ecx, 10;          \
        ror  eax, 17;          \
        xor  ebx, ecx;         \
        xor  eax, ebx;         \
        add  ebp, eax;         \
        mov  SCHED(i), ebp;     \
        ROUNDTAIL(i, a, b, c, d, e, f, g, h, k)

The add ebp, SCHED(i-7) instruction can't be implemented directly, since as mentioned earlier we don't have a way to add to addresses located in relative memory, so we first need to move it to a register.

However an equivalence can easily be made, which looks like this:

ldr  r6, SCHED(i -7);
add  fp, fp, r6;

After that, we have direct instruction equivalents for everything and implementing ROUNDb is straight forward.

Since ROUNDb also takes k as an argument, we will also change the arguments as done in ROUNDTAIL:

    #define ROUNDb(i, a, b, c, d, e, f, g, h, ktop, kmid1, kmid2, kbot)  \
        ldr  r0, SCHED(i-15);                          \
        ldr  fp, SCHED(i-16);                          \
        mov  r1, r0;                                 \
        ldr  r6, SCHED(i -7);                         \
        add  fp, fp, r6;                             \
        mov  r2, r0;                                 \
        mov  r1, r1, ror 18;                          \
        mov  r2, r2, lsr 3;                           \
        mov  r0, r0, ror 7;                           \
        eor  r1, r1, r2;                             \
        eor  r0, r0, r1;                             \
        add  fp, fp, r0;                             \
        ldr  r0, SCHED(i- 2);                          \
        mov  r1, r0;                                 \
        mov  r2, r0;                                 \
        mov  r1, r1, ror 19;                          \
        mov  r2, r2, lsr 10;                          \
        mov  r0, r0, ror 17;                          \
        eor  r1, r1, r2;                             \
        eor  r0, r0, r1;                             \
        add  fp, fp, r0;                             \
        str  fp, SCHED(i);                             \
        ROUNDTAIL(i, a, b, c, d, e, f, g, h, ktop, kmid1, kmid2, kbot)

Lastly is the ROUNDa which is up for conversion

There is equivalents for every instruction in ROUNDa so conversion is straight forward.

Of course we remember to change k, to our new conversion.

x86 version:

    #define ROUNDa(i, a, b, c, d, e, f, g, h, k)  \
        mov    ebp, dword ptr [edi+(i*4)];  \
        bswap  ebp;               \
        mov    SCHED(i),ebp;     \
        ROUNDTAIL(i, a, b, c, d, e, f, g, h, k)

ARM version:

    #define ROUNDa(i, a, b, c, d, e, f, g, h, ktop, kmid1, kmid2, kbot)  \
        ldr    fp, [r5,#(i*4)];  \
        rev    fp, fp;               \
        str    fp, SCHED(i);     \
        ROUNDTAIL(i, a, b, c, d, e, f, g, h, ktop, kmid1, kmid2, kbot)

Main program functionality

Next is converting the copy state and state setup/entry functionality:

    /* Allocate scratch space, save registers */
    sub  esp, 112;
    mov  dword ptr [esp+96], ebx;
    mov  dword ptr [esp+100], esi;
    mov  dword ptr [esp+104], edi;
    mov  dword ptr [esp+108], ebp;

    /* Copy state */
    mov esi, dword ptr [esp+120]; // We can skip this part

    mov eax, dword ptr [esi];         
    mov dword ptr [esp], eax;
    mov eax, dword ptr [esi+4];        
    mov dword ptr [esp+4], eax;
    mov eax, dword ptr [esi+8];        
    mov dword ptr [esp+8], eax;
    mov eax, dword ptr [esi+12];    
    mov dword ptr [esp+12], eax;
    mov eax, dword ptr [esi+16];    
    mov dword ptr [esp+16], eax;    
    mov eax, dword ptr [esi+20];    
    mov dword ptr [esp+20], eax;
    mov eax, dword ptr [esi+24];    
    mov dword ptr [esp+24], eax;
    mov eax, dword ptr [esi+28];    
    mov dword ptr [esp+28], eax;

    /* Do 64 rounds of hashing */
    mov edi, dword ptr [esp+116]

This part is a bit more complicated as we are dealing with a different calling convention than x86

However it might not be as complicated as you might think

The first mov esi instruction is loading the address of where the state is currently located into esi.

The state we have is going to be loaded into r1 instead, we can therefore basically skip the whole step of moving the ptr from the stack into esi.

After that it copies the state onto the stack, which the rest of our algorithm is set up to do. When it hits the part where it does the 64 rounds of hasing, it moves our input into edi.

Again our input is already located inside of r0 in this case, we can therefore just move it from r0 to our edi equivalent (r5).

Lastly we need to move r1 into a register which is not touched by our algorithm. This is so we can easily find out where the state is located, later.

Another approach to saving r1, would also be to save it on the stack. But since we have an abundancy of registers, it kinda just made sense to throw it into r8.

The conversion of what's mentioned above looks as follows:

    /* Allocate scratch space, save registers */
    sub  sp, sp, 116;
    str  r1, [sp,#96];
    str  r4, [sp,#100];
    str  r5, [sp,#104];
    str  fp, [sp,#108];

    /* Copy state */
    /* We need to move r1 into r4 and r0 in r5 for later, because of different calling convention */
    /* We also need to save r1 inside of r8 */
    mov r4, r1
    mov r5, r0
    mov r8, r1

    /* Then we copy the state */
    ldr r0, [r4];         
    str r0, [sp];
    ldr r0, [r4, #4];         
    str r0, [sp, #4];
    ldr r0, [r4, #8];         
    str r0, [sp, #8];
    ldr r0, [r4, #12];         
    str r0, [sp, #12];
    ldr r0, [r4, #16];         
    str r0, [sp, #16];
    ldr r0, [r4, #20];         
    str r0, [sp, #20];
    ldr r0, [r4, #24];         
    str r0, [sp, #24];
    ldr r0, [r4, #28];         
    str r0, [sp, #28];

We then perform the 64 rounds of hasing, but have divided the numbers up into their byte equivalents:

    /* Do 64 rounds of hashing */
    ROUNDa( 0, 0, 1, 2, 3, 4, 5, 6, 7, 0x42,0x8A,0x2F,0x98)
    ROUNDa( 1, 7, 0, 1, 2, 3, 4, 5, 6, 0x71,0x37,0x44,0x91)
    ROUNDa( 2, 6, 7, 0, 1, 2, 3, 4, 5, 0xB5,0xC0,0xFB,0xCF)
    ROUNDa( 3, 5, 6, 7, 0, 1, 2, 3, 4, 0xE9,0xB5,0xDB,0xA5)
    ....

Lastly we need to implement the version of adding to the state at the end and returning:

    /* Add to state */
    mov  esi, dword ptr [esp+120];  /* Argument: state */

    mov  eax, dword ptr [esp];     
    add dword ptr [esi], eax;
    mov  eax, dword ptr [esp+4];   
    add dword ptr [esi+4], eax;
    mov  eax, dword ptr [esp+8];   
    add dword ptr [esi+8], eax;
    mov  eax, dword ptr [esp+12];  
    add dword ptr [esi+12], eax;
    mov  eax, dword ptr [esp+16];  
    add dword ptr [esi+16], eax;
    mov  eax, dword ptr [esp+20];  
    add dword ptr [esi+20], eax;
    mov  eax, dword ptr [esp+24];  
    add dword ptr [esi+24], eax;
    mov  eax, dword ptr [esp+28];  
    add dword ptr [esi+28], eax;

    /* Restore registers */
    mov ebx, dword ptr [esp+96]
    mov esi, dword ptr [esp+100]
    mov edi, dword ptr [esp+104]
    mov ebp, dword ptr [esp+108]
    add esp, 112
    ret

The state argument, we saved earlier inside of r8 and can therefore we can therefore just move it straight into r4:

mov  r4, r8;  /* Argument: state */

Next we need to perform versions of the following multiple times, with different offsets to esp:

mov  eax, dword ptr [esp];     
add dword ptr [esi], eax;

Because there is no equivalent to add dword ptr, we will have to load the values first, do the addition and store it afterwards.

A direct conversion of this, looks as follows:

ldr  r0, [sp];
ldr  r6, [r4];
add  r6, r6, r0;
str  r6, [r4];

Then we just copy/paste and offset into r4 and sp as done in the x86 version:

ldr  r0, [sp];
ldr  r6, [r4];
add  r6, r6, r0;
str  r6, [r4];
ldr  r0, [sp,#4];
ldr  r6, [r4,#4];
add  r6, r6, r0;
str  r6, [r4,#4];
ldr  r0, [sp,#8];
ldr  r6, [r4,#8];
add  r6, r6, r0;
str  r6, [r4,#8];
ldr  r0, [sp,#12];
ldr  r6, [r4,#12];
add  r6, r6, r0;
str  r6, [r4,#12];
ldr  r0, [sp,#16];
ldr  r6, [r4,#16];
add  r6, r6, r0;
str  r6, [r4,#16];
ldr  r0, [sp,#20];
ldr  r6, [r4,#20];
add  r6, r6, r0;
str  r6, [r4,#20];
ldr  r0, [sp,#24];
ldr  r6, [r4,#24];
add  r6, r6, r0;
str  r6, [r4,#24];
ldr  r0, [sp,#28];
ldr  r6, [r4,#28];
add  r6, r6, r0;
str  r6, [r4,#28];

After this, we just need to restore the registers as well and return. When a call is made in ARM with a bl instruction, the retur pointer is stored in the link register (lr).

We therefore just need to push the link register and pop it into pc:

ldr r1, [sp,#96]
ldr r4, [sp,#100]
ldr r5, [sp,#104]
ldr fp, [sp,#108]
add sp, sp, #112
push {lr}
pop {pc}

We've now converted the x86 sha256 implementation to arm!

The full code can be found at the end of this blog post.

Testing it

This is however only the sha256_compress implemented.

We now need to test whether the compression actually worked.

Fortunately the same author provides a sha256-test.c program, that we can use for testing.

We can then compile for ARM using:

arm-linux-gnueabi-gcc sha256-test.c sha256-arm.S -o sha256-test

And emulate it in qemu to test the implementation:

qemu-arm-static ./sha256-test

The self checks pass, and I thought a significant amount of speed had been lost from the original implementation. As I ran it directly on my computer:

However I forgot that qemu adds an emulation layer. Running this through qemu-i386-static, yields a significantaly different result:

So whether the implementation is faster or slower, I'd like to leave as an exercise for the reader. As these tests are kind of inconclusive.

Final Notes

That was the journey of porting an x86 sha256 implementation to ARM!

If you have any questions or concerns, please feel free to contact me directly at jan@icsrange.com and I will do my best to get back to you.

Also please don't forget to check out the original authors website:

https://www.nayuki.io/

He does some really cool stuff and has actually implemented more than just sha256 in assembly. Maybe a port for a blogpost another day ;-)

Signing out!

Jens Nielsen - Senior Security Researcher - ICSRange

Full code

.globl sha256_compress
.p2align 2
.syntax unified
.arch armv7-a
.arm

/*
eax = r0
ebx = r1
ecx = r2
edx = r3
esi = r4
edi = r5

Clobbers r6
*/

sha256_compress:    
    #define SCHED(i)  [sp, #((((i)&0xF)+8)*4)]
    #define STATE(i)  [sp, #(i*4)]
    #define ROUNDTAIL(i, a, b, c, d, e, f, g, h, ktop, kmid1, kmid2, kbot)  \
        ldr   r0, STATE(e);                            \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        mov   r0, r0, ror 11;                       \
        mov   r1, r1, ror 25;                        \
        mov   r2, r2, ror 6;                        \
        ldr   r4, STATE(h);                          \
        eor   r0, r0, r1;                              \
        eor   r2, r2, r0;                              \
        add   r4, r4, fp;                              \
        ldr   r1, STATE(g);                          \
        ldr   r0, STATE(f);                          \
        eor   r0, r0, r1;                              \
        and   r0, r0, r3;                              \
        eor   r0, r0, r1;                              \
        mov   r6, #0;                                  \
        mov   r6, #(ktop);                            \
        mov   r6, r6, lsl 8;                           \
        orr   r6, r6, #(kmid1);                        \
        mov   r6, r6, lsl 8;                          \
        orr   r6, r6, #(kmid2);                        \
        mov   r6, r6, lsl 8;                          \
        orr   r6, r6, #(kbot);                        \
        add   r2, r2, r6;                           \
        add   r2, r2, r0;                              \
        add   r4, r4, r2;                              \
        ldr   r6, STATE(d);                            \
        add   r6, r6, r4;                              \
        str   r6, STATE(d);                         \
        ldr   r0, STATE(a);                          \
        mov   r1, r0;                                  \
        mov   r2, r0;                                  \
        mov   r3, r0;                                  \
        mov   r0, r0, ror 13;                       \
        mov   r1, r1, ror 22;                       \
        mov   r2, r2, ror 2;                        \
        eor   r0, r0, r1;                              \
        eor   r2, r2, r0;                              \
        ldr   r0, STATE(c);                          \
        add   r4, r4, r2;                              \
        mov   r2, r0;                                  \
        ldr   r1, STATE(b);                          \
        orr   r2, r2, r1;                              \
        and   r0, r0, r1;                              \
        and   r2, r2, r3;                              \
        orr   r2, r2, r0;                              \
        add   r4, r4, r2;                              \
        str   r4, STATE(h);

    #define ROUNDb(i, a, b, c, d, e, f, g, h, ktop, kmid1, kmid2, kbot)  \
        ldr  r0, SCHED(i-15);                          \
        ldr  fp, SCHED(i-16);                          \
        mov  r1, r0;                                 \
        ldr  r6, SCHED(i -7);                         \
        add  fp, fp, r6;                             \
        mov  r2, r0;                                 \
        mov  r1, r1, ror 18;                          \
        mov  r2, r2, lsr 3;                           \
        mov  r0, r0, ror 7;                           \
        eor  r1, r1, r2;                             \
        eor  r0, r0, r1;                             \
        add  fp, fp, r0;                             \
        ldr  r0, SCHED(i- 2);                          \
        mov  r1, r0;                                 \
        mov  r2, r0;                                 \
        mov  r1, r1, ror 19;                          \
        mov  r2, r2, lsr 10;                          \
        mov  r0, r0, ror 17;                          \
        eor  r1, r1, r2;                             \
        eor  r0, r0, r1;                             \
        add  fp, fp, r0;                             \
        str  fp, SCHED(i);                             \
        ROUNDTAIL(i, a, b, c, d, e, f, g, h, ktop, kmid1, kmid2, kbot)

    #define ROUNDa(i, a, b, c, d, e, f, g, h, ktop, kmid1, kmid2, kbot)  \
        ldr    fp, [r5,#(i*4)];  \
        rev    fp, fp;               \
        str    fp, SCHED(i);     \
        ROUNDTAIL(i, a, b, c, d, e, f, g, h, ktop, kmid1, kmid2, kbot)

    /* Allocate scratch space, save registers */
    sub  sp, sp, 112;
    str  r1, [sp,#96];
    str  r4, [sp,#100];
    str  r5, [sp,#104];
    str  fp, [sp,#108];

    /* Copy state */
    /* We need to move r1 into r4 and r0 in r5 for later, because of different calling convention */
    /* We also need to save it inside of r8 */
    mov r4, r1
    mov r5, r0
    mov r8, r1

    /* Then we copy the state */
    ldr r0, [r4];         
    str r0, [sp];
    ldr r0, [r4, #4];         
    str r0, [sp, #4];
    ldr r0, [r4, #8];         
    str r0, [sp, #8];
    ldr r0, [r4, #12];         
    str r0, [sp, #12];
    ldr r0, [r4, #16];         
    str r0, [sp, #16];
    ldr r0, [r4, #20];         
    str r0, [sp, #20];
    ldr r0, [r4, #24];         
    str r0, [sp, #24];
    ldr r0, [r4, #28];         
    str r0, [sp, #28];

    /* Do 64 rounds of hashing */
    ROUNDa( 0, 0, 1, 2, 3, 4, 5, 6, 7, 0x42,0x8A,0x2F,0x98)
    ROUNDa( 1, 7, 0, 1, 2, 3, 4, 5, 6, 0x71,0x37,0x44,0x91)
    ROUNDa( 2, 6, 7, 0, 1, 2, 3, 4, 5, 0xB5,0xC0,0xFB,0xCF)
    ROUNDa( 3, 5, 6, 7, 0, 1, 2, 3, 4, 0xE9,0xB5,0xDB,0xA5)
    ROUNDa( 4, 4, 5, 6, 7, 0, 1, 2, 3, 0x39,0x56,0xC2,0x5B)
    ROUNDa( 5, 3, 4, 5, 6, 7, 0, 1, 2, 0x59,0xF1,0x11,0xF1)
    ROUNDa( 6, 2, 3, 4, 5, 6, 7, 0, 1, 0x92,0x3F,0x82,0xA4)
    ROUNDa( 7, 1, 2, 3, 4, 5, 6, 7, 0, 0xAB,0x1C,0x5E,0xD5)
    ROUNDa( 8, 0, 1, 2, 3, 4, 5, 6, 7, 0xD8,0x07,0xAA,0x98)
    ROUNDa( 9, 7, 0, 1, 2, 3, 4, 5, 6, 0x12,0x83,0x5B,0x01)
    ROUNDa(10, 6, 7, 0, 1, 2, 3, 4, 5, 0x24,0x31,0x85,0xBE)
    ROUNDa(11, 5, 6, 7, 0, 1, 2, 3, 4, 0x55,0x0C,0x7D,0xC3)
    ROUNDa(12, 4, 5, 6, 7, 0, 1, 2, 3, 0x72,0xBE,0x5D,0x74)
    ROUNDa(13, 3, 4, 5, 6, 7, 0, 1, 2, 0x80,0xDE,0xB1,0xFE)
    ROUNDa(14, 2, 3, 4, 5, 6, 7, 0, 1, 0x9B,0xDC,0x06,0xA7)
    ROUNDa(15, 1, 2, 3, 4, 5, 6, 7, 0, 0xC1,0x9B,0xF1,0x74)
    ROUNDb(16, 0, 1, 2, 3, 4, 5, 6, 7, 0xE4,0x9B,0x69,0xC1)
    ROUNDb(17, 7, 0, 1, 2, 3, 4, 5, 6, 0xEF,0xBE,0x47,0x86)
    ROUNDb(18, 6, 7, 0, 1, 2, 3, 4, 5, 0x0F,0xC1,0x9D,0xC6)
    ROUNDb(19, 5, 6, 7, 0, 1, 2, 3, 4, 0x24,0x0C,0xA1,0xCC)
    ROUNDb(20, 4, 5, 6, 7, 0, 1, 2, 3, 0x2D,0xE9,0x2C,0x6F)
    ROUNDb(21, 3, 4, 5, 6, 7, 0, 1, 2, 0x4A,0x74,0x84,0xAA)
    ROUNDb(22, 2, 3, 4, 5, 6, 7, 0, 1, 0x5C,0xB0,0xA9,0xDC)
    ROUNDb(23, 1, 2, 3, 4, 5, 6, 7, 0, 0x76,0xF9,0x88,0xDA)
    ROUNDb(24, 0, 1, 2, 3, 4, 5, 6, 7, 0x98,0x3E,0x51,0x52)
    ROUNDb(25, 7, 0, 1, 2, 3, 4, 5, 6, 0xA8,0x31,0xC6,0x6D)
    ROUNDb(26, 6, 7, 0, 1, 2, 3, 4, 5, 0xB0,0x03,0x27,0xC8)
    ROUNDb(27, 5, 6, 7, 0, 1, 2, 3, 4, 0xBF,0x59,0x7F,0xC7)
    ROUNDb(28, 4, 5, 6, 7, 0, 1, 2, 3, 0xC6,0xE0,0x0B,0xF3)
    ROUNDb(29, 3, 4, 5, 6, 7, 0, 1, 2, 0xD5,0xA7,0x91,0x47)
    ROUNDb(30, 2, 3, 4, 5, 6, 7, 0, 1, 0x06,0xCA,0x63,0x51)
    ROUNDb(31, 1, 2, 3, 4, 5, 6, 7, 0, 0x14,0x29,0x29,0x67)
    ROUNDb(32, 0, 1, 2, 3, 4, 5, 6, 7, 0x27,0xB7,0x0A,0x85)
    ROUNDb(33, 7, 0, 1, 2, 3, 4, 5, 6, 0x2E,0x1B,0x21,0x38)
    ROUNDb(34, 6, 7, 0, 1, 2, 3, 4, 5, 0x4D,0x2C,0x6D,0xFC)
    ROUNDb(35, 5, 6, 7, 0, 1, 2, 3, 4, 0x53,0x38,0x0D,0x13)
    ROUNDb(36, 4, 5, 6, 7, 0, 1, 2, 3, 0x65,0x0A,0x73,0x54)
    ROUNDb(37, 3, 4, 5, 6, 7, 0, 1, 2, 0x76,0x6A,0x0A,0xBB)
    ROUNDb(38, 2, 3, 4, 5, 6, 7, 0, 1, 0x81,0xC2,0xC9,0x2E)
    ROUNDb(39, 1, 2, 3, 4, 5, 6, 7, 0, 0x92,0x72,0x2C,0x85)
    ROUNDb(40, 0, 1, 2, 3, 4, 5, 6, 7, 0xA2,0xBF,0xE8,0xA1)
    ROUNDb(41, 7, 0, 1, 2, 3, 4, 5, 6, 0xA8,0x1A,0x66,0x4B)
    ROUNDb(42, 6, 7, 0, 1, 2, 3, 4, 5, 0xC2,0x4B,0x8B,0x70)
    ROUNDb(43, 5, 6, 7, 0, 1, 2, 3, 4, 0xC7,0x6C,0x51,0xA3)
    ROUNDb(44, 4, 5, 6, 7, 0, 1, 2, 3, 0xD1,0x92,0xE8,0x19)
    ROUNDb(45, 3, 4, 5, 6, 7, 0, 1, 2, 0xD6,0x99,0x06,0x24)
    ROUNDb(46, 2, 3, 4, 5, 6, 7, 0, 1, 0xF4,0x0E,0x35,0x85)
    ROUNDb(47, 1, 2, 3, 4, 5, 6, 7, 0, 0x10,0x6A,0xA0,0x70)
    ROUNDb(48, 0, 1, 2, 3, 4, 5, 6, 7, 0x19,0xA4,0xC1,0x16)
    ROUNDb(49, 7, 0, 1, 2, 3, 4, 5, 6, 0x1E,0x37,0x6C,0x08)
    ROUNDb(50, 6, 7, 0, 1, 2, 3, 4, 5, 0x27,0x48,0x77,0x4C)
    ROUNDb(51, 5, 6, 7, 0, 1, 2, 3, 4, 0x34,0xB0,0xBC,0xB5)
    ROUNDb(52, 4, 5, 6, 7, 0, 1, 2, 3, 0x39,0x1C,0x0C,0xB3)
    ROUNDb(53, 3, 4, 5, 6, 7, 0, 1, 2, 0x4E,0xD8,0xAA,0x4A)
    ROUNDb(54, 2, 3, 4, 5, 6, 7, 0, 1, 0x5B,0x9C,0xCA,0x4F)
    ROUNDb(55, 1, 2, 3, 4, 5, 6, 7, 0, 0x68,0x2E,0x6F,0xF3)
    ROUNDb(56, 0, 1, 2, 3, 4, 5, 6, 7, 0x74,0x8F,0x82,0xEE)
    ROUNDb(57, 7, 0, 1, 2, 3, 4, 5, 6, 0x78,0xA5,0x63,0x6F)
    ROUNDb(58, 6, 7, 0, 1, 2, 3, 4, 5, 0x84,0xC8,0x78,0x14)
    ROUNDb(59, 5, 6, 7, 0, 1, 2, 3, 4, 0x8C,0xC7,0x02,0x08)
    ROUNDb(60, 4, 5, 6, 7, 0, 1, 2, 3, 0x90,0xBE,0xFF,0xFA)
    ROUNDb(61, 3, 4, 5, 6, 7, 0, 1, 2, 0xA4,0x50,0x6C,0xEB)
    ROUNDb(62, 2, 3, 4, 5, 6, 7, 0, 1, 0xBE,0xF9,0xA3,0xF7)
    ROUNDb(63, 1, 2, 3, 4, 5, 6, 7, 0, 0xC6,0x71,0x78,0xF2)

    mov  r4, r8;  /* Argument: state */

    ldr  r0, [sp];
    ldr  r6, [r4];
    add  r6, r6, r0;
    str  r6, [r4];
    ldr  r0, [sp,#4];
    ldr  r6, [r4,#4];
    add  r6, r6, r0;
    str  r6, [r4,#4];
    ldr  r0, [sp,#8];
    ldr  r6, [r4,#8];
    add  r6, r6, r0;
    str  r6, [r4,#8];
    ldr  r0, [sp,#12];
    ldr  r6, [r4,#12];
    add  r6, r6, r0;
    str  r6, [r4,#12];
    ldr  r0, [sp,#16];
    ldr  r6, [r4,#16];
    add  r6, r6, r0;
    str  r6, [r4,#16];
    ldr  r0, [sp,#20];
    ldr  r6, [r4,#20];
    add  r6, r6, r0;
    str  r6, [r4,#20];
    ldr  r0, [sp,#24];
    ldr  r6, [r4,#24];
    add  r6, r6, r0;
    str  r6, [r4,#24];
    ldr  r0, [sp,#28];
    ldr  r6, [r4,#28];
    add  r6, r6, r0;
    str  r6, [r4,#28];

    ldr r1, [sp,#96]
    ldr r4, [sp,#100]
    ldr r5, [sp,#104]
    ldr fp, [sp,#108]
    add sp, sp, #112
    push {lr}
    pop {pc}

Mikael Vingaard