• Welcome to Valhalla Legends Archive.
 

[RESOLVED] Broken SHA-1

Started by shadypalm88, October 16, 2004, 12:29 PM

Previous topic - Next topic

shadypalm88

Hi everyone.  I'm working on a cross-platform C/C++ library that is based off of the public-domain code from JavaOp.  Eventually it will handle CD-key decoding & hashing, revision checking, and login capabilities for all clients.  CD-key decoding is working great, but there are some issues with the broken SHA-1 algorithm.  I've managed to trace the problem to one portion of the code.

Here is the code from JavaOp (dated September 27, 2004):
// file: src/bot/util/BrokenSHA1.java
private static void doHash(int[] hashBuffer)
{
    int buf[] = new int[0x50];
    int dw, a, b, c, d, e;
    int p;

    int i;

    for(i = 0; i < 0x10 ; i++)
        buf[i] = hashBuffer[i + 5];

   
    for(i = 0x10; i < 0x50; i++)
    {
        dw = buf[i-0x10] ^ buf[i-0x8] ^ buf[i-0xE] ^ buf[i-0x3];
        buf[i] = (1 >>> (0x20 - (byte)dw)) | (1 << (byte) dw);
    }

    a = hashBuffer[0];
    b = hashBuffer[1];
    c = hashBuffer[2];
    d = hashBuffer[3];
    e = hashBuffer[4];

    p = 0;

    i = 0x14;
    do
    {
        dw = ((a << 5) | (a >>> 0x1b)) + ((~b & d) | (c & b)) + e + buf[p++] + 0x5a827999;
        e = d;
        d = c;
        c = (b >>> 2) | (b << 0x1e);
        b = a;
        a = dw;
    }
    while(--i > 0);

    i = 0x14;
    do
    {
        dw = (d ^ c ^ b) + e + ((a << 5) | (a >>> 0x1b)) + buf[p++] + 0x6ED9EBA1;
        e = d;
        d = c;
        c = (b >>> 2) | (b << 0x1e);
        b = a;
        a = dw;
    }
    while(--i > 0);

    i = 0x14;
    do
    {
        dw = ((c & b) | (d & c) | (d & b)) + e + ((a << 5) | (a >>> 0x1b)) + buf[p++] - 0x70E44324;
        e = d;
        d = c;
        c = (b >>> 2) | (b << 0x1e);
        b = a;
        a = dw;
    }
    while(--i > 0);

    i = 0x14;
    do
    {
        dw = ((a << 5) | (a >>> 0x1b)) + e + (d ^ c ^ b) + buf[p++] - 0x359D3E2A;
        e = d;
        d = c;
        c = (b >>> 2) | (b << 0x1e);
        b = a;
        a = dw;
    }
    while(--i > 0);

    hashBuffer[0] += a;
    hashBuffer[1] += b;
    hashBuffer[2] += c;
    hashBuffer[3] += d;
    hashBuffer[4] += e;
}   


The issue in my code lies in the calculation of a, b, c, d, and e (it gets through the first two loops properly).

Now, my C++ code (note the addition of the extra loop; I did this to try and limit code duplication):
// file: bsha1.cpp
void doHash(unsigned int* hashBuffer) {
    unsigned int* buf = new unsigned int[0x50];
    int dw, a, b, c, d, e;
    unsigned int* p;
    int i, j;
   
    memcpy(buf, hashBuffer + 5, 0x40);
   
    for (i = 0x10; i < 0x50; i++) {
        dw = buf[i - 0x10] ^ buf[i - 0x8] ^ buf[i - 0xE] ^ buf[i - 0x3];
        buf[i] = (1 >> (0x20 - (unsigned char) LSB4(dw)) |
            (1 << (unsigned char) LSB4(dw)));
    }
   
    a = hashBuffer[0];
    b = hashBuffer[1];
    c = hashBuffer[2];
    d = hashBuffer[3];
    e = hashBuffer[4];
   
    p = buf;

    for (i = 0; i < 4; i++) {
        j = 0x14;
        do {
            switch (i) {
                case 0:
                    dw = ((a << 5) | (a >> 0x1B)) + ((~b & d) | (c & b)) + e +
                        *p++ + 0x5A827999;
                    break;
                case 1:
                    dw = (d ^ c ^ b) + e + ((a << 5) | (a >> 0x1B)) + *p++ +
                        0x6ED9EBA1;
                    break;
                case 2:
                    dw = ((c & b) | (d & c) | (d & b)) + e +
                        ((a << 5) | (a >> 0x1B)) + *p++ - 0x70E44324;
                    break;
                case 3:
                    dw = ((a << 5) | (a >> 0x1B)) + e + (d ^ c ^ b) + *p++ -
                        0x359D3E2A;
                    break;
            }
            e = d;
            d = c;
            c = (b >> 2) | (b << 0x1E);
            b = a;
            a = dw;
        } while (--j);
    }

    printf("a=0x%X b=0x%X c=0x%X d=0x%X e=0x%X\n", a, b, c, d, e);
   
    hashBuffer[0] += a;
    hashBuffer[1] += b;
    hashBuffer[2] += c;
    hashBuffer[3] += d;
    hashBuffer[4] += e;

    printf("Final: ");
    for (int k = 0; k < 21; k++) {
        printf("%X ", hashBuffer[k]);
    }
    printf("\n");
   
}


By the way, the LSB4 macro is defined as:#define SWAP2(num) ((((num) >> 8) & 0x00FF) | (((num) << 8) & 0xFF00))
#define SWAP4(num) ((((num) >> 24) & 0x000000FF) | (((num) >> 8) & 0x0000FF00) | (((num) << 8) & 0x00FF0000) | (((num) << 24) & 0xFF000000))

#ifdef BIGENDIAN
#define LSB2(num) SWAP2(num)
#define LSB4(num) SWAP4(num)
#define MSB2(num) (num)
#define MSB4(num) (num)
#else // (little endian)
#define LSB2(num) (num)
#define LSB4(num) (num)
#define MSB2(num) SWAP2(num)
#define MSB4(num) SWAP4(num)
#endif // (endianness)


The two testing programs are being run on the same computer, compiled by the same compiler.

Output from Yobgul's bnetauth for an example CD-key hash:a=0x490625EA b=0x10AC7BDA c=0xDE73312D d=0x47C96CC5 e=0x5B873BEF

Final: B04B48EB 7A2763 772E0E2B 57FBC13B 1F5A1DDF 6EE5B 299CF83 6 90B921 0 A455B972 0 0 0 0 0 0 0 0 0 0


And mine:a=0x63B9F483 b=0xB27829CF c=0xE9985F61 d=0x53B16AFF e=0x5F4A79DE

Final: CAFF1784 A245D558 82533C5F 63E3BF75 231D5BCE 6EE5B 299CF83 6 90B921 0 A455B972 0 0 0 0 0 0 0 0 0 0

Any and all help would be greatly appreciated.

P.S. My code generates yet another result when run on a Mac (big-endian) system.  The keys still decode fine on both systems, I'm just not sure where the endian-related issues are in the SHA-1 code.  But I'd like to take care of this first.

Adron

I think your problem is signed vs unsigned.

shadypalm88

Quote from: Adron on October 16, 2004, 12:53 PM
I think your problem is signed vs unsigned.
::) All that and I was missing one "unsigned".  Something I simply overlooked.  Thanks a lot!  Now I just have the endian problem to fix.

Kp

I know you did it to be faithful to the JavaOp implementation, but there's several things in your translation that could be improved now that you're using a good language.

First, you could use a local array rather than dynamically allocating 50 integers (which I don't see you freeing - that's a bug!).
Next, that doubled up loop is a mess and will hurt performance unnecessarily.  You'd be better off to unroll the outer loop, so you have four loops in series, one for each branch of your current switch statement.

As an aesthetic change, you could make the code a little bit more readable by using the rol/ror idea rather than emulating it everywhere it's needed.  Like this:

#define ROL(a,b)  ((a << b) | (a >> 32 - b))

It's the same result as your code, but it might make things a bit prettier. :)
[19:20:23] (BotNet) <[vL]Kp> Any idiot can make a bot with CSB, and many do!

shadypalm88

Quote from: Kp on October 16, 2004, 01:44 PM
I know you did it to be faithful to the JavaOp implementation, but there's several things in your translation that could be improved now that you're using a good language.
Thanks.  This is kind of a learning experience for me... and I assume by "good" you mean "the language won't hold your hand and protect the user from your incompetence now".

Quote from: Kp on October 16, 2004, 01:44 PMFirst, you could use a local array rather than dynamically allocating 50 integers
Right.  For some reason I thought there would be some sort of benefit to the dynamic allocation... I don't know what I thought it would be, though.  Hmm.

Quote from: Kp on October 16, 2004, 01:44 PM(which I don't see you freeing - that's a bug!).
I noticed that right after I copied it in, and put a delete statement at the end.  But thanks.

Quote from: Kp on October 16, 2004, 01:44 PMNext, that doubled up loop is a mess and will hurt performance unnecessarily.  You'd be better off to unroll the outer loop, so you have four loops in series, one for each branch of your current switch statement.
That is how it was originally, but I have this aversion to copying + pasting code, which is why I put the outer loop in.  I didn't realize it would cause a significant performance hit though.

Quote from: Kp on October 16, 2004, 01:44 PM
As an aesthetic change, you could make the code a little bit more readable by using the rol/ror idea rather than emulating it everywhere it's needed.  Like this:

#define ROL(a,b)  ((a << b) | (a >> 32 - b))

It's the same result as your code, but it might make things a bit prettier. :)
Good idea.  There probably would've been something similar in the JavaOp code, I imagine, if Java had macros, or even inlining.

Behold... the code:// file: mutil.h
#ifndef ROL
#define ROL(a,b) (((a) << (b)) | ((a) >> 32 - (b)))
#endif
#ifndef ROR
#define ROR(a,b) (((a) >> (b)) | ((a) << 32 - (b)))
#endif


// file: bsha1.cpp
#include "mutil.h"
#include "bsha1.h"
#include <cstring>

void doHash(unsigned int* hashBuffer) {
    unsigned int buf[0x50];
    unsigned int dw, a, b, c, d, e;
    unsigned int* p;
    int i, j;
   
    memcpy(buf, hashBuffer + 5, 0x40);
   
    for (i = 0x10; i < 0x50; i++) {
        dw = buf[i - 0x10] ^ buf[i - 0x8] ^ buf[i - 0xE] ^ buf[i - 0x3];
        buf[i] = ROL(1, dw);
    }
   
    a = hashBuffer[0];
    b = hashBuffer[1];
    c = hashBuffer[2];
    d = hashBuffer[3];
    e = hashBuffer[4];
   
    p = buf;

    j = 0x14;
    do {
        dw = ROL(a, 5) + ((~b & d) | (c & b)) + e + *p++ + 0x5A827999;

        e = d;
        d = c;
        c = ROL(b, 30);
        b = a;
        a = dw;
    } while (--j);
   
    j = 0x14;
    do {
        dw = (d ^ c ^ b) + e + ROL(a, 5) + *p++ + 0x6ED9EBA1;   
   
        e = d;
        d = c;
        c = ROL(b, 30);
        b = a;
        a = dw;
    } while (--j);
   
    j = 0x14;
    do {
        dw = ((c & b) | (d & c) | (d & b)) + e + ROL(a, 5) + *p++ - 0x70E44324;
       
        e = d;
        d = c;
        c = ROL(b, 30);
        b = a;
        a = dw;
    } while (--j);
   
    j = 0x14;
    do {
        dw = ROL(a, 5) + e + (d ^ c ^ b) + *p++ -
            0x359D3E2A;

        e = d;
        d = c;
        c = ROL(b, 30);
        b = a;
        a = dw;
    } while (--j);
   
    hashBuffer[0] += a;
    hashBuffer[1] += b;
    hashBuffer[2] += c;
    hashBuffer[3] += d;
    hashBuffer[4] += e;
   
}

MEXP(void) calcHashBuf(char* data, unsigned int length, char* hash) {
    unsigned int hashBuffer[21];
    int subLength;
   
    // Fill in default values
    hashBuffer[0] = 0x67452301;
    hashBuffer[1] = 0xEFCDAB89;
    hashBuffer[2] = 0x98BADCFE;
    hashBuffer[3] = 0x10325476;
    hashBuffer[4] = 0xC3D2E1F0;
    /*hashBuffer[0] = LSB4(0x67452301);
    hashBuffer[1] = LSB4(0xEFCDAB89);
    hashBuffer[2] = LSB4(0x98BADCFE);
    hashBuffer[3] = LSB4(0x10325476);
    hashBuffer[4] = LSB4(0xC3D2E1F0);*/
   
    for (int i = 0; i < length; i += 0x40) {
        subLength = length - i;
       
        // subLength must be <= 0x40
        if (subLength > 0x40)
            subLength = 0x40;
       
        // copy active portion of data to buffer
        memcpy(hashBuffer + 5, data + i, subLength);
       
        // if end of buffer is not reached, it must be zero-padded
        if (subLength < 0x40) {
            memset((char*) (hashBuffer + 5) + subLength, 0, 0x40 - subLength);
        }

        doHash(hashBuffer);
    }
    memcpy(hash, hashBuffer, 20);
}


The new code works fine... on x86.  The endian issue still remains, and still has me baffled.

x86 Output: (Microsoft Windows XP, VS .NET 2005 Beta 1)product = 0x6, val1 = 0x90B921, val2 = 0xA455B972
Size of hash: 20
Hash: EB 48 4B B0 63 27 7A 0 2B E 2E 77 3B C1 FB 57 DF 1D 5A 1F


PPC Output: (Mac OS X 10.3.5 "Panther", gcc 3.3)product = 0x6, val1 = 0x90B921, val2 = 0xA455B972
Size of hash: 20
Hash: 87 2A 81 C9 25 1B D5 AF 3B 9C D1 E 81 28 16 40 77 AB EE BC

Skywing

Try stepping through both the x86 and PPC ones side by side and see where stuff starts to differ.

iago

hm, if endianness is changed, ROL are going to be seriously messed up.  Is that where it's blowing up?
This'll make an interesting test for broken AV:
QuoteX5O!P%@AP[4\PZX54(P^)7CC)7}$EICAR-STANDARD-ANTIVIRUS-TEST-FILE!$H+H*


shadypalm88

Quote from: Skywing on October 16, 2004, 05:26 PM
Try stepping through both the x86 and PPC ones side by side and see where stuff starts to differ.

Quote from: iago on October 16, 2004, 06:53 PM
hm, if endianness is changed, ROL are going to be seriously messed up.  Is that where it's blowing up?
Looks like it, yeah.  I'd been stepping through them side by side (removed that debugging stuff when I realized it was a separate issue).

Here's the first bit of it again:void doHash(unsigned int* hashBuffer) {
    unsigned int buf[0x50];
    unsigned int dw, a, b, c, d, e;
    unsigned int* p;
    int i, j;
   
    dumpbuf((char*) hashBuffer, 0x40);
   
    //memcpy(buf, hashBuffer + 5, 0x40);
   
    for (i = 0; i < 0x10; i++)
        buf[i] = LSB4(hashBuffer[i + 5]);
   
   
    for (i = 0x10; i < 0x50; i++) {
        dw = buf[i - 0x10] ^ buf[i - 0x8] ^ buf[i - 0xE] ^ buf[i - 0x3];
        buf[i] = LSB4(ROL(1, LSB4(dw)));
    }
   
    dumpbuf((char*) buf, 0x50);
// [snip]
}


On the Intel box, this generates:01 23 45 67 89 AB CD EF FE DC BA 98 76 54 32 10 F0 E1 D2 C3 5B EE 06 00 83 CF 99 02 06 00 00 00 21 B9 90 00 00 00 00 00 72 B9 55 A4 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

5B EE 06 00 83 CF 99 02 06 00 00 00 21 B9 90 00 00 00 00 00 72 B9 55 A4 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 20 04 00 00 00 40 00 00 00 00 00 08 00

Size of hash: 20
Hash: EB 48 4B B0 63 27 7A 0 2B E 2E 77 3B C1 FB 57 DF 1D 5A 1F


On the Macintosh:
01 23 45 67 89 AB CD EF FE DC BA 98 76 54 32 10 F0 E1 D2 C3 00 06 EE 5B 02 99 CF 83 00 00 00 06 00 90 B9 21 00 00 00 00 A4 55 B9 72 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

5B EE 06 00 83 CF 99 02 06 00 00 00 21 B9 90 00 00 00 00 00 72 B9 55 A4 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 20 00 00 00 00 40 00 00 00 00 00 08 00

Size of hash: 20
Hash: 5 4 C5 24 EF E9 CB D1 F9 4 96 1D 9A EE 4F 84 C2 11 D AF


I've tried quite a few variations on buf[i] = LSB4(ROL(1, LSB4(dw)));
Including:
buf[i] = ROL(1, LSB4(dw));
buf[i] = LSB4(ROL(1, dw));

but this seems to produce the most accurate result.

Any ideas?

shadypalm88

Oops, small brainfart.  I was dumping 0x50 bytes, not DWORD's.

Intel (x86):01 23 45 67 89 AB CD EF FE DC BA 98 76 54 32 10 F0 E1 D2 C3 5B EE 06 00 83 CF 99 02 06 00 00 00 21 B9 90 00 00 00 00 00 72 B9 55 A4 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

5B EE 06 00 83 CF 99 02 06 00 00 00 21 B9 90 00 00 00 00 00 72 B9 55 A4 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 20 04 00 00 00 40 00 00 00 00 00 08 00 10 00 00 00 00 00 04 00 01 00 00 00 00 00 01 00 01 00 00 00 20 00 00 00 01 00 00 00 02 00 00 00 00 00 01 00 02 00 00 00 08 00 00 00 10 00 00 00 08 00 00 00 00 10 00 00 02 00 00 00 00 04 00 00 00 00 02 00 01 00 00 00 00 01 00 00 00 00 01 00 00 02 00 00 04 00 00 00 08 00 00 00 01 00 00 00 00 10 00 00 00 00 00 08 02 00 00 00 00 00 01 00 00 04 00 00 40 00 00 00 00 04 00 00 01 00 00 00 01 00 00 00 02 00 00 00 08 00 00 00 20 00 00 00 00 04 00 00 00 20 00 00 00 01 00 00 01 00 00 00 08 00 00 00 04 00 00 00 00 08 00 00 00 01 00 00 10 00 00 00 02 00 00 00 02 00 00 00 00 00 04 00 08 00 00 00 10 00 00 00 00 01 00 00 00 01 00 00 01 00 00 00 08 00 00 00 00 04 00 00 10 00 00 00 00 01 00 00 00 00 10 00 01 00 00 00 04 00 00 00

Size of hash: 20
Hash: EB 48 4B B0 63 27 7A 0 2B E 2E 77 3B C1 FB 57 DF 1D 5A 1F


Apple Macintosh (PPC):01 23 45 67 89 AB CD EF FE DC BA 98 76 54 32 10 F0 E1 D2 C3 00 06 EE 5B 02 99 CF 83 00 00 00 06 00 90 B9 21 00 00 00 00 A4 55 B9 72 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

5B EE 06 00 83 CF 99 02 06 00 00 00 21 B9 90 00 00 00 00 00 72 B9 55 A4 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 20 00 00 00 00 40 00 00 00 00 00 08 00 01 00 00 00 00 00 00 00 01 00 00 00 02 00 00 00 01 00 00 00 02 00 00 00 04 00 00 00 02 00 00 00 08 00 00 00 10 00 00 00 08 00 00 00 00 04 00 00 00 00 02 00 00 04 00 00 20 00 00 00 04 00 00 00 00 01 00 00 00 00 00 00 00 10 00 00 01 00 00 00 20 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 02 00 00 00 00 00 01 00 00 01 00 00 08 00 00 00 01 00 00 00 20 00 00 00 00 00 00 00 20 00 00 00 00 00 00 00 02 00 00 00 01 00 00 00 00 01 00 00 00 00 00 00 01 00 00 00 04 00 00 00 01 00 00 00 08 00 00 00 00 40 00 00 02 00 00 00 01 00 00 00 02 00 00 00 08 00 00 00 20 00 00 00 00 00 00 00 02 00 00 00 00 00 00 00 08 00 00 00 04 00 00 00 40 00 00 00 01 00 00 00 00 00 00 00 02 00 00 00 00 02 00 00 02 00 00 00 00 04 00 00 00 20 00 00

Size of hash: 20
Hash: 5 4 C5 24 EF E9 CB D1 F9 4 96 1D 9A EE 4F 84 C2 11 D AF

shadypalm88

:(  I'm still having troubles with this code.  I've even gone so far as to rewrite it, as that forces me to rethink what I'm doing and I often find the bug before I'm even done.  But that wasn't the case.  What has changed is that now I'm checking my code against JavaOp's Broken SHA-1 routine on a big-endian machine (Java is big-endian regardless).  And I've found the specific line that's causing the problem.

Here's the equivalent (working) code from JavaOp's doHash (in full earlier in thread):(....)
        int buf[] = new int[0x50];
        int dw;

        int i;
       
        ( ... snip ... )

        for(i = 0x10; i < 0x50; i++)
        {
            dw = buf[i-0x10] ^ buf[i-0x8] ^ buf[i-0xE] ^ buf[i-0x3];
            buf[i] = (1 >>> (0x20 - (byte)dw)) | (1 << (byte) dw); // <-- this line!
        }


My (not so working) version of that is:f = buf[i - 0x10] ^ buf[i - 0x8] ^ buf[i - 0xE] ^ buf[i - 0x3];
buf[i] = ROL(1, (char) f);

ROL is a macro; that line is preprocessed to:buf[i] = (((1) << ((char) f)) | ((1) >> 32 - ((char) f)));

This is where it gets interesting (and to me, awfully confusing).  The buffers in both JavaOp and my code before the loop are identical.  The first time it loops, the f value equals the dw value, and the ROL operation also produces the same result.  The second time it goes around, again, f = dw = 0x21363630.  But, in JavaOp, rotation produces 0x10000.  My code produces 0.  Replacing my ROL macro with JavaOp's code yields the same result.

I figure I must be overlooking something obvious (probably because I've been looking at it for too long).  Anyone have an idea as to what's wrong?

drivehappy

Could it be because the >>> operator is the unsigned shift right operator in Java?

shadypalm88

Quote from: drivehappy on October 22, 2004, 09:19 PM
Could it be because the >>> operator is the unsigned shift right operator in Java?
Nope.  When I tried the Java I of course changed >>> to >>.  The code compiles and runs without crashing... it just doesn't do what I want it to do.

drivehappy

Yes, but are you certain the ROR operation is equating to an unsigned value?

shadypalm88

Quote from: drivehappy on October 22, 2004, 10:00 PM
Yes, but are you certain the ROR operation is equating to an unsigned value?
I tried making all the int's involved signed, but it didn't have any affect on the ROL operations.  Although it did on the final result (which was still wrong).

shadypalm88

Finally!  The Broken SHA-1 routine now works on Windows (x86), Linux (x86), and Mac OS X (PowerPC).  It's now based more on Yobguls' HashData and other, more optimized, SHA-1 implementations.  Most of the loops have been unrolled.

Thanks to all who helped!

mutil.h (Macros used by Broken SHA-1.)
#ifndef MUTIL_H
#define MUTIL_H

#ifdef _MSC_VER
  #define MEXP(type) __declspec(dllexport) type __stdcall
  #define MCEXP __declspec(dllexport)
#else
  #define MEXP(type) type
  #define MCEXP
#endif
#define MYRIAD_UTIL

#ifdef _MSC_VER
#include <stdlib.h> /* get prototypes for rotation functions */
#pragma intrinsic(_lrotl,_lrotr) /* use intrinsic compiler rotations */
#define ROL(x,n) _lrotl((x),(n))
#define ROR(x,n) _lrotr((x),(n))
#else
#ifndef ROL
#define ROL(a,b) (((a) << (b)) | ((a) >> 32 - (b)))
#endif
#ifndef ROR
#define ROR(a,b) (((a) >> (b)) | ((a) << 32 - (b)))
#endif
#endif

#define SWAP2(num) ((((num) >> 8) & 0x00FF) | (((num) << 8) & 0xFF00))
#define SWAP4(num) ((((num) >> 24) & 0x000000FF) | (((num) >> 8) & 0x0000FF00) | (((num) << 8) & 0x00FF0000) | (((num) << 24) & 0xFF000000))

#ifdef BIGENDIAN
#define LSB2(num) SWAP2(num)
#define LSB4(num) SWAP4(num)
#define MSB2(num) (num)
#define MSB4(num) (num)
#else // (little endian)
#define LSB2(num) (num)
#define LSB4(num) (num)
#define MSB2(num) SWAP2(num)
#define MSB4(num) SWAP4(num)
#endif // (endianness)

#ifndef NULL
#define NULL 0
#endif // NULL

#endif // MUTIL_H


bsha1.cpp
/**
* Broken SHA-1 Implementation
*
* October 23, 2004
*/

#include "mutil.h"
#include "bsha1.h"
#include <cstring>

#define BSHA_IC1 0x67452301lu
#define BSHA_IC2 0xEFCDAB89lu
#define BSHA_IC3 0x98BADCFElu
#define BSHA_IC4 0x10325476lu
#define BSHA_IC5 0xC3D2E1F0lu

#define BSHA_OC1 0x5A827999lu
#define BSHA_OC2 0x6ED9EBA1lu
#define BSHA_OC3 0x70E44324lu
#define BSHA_OC4 0x359D3E2Alu

#define BSHA_COP e = d; d = c; c = ROL(b, 30); b = a; a = g

#define BSHA_OP1(a, b, c, d, e, f, g) g = LSB4(f) + ROL(a, 5) + e + \
    ((b & c) | (~b & d)) + BSHA_OC1; BSHA_COP;
#define BSHA_OP2(a, b, c, d, e, f, g) g = (d ^ c ^ b) + e + ROL(g, 5) + \
    LSB4(f) + BSHA_OC2; BSHA_COP;
#define BSHA_OP3(a, b, c, d, e, f, g) g = LSB4(f) + ROL(g, 5) + e + \
    ((c & b) | (d & c) | (d & b)) - BSHA_OC3; BSHA_COP;
#define BSHA_OP4(a, b, c, d, e, f, g) g = (d ^ c ^ b) + e + ROL(g, 5) + \
    LSB4(f) - BSHA_OC4; BSHA_COP;

MEXP(void) calcHashBuf(char* input, unsigned int length, char* result) {
    int i;
    unsigned long a, b, c, d, e, g, *ldata;
    char data[1024];
    memset(data, 0, 1024);
    memcpy(data, input, length);
    ldata = (unsigned long*) data;
   
    for (i = 0; i < 64; i++) {
        ldata[i + 16] =
    LSB4(ROL(1, LSB4(ldata[i] ^ ldata[i+8] ^ ldata[i+2] ^ ldata[i+13]) % 32));
    }
   
   
    a = BSHA_IC1;
    b = BSHA_IC2;
    c = BSHA_IC3;
    d = BSHA_IC4;
    e = BSHA_IC5;
    g = 0;
   
    // Loops unrolled.
    BSHA_OP1(a, b, c, d, e, *ldata++, g) BSHA_OP1(a, b, c, d, e, *ldata++, g)
    BSHA_OP1(a, b, c, d, e, *ldata++, g) BSHA_OP1(a, b, c, d, e, *ldata++, g)
    BSHA_OP1(a, b, c, d, e, *ldata++, g) BSHA_OP1(a, b, c, d, e, *ldata++, g)
    BSHA_OP1(a, b, c, d, e, *ldata++, g) BSHA_OP1(a, b, c, d, e, *ldata++, g)
    BSHA_OP1(a, b, c, d, e, *ldata++, g) BSHA_OP1(a, b, c, d, e, *ldata++, g)
    BSHA_OP1(a, b, c, d, e, *ldata++, g) BSHA_OP1(a, b, c, d, e, *ldata++, g)
    BSHA_OP1(a, b, c, d, e, *ldata++, g) BSHA_OP1(a, b, c, d, e, *ldata++, g)
    BSHA_OP1(a, b, c, d, e, *ldata++, g) BSHA_OP1(a, b, c, d, e, *ldata++, g)
    BSHA_OP1(a, b, c, d, e, *ldata++, g) BSHA_OP1(a, b, c, d, e, *ldata++, g)
    BSHA_OP1(a, b, c, d, e, *ldata++, g) BSHA_OP1(a, b, c, d, e, *ldata++, g)

    BSHA_OP2(a, b, c, d, e, *ldata++, g) BSHA_OP2(a, b, c, d, e, *ldata++, g)
    BSHA_OP2(a, b, c, d, e, *ldata++, g) BSHA_OP2(a, b, c, d, e, *ldata++, g)
    BSHA_OP2(a, b, c, d, e, *ldata++, g) BSHA_OP2(a, b, c, d, e, *ldata++, g)
    BSHA_OP2(a, b, c, d, e, *ldata++, g) BSHA_OP2(a, b, c, d, e, *ldata++, g)
    BSHA_OP2(a, b, c, d, e, *ldata++, g) BSHA_OP2(a, b, c, d, e, *ldata++, g)
    BSHA_OP2(a, b, c, d, e, *ldata++, g) BSHA_OP2(a, b, c, d, e, *ldata++, g)
    BSHA_OP2(a, b, c, d, e, *ldata++, g) BSHA_OP2(a, b, c, d, e, *ldata++, g)
    BSHA_OP2(a, b, c, d, e, *ldata++, g) BSHA_OP2(a, b, c, d, e, *ldata++, g)
    BSHA_OP2(a, b, c, d, e, *ldata++, g) BSHA_OP2(a, b, c, d, e, *ldata++, g)
    BSHA_OP2(a, b, c, d, e, *ldata++, g) BSHA_OP2(a, b, c, d, e, *ldata++, g)

    BSHA_OP3(a, b, c, d, e, *ldata++, g) BSHA_OP3(a, b, c, d, e, *ldata++, g)
    BSHA_OP3(a, b, c, d, e, *ldata++, g) BSHA_OP3(a, b, c, d, e, *ldata++, g)
    BSHA_OP3(a, b, c, d, e, *ldata++, g) BSHA_OP3(a, b, c, d, e, *ldata++, g)
    BSHA_OP3(a, b, c, d, e, *ldata++, g) BSHA_OP3(a, b, c, d, e, *ldata++, g)
    BSHA_OP3(a, b, c, d, e, *ldata++, g) BSHA_OP3(a, b, c, d, e, *ldata++, g)
    BSHA_OP3(a, b, c, d, e, *ldata++, g) BSHA_OP3(a, b, c, d, e, *ldata++, g)
    BSHA_OP3(a, b, c, d, e, *ldata++, g) BSHA_OP3(a, b, c, d, e, *ldata++, g)
    BSHA_OP3(a, b, c, d, e, *ldata++, g) BSHA_OP3(a, b, c, d, e, *ldata++, g)
    BSHA_OP3(a, b, c, d, e, *ldata++, g) BSHA_OP3(a, b, c, d, e, *ldata++, g)
    BSHA_OP3(a, b, c, d, e, *ldata++, g) BSHA_OP3(a, b, c, d, e, *ldata++, g)

    BSHA_OP4(a, b, c, d, e, *ldata++, g) BSHA_OP4(a, b, c, d, e, *ldata++, g)
    BSHA_OP4(a, b, c, d, e, *ldata++, g) BSHA_OP4(a, b, c, d, e, *ldata++, g)
    BSHA_OP4(a, b, c, d, e, *ldata++, g) BSHA_OP4(a, b, c, d, e, *ldata++, g)
    BSHA_OP4(a, b, c, d, e, *ldata++, g) BSHA_OP4(a, b, c, d, e, *ldata++, g)
    BSHA_OP4(a, b, c, d, e, *ldata++, g) BSHA_OP4(a, b, c, d, e, *ldata++, g)
    BSHA_OP4(a, b, c, d, e, *ldata++, g) BSHA_OP4(a, b, c, d, e, *ldata++, g)
    BSHA_OP4(a, b, c, d, e, *ldata++, g) BSHA_OP4(a, b, c, d, e, *ldata++, g)
    BSHA_OP4(a, b, c, d, e, *ldata++, g) BSHA_OP4(a, b, c, d, e, *ldata++, g)
    BSHA_OP4(a, b, c, d, e, *ldata++, g) BSHA_OP4(a, b, c, d, e, *ldata++, g)
    BSHA_OP4(a, b, c, d, e, *ldata++, g) BSHA_OP4(a, b, c, d, e, *ldata++, g)

    ldata = (unsigned long*) result;
    ldata[0] = LSB4(BSHA_IC1 + a);
    ldata[1] = LSB4(BSHA_IC2 + b);
    ldata[2] = LSB4(BSHA_IC3 + c);
    ldata[3] = LSB4(BSHA_IC4 + d);
    ldata[4] = LSB4(BSHA_IC5 + e);
    ldata = NULL;
}

(bsha1.h only contains the prototype for calcHashBuf().)