#include "uintN_t.h"

#pragma MAIN_MHZ groestl 500.0

typedef struct state_t {
	uint8_t words[8][16];
} state_t;

typedef struct col_t {
	uint8_t words[8];
} col_t;

uint8_t sbox(uint8_t x) {
  uint8_t rom[256] = {
    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
  };
  return rom[x];
}

#define GET_COL(ret, state, col) \
uint32_t GET_COL_j; \
for (GET_COL_j = 0; GET_COL_j < 8; GET_COL_j += 1) { \
  ret.words[GET_COL_j] = state.words[GET_COL_j][col]; \
}

/*
col_t get_col(state_t state, uint32_t col) {
	col_t ret;
	uint32_t j;
	for (j = 0; j < 8; j += 1) {
		ret.words[j] = state.words[j][col];
	}
	return ret;
}
*/

col_t mul2(col_t x) {
  uint8_t rom[256] = {
    0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a,
    0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36,
    0x38, 0x3a, 0x3c, 0x3e, 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52,
    0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e,
    0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a,
    0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, 0xa0, 0xa2, 0xa4, 0xa6,
    0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, 0xc0, 0xc2,
    0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde,
    0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa,
    0xfc, 0xfe, 0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d,
    0x03, 0x01, 0x07, 0x05, 0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29,
    0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25, 0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55,
    0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45, 0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71,
    0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65, 0x9b, 0x99, 0x9f, 0x9d,
    0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85, 0xbb, 0xb9,
    0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5,
    0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1,
    0xc7, 0xc5, 0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed,
    0xe3, 0xe1, 0xe7, 0xe5
  };

	col_t ret;
	uint32_t j;
	for (j = 0; j < 8; j += 1) {
		ret.words[j] = rom[x.words[j]];
	}

	return ret;
}

state_t mix_bytes(state_t state) 
{
	state_t ret;

  col_t cols[16];
  GET_COL(cols[0], state, 0)
  GET_COL(cols[1], state, 1)
  GET_COL(cols[2], state, 2)
  GET_COL(cols[3], state, 3)
  GET_COL(cols[4], state, 4)
  GET_COL(cols[5], state, 5)
  GET_COL(cols[6], state, 6)
  GET_COL(cols[7], state, 7)
  GET_COL(cols[8], state, 8)
  GET_COL(cols[9], state, 9)
  GET_COL(cols[10], state, 10)
  GET_COL(cols[11], state, 11)
  GET_COL(cols[12], state, 12)
  GET_COL(cols[13], state, 13)
  GET_COL(cols[14], state, 14)
  GET_COL(cols[15], state, 15)
  
	uint32_t i;
	for (i = 0; i < 16; i += 1) {
		col_t b = cols[i]; //get_col(state, i);
		col_t b2 = mul2(b);
		col_t b4 = mul2(b2);
		uint8_t t1 = b2.words[0] ^ b2.words[2] ^ b.words[5] ^ b4.words[7] ^ b.words[7];
		uint8_t t2 = b2.words[1] ^ b.words[4] ^ b4.words[6] ^ b.words[6] ^ b2.words[7];
		uint8_t t3 = b.words[0] ^ b4.words[2] ^ b.words[2] ^ b2.words[3] ^ b2.words[5];
		uint8_t t4 = b.words[1] ^ b4.words[3] ^ b.words[3] ^ b2.words[4] ^ b2.words[6];
		uint8_t t5 = b4.words[0] ^ b.words[0] ^ b2.words[3] ^ b4.words[5];
		uint8_t t6 = b4.words[1] ^ b4.words[4] ^ b.words[4] ^ b2.words[7];
		uint8_t t7 = b4.words[1] ^ b.words[1] ^ b2.words[4];
		uint8_t t8 = b2.words[0] ^ b4.words[5] ^ b.words[5];
		uint8_t t9 = b.words[2] ^ b2.words[5];
		uint8_t ta = b2.words[1] ^ b.words[6];
		uint8_t tb = b.words[3] ^ b2.words[6];
		uint8_t tc = b2.words[2] ^ b.words[7];

		ret.words[0][i] = t1 ^ t2 ^ t9 ^ b4.words[3] ^ b4.words[4];
		ret.words[1][i] = t1 ^ t5 ^ ta ^ tb ^ b4.words[4];
		ret.words[2][i] = t2 ^ t5 ^ t7 ^ tc;
		ret.words[3][i] = t1 ^ t3 ^ t7 ^ b4.words[6];
		ret.words[4][i] = t3 ^ t4 ^ ta ^ b4.words[0] ^ b4.words[7];
		ret.words[5][i] = t4 ^ t6 ^ t9 ^ tc ^ b4.words[0];
		ret.words[6][i] = t3 ^ t6 ^ t8 ^ tb;
		ret.words[7][i] = t2 ^ t4 ^ t8 ^ b4.words[2];
	}

	return ret;
}

state_t add_round_constant_P(state_t state, uint8_t round) {
	state_t ret;

	uint32_t i;
  uint32_t j;
	for (i = 0; i < 16; i += 1) {
		ret.words[0][i] = state.words[0][i] ^ (uint8_t) (i * 16) ^ round;
	}
	
	for (j = 1; j < 8; j += 1) {
		for (i = 0; i < 16; i += 1) {
			ret.words[j][i] = state.words[j][i];
		}
	}


	return ret;
}

state_t add_round_constant_Q(state_t state, uint8_t round) {
	state_t ret;

	uint32_t j;
	uint32_t i;
	for (j = 0; j < 7; j += 1) {
		for (i = 0; i < 16; i += 1) {
			ret.words[j][i] = state.words[j][i] ^ 0xff;
		}
	}

	for (i = 0; i < 16; i += 1) {
		ret.words[7][i] = state.words[7][i] ^ (uint8_t) (255 - 16 * i) ^ round;
	}

	return ret;
}

state_t sub_bytes(state_t state) {
    state_t ret;

	uint32_t i;
	uint32_t j;
    for (i = 0; i < 8; i += 1) {
    	for (j = 0; j < 16; j += 1) {
    		ret.words[i][j] = sbox(state.words[i][j]);
    	}
    }

    return ret;
}

state_t shift_bytes_P(state_t state) {
    state_t ret;

    uint32_t i;
    for (i = 0; i < 16; i += 1) {
    	ret.words[0][i] = state.words[0][(i - 0) % 16];
    	ret.words[1][i] = state.words[1][(i - 1) % 16];
    	ret.words[2][i] = state.words[2][(i - 2) % 16];
    	ret.words[3][i] = state.words[3][(i - 3) % 16];
    	ret.words[4][i] = state.words[4][(i - 4) % 16];
    	ret.words[5][i] = state.words[5][(i - 5) % 16];
    	ret.words[6][i] = state.words[6][(i - 6) % 16];
    	ret.words[7][i] = state.words[7][(i - 11) % 16];
    }

    return ret;
}

state_t shift_bytes_Q(state_t state) {
	state_t ret;

	uint32_t i;
	for (i = 0; i < 16; i += 1) {
		ret.words[0][i] = state.words[0][(i - 1) % 16];
		ret.words[1][i] = state.words[1][(i - 3) % 16];
		ret.words[2][i] = state.words[2][(i - 5) % 16];
		ret.words[3][i] = state.words[3][(i - 11) % 16];
		ret.words[4][i] = state.words[4][(i - 0) % 16];
		ret.words[5][i] = state.words[5][(i - 2) % 16];
		ret.words[6][i] = state.words[6][(i - 4) % 16];
		ret.words[7][i] = state.words[7][(i - 6) % 16];
	}

	return ret;
}

state_t inner_round_P(state_t state, uint32_t round) {
	state_t ret = add_round_constant_P(state, round);
	ret = sub_bytes(ret);
	ret = shift_bytes_P(ret);
	ret = mix_bytes(ret);

	return ret;
}

state_t inner_round_Q(state_t state, uint32_t round) {
	state_t ret = add_round_constant_Q(state, round);
	ret = sub_bytes(ret);
	ret = shift_bytes_Q(ret);
	ret = mix_bytes(ret);

	return ret;
}

state_t compress_P(state_t state) {
	state_t ret = state;

	uint32_t i;
	for (i = 0; i < 14; i += 1) {
		ret = inner_round_P(ret, i);
	}

	return ret;
}

state_t compress_Q(state_t state) {
	state_t ret = state;

	uint32_t i;
	for (i = 0; i < 14; i += 1) {
		ret = inner_round_Q(ret, i);
	}

	return ret;
}

state_t xor_state(state_t a, state_t b) {
	state_t ret;

	uint32_t i;
	uint32_t j;
	for (i = 0; i < 8; i += 1) {
		for (j = 0; j < 16; j += 1) {
			ret.words[i][j] = a.words[i][j] ^ b.words[i][j];
		}
	}

	return ret;
}

// Groestl hash function but with IV is set to zero. For benchmarking
// only, not for hashing use...
state_t groestl(state_t state) {
	state_t mid = xor_state(compress_P(state), compress_Q(state));
	return compress_P(mid);
}
