diff --git a/common/trinary/BUILD b/common/trinary/BUILD index 4f1d8346e0..c6bbd0e448 100644 --- a/common/trinary/BUILD +++ b/common/trinary/BUILD @@ -8,6 +8,15 @@ cc_library( ], ) +cc_library( + name = "trit_tryte_sse42", + hdrs = ["trit_tryte_sse42.h"], + deps = [ + ":tryte", + "//common:defs", + ], +) + cc_library( name = "trits", hdrs = ["trits.h"], @@ -102,7 +111,10 @@ cc_library( cc_library( name = "trit_tryte", srcs = ["trit_tryte.c"], - hdrs = ["trit_tryte.h"], + hdrs = [ + "trit_tryte.h", + "trit_tryte_sse42.h", + ], deps = [ ":trits", ":tryte", diff --git a/common/trinary/benchmark/BUILD b/common/trinary/benchmark/BUILD new file mode 100644 index 0000000000..db21e891f9 --- /dev/null +++ b/common/trinary/benchmark/BUILD @@ -0,0 +1,8 @@ +cc_test( + name = "bench_trit_tryte", + timeout = "short", + srcs = ["bench_trit_tryte.c"], + deps = [ + "//common/trinary:trit_tryte", + ], +) diff --git a/common/trinary/benchmark/bench_trit_tryte.c b/common/trinary/benchmark/bench_trit_tryte.c new file mode 100644 index 0000000000..cdde82bd2d --- /dev/null +++ b/common/trinary/benchmark/bench_trit_tryte.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2020 IOTA Stiftung + * https://github.com/iotaledger/entangled + * + * Refer to the LICENSE file for licensing information + */ + +#include +#include +#include +#include + +#include "common/trinary/trit_tryte.h" + +// The minimum and maximum input/output tryte size for perfomance testing +#define MIN_TRYTE_SIZE 16 +#define MAX_TRYTE_SIZE 2048 +// The number of times of the same input size testing +#define TEST_TIMES 20 + +tryte_t tryte_chars[27] = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', + 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '9'}; +trit_t trit_nums[3] = {-1, 0, 1}; + +long diff_in_nanosec(struct timespec start, struct timespec end) { + struct timespec diff; + + if (end.tv_nsec - start.tv_nsec < 0) { + diff.tv_sec = end.tv_sec - start.tv_sec - 1; + diff.tv_nsec = end.tv_nsec - start.tv_nsec + 1000000000; + } else { + diff.tv_sec = end.tv_sec - start.tv_sec; + diff.tv_nsec = end.tv_nsec - start.tv_nsec; + } + + return (diff.tv_sec * 1000000000 + diff.tv_nsec); +} + +void test_trits_to_trytes(unsigned int trit_size, unsigned int times) { + struct timespec start, end; + long run_time; + long min = 0, max = 0, sum = 0; + + trit_t *trits = malloc(sizeof(trit_t) * trit_size); + tryte_t *trytes = malloc(sizeof(tryte_t) * trit_size / 3); + + for (unsigned int count = 0; count < times; count++) { + // Generate random trits + for (unsigned int idx = 0; idx < trit_size; idx++) { + memset(trits + idx, trit_nums[rand() % 3], 1); + } + + // Execution time measurement + clock_gettime(CLOCK_MONOTONIC, &start); + trits_to_trytes(trits, trytes, trit_size); + clock_gettime(CLOCK_MONOTONIC, &end); + run_time = diff_in_nanosec(start, end); + + max = (count == 0 || run_time > max) ? run_time : max; + min = (count == 0 || run_time < min) ? run_time : min; + sum += run_time; + } + + printf("Input trit size: %d\n", trit_size); + printf(" minimum: %ld nsec\n", min); + printf(" maximum: %ld nsec\n", max); + printf(" average: %ld nsec\n", sum / times); + + free(trits); + free(trytes); +} + +void test_trytes_to_trits(unsigned int tryte_size, unsigned int times) { + struct timespec start, end; + long run_time; + long min = 0, max = 0, sum = 0; + + tryte_t *trytes = malloc(sizeof(tryte_t) * tryte_size); + trit_t *trits = malloc(sizeof(trit_t) * 3 * tryte_size); + + for (unsigned int count = 0; count < times; count++) { + // Generate random trytes + for (unsigned int idx = 0; idx < tryte_size; idx++) { + memset(trytes + idx, tryte_chars[rand() % 27], 1); + } + + // Execution time measurement + clock_gettime(CLOCK_MONOTONIC, &start); + trytes_to_trits(trytes, trits, tryte_size); + clock_gettime(CLOCK_MONOTONIC, &end); + run_time = diff_in_nanosec(start, end); + + max = (count == 0 || run_time > max) ? run_time : max; + min = (count == 0 || run_time < min) ? run_time : min; + sum += run_time; + } + + printf("Input tryte size: %d\n", tryte_size); + printf(" minimum: %ld nsec\n", min); + printf(" maximum: %ld nsec\n", max); + printf(" average: %ld nsec\n", sum / times); + + free(trytes); + free(trits); +} + +int main(void) { + unsigned int size; + + // Set random seed + srand(time(NULL)); + + printf("trytes_to_trits\n"); + + for (size = MIN_TRYTE_SIZE; size <= MAX_TRYTE_SIZE; size++) { + test_trytes_to_trits(size, TEST_TIMES); + } + + printf("\n"); + printf("trits_to_trytes\n"); + + for (size = MIN_TRYTE_SIZE; size <= MAX_TRYTE_SIZE; size++) { + test_trits_to_trytes(size * 3, TEST_TIMES); + } + + return 0; +} diff --git a/common/trinary/tests/test_trit_tryte.c b/common/trinary/tests/test_trit_tryte.c index 7b9a9fa1ad..7ea0b8174e 100644 --- a/common/trinary/tests/test_trit_tryte.c +++ b/common/trinary/tests/test_trit_tryte.c @@ -9,26 +9,47 @@ #include #include "common/trinary/trit_tryte.h" +#include "common/trinary/trit_tryte_sse42.h" -#define TRYTES_IN "AZN9" -#define EXP 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0 +#define TRYTES_IN "AZN9AZN9AZN9AZN9AZN9" +#define EXP \ + 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0, 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0, 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, \ + 0, 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0, 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0 void test_trits_to_trytes(void) { trit_t trits[] = {EXP}; - tryte_t trytes[4]; + tryte_t trytes[20]; tryte_t exp[] = {TRYTES_IN}; - trits_to_trytes(trits, trytes, 12); - TEST_ASSERT_EQUAL_MEMORY(exp, trytes, 4); + trits_to_trytes(trits, trytes, 60); + TEST_ASSERT_EQUAL_MEMORY(exp, trytes, 20); } void test_trytes_to_trits(void) { tryte_t trytes[] = {TRYTES_IN}; - trit_t trits[12]; + trit_t trits[60]; trit_t exp[] = {EXP}; - trytes_to_trits(trytes, trits, 4); - TEST_ASSERT_EQUAL_MEMORY(exp, trits, 12); + trytes_to_trits(trytes, trits, 20); + TEST_ASSERT_EQUAL_MEMORY(exp, trits, 60); } +#if defined(__SSE4_2__) +void test_trits_to_trytes_sse42(void) { + trit_t trits[] = {EXP}; + tryte_t trytes[20]; + tryte_t exp[] = {TRYTES_IN}; + trits_to_trytes_sse42(trits, trytes, 60); + TEST_ASSERT_EQUAL_MEMORY(exp, trytes, 20); +} + +void test_trytes_to_trits_sse42(void) { + tryte_t trytes[] = {TRYTES_IN}; + trit_t trits[60]; + trit_t exp[] = {EXP}; + trytes_to_trits_sse42(trytes, trits, 20); + TEST_ASSERT_EQUAL_MEMORY(exp, trits, 60); +} +#endif + void test_get_trit_at(void) { tryte_t trytes[] = {TRYTES_IN}; trit_t trit; @@ -57,6 +78,10 @@ int main(void) { RUN_TEST(test_trits_to_trytes); RUN_TEST(test_trytes_to_trits); +#if defined(__SSE4_2__) + RUN_TEST(test_trits_to_trytes_sse42); + RUN_TEST(test_trytes_to_trits_sse42); +#endif RUN_TEST(test_get_trit_at); RUN_TEST(test_set_trit_at); diff --git a/common/trinary/trit_tryte.c b/common/trinary/trit_tryte.c index 5e81ecdd46..b1fcb5b1d9 100644 --- a/common/trinary/trit_tryte.c +++ b/common/trinary/trit_tryte.c @@ -8,11 +8,9 @@ #include #include "common/trinary/trit_tryte.h" - -static const trit_t TRYTES_TRITS_LUT[TRYTE_SPACE_SIZE][NUMBER_OF_TRITS_IN_A_TRYTE] = { - {0, 0, 0}, {1, 0, 0}, {-1, 1, 0}, {0, 1, 0}, {1, 1, 0}, {-1, -1, 1}, {0, -1, 1}, {1, -1, 1}, {-1, 0, 1}, - {0, 0, 1}, {1, 0, 1}, {-1, 1, 1}, {0, 1, 1}, {1, 1, 1}, {-1, -1, -1}, {0, -1, -1}, {1, -1, -1}, {-1, 0, -1}, - {0, 0, -1}, {1, 0, -1}, {-1, 1, -1}, {0, 1, -1}, {1, 1, -1}, {-1, -1, 0}, {0, -1, 0}, {1, -1, 0}, {-1, 0, 0}}; +#if defined(__SSE4_2__) +#include "common/trinary/trit_tryte_sse42.h" +#endif trit_t get_trit_at(tryte_t const *const trytes, size_t const length, size_t const index) { size_t tindex = index / 3U; @@ -50,6 +48,16 @@ uint8_t set_trit_at(tryte_t *const trytes, size_t const length, size_t const ind } void trits_to_trytes(trit_t const *const trits, tryte_t *const trytes, size_t const length) { + if (length == 0) { + return; + } + +#if defined(__SSE4_2__) + if (length >= TRITS_TO_TRYTES_THRESHOLD) { + trits_to_trytes_sse42(trits, trytes, length); + return; + } +#endif int k = 0; for (size_t i = 0, j = 0; i < length; i += RADIX, j++) { @@ -70,6 +78,12 @@ void trytes_to_trits(tryte_t const *const trytes, trit_t *const trits, size_t co return; } +#if defined(__SSE4_2__) + if (length >= TRYTES_TO_TRITS_THRESHOLD) { + trytes_to_trits_sse42(trytes, trits, length); + return; + } +#endif for (size_t i = 0, j = 0; i < length; i++, j += RADIX) { memcpy(trits + j, TRYTES_TRITS_LUT[INDEX_OF_TRYTE(trytes[i])], NUMBER_OF_TRITS_IN_A_TRYTE); } diff --git a/common/trinary/trit_tryte.h b/common/trinary/trit_tryte.h index a891a6bf5b..f8e67cbe03 100644 --- a/common/trinary/trit_tryte.h +++ b/common/trinary/trit_tryte.h @@ -12,10 +12,18 @@ #include "common/trinary/trits.h" #include "common/trinary/tryte.h" +#define TRITS_TO_TRYTES_THRESHOLD 192 +#define TRYTES_TO_TRITS_THRESHOLD 736 + #ifdef __cplusplus extern "C" { #endif +static const trit_t TRYTES_TRITS_LUT[TRYTE_SPACE_SIZE][NUMBER_OF_TRITS_IN_A_TRYTE] = { + {0, 0, 0}, {1, 0, 0}, {-1, 1, 0}, {0, 1, 0}, {1, 1, 0}, {-1, -1, 1}, {0, -1, 1}, {1, -1, 1}, {-1, 0, 1}, + {0, 0, 1}, {1, 0, 1}, {-1, 1, 1}, {0, 1, 1}, {1, 1, 1}, {-1, -1, -1}, {0, -1, -1}, {1, -1, -1}, {-1, 0, -1}, + {0, 0, -1}, {1, 0, -1}, {-1, 1, -1}, {0, 1, -1}, {1, 1, -1}, {-1, -1, 0}, {0, -1, 0}, {1, -1, 0}, {-1, 0, 0}}; + static inline size_t num_trytes_for_trits(size_t num_trits) { return (num_trits + NUMBER_OF_TRITS_IN_A_TRYTE - 1) / NUMBER_OF_TRITS_IN_A_TRYTE; } diff --git a/common/trinary/trit_tryte_sse42.h b/common/trinary/trit_tryte_sse42.h new file mode 100644 index 0000000000..4a57997dfa --- /dev/null +++ b/common/trinary/trit_tryte_sse42.h @@ -0,0 +1,274 @@ +#ifndef __COMMON_TRIT_TRYTE_SSE42_H_ +#define __COMMON_TRIT_TRYTE_SSE42_H_ + +#include + +#include "common/defs.h" +#include "common/trinary/trit_tryte.h" +#include "common/trinary/tryte.h" + +#define BLOCK_8BIT(type) (sizeof(type) / sizeof(int8_t)) +#define BYTE_OF_128BIT 16 +#define COMMA0 +#define COMMA1 , +#define COMMA(x) COMMA##x +#define INDEX_3DIFF_0F 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F +#define INDEX_3DIFF_1D 0x01, 0x04, 0x07, 0x0A, 0x0D +#define INDEX_3DIFF_2E 0x02, 0x05, 0x08, 0x0B, 0x0E +#define REPEAT0(str) +#define REPEAT1(str) str +#define REPEAT2(str) REPEAT1(str), str +#define REPEAT3(str) REPEAT2(str), str +#define REPEAT4(str) REPEAT3(str), str +#define REPEAT5(str) REPEAT4(str), str +#define REPEAT6(str) REPEAT5(str), str +#define REPEAT7(str) REPEAT6(str), str +#define REPEAT8(str) REPEAT7(str), str +#define REPEAT9(str) REPEAT8(str), str +#define REPEAT10(str) REPEAT9(str), str +#define REPEAT11(str) REPEAT10(str), str +#define REPEAT(n, str) REPEAT##n(str) + +static inline void trits_to_trytes_sse42(trit_t const *const trits, tryte_t *const trytes, size_t const length) { + const int block_8bit = BLOCK_8BIT(__m128i); + const int8_t set_msb = 0x80; + const __m128i tryte_alphabet[2] = { + _mm_setr_epi8(TRYTE_ALPHABET[14], TRYTE_ALPHABET[15], TRYTE_ALPHABET[16], TRYTE_ALPHABET[17], TRYTE_ALPHABET[18], + TRYTE_ALPHABET[19], TRYTE_ALPHABET[20], TRYTE_ALPHABET[21], TRYTE_ALPHABET[22], TRYTE_ALPHABET[23], + TRYTE_ALPHABET[24], TRYTE_ALPHABET[25], TRYTE_ALPHABET[26], TRYTE_ALPHABET[0], TRYTE_ALPHABET[1], + TRYTE_ALPHABET[2]), + _mm_setr_epi8(TRYTE_ALPHABET[3], TRYTE_ALPHABET[4], TRYTE_ALPHABET[5], TRYTE_ALPHABET[6], TRYTE_ALPHABET[7], + TRYTE_ALPHABET[8], TRYTE_ALPHABET[9], TRYTE_ALPHABET[10], TRYTE_ALPHABET[11], TRYTE_ALPHABET[12], + TRYTE_ALPHABET[13], 0, 0, 0, 0, 0)}; + /* For shuffling the bytes of the input trits */ + const __m128i shuffle_low[3] = { + _mm_setr_epi8(REPEAT(0, set_msb) COMMA(0) INDEX_3DIFF_0F COMMA(1) REPEAT(10, set_msb)), + _mm_setr_epi8(REPEAT(6, set_msb) COMMA(1) INDEX_3DIFF_2E COMMA(1) REPEAT(5, set_msb)), + _mm_setr_epi8(REPEAT(11, set_msb) COMMA(1) INDEX_3DIFF_1D COMMA(0) REPEAT(0, set_msb))}; + const __m128i shuffle_mid[3] = { + _mm_setr_epi8(REPEAT(0, set_msb) COMMA(0) INDEX_3DIFF_1D COMMA(1) REPEAT(11, set_msb)), + _mm_setr_epi8(REPEAT(5, set_msb) COMMA(1) INDEX_3DIFF_0F COMMA(1) REPEAT(5, set_msb)), + _mm_setr_epi8(REPEAT(11, set_msb) COMMA(1) INDEX_3DIFF_2E COMMA(0) REPEAT(0, set_msb))}; + const __m128i shuffle_high[3] = { + _mm_setr_epi8(REPEAT(0, set_msb) COMMA(0) INDEX_3DIFF_2E COMMA(1) REPEAT(11, set_msb)), + _mm_setr_epi8(REPEAT(5, set_msb) COMMA(1) INDEX_3DIFF_1D COMMA(1) REPEAT(6, set_msb)), + _mm_setr_epi8(REPEAT(10, set_msb) COMMA(1) INDEX_3DIFF_0F COMMA(0) REPEAT(0, set_msb))}; + + /* Start converting */ + for (size_t i = 0; i < length / NUMBER_OF_TRITS_IN_A_TRYTE / block_8bit; i++) { + /* Get trit data */ + __m128i data_first = _mm_loadu_si128((__m128i *)(trits) + i * 3); + __m128i data_mid = _mm_loadu_si128((__m128i *)(trits) + i * 3 + 1); + __m128i data_last = _mm_loadu_si128((__m128i *)(trits) + i * 3 + 2); + /* + * Each block represents a trit. + * shuffle + * ---------------- ------ ------ ------ ------ + * data_first = | a1 | a2 | a3 | ...... | f1 | low_trit = | a1 | ... | f1 | ... | p1 | + * ---------------- ------ ------ ------ ------ + * ---------------- ------ ------ ------ ------ + * data_mid = | f2 | f3 | g1 | ...... | k2 | => mid_trit = | a2 | ... | f2 | ... | p2 | + * ---------------- ------ ------ ------ ------ + * ---------------- ------ ------ ------ ------ + * data_last = | k3 | l1 | l2 | ...... | p3 | high_trit = | a3 | ... | f3 | ... | p3 | + * ---------------- ------ ------ ------ ------ + */ + __m128i low_trit = _mm_or_si128( + _mm_shuffle_epi8(data_first, shuffle_low[0]), + _mm_or_si128(_mm_shuffle_epi8(data_mid, shuffle_low[1]), _mm_shuffle_epi8(data_last, shuffle_low[2]))); + __m128i mid_trit = _mm_or_si128( + _mm_shuffle_epi8(data_first, shuffle_mid[0]), + _mm_or_si128(_mm_shuffle_epi8(data_mid, shuffle_mid[1]), _mm_shuffle_epi8(data_last, shuffle_mid[2]))); + __m128i high_trit = _mm_or_si128( + _mm_shuffle_epi8(data_first, shuffle_high[0]), + _mm_or_si128(_mm_shuffle_epi8(data_mid, shuffle_high[1]), _mm_shuffle_epi8(data_last, shuffle_high[2]))); + /* low_result = (low_trit) */ + __m128i low_result = low_trit; + /* mid_result = (mid_trit * 3) = (mid_trit + mid_trit + mid_trit) */ + __m128i mid_result = _mm_add_epi8(mid_trit, _mm_add_epi8(mid_trit, mid_trit)); + /* high_result = (high_trit * 9) = (high_trit + high_trit * 4 + high_trit * 4)*/ + __m128i high_trit_2 = _mm_add_epi8(high_trit, high_trit); + __m128i high_trit_4 = _mm_add_epi8(high_trit_2, high_trit_2); + __m128i high_result = _mm_add_epi8(high_trit, _mm_add_epi8(high_trit_4, high_trit_4)); + /* alphabet_offset = (low_result + mid_result + high_result) */ + __m128i alphabet_offset = _mm_add_epi8(low_result, _mm_add_epi8(mid_result, high_result)); + /* Add 0x0D (13) to eliminate negative value */ + alphabet_offset = _mm_add_epi8(alphabet_offset, _mm_set_epi32(REPEAT(4, 0x0D0D0D0D))); + + /* Assign tryte alphabet */ + /* If the offset is >= 16 (> 15), then the compared result byte = 0xFF, + * else = 0x00 */ + __m128i cmp_result = + _mm_cmpgt_epi8(alphabet_offset, _mm_set_epi8(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15)); + /* Use the offset to get the correct tryte alphabet from tryte_alphabet[] + */ + __m128i result_lt = _mm_shuffle_epi8(tryte_alphabet[0], alphabet_offset); + __m128i result_ge = _mm_shuffle_epi8( + tryte_alphabet[1], + /* alphabet_offset - 16 */ + _mm_sub_epi8(alphabet_offset, _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16))); + __m128i result = _mm_or_si128(_mm_andnot_si128(cmp_result, result_lt), _mm_and_si128(cmp_result, result_ge)); + /* Store the tryte result */ + _mm_storeu_si128((__m128i *)(trytes + i * block_8bit), result); + } + /* The rest of the trits */ + for (size_t i = (length / NUMBER_OF_TRITS_IN_A_TRYTE / block_8bit) * block_8bit; + i * NUMBER_OF_TRITS_IN_A_TRYTE < length; i++) { + int k = 0; + int used_trit = i * NUMBER_OF_TRITS_IN_A_TRYTE; + for (size_t l = length - used_trit < NUMBER_OF_TRITS_IN_A_TRYTE ? length - used_trit : NUMBER_OF_TRITS_IN_A_TRYTE; + l-- > 0;) { + k *= RADIX; + k += trits[used_trit + l]; + } + + if (k < 0) { + k += TRYTE_SPACE_SIZE; + } + trytes[i] = TRYTE_ALPHABET[k]; + } +} + +static inline void trytes_to_trits_sse42(tryte_t const *const trytes, trit_t *const trits, size_t const length) { + const int block_8bit = BLOCK_8BIT(__m128i); + /* For setting the most significant bit of a byte */ + const int8_t set_msb = 0x80; + /* The set and range for indicating the trits value (0, 1, -1) + * of the corresponding trytes */ + /* '9', 'C', 'F', 'I', 'L', 'O', 'R', 'U', 'X' */ + const char set_low_trit_0[BYTE_OF_128BIT] = "9CFILORUX"; + /* 'A', 'D', 'G', 'J', 'M', 'P', 'S', 'V', 'Y' */ + const char set_low_trit_p1[BYTE_OF_128BIT] = "ADGJMPSVY"; + /* 'B', 'E', 'H', 'K', 'N', 'Q', 'T', 'W', 'Z' */ + const char set_low_trit_n1[BYTE_OF_128BIT] = "BEHKNQTWZ"; + /* '9', 'A', 'H', 'I', 'J', 'Q', 'R', 'S', 'Z' */ + const char range_mid_trit_0[BYTE_OF_128BIT] = "99AAHJQSZZ"; + /* 'B', 'C', 'D', 'K', 'L', 'M', 'T', 'U', 'V' */ + const char range_mid_trit_p1[BYTE_OF_128BIT] = "BDKMTV"; + /* 'E', 'F', 'G', 'N', 'O', 'P', 'W', 'X', 'Y' */ + const char range_mid_trit_n1[BYTE_OF_128BIT] = "EGNPWY"; + /* '9', 'A', 'B', 'C', 'D', 'W', 'X', 'Y', 'Z' */ + const char range_high_trit_0[BYTE_OF_128BIT] = "99ADWZ"; + /* 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M' */ + const char range_high_trit_p1[BYTE_OF_128BIT] = "EM"; + /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V' */ + const char range_high_trit_n1[BYTE_OF_128BIT] = "NV"; + /* Convert the char array to the 128-bit data */ + const __m128i pattern_low_trit_0 = _mm_loadu_si128((__m128i *)(set_low_trit_0)); + const __m128i pattern_low_trit_p1 = _mm_loadu_si128((__m128i *)(set_low_trit_p1)); + const __m128i pattern_low_trit_n1 = _mm_loadu_si128((__m128i *)(set_low_trit_n1)); + const __m128i pattern_mid_trit_0 = _mm_loadu_si128((__m128i *)(range_mid_trit_0)); + const __m128i pattern_mid_trit_p1 = _mm_loadu_si128((__m128i *)(range_mid_trit_p1)); + const __m128i pattern_mid_trit_n1 = _mm_loadu_si128((__m128i *)(range_mid_trit_n1)); + const __m128i pattern_high_trit_0 = _mm_loadu_si128((__m128i *)(range_high_trit_0)); + const __m128i pattern_high_trit_p1 = _mm_loadu_si128((__m128i *)(range_high_trit_p1)); + const __m128i pattern_high_trit_n1 = _mm_loadu_si128((__m128i *)(range_high_trit_n1)); + /* The 128-bit data with the repeated same bytes */ + const __m128i pos_one = _mm_set1_epi8(1); + const __m128i neg_one = _mm_set1_epi8(-1); + const __m128i zero = _mm_set1_epi8(0); + /* For shuffling the bytes of the trits transformed from the input trytes */ + const __m128i shuffle_first[3] = { + _mm_setr_epi8(0x00, REPEAT2(set_msb), 0x01, REPEAT2(set_msb), 0x02, REPEAT2(set_msb), 0x03, REPEAT2(set_msb), + 0x04, REPEAT2(set_msb), 0x05), + _mm_setr_epi8(REPEAT1(set_msb), 0x00, REPEAT2(set_msb), 0x01, REPEAT2(set_msb), 0x02, REPEAT2(set_msb), 0x03, + REPEAT2(set_msb), 0x04, REPEAT2(set_msb)), + _mm_setr_epi8(REPEAT2(set_msb), 0x00, REPEAT2(set_msb), 0x01, REPEAT2(set_msb), 0x02, REPEAT2(set_msb), 0x03, + REPEAT2(set_msb), 0x04, REPEAT1(set_msb))}; + const __m128i shuffle_mid[3] = {_mm_setr_epi8(REPEAT2(set_msb), 0x06, REPEAT2(set_msb), 0x07, REPEAT2(set_msb), 0x08, + REPEAT2(set_msb), 0x09, REPEAT2(set_msb), 0x0A, REPEAT1(set_msb)), + _mm_setr_epi8(0x05, REPEAT2(set_msb), 0x06, REPEAT2(set_msb), 0x07, REPEAT2(set_msb), + 0x08, REPEAT2(set_msb), 0x09, REPEAT2(set_msb), 0x0A), + _mm_setr_epi8(REPEAT1(set_msb), 0x05, REPEAT2(set_msb), 0x06, REPEAT2(set_msb), 0x07, + REPEAT2(set_msb), 0x08, REPEAT2(set_msb), 0x09, REPEAT2(set_msb))}; + const __m128i shuffle_last[3] = {_mm_setr_epi8(REPEAT1(set_msb), 0x0B, REPEAT2(set_msb), 0x0C, REPEAT2(set_msb), 0x0D, + REPEAT2(set_msb), 0x0E, REPEAT2(set_msb), 0x0F, REPEAT2(set_msb)), + _mm_setr_epi8(REPEAT2(set_msb), 0x0B, REPEAT2(set_msb), 0x0C, REPEAT2(set_msb), 0x0D, + REPEAT2(set_msb), 0x0E, REPEAT2(set_msb), 0x0F, REPEAT1(set_msb)), + _mm_setr_epi8(0x0A, REPEAT2(set_msb), 0x0B, REPEAT2(set_msb), 0x0C, REPEAT2(set_msb), + 0x0D, REPEAT2(set_msb), 0x0E, REPEAT2(set_msb), 0x0F)}; + + /* Start converting */ + /* The for loop handles the group of the 128-bit characters without the + * end-of-string */ + for (size_t i = 0; i < length / block_8bit; i++) { + /* Get tryte data */ + __m128i data = _mm_loadu_si128((__m128i *)(trytes) + i); + + /* The masks for setting the corresponding trits */ + __m128i mask_low_trit_0 = _mm_cmpistrm(pattern_low_trit_0, data, + /* Signed byte comparison */ + _SIDD_SBYTE_OPS | + /* Compare with the character set */ + _SIDD_CMP_EQUAL_ANY | + /* Expand the corrsponding bit result to byte unit */ + _SIDD_UNIT_MASK); + __m128i mask_low_trit_p1 = + _mm_cmpistrm(pattern_low_trit_p1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK); + __m128i mask_low_trit_n1 = + _mm_cmpistrm(pattern_low_trit_n1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK); + __m128i mask_mid_trit_0 = _mm_cmpistrm(pattern_mid_trit_0, data, + /* Signed byte comparison */ + _SIDD_SBYTE_OPS | + /* Compare with the character range */ + _SIDD_CMP_RANGES | + /* Expand the corrsponding bit result to byte unit */ + _SIDD_UNIT_MASK); + __m128i mask_mid_trit_p1 = + _mm_cmpistrm(pattern_mid_trit_p1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); + __m128i mask_mid_trit_n1 = + _mm_cmpistrm(pattern_mid_trit_n1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); + __m128i mask_high_trit_0 = + _mm_cmpistrm(pattern_high_trit_0, data, _SIDD_SBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); + __m128i mask_high_trit_p1 = + _mm_cmpistrm(pattern_high_trit_p1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); + __m128i mask_high_trit_n1 = + _mm_cmpistrm(pattern_high_trit_n1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); + + /* + * Each block represents a trit. + * shuffle + * ------ ------ ------ ---------------- ------ + * low_trit = | a1 | ... | f1 | ... | p1 | data_first = | a1 | a2 | a3 | ...... | f1 | + * ------ ------ ------ ---------------- ------ + * ------ ------ ------ ---------------- ------ + * mid_trit = | a2 | ... | f2 | ... | p2 | => data_mid = | f2 | f3 | g1 | ...... | k2 | + * ------ ------ ------ ---------------- ------ + * ------ ------ ------ ---------------- ------ + * high_trit = | a3 | ... | f3 | ... | p3 | data_last = | k3 | l1 | l2 | ...... | p3 | + * ------ ------ ------ ---------------- ------ + */ + __m128i low_trit = + _mm_or_si128(_mm_and_si128(mask_low_trit_0, zero), + _mm_or_si128(_mm_and_si128(mask_low_trit_p1, pos_one), _mm_and_si128(mask_low_trit_n1, neg_one))); + __m128i mid_trit = + _mm_or_si128(_mm_and_si128(mask_mid_trit_0, zero), + _mm_or_si128(_mm_and_si128(mask_mid_trit_p1, pos_one), _mm_and_si128(mask_mid_trit_n1, neg_one))); + __m128i high_trit = _mm_or_si128( + _mm_and_si128(mask_high_trit_0, zero), + _mm_or_si128(_mm_and_si128(mask_high_trit_p1, pos_one), _mm_and_si128(mask_high_trit_n1, neg_one))); + __m128i data_first, data_mid, data_last; + data_first = _mm_or_si128( + _mm_shuffle_epi8(low_trit, shuffle_first[0]), + _mm_or_si128(_mm_shuffle_epi8(mid_trit, shuffle_first[1]), _mm_shuffle_epi8(high_trit, shuffle_first[2]))); + data_mid = _mm_or_si128( + _mm_shuffle_epi8(low_trit, shuffle_mid[0]), + _mm_or_si128(_mm_shuffle_epi8(mid_trit, shuffle_mid[1]), _mm_shuffle_epi8(high_trit, shuffle_mid[2]))); + data_last = _mm_or_si128( + _mm_shuffle_epi8(low_trit, shuffle_last[0]), + _mm_or_si128(_mm_shuffle_epi8(mid_trit, shuffle_last[1]), _mm_shuffle_epi8(high_trit, shuffle_last[2]))); + + /* Store the 3 * 128-bit trits converted from trytes */ + _mm_storeu_si128((__m128i *)(trits + (3 * i) * block_8bit), data_first); + _mm_storeu_si128((__m128i *)(trits + (3 * i + 1) * block_8bit), data_mid); + _mm_storeu_si128((__m128i *)(trits + (3 * i + 2) * block_8bit), data_last); + } + /* The rest of the trytes */ + for (size_t i = (length / block_8bit) * block_8bit, + j = (length / block_8bit) * block_8bit * NUMBER_OF_TRITS_IN_A_TRYTE; + i < length; i++, j += RADIX) { + memcpy(trits + j, TRYTES_TRITS_LUT[INDEX_OF_TRYTE(trytes[i])], NUMBER_OF_TRITS_IN_A_TRYTE); + } +} + +#endif // __COMMON_TRIT_TRYTE_SSE42_H_