diff --git a/libraries/FastShiftInOut/CHANGELOG.md b/libraries/FastShiftInOut/CHANGELOG.md index 0fc1bf08..5cb6d255 100644 --- a/libraries/FastShiftInOut/CHANGELOG.md +++ b/libraries/FastShiftInOut/CHANGELOG.md @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). +## [0.2.1] - 2024-10-31 +- fix #9, more optimizations + ## [0.2.0] - 2024-09-10 - fix #7, loop unroll option, improving performance, kudos to nt314p - added flag to select LOOP UNROLL (is optional as it gives larger code size) diff --git a/libraries/FastShiftInOut/FastShiftInOut.cpp b/libraries/FastShiftInOut/FastShiftInOut.cpp index 44fc45f7..5d82fcf1 100644 --- a/libraries/FastShiftInOut/FastShiftInOut.cpp +++ b/libraries/FastShiftInOut/FastShiftInOut.cpp @@ -1,7 +1,7 @@ // // FILE: FastShiftInOut.cpp // AUTHOR: Rob Tillaart -// VERSION: 0.2.0 +// VERSION: 0.2.1 // PURPOSE: Arduino library for (AVR) optimized shiftInOut (simultaneously) // URL: https://github.com/RobTillaart/FastShiftInOut @@ -109,8 +109,12 @@ uint8_t FastShiftInOut::writeLSBFIRST(uint8_t data) uint8_t oldSREG = SREG; noInterrupts(); - if ((value & 0x01) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + // See discussion #17 FastShiftOut + uint8_t d0 = *localDataOutRegister & outmask2; // cache 0 + uint8_t d1 = d0 | outmask1; // cache 1 + + if ((value & 0x01) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; // *localClockRegister |= cbmask1; // if ((*localDataInRegister & inmask1) > 0) rv |= 0x01; // *localClockRegister &= cbmask2; // ~_clockBit; @@ -121,50 +125,50 @@ uint8_t FastShiftInOut::writeLSBFIRST(uint8_t data) if ((*localDataInRegister & inmask1) > 0) rv |= 0x01; *localClockRegister = r; // reset it - if ((value & 0x02) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x02) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x02; *localClockRegister = r; // reset it - if ((value & 0x04) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x04) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x04; *localClockRegister = r; // reset it - if ((value & 0x08) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x08) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x08; *localClockRegister = r; // reset it - if ((value & 0x10) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x10) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x10; *localClockRegister = r; // reset it - if ((value & 0x20) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x20) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x20; *localClockRegister = r; // reset it - if ((value & 0x40) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x40) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x40; *localClockRegister = r; // reset it - if ((value & 0x80) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x80) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x80; @@ -185,16 +189,19 @@ uint8_t FastShiftInOut::writeLSBFIRST(uint8_t data) uint8_t oldSREG = SREG; noInterrupts(); - - uint8_t r = *localClockRegister; - + + uint8_t d0 = *localDataOutRegister & outmask2; // cache 0 + uint8_t d1 = d0 | outmask1; // cache 1 + for (uint8_t m = 1; m > 0; m <<= 1) { // write one bit - if ((value & m) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & m) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; + uint8_t r = *localClockRegister; + // clock pulse HIGH - *localClockRegister |= cbmask1; + *localClockRegister = r | cbmask1; // read one bit if ((*localDataInRegister & inmask1) > 0) rv |= m; // clock pulse LOW @@ -249,8 +256,12 @@ uint8_t FastShiftInOut::writeMSBFIRST(uint8_t data) uint8_t oldSREG = SREG; noInterrupts(); - if ((value & 0x80) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + // See discussion #17 FastShiftOut + uint8_t d0 = *localDataOutRegister & outmask2; // cache 0 + uint8_t d1 = d0 | outmask1; // cache 1 + + if ((value & 0x80) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; // *localClockRegister |= cbmask1; // if ((*localDataInRegister & inmask1) > 0) rv |= 0x80; // *localClockRegister &= cbmask2; // ~_clockBit; @@ -261,50 +272,50 @@ uint8_t FastShiftInOut::writeMSBFIRST(uint8_t data) if ((*localDataInRegister & inmask1) > 0) rv |= 0x80; *localClockRegister = r; // reset it - if ((value & 0x40) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x40) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x40; *localClockRegister = r; // reset it - if ((value & 0x20) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x20) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x20; *localClockRegister = r; // reset it - if ((value & 0x10) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x10) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x10; *localClockRegister = r; // reset it - if ((value & 0x08) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x08) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x08; *localClockRegister = r; // reset it - if ((value & 0x04) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x04) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x04; *localClockRegister = r; // reset it - if ((value & 0x02) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x02) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x02; *localClockRegister = r; // reset it - if ((value & 0x01) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & 0x01) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; r = *localClockRegister; *localClockRegister = r | cbmask1; // set one bit if ((*localDataInRegister & inmask1) > 0) rv |= 0x01; @@ -326,14 +337,18 @@ uint8_t FastShiftInOut::writeMSBFIRST(uint8_t data) uint8_t oldSREG = SREG; noInterrupts(); - uint8_t r = *localClockRegister; + // See discussion #17 FastShiftOut + uint8_t d0 = *localDataOutRegister & outmask2; // cache 0 + uint8_t d1 = d0 | outmask1; // cache 1 + for (uint8_t m = 0x80; m > 0; m >>= 1) { // write one bit - if ((value & m) == 0) *localDataOutRegister &= outmask2; - else *localDataOutRegister |= outmask1; + if ((value & m) == 0) *localDataOutRegister = d0; + else *localDataOutRegister = d1; + uint8_t r = *localClockRegister; // clock pulse HIGH - *localClockRegister |= cbmask1; + *localClockRegister = r | cbmask1; // read one bit if ((*localDataInRegister & inmask1) > 0) rv |= m; // clock pulse LOW diff --git a/libraries/FastShiftInOut/FastShiftInOut.h b/libraries/FastShiftInOut/FastShiftInOut.h index b603dd07..88685d68 100644 --- a/libraries/FastShiftInOut/FastShiftInOut.h +++ b/libraries/FastShiftInOut/FastShiftInOut.h @@ -2,7 +2,7 @@ // // FILE: FastShiftInOut.cpp // AUTHOR: Rob Tillaart -// VERSION: 0.2.0 +// VERSION: 0.2.1 // PURPOSE: Arduino library for (AVR) optimized shiftInOut (simultaneously) // URL: https://github.com/RobTillaart/FastShiftInOut @@ -10,10 +10,10 @@ #include "Arduino.h" -#define FASTSHIFTINOUT_LIB_VERSION (F("0.2.0")) +#define FASTSHIFTINOUT_LIB_VERSION (F("0.2.1")) // uncomment next line to get SPEED OPTIMIZED CODE -#define FASTSHIFTINOUT_AVR_LOOP_UNROLLED 1 +// #define FASTSHIFTINOUT_AVR_LOOP_UNROLLED 1 class FastShiftInOut diff --git a/libraries/FastShiftInOut/README.md b/libraries/FastShiftInOut/README.md index 7535851b..e695d481 100644 --- a/libraries/FastShiftInOut/README.md +++ b/libraries/FastShiftInOut/README.md @@ -58,22 +58,23 @@ Performance of **write()** #### Measurements -(0.2.0) +(0.2.1) Indicative time in microseconds, Arduino UNO, IDE 1.8.19, measured over 1000 calls. (delta between 2 calls and 1 call to eliminate overhead) -| function | 0.1.3 | 0.2.0 | 0.2.0L | -|:-------------------------|---------:|---------:|----------:| -| write() (reference) | no data | 158.24 | no data | -| write() | 25.52 | 17.61 | 12.26 | -| writeLSBFIRST() | 25.52 | 17.61 | 12.26 | -| writeMSBFIRST() | 25.52 | 17.60 | 12.20 | +| function | 0.1.3 | 0.2.0 | 0.2.0L | 0.2.1 | 0.2.1L | +|:-------------------------|---------:|---------:|----------:|---------:|----------:| +| write() (reference) | no data | 158.24 | no data | 158.24 | no data | +| write() | 25.52 | 17.61 | 12.26 | 16.72 | 11.00 | +| writeLSBFIRST() | 25.52 | 17.61 | 12.26 | 16.72 | 11.00 | +| writeMSBFIRST() | 25.52 | 17.60 | 12.20 | 16.72 | 10.94 | - Note: 0.1.3 added from old table. - Note: reference run on AVR by commenting all optimizations. - Note: 0.2.0 measured with loop unroll flag disabled. - Note: 0.2.0L measured with loop unrolled flag enabled. +- Note: 0.2.1 / 0.2.1L idem. ### Related @@ -83,6 +84,7 @@ Indicative time in microseconds, Arduino UNO, IDE 1.8.19, measured over 1000 cal - https://github.com/RobTillaart/FastShiftOut - https://github.com/RobTillaart/ShiftInSlow - https://github.com/RobTillaart/ShiftOutSlow +- https://github.com/RobTillaart/SWSPI (experimental) ## Interface diff --git a/libraries/FastShiftInOut/examples/FastShiftInOut_performance/performance_0.2.1.txt b/libraries/FastShiftInOut/examples/FastShiftInOut_performance/performance_0.2.1.txt new file mode 100644 index 00000000..49894aff --- /dev/null +++ b/libraries/FastShiftInOut/examples/FastShiftInOut_performance/performance_0.2.1.txt @@ -0,0 +1,39 @@ +IDE: 1.8.19 +Board: UNO + +FASTSHIFTINOUT_LIB_VERSION: 0.2.1 + +Performance - time in us + write: 17.86 + write: 34.58 + Delta: 16.72 + +writeLSBFIRST: 16.98 +writeLSBFIRST: 33.70 + Delta: 16.72 + +writeMSBFIRST: 16.98 +writeMSBFIRST: 33.70 + Delta: 16.72 + + +# loop unrolled. + +FASTSHIFTINOUT_LIB_VERSION: 0.2.1 + +Performance - time in us + write: 12.14 + write: 23.14 + Delta: 11.00 + +writeLSBFIRST: 11.26 +writeLSBFIRST: 22.26 + Delta: 11.00 + +writeMSBFIRST: 11.19 +writeMSBFIRST: 22.13 + Delta: 10.94 + + +done ... + diff --git a/libraries/FastShiftInOut/library.json b/libraries/FastShiftInOut/library.json index 56293f5a..28ca653e 100644 --- a/libraries/FastShiftInOut/library.json +++ b/libraries/FastShiftInOut/library.json @@ -15,7 +15,7 @@ "type": "git", "url": "https://github.com/RobTillaart/FastShiftInOut.git" }, - "version": "0.2.0", + "version": "0.2.1", "license": "MIT", "frameworks": "*", "platforms": "*", diff --git a/libraries/FastShiftInOut/library.properties b/libraries/FastShiftInOut/library.properties index c6569126..b0f3c451 100644 --- a/libraries/FastShiftInOut/library.properties +++ b/libraries/FastShiftInOut/library.properties @@ -1,5 +1,5 @@ name=FastShiftInOut -version=0.2.0 +version=0.2.1 author=Rob Tillaart maintainer=Rob Tillaart sentence=Arduino library for (AVR) optimized shiftInOut (simultaneously)