16 Bit Store Optimizations

classic Classic list List threaded Threaded
2 messages Options
Reply | Threaded
Open this post in threaded view
|

16 Bit Store Optimizations

Thomas Watson-2
Hello all,

A frequent need in my code is to combine two 8 bit variables into a 16 bit variable. I am trying to determine the optimal way to do this. The naïve way and a more clever way both generate extra instructions that could be optimized away. I include a test case and comments which explain the setup and issue in more detail. It seems this is a missed opportunity for optimization in the compiler.

Thomas

#include <avr/io.h>
#include <inttypes.h>
#include <avr/interrupt.h>
/*
$ avr-gcc -v
Using built-in specs.
COLLECT_GCC=avr-gcc
COLLECT_LTO_WRAPPER=/usr/local/Cellar/avr-gcc/4.9.2/libexec/gcc/avr/4.9.2/lto-wrapper
Target: avr
Configured with: ../configure --enable-languages=c,c++ --target=avr --disable-libssp --disable-nls --with-dwarf2 --prefix=/usr/local/Cellar/avr-gcc/4.9.2 --with-gmp=/usr/local/Cellar/gmp/6.0.0a --with-mpfr=/usr/local/Cellar/mpfr/3.1.2-p10 --with-mpc=/usr/local/Cellar/libmpc/1.0.2 --datarootdir=/usr/local/Cellar/avr-gcc/4.9.2/share --bindir=/usr/local/Cellar/avr-gcc/4.9.2/bin --with-as=/usr/local/bin/avr-as --with-ld=/usr/local/bin/avr-ld
Thread model: single
gcc version 4.9.2 (GCC)
*/

// compile like:
// avr-gcc -mmcu=atmega328p -std=gnu99 -Os -Wall -DF_CPU=16000000 -Wa,-ahlmsd=sixteenbit_test.lst -o sixteenbit_test.elf sixteenbit_test.c

// data storage memory (might be used in ISR for example)
volatile uint16_t data;
// read and return a byte from the serial port
uint8_t read_byte() {
    while (!(UCSR0A & _BV(RXC0)));
    return (uint8_t)UDR0;
}

int main() {
    cli();

    // init serial port
    // etc

    sei();

    uint8_t temp_hi, temp_lo;

    // receive a word with |
    temp_hi = read_byte();
    temp_lo = read_byte();
    cli(); // not okay to get interrupted while assigning
    // in case an ISR comes and tries to read 'data'
    /* compiles to
  47 0012 282F          mov r18,r24
  48 0014 30E0          ldi r19,0
  49 0016 C901          movw r24,r18
  50 0018 9C2B          or r25,r28
  51 001a 9093 0000     sts data+1,r25
  52 001e 8093 0000     sts data,r24
    where r28 is temp_hi and r24 is temp_lo
    8 bytes and 4 cycles worse than the good solution
    */
    data = (temp_hi<<8) | temp_lo;
    sei(); // can get interrupted again

    // receive a word with pointer assignment
    // ugly and still does not compile how I want
    temp_hi = read_byte();
    temp_lo = read_byte();
    cli(); // not okay to get interrupted while assigning
    // in case an ISR comes and tries to read 'data'
    /* compiles to
  66 0030 E0E0          ldi r30,lo8(data)
  67 0032 F0E0          ldi r31,hi8(data)
  68 0034 8083          st Z,r24
  69 0036 C183          std Z+1,r28
    2 cycles worse than the good solution
    */
    *((uint8_t*)&data) = temp_lo;
    *((uint8_t*)&data+1) = temp_hi;
    sei(); // can get interrupted again

    /*
    Why won't the compiler compile it as
    sts data, r24
    sts data+1, r28
    It will, in the second case, if it feels Z and Y are otherwise occupied.
    But I couldn't get to think that for a reasonably short sample.
    It also will work correctly in this case for -O2.
    */
}

_______________________________________________
AVR-GCC-list mailing list
[hidden email]
https://lists.nongnu.org/mailman/listinfo/avr-gcc-list
Reply | Threaded
Open this post in threaded view
|

Re: 16 Bit Store Optimizations

Georg-Johann Lay-2
On 19.12.2016 22:24, Thomas Watson wrote:

> Hello all,
>
> A frequent need in my code is to combine two 8 bit variables into a 16 bit variable. I am trying to determine the optimal way to do this. The naïve way and a more clever way both generate extra instructions that could be optimized away. I include a test case and comments which explain the setup and issue in more detail. It seems this is a missed opportunity for optimization in the compiler.
>
> Thomas
>
> #include <avr/io.h>
> #include <inttypes.h>
> #include <avr/interrupt.h>
> /*
> $ avr-gcc -v
> Using built-in specs.
> COLLECT_GCC=avr-gcc
> COLLECT_LTO_WRAPPER=/usr/local/Cellar/avr-gcc/4.9.2/libexec/gcc/avr/4.9.2/lto-wrapper
> Target: avr
> Configured with: ../configure --enable-languages=c,c++ --target=avr --disable-libssp --disable-nls --with-dwarf2 --prefix=/usr/local/Cellar/avr-gcc/4.9.2 --with-gmp=/usr/local/Cellar/gmp/6.0.0a --with-mpfr=/usr/local/Cellar/mpfr/3.1.2-p10 --with-mpc=/usr/local/Cellar/libmpc/1.0.2 --datarootdir=/usr/local/Cellar/avr-gcc/4.9.2/share --bindir=/usr/local/Cellar/avr-gcc/4.9.2/bin --with-as=/usr/local/bin/avr-as --with-ld=/usr/local/bin/avr-ld
> Thread model: single
> gcc version 4.9.2 (GCC)
> */
>
> // compile like:
> // avr-gcc -mmcu=atmega328p -std=gnu99 -Os -Wall -DF_CPU=16000000 -Wa,-ahlmsd=sixteenbit_test.lst -o sixteenbit_test.elf sixteenbit_test.c
>
> // data storage memory (might be used in ISR for example)
> volatile uint16_t data;
> // read and return a byte from the serial port
> uint8_t read_byte() {
>     while (!(UCSR0A & _BV(RXC0)));
>     return (uint8_t)UDR0;
> }
>
> int main() {
>     cli();
>
>     // init serial port
>     // etc
>
>     sei();
>
>     uint8_t temp_hi, temp_lo;
>
>     // receive a word with |
>     temp_hi = read_byte();
>     temp_lo = read_byte();
>     cli(); // not okay to get interrupted while assigning
>     // in case an ISR comes and tries to read 'data'
>     /* compiles to
>   47 0012 282F          mov r18,r24
>   48 0014 30E0          ldi r19,0
>   49 0016 C901          movw r24,r18
>   50 0018 9C2B          or r25,r28
>   51 001a 9093 0000     sts data+1,r25
>   52 001e 8093 0000     sts data,r24
>     where r28 is temp_hi and r24 is temp_lo
>     8 bytes and 4 cycles worse than the good solution
>     */
>     data = (temp_hi<<8) | temp_lo;

Short answer:  This are around 5 operations:

- promote temp_hi to int
- promote temp_lo to int
- shift int left by 8
- or'ing two int values
- assign 16-bit result to data

Long answer:

This is known for some time now, cf.

http://gcc.gnu.org/PR41076

The problem is that 16-bit operations cannot just be split
into 8-bit operations because that usually leads to code bloat,
and splitting before register allocation will come up with
quite some 8-bit subregs of the 16-bit containers, which
would result in unpleasant register allocation and more
moves all over the place.

So one option is to add ugly post reg-alloc split patterns like
r242907 that split the combined instructions after register
allocation.  This gives some improvement but still no perfect code.

In particular, in order to store to a 16-bit value two registers
which are /not/ a 16-bit value (but 2 free floating 8-bit values)
the stores would also have to be split.  However, such a splitter
would never touch accesses with side effects (volatile) as in
your example.

The latest change is upstream since 2016-11-28 to trunk, which
is future v7 (released around spring 2017).

https://gcc.gnu.org/r242907

Feeding the following test case into respective compiler


#include <stdint.h>
#include <avr/interrupt.h>

extern volatile uint16_t data;

// read and return a byte from the serial port
extern uint8_t read_byte(void);

void test1 (void)
{
     uint8_t temp_hi, temp_lo;

     // receive a word with |
     temp_hi = read_byte();
     temp_lo = read_byte();

     cli();
     data = (temp_hi<<8) | temp_lo;
     sei();
}


and then

$ avr-gcc-7 foo.c -mmcu=atmega328 -dp -save-temps -Os

spits out the following foo.s assembly output:


test1:
        push r28 ;  22 pushqi1/1 [length = 1]
        call read_byte ;  5 call_value_insn/2 [length = 2]
        mov r28,r24 ;  6 movqi_insn/1 [length = 1]
        call read_byte ;  7 call_value_insn/2 [length = 2]
/* #APP */
        cli
/* #NOAPP */
        mov r25,r28 ;  20 movqi_insn/1 [length = 1]
        sts data+1,r25 ;  14 *movhi/4 [length = 4]
        sts data,r24
/* #APP */
        sei
/* #NOAPP */
        pop r28 ;  25 popqi [length = 1]
        ret ;  26 return_from_epilogue [length = 1]


Which is still not optimal due to insn 20 which is the last bit
of composing two 8-bit values into one 16-bit value for storage
(insn 14).

>     sei(); // can get interrupted again
>
>     // receive a word with pointer assignment
>     // ugly and still does not compile how I want
>     temp_hi = read_byte();
>     temp_lo = read_byte();
>     cli(); // not okay to get interrupted while assigning
>     // in case an ISR comes and tries to read 'data'
>     /* compiles to
>   66 0030 E0E0          ldi r30,lo8(data)
>   67 0032 F0E0          ldi r31,hi8(data)
>   68 0034 8083          st Z,r24
>   69 0036 C183          std Z+1,r28
>     2 cycles worse than the good solution
>     */
>     *((uint8_t*)&data) = temp_lo;
>     *((uint8_t*)&data+1) = temp_hi;

This is throwing away the volatile qualification.  And
type-punning through pointers is something that's really
discouraged.

If it's of utmost importance that a specific sequence is
generated, you can use inline assembler any wrap it into
a function for encapsulation and re-usage.  With the same
headers, declarations and compilation as above:


static inline __attribute__((__always_inline__))
void atomic_store16 (uint16_t volatile *addr, uint8_t lo, uint8_t hi)
{
     cli();
     __asm volatile ("sts %3+1,%[hi]" "\n\t"
                     "sts %3,%[lo]" "\n\t"
                     : "=m" (*addr)
                     : [hi] "r" (hi), [lo] "r" (lo), "i" (addr));
     sei();
}


void test4 (void)
{
     uint8_t temp_hi = read_byte();
     atomic_store16 (&data, read_byte(), temp_hi);
}


Which avr-gcc compiles to

test4:
        push r28 ;  15 pushqi1/1 [length = 1]
        call read_byte ;  5 call_value_insn/2 [length = 2]
        mov r28,r24 ;  6 movqi_insn/1 [length = 1]
        call read_byte ;  7 call_value_insn/2 [length = 2]
/* #APP */
        cli
        sts data+1,r28
        sts data,r24
        sei
/* #NOAPP */
        pop r28 ;  18 popqi [length = 1]
        ret ;  19 return_from_epilogue [length = 1]

The problem is the ugly, non-portable code and that it might
hiccup with -O0 and depends on data living in static storage.

Yet another ugly solution is type-punning through a union
(which is explicitly supported by GCC) as follows:

static inline __attribute__((__always_inline__))
void atomic_union16 (uint16_t volatile *addr, uint8_t lo, uint8_t hi)
{
     union { uint16_t word; uint8_t byte[2]; }
     val = { .byte[0] = lo, .byte[1] = hi };

     *addr = val.word;
}

void test5 (void)
{
     uint8_t temp_hi = read_byte();
     atomic_union16 (&data, read_byte(), temp_hi);
}

The code is the same as with test4 from above.

Johann

>     sei(); // can get interrupted again
>
>     /*
>     Why won't the compiler compile it as
>     sts data, r24
>     sts data+1, r28
>     It will, in the second case, if it feels Z and Y are otherwise occupied.
>     But I couldn't get to think that for a reasonably short sample.
>     It also will work correctly in this case for -O2.
>     */
> }


_______________________________________________
AVR-GCC-list mailing list
[hidden email]
https://lists.nongnu.org/mailman/listinfo/avr-gcc-list