]> git.the-white-hart.net Git - atmega/siggen.git/commitdiff
Annotate with cycle counts and SREG requirements main
authoruser <none>
Tue, 26 Dec 2023 08:10:36 +0000 (02:10 -0600)
committeruser <none>
Tue, 26 Dec 2023 08:10:36 +0000 (02:10 -0600)
An asterisk after the cycle count indicates that it requires the SREG
value from a previous instruction.

asm_2/interp.asm

index 423df3ce791c13b48d93a9b6917dc5b7472c8aa6..88b63c98a45aa5014fc162a12a08658b2e9f9342 100644 (file)
@@ -200,818 +200,818 @@ _zreg_done:
 .org INTERPRETER_START
 loop:
        ; Check for countdown events
-       in      r25, TIFR0
-       sbrs    r25, OCF0A
-       rjmp    _countdown_done
+       in      r25, TIFR0               ; 1
+       sbrs    r25, OCF0A               ; 1/2
+       rjmp    _countdown_done          ; 2
 
        ; Clear the timer expire flag
-       out     TIFR0, r25
+       out     TIFR0, r25               ; 1
 
        ; Decrement each of the countdown clocks
-       ldi     ZL, LOW(interp_clocks)
-       ldi     ZH, HIGH(interp_clocks)
-       ldi     r23, 16
+       ldi     ZL, LOW(interp_clocks)   ; 1
+       ldi     ZH, HIGH(interp_clocks)  ; 1
+       ldi     r23, 16                  ; 1
 _countdown_loop:
-       dec     r23
-       brlt    _countdown_done
-       ldd     r24, Z+0
-       ldd     r25, Z+1
-       mov     r22, r24
-       or      r22, r25
-       breq    _countdown_next
-       subi    r24, 1
-       sbci    r25, 0
+       dec     r23                      ; 1
+       brlt    _countdown_done          ; 1/2*
+       ldd     r24, Z+0                 ; 2
+       ldd     r25, Z+1                 ; 2
+       mov     r22, r24                 ; 1
+       or      r22, r25                 ; 1
+       breq    _countdown_next          ; 1/2*
+       subi    r24, 1                   ; 1
+       sbci    r25, 0                   ; 1*
 _countdown_next:
-       st      Z+, r24
-       st      Z+, r25
-       rjmp    _countdown_loop
+       st      Z+, r24                  ; 2
+       st      Z+, r25                  ; 2
+       rjmp    _countdown_loop          ; 2
 _countdown_done:
 
        ; Fetch instruction
-       movw    ZL, r2     ; Put bytecode PC into Z
-       lsl     ZL         ; *2 to get a byte address from a word address
-       rol     ZH
-       lpm     r4, Z+     ; Load a word and increment
-       lpm     r5, Z+
-       lsr     ZH         ; /2 to get a word address from a byte address
-       ror     ZL
-       movw    r2, ZL     ; Save updated bytecode PC
+       movw    ZL, r2                   ; 1  Put bytecode PC into Z
+       lsl     ZL                       ; 1  *2 to get a byte address from a word address
+       rol     ZH                       ; 1*
+       lpm     r4, Z+                   ; 3  Load a word and increment
+       lpm     r5, Z+                   ; 3
+       lsr     ZH                       ; 1  /2 to get a word address from a byte address
+       ror     ZL                       ; 1*
+       movw    r2, ZL                   ; 1  Save updated bytecode PC
 
        ; Decode first operand, always register V[X]
-       mov     r24, r5    ; Extract X field from instruction
-       andi    r24, 0x0f
-       lsl     r24        ; *4 to get an offset from an index
-       lsl     r24
-       clr     r25        ; Add offset to base address of registers
-       ldi     ZL, LOW(interp_regs)
-       ldi     ZH, HIGH(interp_regs)
-       add     ZL, r24
-       adc     ZH, r25
-       movw    r16, ZL    ; Save the reg addr for use as destination later
-       ldd     r6, Z+0    ; Load operand value
-       ldd     r7, Z+1
-       ldd     r8, Z+2
-       ldd     r9, Z+3
+       mov     r24, r5                  ; 1  Extract X field from instruction
+       andi    r24, 0x0f                ; 1
+       lsl     r24                      ; 1  *4 to get an offset from an index
+       lsl     r24                      ; 1
+       clr     r25                      ; 1  Add offset to base address of registers
+       ldi     ZL, LOW(interp_regs)     ; 1
+       ldi     ZH, HIGH(interp_regs)    ; 1
+       add     ZL, r24                  ; 1
+       adc     ZH, r25                  ; 1*
+       movw    r16, ZL                  ; 1  Save the reg addr for use as destination later
+       ldd     r6, Z+0                  ; 2  Load operand value
+       ldd     r7, Z+1                  ; 2
+       ldd     r8, Z+2                  ; 2
+       ldd     r9, Z+3                  ; 2
 
        ; Decode second operand based on instruction F field
-       mov     r24, r5    ; Extract F field from instruction
-       lsr     r24
-       lsr     r24
-       lsr     r24
-       lsr     r24
-       clr     r25        ; Add base address of operand-dispatch jumptable
-       ldi     ZL, LOW(operand_jumptable)
-       ldi     ZH, HIGH(operand_jumptable)
-       add     ZL, r24
-       adc     ZH, r25
-       mov     r15, r24   ; Save the F field for decoding instruction later
-       ijmp               ; Jump to whatever code decodes the other operand
+       mov     r24, r5                  ; 1  Extract F field from instruction
+       lsr     r24                      ; 1
+       lsr     r24                      ; 1
+       lsr     r24                      ; 1
+       lsr     r24                      ; 1
+       clr     r25                      ; 1  Add base address of operand-dispatch jumptable
+       ldi     ZL, LOW(operand_jumptable)   ; 1
+       ldi     ZH, HIGH(operand_jumptable)  ; 1
+       add     ZL, r24                  ; 1
+       adc     ZH, r25                  ; 1*
+       mov     r15, r24                 ; 1  Save the F field for decoding instruction later
+       ijmp                             ; 2  Jump to whatever code decodes the other operand
 _decode_done:
 
        ; Load flags value
-       ldi     r24, 0x3c  ; Offset of the VF register (0xf * 4)
-       clr     r25        ; Add offset to base address of registers
-       ldi     ZL, LOW(interp_regs)
-       ldi     ZH, HIGH(interp_regs)
-       add     ZL, r24
-       adc     ZH, r25
-       ld      r14, Z
+       ldi     r24, 0x3c                ; 1  Offset of the VF register (0xf * 4)
+       clr     r25                      ; 1  Add offset to base address of registers
+       ldi     ZL, LOW(interp_regs)     ; 1
+       ldi     ZH, HIGH(interp_regs)    ; 1
+       add     ZL, r24                  ; 1
+       adc     ZH, r25                  ; 1*
+       ld      r14, Z                   ; 2
 
        ; Dispatch based on instruction F field
-       mov     r24, r15   ; Recover saved F field from instruction
-       clr     r25        ; Add base address of instruction-dispatch jumptable
-       ldi     ZL, LOW(f_dispatch_jumptable)
-       ldi     ZH, HIGH(f_dispatch_jumptable)
-       add     ZL, r24
-       adc     ZH, r25
-       ijmp               ; Jump to whatever code runs this type of instruction
+       mov     r24, r15                 ; 1  Recover saved F field from instruction
+       clr     r25                      ; 1  Add base address of instruction-dispatch jumptable
+       ldi     ZL, LOW(f_dispatch_jumptable)   ; 1
+       ldi     ZH, HIGH(f_dispatch_jumptable)  ; 1
+       add     ZL, r24                  ; 1
+       adc     ZH, r25                  ; 1*
+       ijmp                             ; 2  Jump to whatever code runs this type of instruction
 _dispatch_done_writeback_flags:
        ; Get rid of the S, N, and Z flags, we're making our own
-       ldi     r25, 0xe9
-       and     r14, r25
+       ldi     r25, 0xe9                ; 1
+       and     r14, r25                 ; 1
 
        ; Compute N flag
-       ldi     r25, 0x04
-       sbrc    r9, 7
-       or      r14, r25
+       ldi     r25, 0x04                ; 1
+       sbrc    r9, 7                    ; 1/2
+       or      r14, r25                 ; 1
 
        ; Compute Z flag
-       ldi     r24, 0x02
-       clr     r25
-       or      r25, r6
-       or      r25, r7
-       or      r25, r8
-       or      r25, r9
-       brne    _no_z
-       or      r14, r24
+       ldi     r24, 0x02                ; 1
+       clr     r25                      ; 1
+       or      r25, r6                  ; 1
+       or      r25, r7                  ; 1
+       or      r25, r8                  ; 1
+       or      r25, r9                  ; 1
+       brne    _no_z                    ; 1/2*
+       or      r14, r24                 ; 1
 _no_z:
 
        ; Compute S flag
-       mov     r25, r14
-       lsl     r25        ; Shift N flag up to where V is
-       eor     r25, r14   ; Xor to get the value for the S flag
-       bst     r25, 3     ; Read flag value
-       bld     r14, 4     ; Write into proper spot
+       mov     r25, r14                 ; 1
+       lsl     r25                      ; 1  Shift N flag up to where V is
+       eor     r25, r14                 ; 1  Xor to get the value for the S flag
+       bst     r25, 3                   ; 1  Read flag value
+       bld     r14, 4                   ; 1  Write into proper spot
 
 _dispatch_done_writeback_fixedflags:
-       ldi     r24, 0x3c  ; Offset of the VF register (0xf * 4)
-       clr     r25        ; Add offset to base address of registers
-       ldi     ZL, LOW(interp_regs)
-       ldi     ZH, HIGH(interp_regs)
-       add     ZL, r24
-       adc     ZH, r25
-       st      Z, r14     ; Store the flag byte generated by the instruction
+       ldi     r24, 0x3c                ; 1  Offset of the VF register (0xf * 4)
+       clr     r25                      ; 1  Add offset to base address of registers
+       ldi     ZL, LOW(interp_regs)     ; 1
+       ldi     ZH, HIGH(interp_regs)    ; 1
+       add     ZL, r24                  ; 1
+       adc     ZH, r25                  ; 1*
+       st      Z, r14                   ; 2  Store the flag byte generated by the instruction
 _dispatch_done_writeback_reg:
-       movw    ZL, r16    ; Recover the pointer to V[X]
-       std     Z+0, r6    ; Save the instruction result to the register
-       std     Z+1, r7
-       std     Z+2, r8
-       std     Z+3, r9
+       movw    ZL, r16                  ; 1  Recover the pointer to V[X]
+       std     Z+0, r6                  ; 2  Save the instruction result to the register
+       std     Z+1, r7                  ; 2
+       std     Z+2, r8                  ; 2
+       std     Z+3, r9                  ; 2
 _dispatch_done:
 
-       rjmp    loop
+       rjmp    loop                     ; 2
 
 
 ; ------------------------------------------------------------------------------
 ; Operand decoding
 
 operand_jumptable:
-       rjmp    operand_VY
-       rjmp    operand_imm32
-       rjmp    operand_Y
-       rjmp    operand_VY_N
-       rjmp    operand_VY_N
-       rjmp    operand_VY_N
-       rjmp    operand_VY_N
-       rjmp    operand_VY_N
-       rjmp    operand_VY_N
-       rjmp    operand_VY_N
-       rjmp    operand_VY_N
-       rjmp    operand_PC_ssNN
-       rjmp    operand_PC_sNNN
-       rjmp    operand_PC_sNNN
-       rjmp    operand_0NNN
-       rjmp    operand_VY_N
+       rjmp    operand_VY               ; 2
+       rjmp    operand_imm32            ; 2
+       rjmp    operand_Y                ; 2
+       rjmp    operand_VY_N             ; 2
+       rjmp    operand_VY_N             ; 2
+       rjmp    operand_VY_N             ; 2
+       rjmp    operand_VY_N             ; 2
+       rjmp    operand_VY_N             ; 2
+       rjmp    operand_VY_N             ; 2
+       rjmp    operand_VY_N             ; 2
+       rjmp    operand_VY_N             ; 2
+       rjmp    operand_PC_ssNN          ; 2
+       rjmp    operand_PC_sNNN          ; 2
+       rjmp    operand_PC_sNNN          ; 2
+       rjmp    operand_0NNN             ; 2
+       rjmp    operand_VY_N             ; 2
 
 
 ; ----- V[Y]
 operand_VY:
-       mov     r24, r4
-       andi    r24, 0xf0
-       lsr     r24
-       lsr     r24
-       clr     r25
-       ldi     ZL, LOW(interp_regs)
-       ldi     ZH, HIGH(interp_regs)
-       add     ZL, r24
-       adc     ZH, r25
-       ldd     r10, Z+0
-       ldd     r11, Z+1
-       ldd     r12, Z+2
-       ldd     r13, Z+3
-
-       rjmp    _decode_done
+       mov     r24, r4                  ; 1
+       andi    r24, 0xf0                ; 1
+       lsr     r24                      ; 1
+       lsr     r24                      ; 1
+       clr     r25                      ; 1
+       ldi     ZL, LOW(interp_regs)     ; 1
+       ldi     ZH, HIGH(interp_regs)    ; 1
+       add     ZL, r24                  ; 1
+       adc     ZH, r25                  ; 1*
+       ldd     r10, Z+0                 ; 2
+       ldd     r11, Z+1                 ; 2
+       ldd     r12, Z+2                 ; 2
+       ldd     r13, Z+3                 ; 2
+
+       rjmp    _decode_done             ; 2
 
 
 ; ----- 32-bit immediate following instruction
 operand_imm32:
-       movw    ZL, r2     ; Put bytecode PC into Z
-       lsl     ZL         ; *2 to get a byte address from a word address
-       rol     ZH
-       lpm     r10, Z+    ; Load four bytes and increment
-       lpm     r11, Z+
-       lpm     r12, Z+
-       lpm     r13, Z+
-       lsr     ZH         ; /2 to get a word address from a byte address
-       ror     ZL
-       movw    r2, ZL     ; Save updated bytecode PC
+       movw    ZL, r2                   ; 1  Put bytecode PC into Z
+       lsl     ZL                       ; 1  *2 to get a byte address from a word address
+       rol     ZH                       ; 1*
+       lpm     r10, Z+                  ; 3  Load four bytes and increment
+       lpm     r11, Z+                  ; 3
+       lpm     r12, Z+                  ; 3
+       lpm     r13, Z+                  ; 3
+       lsr     ZH                       ; 1  /2 to get a word address from a byte address
+       ror     ZL                       ; 1*
+       movw    r2, ZL                   ; 1  Save updated bytecode PC
 
-       rjmp    _decode_done
+       rjmp    _decode_done             ; 2
 
 
 ; ----- 4-bit zero-extended immediate within instruction
 operand_Y:
-       mov     r10, r4
-       lsr     r10
-       lsr     r10
-       lsr     r10
-       lsr     r10
-       clr     r11
-       clr     r12
-       clr     r13
+       mov     r10, r4                  ; 1
+       lsr     r10                      ; 1
+       lsr     r10                      ; 1
+       lsr     r10                      ; 1
+       lsr     r10                      ; 1
+       clr     r11                      ; 1
+       clr     r12                      ; 1
+       clr     r13                      ; 1
 
        ; Most of these instructions have no use for a zero immediate
        ; Replace zero with a more useful 0x10 value, for range of 0x01-0x10
        ; Instructions that want 0x00-0x0f can mask off the upper nibble
-       tst     r10
-       brne    _operand_Y_done
-       ldi     r25, 0x10
-       mov     r10, r25
+       tst     r10                      ; 1
+       brne    _operand_Y_done          ; 1/2*
+       ldi     r25, 0x10                ; 1
+       mov     r10, r25                 ; 1
 _operand_Y_done:
 
-       rjmp    _decode_done
+       rjmp    _decode_done             ; 2
 
 
 ; ----- V[Y] + 4-bit zero-extended immediate within instruction
 operand_VY_N:
        ; Load V[Y]
-       mov     r24, r4
-       andi    r24, 0xf0
-       lsr     r24
-       lsr     r24
-       clr     r25
-       ldi     ZL, LOW(interp_regs)
-       ldi     ZH, HIGH(interp_regs)
-       add     ZL, r24
-       adc     ZH, r25
-       ldd     r10, Z+0
-       ldd     r11, Z+1
-       ldd     r12, Z+2
-       ldd     r13, Z+3
+       mov     r24, r4                  ; 1
+       andi    r24, 0xf0                ; 1
+       lsr     r24                      ; 1
+       lsr     r24                      ; 1
+       clr     r25                      ; 1
+       ldi     ZL, LOW(interp_regs)     ; 1
+       ldi     ZH, HIGH(interp_regs)    ; 1
+       add     ZL, r24                  ; 1
+       adc     ZH, r25                  ; 1*
+       ldd     r10, Z+0                 ; 2
+       ldd     r11, Z+1                 ; 2
+       ldd     r12, Z+2                 ; 2
+       ldd     r13, Z+3                 ; 2
 
        ; Add N
-       mov     r24, r4
-       andi    r24, 0x0f
-       clr     r25
-       add     r10, r24
-       adc     r11, r25
-       adc     r12, r25
-       adc     r13, r25
+       mov     r24, r4                  ; 1
+       andi    r24, 0x0f                ; 1
+       clr     r25                      ; 1
+       add     r10, r24                 ; 1
+       adc     r11, r25                 ; 1*
+       adc     r12, r25                 ; 1*
+       adc     r13, r25                 ; 1*
 
-       rjmp    _decode_done
+       rjmp    _decode_done             ; 2
 
 
 ; ----- Zero-extended 12-bit immediate within instruction
 operand_0NNN:
-       movw    r10, r4
-       ldi     r25, 0x0f
-       and     r11, r25
-       clr     r12
-       clr     r13
-       rjmp    _decode_done
+       movw    r10, r4                  ; 1
+       ldi     r25, 0x0f                ; 1
+       and     r11, r25                 ; 1
+       clr     r12                      ; 1
+       clr     r13                      ; 1
+       rjmp    _decode_done             ; 2
 
 
 ; ----- PC + sign-extended 8-bit immediate within instruction
 operand_PC_ssNN:
        ; Sign-extend 8-bit immediate
-       mov     r10, r4
-       clr     r11
-       clr     r12
-       clr     r13
-       sbrs    r10, 7
-       rjmp    _sext_done_PCssNN
-       com     r11
-       com     r12
-       com     r13
+       mov     r10, r4                  ; 1
+       clr     r11                      ; 1
+       clr     r12                      ; 1
+       clr     r13                      ; 1
+       sbrs    r10, 7                   ; 1/2
+       rjmp    _sext_done_PCssNN        ; 2
+       com     r11                      ; 1
+       com     r12                      ; 1
+       com     r13                      ; 1
 _sext_done_PCssNN:
 
        ; Add PC
-       clr     r25
-       add     r10, r2
-       adc     r11, r3
-       adc     r12, r25
-       adc     r13, r25
+       clr     r25                      ; 1
+       add     r10, r2                  ; 1
+       adc     r11, r3                  ; 1*
+       adc     r12, r25                 ; 1*
+       adc     r13, r25                 ; 1*
 
-       rjmp    _decode_done
+       rjmp    _decode_done             ; 2
 
 
 ; ----- PC + sign-extended 12-bit immediate within instruction
 operand_PC_sNNN:
        ; Sign-extend 12-bit immediate
-       movw    r10, r4
-       ldi     r25, 0x0f
-       and     r11, r25
-       clr     r12
-       clr     r13
-       sbrs    r11, 3
-       rjmp    _sext_done_PCsNNN
-       ldi     r25, 0xf0
-       or      r11, r25
-       com     r12
-       com     r13
+       movw    r10, r4                  ; 1
+       ldi     r25, 0x0f                ; 1
+       and     r11, r25                 ; 1
+       clr     r12                      ; 1
+       clr     r13                      ; 1
+       sbrs    r11, 3                   ; 1/2
+       rjmp    _sext_done_PCsNNN        ; 2
+       ldi     r25, 0xf0                ; 1
+       or      r11, r25                 ; 1
+       com     r12                      ; 1
+       com     r13                      ; 1
 _sext_done_PCsNNN:
 
        ; Add PC
-       clr     r25
-       add     r10, r2
-       adc     r11, r3
-       adc     r12, r25
-       adc     r13, r25
+       clr     r25                      ; 1
+       add     r10, r2                  ; 1
+       adc     r11, r3                  ; 1*
+       adc     r12, r25                 ; 1*
+       adc     r13, r25                 ; 1*
 
-       rjmp    _decode_done
+       rjmp    _decode_done             ; 2
 
 
 operand_none:
-       rjmp    _decode_done
+       rjmp    _decode_done             ; 2
 
 
 ; ------------------------------------------------------------------------------
 ; Instruction dispatch
 
 f_dispatch_jumptable:
-       rjmp    dispatch_alu
-       rjmp    dispatch_alu
-       rjmp    dispatch_imm4
-       rjmp    exec_ldb
-       rjmp    exec_ldh
-       rjmp    exec_ldw
-       rjmp    exec_stb
-       rjmp    exec_sth
-       rjmp    exec_stw
-       rjmp    exec_lpb
-       rjmp    exec_lph
-       rjmp    dispatch_branch
-       rjmp    exec_jal_with_ve
-       rjmp    exec_jmp
-       rjmp    exec_ext
-       rjmp    exec_lpw
+       rjmp    dispatch_alu             ; 2
+       rjmp    dispatch_alu             ; 2
+       rjmp    dispatch_imm4            ; 2
+       rjmp    exec_ldb                 ; 2
+       rjmp    exec_ldh                 ; 2
+       rjmp    exec_ldw                 ; 2
+       rjmp    exec_stb                 ; 2
+       rjmp    exec_sth                 ; 2
+       rjmp    exec_stw                 ; 2
+       rjmp    exec_lpb                 ; 2
+       rjmp    exec_lph                 ; 2
+       rjmp    dispatch_branch          ; 2
+       rjmp    exec_jal_with_ve         ; 2
+       rjmp    exec_jmp                 ; 2
+       rjmp    exec_ext                 ; 2
+       rjmp    exec_lpw                 ; 2
 
 alu_dispatch_jumptable:
-       rjmp    exec_add
-       rjmp    exec_sub
-       rjmp    exec_and
-       rjmp    exec_or
-       rjmp    exec_xor
-       rjmp    exec_nor
-       rjmp    exec_mov
-       rjmp    exec_mul
-       rjmp    exec_test
-       rjmp    exec_cmp
-       rjmp    exec_udiv
-       rjmp    exec_umod
-       rjmp    exec_sdiv
-       rjmp    exec_smod
-       rjmp    exec_nop
-       rjmp    exec_jal
+       rjmp    exec_add                 ; 2
+       rjmp    exec_sub                 ; 2
+       rjmp    exec_and                 ; 2
+       rjmp    exec_or                  ; 2
+       rjmp    exec_xor                 ; 2
+       rjmp    exec_nor                 ; 2
+       rjmp    exec_mov                 ; 2
+       rjmp    exec_mul                 ; 2
+       rjmp    exec_test                ; 2
+       rjmp    exec_cmp                 ; 2
+       rjmp    exec_udiv                ; 2
+       rjmp    exec_umod                ; 2
+       rjmp    exec_sdiv                ; 2
+       rjmp    exec_smod                ; 2
+       rjmp    exec_nop                 ; 2
+       rjmp    exec_jal                 ; 2
 
 imm4_dispatch_jumptable:
-       rjmp    exec_add
-       rjmp    exec_sub
-       rjmp    exec_mov
-       rjmp    exec_shl
-       rjmp    exec_shrl
-       rjmp    exec_shra
-       rjmp    exec_rol
-       rjmp    exec_ror
-       rjmp    exec_spi
-       rjmp    exec_mft
-       rjmp    exec_mtt
-       rjmp    exec_ddir
-       rjmp    exec_din
-       rjmp    exec_dout
-       rjmp    exec_ain
-       rjmp    exec_aout
+       rjmp    exec_add                 ; 2
+       rjmp    exec_sub                 ; 2
+       rjmp    exec_mov                 ; 2
+       rjmp    exec_shl                 ; 2
+       rjmp    exec_shrl                ; 2
+       rjmp    exec_shra                ; 2
+       rjmp    exec_rol                 ; 2
+       rjmp    exec_ror                 ; 2
+       rjmp    exec_spi                 ; 2
+       rjmp    exec_mft                 ; 2
+       rjmp    exec_mtt                 ; 2
+       rjmp    exec_ddir                ; 2
+       rjmp    exec_din                 ; 2
+       rjmp    exec_dout                ; 2
+       rjmp    exec_ain                 ; 2
+       rjmp    exec_aout                ; 2
 
 branch_dispatch_jumptable:
-       rjmp    exec_jtab
-       rjmp    exec_jtab
-       rjmp    exec_jtab
-       rjmp    exec_jtab
-       rjmp    exec_jtab
-       rjmp    exec_jtab
-       rjmp    exec_blt
-       rjmp    exec_bge
-       rjmp    exec_bv
-       rjmp    exec_bnv
-       rjmp    exec_bmi
-       rjmp    exec_bpl
-       rjmp    exec_bz
-       rjmp    exec_bnz
-       rjmp    exec_c
-       rjmp    exec_nc
+       rjmp    exec_jtab                ; 2
+       rjmp    exec_jtab                ; 2
+       rjmp    exec_jtab                ; 2
+       rjmp    exec_jtab                ; 2
+       rjmp    exec_jtab                ; 2
+       rjmp    exec_jtab                ; 2
+       rjmp    exec_blt                 ; 2
+       rjmp    exec_bge                 ; 2
+       rjmp    exec_bv                  ; 2
+       rjmp    exec_bnv                 ; 2
+       rjmp    exec_bmi                 ; 2
+       rjmp    exec_bpl                 ; 2
+       rjmp    exec_bz                  ; 2
+       rjmp    exec_bnz                 ; 2
+       rjmp    exec_c                   ; 2
+       rjmp    exec_nc                  ; 2
 
 
 dispatch_alu:
-       mov     r24, r4
-       andi    r24, 0x0f
-       clr     r25
-       ldi     ZL, LOW(alu_dispatch_jumptable)
-       ldi     ZH, HIGH(alu_dispatch_jumptable)
-       add     ZL, r24
-       adc     ZH, r25
-       ijmp
+       mov     r24, r4                  ; 1
+       andi    r24, 0x0f                ; 1
+       clr     r25                      ; 1
+       ldi     ZL, LOW(alu_dispatch_jumptable)   ; 1
+       ldi     ZH, HIGH(alu_dispatch_jumptable)  ; 1
+       add     ZL, r24                  ; 1
+       adc     ZH, r25                  ; 1*
+       ijmp                             ; 2
 
 dispatch_imm4:
-       mov     r24, r4
-       andi    r24, 0x0f
-       clr     r25
-       ldi     ZL, LOW(imm4_dispatch_jumptable)
-       ldi     ZH, HIGH(imm4_dispatch_jumptable)
-       add     ZL, r24
-       adc     ZH, r25
-       ijmp
+       mov     r24, r4                  ; 1
+       andi    r24, 0x0f                ; 1
+       clr     r25                      ; 1
+       ldi     ZL, LOW(imm4_dispatch_jumptable)   ; 1
+       ldi     ZH, HIGH(imm4_dispatch_jumptable)  ; 1
+       add     ZL, r24                  ; 1
+       adc     ZH, r25                  ; 1*
+       ijmp                             ; 2
 
 dispatch_branch:
-       mov     r24, r5
-       andi    r24, 0x0f
-       clr     r25
-       ldi     ZL, LOW(branch_dispatch_jumptable)
-       ldi     ZH, HIGH(branch_dispatch_jumptable)
-       add     ZL, r24
-       adc     ZH, r25
-       ijmp
+       mov     r24, r5                  ; 1
+       andi    r24, 0x0f                ; 1
+       clr     r25                      ; 1
+       ldi     ZL, LOW(branch_dispatch_jumptable)   ; 1
+       ldi     ZH, HIGH(branch_dispatch_jumptable)  ; 1
+       add     ZL, r24                  ; 1
+       adc     ZH, r25                  ; 1*
+       ijmp                             ; 2
 
 
 exec_nop:
-       rjmp    _dispatch_done
+       rjmp    _dispatch_done           ; 2
 
 
 exec_add:
-       add     r6, r10
-       adc     r7, r11
-       adc     r8, r12
-       adc     r9, r13
+       add     r6, r10                  ; 1
+       adc     r7, r11                  ; 1*
+       adc     r8, r12                  ; 1*
+       adc     r9, r13                  ; 1*
 
        ; Flags
-       in      r24, SREG  ; Load real flags and keep xxxSVxxC
-       andi    r24, 0x19
-       ldi     r25, 0xe6  ; Clear old flag bits that we're taking from SREG
-       and     r14, r25
-       or      r14, r24   ; Add flags from SREG into interpreter flags
+       in      r24, SREG                ; 1* Load real flags and keep xxxSVxxC
+       andi    r24, 0x19                ; 1
+       ldi     r25, 0xe6                ; 1  Clear old flag bits that we're taking from SREG
+       and     r14, r25                 ; 1
+       or      r14, r24                 ; 1  Add flags from SREG into interpreter flags
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_sub:
-       sub     r6, r10
-       sbc     r7, r11
-       sbc     r8, r12
-       sbc     r9, r13
+       sub     r6, r10                  ; 1
+       sbc     r7, r11                  ; 1*
+       sbc     r8, r12                  ; 1*
+       sbc     r9, r13                  ; 1*
 
        ; Flags
-       in      r24, SREG  ; Load real flags and keep xxxSVxxC
-       andi    r24, 0x19
-       ldi     r25, 0xe6  ; Clear old flag bits that we're taking from SREG
-       and     r14, r25
-       or      r14, r24   ; Add flags from SREG into interpreter flags
+       in      r24, SREG                ; 1* Load real flags and keep xxxSVxxC
+       andi    r24, 0x19                ; 1
+       ldi     r25, 0xe6                ; 1  Clear old flag bits that we're taking from SREG
+       and     r14, r25                 ; 1
+       or      r14, r24                 ; 1  Add flags from SREG into interpreter flags
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_and:
-       and     r6, r10
-       and     r7, r11
-       and     r8, r12
-       and     r9, r13
+       and     r6, r10                  ; 1
+       and     r7, r11                  ; 1
+       and     r8, r12                  ; 1
+       and     r9, r13                  ; 1
 
        ; Flags
-       in      r24, SREG  ; Load real flags and keep xxxSVxxx
-       andi    r24, 0x18
-       ldi     r25, 0xe7  ; Clear old flag bits that we're taking from SREG
-       and     r14, r25
-       or      r14, r24   ; Add flags from SREG into interpreter flags
+       in      r24, SREG                ; 1* Load real flags and keep xxxSVxxx
+       andi    r24, 0x18                ; 1
+       ldi     r25, 0xe7                ; 1  Clear old flag bits that we're taking from SREG
+       and     r14, r25                 ; 1
+       or      r14, r24                 ; 1  Add flags from SREG into interpreter flags
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_or:
-       or      r6, r10
-       or      r7, r11
-       or      r8, r12
-       or      r9, r13
+       or      r6, r10                  ; 1
+       or      r7, r11                  ; 1
+       or      r8, r12                  ; 1
+       or      r9, r13                  ; 1
 
        ; Flags
-       in      r24, SREG  ; Load real flags and keep xxxSVxxx
-       andi    r24, 0x18
-       ldi     r25, 0xe7  ; Clear old flag bits that we're taking from SREG
-       and     r14, r25
-       or      r14, r24   ; Add flags from SREG into interpreter flags
+       in      r24, SREG                ; 1* Load real flags and keep xxxSVxxx
+       andi    r24, 0x18                ; 1
+       ldi     r25, 0xe7                ; 1  Clear old flag bits that we're taking from SREG
+       and     r14, r25                 ; 1
+       or      r14, r24                 ; 1  Add flags from SREG into interpreter flags
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_xor:
-       eor     r6, r10
-       eor     r7, r11
-       eor     r8, r12
-       eor     r9, r13
+       eor     r6, r10                  ; 1
+       eor     r7, r11                  ; 1
+       eor     r8, r12                  ; 1
+       eor     r9, r13                  ; 1
 
        ; Flags
-       in      r24, SREG  ; Load real flags and keep xxxSVxxx
-       andi    r24, 0x18
-       ldi     r25, 0xe7  ; Clear old flag bits that we're taking from SREG
-       and     r14, r25
-       or      r14, r24   ; Add flags from SREG into interpreter flags
+       in      r24, SREG                ; 1* Load real flags and keep xxxSVxxx
+       andi    r24, 0x18                ; 1
+       ldi     r25, 0xe7                ; 1  Clear old flag bits that we're taking from SREG
+       and     r14, r25                 ; 1
+       or      r14, r24                 ; 1  Add flags from SREG into interpreter flags
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_nor:
-       or      r6, r10
-       or      r7, r11
-       or      r8, r12
-       or      r9, r13
-       com     r6
-       com     r7
-       com     r8
-       com     r9
+       or      r6, r10                  ; 1
+       or      r7, r11                  ; 1
+       or      r8, r12                  ; 1
+       or      r9, r13                  ; 1
+       com     r6                       ; 1
+       com     r7                       ; 1
+       com     r8                       ; 1
+       com     r9                       ; 1
 
        ; Flags
-       in      r24, SREG  ; Load real flags and keep xxxSVxxx
-       andi    r24, 0x18
-       ldi     r25, 0xe7  ; Clear old flag bits that we're taking from SREG
-       and     r14, r25
-       or      r14, r24   ; Add flags from SREG into interpreter flags
+       in      r24, SREG                ; 1* Load real flags and keep xxxSVxxx
+       andi    r24, 0x18                ; 1
+       ldi     r25, 0xe7                ; 1  Clear old flag bits that we're taking from SREG
+       and     r14, r25                 ; 1
+       or      r14, r24                 ; 1  Add flags from SREG into interpreter flags
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_mov:
-       movw    r6, r10
-       movw    r8, r12
-       rjmp    _dispatch_done_writeback_reg
+       movw    r6, r10                  ; 1
+       movw    r8, r12                  ; 1
+       rjmp    _dispatch_done_writeback_reg  ; 2
 
 
 exec_mul:
-       clr     r0         ; Zero for adding carries
-       clr     r1         ; Carry accumulation
-       ldi     r21, 32    ; Loop counter
-       clr     r22        ; Temporary for result
-       clr     r23
-       clr     r24
-       clr     r25
+       clr     r0                       ; 1  Zero for adding carries
+       clr     r1                       ; 1  Carry accumulation
+       ldi     r21, 32                  ; 1  Loop counter
+       clr     r22                      ; 1  Temporary for result
+       clr     r23                      ; 1
+       clr     r24                      ; 1
+       clr     r25                      ; 1
 
        ; Multiply
 _mul_loop:
-       dec     r21
-       brmi    _mul_done
-
-       lsl     r22        ; Shift result one bit up
-       rol     r23
-       rol     r24
-       rol     r25
-       adc     r1, r0
-
-       lsl     r10        ; Shift multiplier one bit up
-       rol     r11
-       rol     r12
-       rol     r13
-
-       brcc    _mul_loop  ; If the multiplier high bit was 1, add multiplicand
-       add     r22, r6
-       adc     r23, r7
-       adc     r24, r8
-       adc     r25, r9
-       adc     r1, r0
-       rjmp    _mul_loop
+       dec     r21                      ; 1
+       brmi    _mul_done                ; 1/2*
+
+       lsl     r22                      ; 1  Shift result one bit up
+       rol     r23                      ; 1*
+       rol     r24                      ; 1*
+       rol     r25                      ; 1*
+       adc     r1, r0                   ; 1*
+
+       lsl     r10                      ; 1  Shift multiplier one bit up
+       rol     r11                      ; 1*
+       rol     r12                      ; 1*
+       rol     r13                      ; 1*
+
+       brcc    _mul_loop                ; 1/2* If the multiplier high bit was 1, add multiplicand
+       add     r22, r6                  ; 1
+       adc     r23, r7                  ; 1*
+       adc     r24, r8                  ; 1*
+       adc     r25, r9                  ; 1*
+       adc     r1, r0                   ; 1*
+       rjmp    _mul_loop                ; 2
 _mul_done:
 
        ; Copy low half of temporary to result (frees up temp regs for flags)
-       movw    r6, r22
+       movw    r6, r22                  ; 1
 
-       mov     r22, r14   ; Copy flags to temp
-       andi    r22, 0xf6  ; Clear V, and C flags
+       mov     r22, r14                 ; 1  Copy flags to temp
+       andi    r22, 0xf6                ; 1  Clear V, and C flags
 
        ; Set carry flag if any of the upper 32 bits of result would be set
-       tst     r1
-       breq    _mul_no_carry
-       ori     r22, 0x01
+       tst     r1                       ; 1
+       breq    _mul_no_carry            ; 1/2*
+       ori     r22, 0x01                ; 1
 _mul_no_carry:
 
        ; Set overflow flag if sign of result disagrees with signs of inputs
-       mov     r23, r9
-       eor     r23, r13   ; Top bit of r23 is one if result should be negative
-       eor     r23, r25   ; Top bit of r23 is one if result sign is incorrect
-       sbrc    r23, 7
-       ori     r22, 0x08
+       mov     r23, r9                  ; 1
+       eor     r23, r13                 ; 1  Top bit of r23 is one if result should be negative
+       eor     r23, r25                 ; 1  Top bit of r23 is one if result sign is incorrect
+       sbrc    r23, 7                   ; 1/2
+       ori     r22, 0x08                ; 1
 
-       mov     r14, r22   ; Copy temp back into flags
+       mov     r14, r22                 ; 1  Copy temp back into flags
 
        ; Copy high half of temporary to result
-       movw    r8, r24
+       movw    r8, r24                  ; 1
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_udiv:
-       mov     r1, r9
-       eor     r1, r13    ; Bit 7 is set if the result needs to be negated
-       clr     r0         ; For adding carries
+       mov     r1, r9                   ; 1
+       eor     r1, r13                  ; 1  Bit 7 is set if the result needs to be negated
+       clr     r0                       ; 1  For adding carries
 
-       ldi     r25, 0xf6  ; Discard overflow and carry flags
-       and     r14, r25
+       ldi     r25, 0xf6                ; 1  Discard overflow and carry flags
+       and     r14, r25                 ; 1
 
        ; Set carry flag if dividing by zero
        ; Then divide anyway, because it's hilarious
-       mov     r25, r10
-       or      r25, r11
-       or      r25, r12
-       or      r25, r13
-       brne    _udiv_no_divz
-       ldi     r25, 0x01
-       or      r14, r25
+       mov     r25, r10                 ; 1
+       or      r25, r11                 ; 1
+       or      r25, r12                 ; 1
+       or      r25, r13                 ; 1
+       brne    _udiv_no_divz            ; 1/2*
+       ldi     r25, 0x01                ; 1
+       or      r14, r25                 ; 1
 _udiv_no_divz:
 
        ; Call/ret take more than three clock cycles, so they can't be used
-       ldi     ZL, LOW(_div_done)
-       ldi     ZH, HIGH(_div_done)
-       rjmp    div_subroutine
+       ldi     ZL, LOW(_div_done)       ; 1
+       ldi     ZH, HIGH(_div_done)      ; 1
+       rjmp    div_subroutine           ; 2
 _div_done:
 
        ; Set the overflow flag if the sign is unexpected
-       ldi     r25, 0x08
-       eor     r1, r9
-       sbrc    r1, 7
-       or      r14, r25
+       ldi     r25, 0x08                ; 1
+       eor     r1, r9                   ; 1
+       sbrc    r1, 7                    ; 1/2
+       or      r14, r25                 ; 1
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_umod:
-       mov     r1, r13    ; Bit 7 is set if the result needs to be negated
-       clr     r0         ; For adding carries
+       mov     r1, r13                  ; 1  Bit 7 is set if the result needs to be negated
+       clr     r0                       ; 1  For adding carries
 
-       ldi     r25, 0xf6  ; Discard overflow and carry flags
-       and     r14, r25
+       ldi     r25, 0xf6                ; 1  Discard overflow and carry flags
+       and     r14, r25                 ; 1
 
        ; Set carry flag if dividing by zero
        ; Then divide anyway, because it's hilarious
-       mov     r25, r10
-       or      r25, r11
-       or      r25, r12
-       or      r25, r13
-       brne    _umod_no_divz
-       ldi     r25, 0x01
-       or      r14, r25
+       mov     r25, r10                 ; 1
+       or      r25, r11                 ; 1
+       or      r25, r12                 ; 1
+       or      r25, r13                 ; 1
+       brne    _umod_no_divz            ; 1/2*
+       ldi     r25, 0x01                ; 1
+       or      r14, r25                 ; 1
 _umod_no_divz:
 
        ; Call/ret take more than three clock cycles, so they can't be used
-       ldi     ZL, LOW(_mod_done)
-       ldi     ZH, HIGH(_mod_done)
-       rjmp    div_subroutine
+       ldi     ZL, LOW(_mod_done)       ; 1
+       ldi     ZH, HIGH(_mod_done)      ; 1
+       rjmp    div_subroutine           ; 2
 _mod_done:
 
-       movw    r6, r22
-       movw    r8, r24
+       movw    r6, r22                  ; 1
+       movw    r8, r24                  ; 1
 
        ; Set the overflow flag if the sign is unexpected
-       ldi     r25, 0x08
-       eor     r1, r9
-       sbrc    r1, 7
-       or      r14, r25
+       ldi     r25, 0x08                ; 1
+       eor     r1, r9                   ; 1
+       sbrc    r1, 7                    ; 1/2
+       or      r14, r25                 ; 1
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_sdiv:
-       mov     r1, r9
-       eor     r1, r13    ; Bit 7 is set if the result needs to be negated
-       clr     r0
-       dec     r0         ; For adding carries during inversion
+       mov     r1, r9                   ; 1
+       eor     r1, r13                  ; 1  Bit 7 is set if the result needs to be negated
+       clr     r0                       ; 1
+       dec     r0                       ; 1  For adding carries during inversion
 
-       ldi     r25, 0xf6  ; Discard overflow and carry flags
-       and     r14, r25
+       ldi     r25, 0xf6                ; 1  Discard overflow and carry flags
+       and     r14, r25                 ; 1
 
        ; Set carry flag if dividing by zero
        ; Then divide anyway, because it's hilarious
-       mov     r25, r10
-       or      r25, r11
-       or      r25, r12
-       or      r25, r13
-       brne    _sdiv_no_divz
-       ldi     r25, 0x01
-       or      r14, r25
+       mov     r25, r10                 ; 1
+       or      r25, r11                 ; 1
+       or      r25, r12                 ; 1
+       or      r25, r13                 ; 1
+       brne    _sdiv_no_divz            ; 1/2*
+       ldi     r25, 0x01                ; 1
+       or      r14, r25                 ; 1
 _sdiv_no_divz:
 
        ; Absolute value of dividend
-       bst     r9, 7
-       brtc    _sdiv_no_inv_a
-       com     r9
-       com     r8
-       com     r7
-       neg     r6
-       sbc     r7, r0
-       sbc     r8, r0
-       sbc     r9, r0
+       bst     r9, 7                    ; 1
+       brtc    _sdiv_no_inv_a           ; 1/2*
+       com     r9                       ; 1
+       com     r8                       ; 1
+       com     r7                       ; 1
+       neg     r6                       ; 1
+       sbc     r7, r0                   ; 1*
+       sbc     r8, r0                   ; 1*
+       sbc     r9, r0                   ; 1*
 _sdiv_no_inv_a:
 
        ; Absolute value of divisor
-       bst     r13, 7
-       brtc    _sdiv_no_inv_b
-       com     r13
-       com     r12
-       com     r11
-       neg     r10
-       sbc     r11, r0
-       sbc     r12, r0
-       sbc     r13, r0
+       bst     r13, 7                   ; 1
+       brtc    _sdiv_no_inv_b           ; 1/2*
+       com     r13                      ; 1
+       com     r12                      ; 1
+       com     r11                      ; 1
+       neg     r10                      ; 1
+       sbc     r11, r0                  ; 1*
+       sbc     r12, r0                  ; 1*
+       sbc     r13, r0                  ; 1*
 _sdiv_no_inv_b:
 
        ; Call/ret take more than three clock cycles, so they can't be used
-       ldi     ZL, LOW(_sdiv_done)
-       ldi     ZH, HIGH(_sdiv_done)
-       rjmp    div_subroutine
+       ldi     ZL, LOW(_sdiv_done)      ; 1
+       ldi     ZH, HIGH(_sdiv_done)     ; 1
+       rjmp    div_subroutine           ; 2
 _sdiv_done:
 
        ; Invert result if necessary
-       bst     r1, 7
-       brtc    _sdiv_no_inv
-       com     r6
-       com     r7
-       com     r8
-       com     r9
-       inc     r6
-       adc     r7, r0
-       adc     r8, r0
-       adc     r9, r0
+       bst     r1, 7                    ; 1
+       brtc    _sdiv_no_inv             ; 1/2*
+       com     r6                       ; 1
+       com     r7                       ; 1
+       com     r8                       ; 1
+       com     r9                       ; 1
+       inc     r6                       ; 1
+       adc     r7, r0                   ; 1*
+       adc     r8, r0                   ; 1*
+       adc     r9, r0                   ; 1*
 _sdiv_no_inv:
 
        ; Set the overflow flag if the sign is unexpected
-       ldi     r25, 0x08
-       eor     r1, r9
-       sbrc    r1, 7
-       or      r14, r25
+       ldi     r25, 0x08                ; 1
+       eor     r1, r9                   ; 1
+       sbrc    r1, 7                    ; 1/2
+       or      r14, r25                 ; 1
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_smod:
-       mov     r1, r9
-       eor     r1, r13    ; Bit 7 is set if the result is negative
-       bst     r13, 7
-       bld     r1, 6      ; Bit 6 is set if the modulo needs to be negated
-       clr     r0
-       dec     r0         ; For adding carries during inversion
+       mov     r1, r9                   ; 1
+       eor     r1, r13                  ; 1  Bit 7 is set if the result is negative
+       bst     r13, 7                   ; 1
+       bld     r1, 6                    ; 1  Bit 6 is set if the modulo needs to be negated
+       clr     r0                       ; 1
+       dec     r0                       ; 1  For adding carries during inversion
 
-       ldi     r25, 0xf6  ; Discard overflow and carry flags
-       and     r14, r25
+       ldi     r25, 0xf6                ; 1  Discard overflow and carry flags
+       and     r14, r25                 ; 1
 
        ; Set carry flag if dividing by zero
        ; Then divide anyway, because it's hilarious
-       mov     r25, r10
-       or      r25, r11
-       or      r25, r12
-       or      r25, r13
-       brne    _smod_no_divz
-       ldi     r25, 0x01
-       or      r14, r25
+       mov     r25, r10                 ; 1
+       or      r25, r11                 ; 1
+       or      r25, r12                 ; 1
+       or      r25, r13                 ; 1
+       brne    _smod_no_divz            ; 1/2*
+       ldi     r25, 0x01                ; 1
+       or      r14, r25                 ; 1
 _smod_no_divz:
 
        ; Absolute value of dividend
-       bst     r9, 7
-       brtc    _smod_no_inv_a
-       com     r9
-       com     r8
-       com     r7
-       neg     r6
-       sbc     r7, r0
-       sbc     r8, r0
-       sbc     r9, r0
+       bst     r9, 7                    ; 1
+       brtc    _smod_no_inv_a           ; 1/2*
+       com     r9                       ; 1
+       com     r8                       ; 1
+       com     r7                       ; 1
+       neg     r6                       ; 1
+       sbc     r7, r0                   ; 1*
+       sbc     r8, r0                   ; 1*
+       sbc     r9, r0                   ; 1*
 _smod_no_inv_a:
 
        ; Absolute value of divisor
-       bst     r13, 7
-       brtc    _smod_no_inv_b
-       com     r13
-       com     r12
-       com     r11
-       neg     r10
-       sbc     r11, r0
-       sbc     r12, r0
-       sbc     r13, r0
+       bst     r13, 7                   ; 1
+       brtc    _smod_no_inv_b           ; 1/2*
+       com     r13                      ; 1
+       com     r12                      ; 1
+       com     r11                      ; 1
+       neg     r10                      ; 1
+       sbc     r11, r0                  ; 1*
+       sbc     r12, r0                  ; 1*
+       sbc     r13, r0                  ; 1*
 _smod_no_inv_b:
 
        ; Call/ret take more than three clock cycles, so they can't be used
-       ldi     ZL, LOW(_smod_done)
-       ldi     ZH, HIGH(_smod_done)
-       rjmp    div_subroutine
+       ldi     ZL, LOW(_smod_done)      ; 1
+       ldi     ZH, HIGH(_smod_done)     ; 1
+       rjmp    div_subroutine           ; 2
 _smod_done:
 
        ; Adjust modulo if division result is negative
-       bst     r1, 7
-       brtc    _smod_no_adj
-       sub     r10, r22
-       sbc     r11, r23
-       sbc     r12, r24
-       sbc     r13, r25
+       bst     r1, 7                    ; 1
+       brtc    _smod_no_adj             ; 1/2*
+       sub     r10, r22                 ; 1
+       sbc     r11, r23                 ; 1*
+       sbc     r12, r24                 ; 1*
+       sbc     r13, r25                 ; 1*
        ; Invert modulo if divisor was negative
-       bst     r1, 6
-       brtc    _smod_no_inv
-       com     r13
-       com     r12
-       com     r11
-       neg     r10
-       sbc     r11, r0
-       sbc     r12, r0
-       sbc     r13, r0
+       bst     r1, 6                    ; 1
+       brtc    _smod_no_inv             ; 1/2*
+       com     r13                      ; 1
+       com     r12                      ; 1
+       com     r11                      ; 1
+       neg     r10                      ; 1
+       sbc     r11, r0                  ; 1*
+       sbc     r12, r0                  ; 1*
+       sbc     r13, r0                  ; 1*
 _smod_no_inv:
        ; Copy adjusted modulo
-       movw    r6, r10
-       movw    r8, r12
-       rjmp    _smod_doflags
+       movw    r6, r10                  ; 1
+       movw    r8, r12                  ; 1
+       rjmp    _smod_doflags            ; 2
 _smod_no_adj:
        ; Non-negative, copy modulo as-is
-       movw    r6, r22
-       movw    r8, r24
+       movw    r6, r22                  ; 1
+       movw    r8, r24                  ; 1
 _smod_doflags:
 
        ; Set the overflow flag if the sign is unexpected
-       ldi     r25, 0x08
-       eor     r1, r9
-       sbrc    r1, 7
-       or      r14, r25
+       ldi     r25, 0x08                ; 1
+       eor     r1, r9                   ; 1
+       sbrc    r1, 7                    ; 1/2
+       or      r14, r25                 ; 1
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 div_subroutine:
@@ -1020,721 +1020,721 @@ div_subroutine:
        ; r10:r11:r12:r13 - divisor
        ; r21             - loop counter
 
-       clr     r22
-       clr     r23
-       clr     r24
-       clr     r25
-       ldi     r21, 32
+       clr     r22                      ; 1
+       clr     r23                      ; 1
+       clr     r24                      ; 1
+       clr     r25                      ; 1
+       ldi     r21, 32                  ; 1
 _div_loop:
-       lsl     r6
-       rol     r7
-       rol     r8
-       rol     r9
-       rol     r22
-       rol     r23
-       rol     r24
-       rol     r25
-
-       cp      r22, r10
-       cpc     r23, r11
-       cpc     r24, r12
-       cpc     r25, r13
-
-       brlo    _div_next
-       sub     r22, r10
-       sbc     r23, r11
-       sbc     r24, r12
-       sbc     r25, r13
-       inc     r6
+       lsl     r6                       ; 1
+       rol     r7                       ; 1*
+       rol     r8                       ; 1*
+       rol     r9                       ; 1*
+       rol     r22                      ; 1*
+       rol     r23                      ; 1*
+       rol     r24                      ; 1*
+       rol     r25                      ; 1*
+
+       cp      r22, r10                 ; 1
+       cpc     r23, r11                 ; 1*
+       cpc     r24, r12                 ; 1*
+       cpc     r25, r13                 ; 1*
+
+       brlo    _div_next                ; 1/2*
+       sub     r22, r10                 ; 1
+       sbc     r23, r11                 ; 1*
+       sbc     r24, r12                 ; 1*
+       sbc     r25, r13                 ; 1*
+       inc     r6                       ; 1
 _div_next:
 
-       dec     r21
-       brne    _div_loop
+       dec     r21                      ; 1
+       brne    _div_loop                ; 1/2*
 
-       ijmp
+       ijmp                             ; 2
 
 
 exec_cmp:
-       movw    r22, r6
-       movw    r24, r8
-       sub     r22, r10
-       sbc     r23, r11
-       sbc     r24, r12
-       sbc     r25, r13
-       in      r21, SREG
-       andi    r21, 0x1d
-
-       or      r25, r24
-       or      r25, r23
-       or      r25, r22
-       brne    _cmp_nz
-       ori     r21, 0x02
+       movw    r22, r6                  ; 1
+       movw    r24, r8                  ; 1
+       sub     r22, r10                 ; 1
+       sbc     r23, r11                 ; 1*
+       sbc     r24, r12                 ; 1*
+       sbc     r25, r13                 ; 1*
+       in      r21, SREG                ; 1*
+       andi    r21, 0x1d                ; 1
+
+       or      r25, r24                 ; 1
+       or      r25, r23                 ; 1
+       or      r25, r22                 ; 1
+       brne    _cmp_nz                  ; 1/2*
+       ori     r21, 0x02                ; 1
 _cmp_nz:
-       ldi     r25, 0xe0
-       and     r14, r25
-       or      r14, r21
-       rjmp    _dispatch_done_writeback_fixedflags
+       ldi     r25, 0xe0                ; 1
+       and     r14, r25                 ; 1
+       or      r14, r21                 ; 1
+       rjmp    _dispatch_done_writeback_fixedflags  ; 2
 
 
 exec_test:
-       clr     r0
-       mov     r25, r14
-       andi    r25, 0xf9
-
-       mov     r24, r6
-       and     r24, r10
-       or      r0, r24
-
-       mov     r24, r7
-       and     r24, r11
-       or      r0, r24
-
-       mov     r24, r8
-       and     r24, r12
-       or      r0, r24
-
-       mov     r24, r9
-       and     r24, r13
-       sbrc    r24, 7
-       ori     r25, 0x04
-       or      r0, r24
-
-       breq    _test_z
-       ori     r25, 0x02
+       clr     r0                       ; 1
+       mov     r25, r14                 ; 1
+       andi    r25, 0xf9                ; 1
+
+       mov     r24, r6                  ; 1
+       and     r24, r10                 ; 1
+       or      r0, r24                  ; 1
+
+       mov     r24, r7                  ; 1
+       and     r24, r11                 ; 1
+       or      r0, r24                  ; 1
+
+       mov     r24, r8                  ; 1
+       and     r24, r12                 ; 1
+       or      r0, r24                  ; 1
+
+       mov     r24, r9                  ; 1
+       and     r24, r13                 ; 1
+       sbrc    r24, 7                   ; 1/2
+       ori     r25, 0x04                ; 1
+       or      r0, r24                  ; 1
+
+       breq    _test_z                  ; 1/2*
+       ori     r25, 0x02                ; 1
 _test_z:
-       mov     r14, r25
-       rjmp    _dispatch_done_writeback_fixedflags
+       mov     r14, r25                 ; 1
+       rjmp    _dispatch_done_writeback_fixedflags  ; 2
 
 
 exec_jal_with_ve:
        ; Change destination pointer to V[E]
-       ldi     r24, 0x0e*4
-       clr     r25
-       ldi     r16, LOW(interp_regs)
-       ldi     r17, HIGH(interp_regs)
-       add     r16, r24
-       adc     r17, r25
+       ldi     r24, 0x0e*4              ; 1
+       clr     r25                      ; 1
+       ldi     r16, LOW(interp_regs)    ; 1
+       ldi     r17, HIGH(interp_regs)   ; 1
+       add     r16, r24                 ; 1
+       adc     r17, r25                 ; 1*
        ; Fall-through to normal jump-and-link code
 exec_jal:
-       movw    r6, r2
-       clr     r8
-       clr     r9
-       movw    r2, r10
-       rjmp    _dispatch_done_writeback_reg
+       movw    r6, r2                   ; 1
+       clr     r8                       ; 1
+       clr     r9                       ; 1
+       movw    r2, r10                  ; 1
+       rjmp    _dispatch_done_writeback_reg  ; 2
 
 
 exec_shl:
-       clr     r24        ; Zero for adding carries
-       clr     r25        ; To accumulate carries
-       mov     r1, r9     ; For overflow flag
+       clr     r24                      ; 1  Zero for adding carries
+       clr     r25                      ; 1  To accumulate carries
+       mov     r1, r9                   ; 1  For overflow flag
 
 _shl_loop:
-       dec     r10        ; Decrement counter
-       brlt    _shl_done
+       dec     r10                      ; 1  Decrement counter
+       brlt    _shl_done                ; 1/2*
 
-       lsl     r6         ; Shift left by a bit
-       rol     r7
-       rol     r8
-       rol     r9
-       adc     r25, r24   ; Accumulate carries
+       lsl     r6                       ; 1  Shift left by a bit
+       rol     r7                       ; 1*
+       rol     r8                       ; 1*
+       rol     r9                       ; 1*
+       adc     r25, r24                 ; 1* Accumulate carries
 
-       rjmp    _shl_loop
+       rjmp    _shl_loop                ; 2
 _shl_done:
 
-       mov     r24, r14   ; Discard overflow and carry flags
-       andi    r24, 0xf6
+       mov     r24, r14                 ; 1  Discard overflow and carry flags
+       andi    r24, 0xf6                ; 1
 
        ; Set carry flag if any bits were shifted out
-       tst     r25
-       breq    _shl_no_carry
-       ori     r24, 0x01
+       tst     r25                      ; 1
+       breq    _shl_no_carry            ; 1/2*
+       ori     r24, 0x01                ; 1
 _shl_no_carry:
 
        ; Set overflow flag if sign changed
-       eor     r1, r9
-       sbrc    r1, 7
-       ori     r24, 0x04
+       eor     r1, r9                   ; 1
+       sbrc    r1, 7                    ; 1/2
+       ori     r24, 0x04                ; 1
 
-       mov     r14, r24
+       mov     r14, r24                 ; 1
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_shrl:
-       clr     r24        ; Zero for adding carries
-       clr     r25        ; To accumulate carries
-       mov     r1, r9     ; For overflow flag
+       clr     r24                      ; 1  Zero for adding carries
+       clr     r25                      ; 1  To accumulate carries
+       mov     r1, r9                   ; 1  For overflow flag
 
 _shrl_loop:
-       dec     r10        ; Decrement counter
-       brlt    _shrl_done
+       dec     r10                      ; 1  Decrement counter
+       brlt    _shrl_done               ; 1/2*
 
-       lsr     r9
-       ror     r8
-       ror     r7
-       ror     r6
-       adc     r25, r24   ; Accumulate carries
+       lsr     r9                       ; 1
+       ror     r8                       ; 1*
+       ror     r7                       ; 1*
+       ror     r6                       ; 1*
+       adc     r25, r24                 ; 1* Accumulate carries
 
-       rjmp    _shrl_loop
+       rjmp    _shrl_loop               ; 2
 _shrl_done:
 
-       mov     r24, r14   ; Discard overflow and carry flags
-       andi    r24, 0xf6
+       mov     r24, r14                 ; 1  Discard overflow and carry flags
+       andi    r24, 0xf6                ; 1
 
        ; Set carry flag if any bits were shifted out
-       tst     r25
-       breq    _shrl_no_carry
-       ori     r24, 0x01
+       tst     r25                      ; 1
+       breq    _shrl_no_carry           ; 1/2*
+       ori     r24, 0x01                ; 1
 _shrl_no_carry:
 
        ; Set overflow flag if sign changed
-       eor     r1, r9
-       sbrc    r1, 7
-       ori     r24, 0x04
+       eor     r1, r9                   ; 1
+       sbrc    r1, 7                    ; 1/2
+       ori     r24, 0x04                ; 1
 
-       mov     r14, r24
+       mov     r14, r24                 ; 1
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_shra:
-       clr     r24        ; Zero for adding carries
-       clr     r25        ; To accumulate carries
+       clr     r24                      ; 1  Zero for adding carries
+       clr     r25                      ; 1  To accumulate carries
 
 _shra_loop:
-       dec     r10        ; Decrement counter
-       brlt    _shra_done
+       dec     r10                      ; 1  Decrement counter
+       brlt    _shra_done               ; 1/2*
 
-       asr     r9
-       ror     r8
-       ror     r7
-       ror     r6
-       adc     r25, r24   ; Accumulate carries
+       asr     r9                       ; 1
+       ror     r8                       ; 1*
+       ror     r7                       ; 1*
+       ror     r6                       ; 1*
+       adc     r25, r24                 ; 1* Accumulate carries
 
-       rjmp    _shra_loop
+       rjmp    _shra_loop               ; 2
 _shra_done:
 
-       mov     r24, r14   ; Discard overflow and carry flags
-       andi    r24, 0xf6
+       mov     r24, r14                 ; 1  Discard overflow and carry flags
+       andi    r24, 0xf6                ; 1
 
        ; Set carry flag if any bits were shifted out
-       tst     r25
-       breq    _shra_no_carry
-       ori     r24, 0x01
+       tst     r25                      ; 1
+       breq    _shra_no_carry           ; 1/2*
+       ori     r24, 0x01                ; 1
 _shra_no_carry:
 
        ; Sign will never change, leave overflow flag clear
 
-       mov     r14, r24
+       mov     r14, r24                 ; 1
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_rol:
-       mov     r1, r9     ; For overflow flag
+       mov     r1, r9                   ; 1  For overflow flag
 
 _rol_loop:
-       dec     r10        ; Decrement counter
-       brlt    _rol_done
-
-       clc                ; Pull highest bit into carry
-       sbrc    r9, 7
-       sec
-       rol     r6
-       rol     r7
-       rol     r8
-       rol     r9
-
-       rjmp    _rol_loop
+       dec     r10                      ; 1  Decrement counter
+       brlt    _rol_done                ; 1/2*
+
+       clc                              ; 1  Pull highest bit into carry
+       sbrc    r9, 7                    ; 1/2
+       sec                              ; 1
+       rol     r6                       ; 1*
+       rol     r7                       ; 1*
+       rol     r8                       ; 1*
+       rol     r9                       ; 1*
+
+       rjmp    _rol_loop                ; 2
 _rol_done:
 
-       mov     r24, r14   ; Discard overflow and carry flags
-       andi    r24, 0xf6
+       mov     r24, r14                 ; 1  Discard overflow and carry flags
+       andi    r24, 0xf6                ; 1
 
        ; No bits will be lost, leave carry flag clear
 
        ; Set overflow flag if sign changed
-       eor     r1, r9
-       sbrc    r1, 7
-       ori     r24, 0x04
+       eor     r1, r9                   ; 1
+       sbrc    r1, 7                    ; 1/2
+       ori     r24, 0x04                ; 1
 
-       mov     r14, r24
+       mov     r14, r24                 ; 1
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_ror:
-       mov     r1, r9     ; For overflow flag
+       mov     r1, r9                   ; 1  For overflow flag
 
 _ror_loop:
-       dec     r10        ; Decrement counter
-       brlt    _ror_done
-
-       clc                ; Pull lowest bit into carry
-       sbrc    r6, 0
-       sec
-       ror     r9
-       ror     r8
-       ror     r7
-       ror     r6
-
-       rjmp    _ror_loop
+       dec     r10                      ; 1  Decrement counter
+       brlt    _ror_done                ; 1/2*
+
+       clc                              ; 1  Pull lowest bit into carry
+       sbrc    r6, 0                    ; 1/2
+       sec                              ; 1
+       ror     r9                       ; 1*
+       ror     r8                       ; 1*
+       ror     r7                       ; 1*
+       ror     r6                       ; 1*
+
+       rjmp    _ror_loop                ; 2
 _ror_done:
 
-       mov     r14, r14   ; Discard overflow and carry flags
-       andi    r24, 0xf6
+       mov     r14, r14                 ; 1  Discard overflow and carry flags
+       andi    r24, 0xf6                ; 1
 
        ; No bits will be lost, leave carry flag clear
 
        ; Set overflow flag if sign changed
-       eor     r1, r9
-       sbrc    r1, 7
-       ori     r24, 0x04
+       eor     r1, r9                   ; 1
+       sbrc    r1, 7                    ; 1/2
+       ori     r24, 0x04                ; 1
 
-       mov     r14, r24
+       mov     r14, r24                 ; 1
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_spi:
-       movw    ZL, r6
+       movw    ZL, r6                   ; 1
 _spi_byte_loop:
-       dec     r10
-       brlt    _spi_done
-       ld      r25, Z
-       out     SPDR, r25
+       dec     r10                      ; 1
+       brlt    _spi_done                ; 1/2*
+       ld      r25, Z                   ; 2
+       out     SPDR, r25                ; 1
 _spi_wait_loop:
-       in      r25, SPSR
-       sbrs    r25, SPIF
-       rjmp    _spi_wait_loop
-       in      r25, SPDR
-       st      Z+, r25
-       rjmp    _spi_byte_loop
+       in      r25, SPSR                ; 1
+       sbrs    r25, SPIF                ; 1/2
+       rjmp    _spi_wait_loop           ; 2
+       in      r25, SPDR                ; 1
+       st      Z+, r25                  ; 2
+       rjmp    _spi_byte_loop           ; 2
 _spi_done:
-       rjmp    _dispatch_done
+       rjmp    _dispatch_done           ; 2
 
 
 exec_mft:
        ; Restrict operand to 0-f
-       ldi     r25, 0x0f
-       and     r10, r25
+       ldi     r25, 0x0f                ; 1
+       and     r10, r25                 ; 1
 
-       lsl     r10
-       clr     r25
+       lsl     r10                      ; 1
+       clr     r25                      ; 1
 
-       ldi     ZL, LOW(interp_clocks)
-       ldi     ZH, HIGH(interp_clocks)
-       add     ZL, r10
-       adc     ZH, r25
+       ldi     ZL, LOW(interp_clocks)   ; 1
+       ldi     ZH, HIGH(interp_clocks)  ; 1
+       add     ZL, r10                  ; 1
+       adc     ZH, r25                  ; 1*
 
-       ld      r6, Z+
-       ld      r7, Z+
-       clr     r8
-       clr     r9
+       ld      r6, Z+                   ; 2
+       ld      r7, Z+                   ; 2
+       clr     r8                       ; 1
+       clr     r9                       ; 1
 
-       rjmp    _dispatch_done_writeback_reg
+       rjmp    _dispatch_done_writeback_reg  ; 2
 
 
 exec_mtt:
        ; Restrict operand to 0-f
-       ldi     r25, 0x0f
-       and     r10, r25
+       ldi     r25, 0x0f                ; 1
+       and     r10, r25                 ; 1
 
-       lsl     r10
-       clr     r25
+       lsl     r10                      ; 1
+       clr     r25                      ; 1
 
-       ldi     ZL, LOW(interp_clocks)
-       ldi     ZH, HIGH(interp_clocks)
-       add     ZL, r10
-       adc     ZH, r25
+       ldi     ZL, LOW(interp_clocks)   ; 1
+       ldi     ZH, HIGH(interp_clocks)  ; 1
+       add     ZL, r10                  ; 1
+       adc     ZH, r25                  ; 1*
 
-       st      Z+, r6
-       st      Z+, r7
+       st      Z+, r6                   ; 2
+       st      Z+, r7                   ; 2
 
-       rjmp    _dispatch_done
+       rjmp    _dispatch_done           ; 2
 
 
 exec_ddir:
        ; Restrict operand to 0-f
-       ldi     r25, 0x0f
-       and     r10, r25
+       ldi     r25, 0x0f                ; 1
+       and     r10, r25                 ; 1
 
        ; Extract LSB from first operand
-       ldi     r23, 0x01
-       and     r6, r23
-       clr     r7
+       ldi     r23, 0x01                ; 1
+       and     r6, r23                  ; 1
+       clr     r7                       ; 1
 
        ; Mask of all bits except LSB
-       ldi     r24, 0xfe
-       ldi     r25, 0xff
+       ldi     r24, 0xfe                ; 1
+       ldi     r25, 0xff                ; 1
 
        ; Rotate LSB and mask into position specified by second operand
 _ddir_loop:
-       dec     r10
-       brlt    _din_loop_done
+       dec     r10                      ; 1
+       brlt    _din_loop_done           ; 1/2*
 
-       sec
-       rol     r24
-       rol     r25
+       sec                              ; 1
+       rol     r24                      ; 1*
+       rol     r25                      ; 1*
 
-       clc
-       rol     r6
-       rol     r7
+       clc                              ; 1
+       rol     r6                       ; 1*
+       rol     r7                       ; 1*
 
-       rjmp    _ddir_loop
+       rjmp    _ddir_loop               ; 2
 _ddir_loop_done:
 
        ; Read-modify-write
-       in      r22, DDRB
-       in      r23, DDRC
-       and     r22, r24
-       and     r23, r25
-       or      r22, r6
-       or      r23, r7
-       out     DDRB, r22
-       out     DDRC, r23
+       in      r22, DDRB                ; 1
+       in      r23, DDRC                ; 1
+       and     r22, r24                 ; 1
+       and     r23, r25                 ; 1
+       or      r22, r6                  ; 1
+       or      r23, r7                  ; 1
+       out     DDRB, r22                ; 1
+       out     DDRC, r23                ; 1
 
-       rjmp    _dispatch_done
+       rjmp    _dispatch_done           ; 2
 
 
 exec_din:
        ; Restrict operand to 0-f
-       ldi     r25, 0x0f
-       and     r10, r25
+       ldi     r25, 0x0f                ; 1
+       and     r10, r25                 ; 1
 
        ; Read port values
-       in      r24, PINB
-       in      r25, PINC
+       in      r24, PINB                ; 1
+       in      r25, PINC                ; 1
 
        ; Shift desired value into LSB
 _din_loop:
-       dec     r10
-       brlt    _din_loop_done
+       dec     r10                      ; 1
+       brlt    _din_loop_done           ; 1/2*
 
-       clc
-       ror     r25
-       ror     r24
+       clc                              ; 1
+       ror     r25                      ; 1*
+       ror     r24                      ; 1*
 
-       rjmp    _din_loop
+       rjmp    _din_loop                ; 2
 _din_loop_done:
 
        ; Extract port LSB and put into result LSB
-       andi    r24, 0x01
-       ldi     r25, 0xfe
-       and     r6, r25
-       or      r6, r24
+       andi    r24, 0x01                ; 1
+       ldi     r25, 0xfe                ; 1
+       and     r6, r25                  ; 1
+       or      r6, r24                  ; 1
 
-       rjmp    _dispatch_done_writeback_flags
+       rjmp    _dispatch_done_writeback_flags  ; 2
 
 
 exec_dout:
        ; Restrict operand to 0-f
-       ldi     r25, 0x0f
-       and     r10, r25
+       ldi     r25, 0x0f                ; 1
+       and     r10, r25                 ; 1
 
        ; Extract LSB from first operand
-       ldi     r23, 0x01
-       and     r6, r23
-       clr     r7
+       ldi     r23, 0x01                ; 1
+       and     r6, r23                  ; 1
+       clr     r7                       ; 1
 
        ; Mask of all bits except LSB
-       ldi     r24, 0xfe
-       ldi     r25, 0xff
+       ldi     r24, 0xfe                ; 1
+       ldi     r25, 0xff                ; 1
 
        ; Rotate LSB and mask into position specified by second operand
 _dout_loop:
-       dec     r10
-       brlt    _dout_loop_done
+       dec     r10                      ; 1
+       brlt    _dout_loop_done          ; 1/2*
 
-       sec
-       rol     r24
-       rol     r25
+       sec                              ; 1
+       rol     r24                      ; 1*
+       rol     r25                      ; 1*
 
-       clc
-       rol     r6
-       rol     r7
+       clc                              ; 1
+       rol     r6                       ; 1*
+       rol     r7                       ; 1*
 
-       rjmp    _dout_loop
+       rjmp    _dout_loop               ; 2
 _dout_loop_done:
 
        ; Read-modify-write
-       in      r22, PORTB
-       in      r23, PORTC
-       and     r22, r24
-       and     r23, r25
-       or      r22, r6
-       or      r23, r7
-       out     PORTB, r22
-       out     PORTC, r23
+       in      r22, PORTB               ; 1
+       in      r23, PORTC               ; 1
+       and     r22, r24                 ; 1
+       and     r23, r25                 ; 1
+       or      r22, r6                  ; 1
+       or      r23, r7                  ; 1
+       out     PORTB, r22               ; 1
+       out     PORTC, r23               ; 1
 
-       rjmp    _dispatch_done
+       rjmp    _dispatch_done           ; 2
 
 
 exec_ain:
        ; Set the ADC source
-       lds     r25, ADMUX
-       andi    r25, 0xf0
-       mov     r24, r10
-       andi    r24, 0x0f
-       or      r25, r24
-       sts     ADMUX, r25
+       lds     r25, ADMUX               ; 2
+       andi    r25, 0xf0                ; 1
+       mov     r24, r10                 ; 1
+       andi    r24, 0x0f                ; 1
+       or      r25, r24                 ; 1
+       sts     ADMUX, r25               ; 2
 
        ; Trigger a single conversion
-       lds     r25, ADCSRA
-       ori     r25, (1 << ADSC)
-       sts     ADCSRA, r25
+       lds     r25, ADCSRA              ; 2
+       ori     r25, (1 << ADSC)         ; 1
+       sts     ADCSRA, r25              ; 2
 
        ; Wait for conversion to complete
 _ain_wait:
-       lds     r25, ADCSRA
-       sbrs    r25, ADIF
-       rjmp    _ain_wait
-       sts     ADCSRA, r25
+       lds     r25, ADCSRA              ; 2
+       sbrs    r25, ADIF                ; 1/2
+       rjmp    _ain_wait                ; 2
+       sts     ADCSRA, r25              ; 2
 
        ; Read value from ADC
-       lds     r6, ADCL
-       lds     r7, ADCH
-       clr     r8
-       clr     r9
+       lds     r6, ADCL                 ; 2
+       lds     r7, ADCH                 ; 2
+       clr     r8                       ; 1
+       clr     r9                       ; 1
 
-       rjmp    _dispatch_done_writeback_reg
+       rjmp    _dispatch_done_writeback_reg  ; 2
 
 
 exec_aout:
        ; Restrict operand to 0-7
-       ldi     r25, 0x07
-       and     r10, r25
+       ldi     r25, 0x07                ; 1
+       and     r10, r25                 ; 1
 
-       clr     r25
+       clr     r25                      ; 1
 
-       ldi     ZL, LOW(_aout_jtab)
-       ldi     ZH, HIGH(_aout_jtab)
-       add     ZL, r10
-       adc     ZH, r25
-       ijmp
+       ldi     ZL, LOW(_aout_jtab)      ; 1
+       ldi     ZH, HIGH(_aout_jtab)     ; 1
+       add     ZL, r10                  ; 1
+       adc     ZH, r25                  ; 1*
+       ijmp                             ; 2
 
 _aout_jtab:
-       rjmp    _aout_ocr0a
-       rjmp    _aout_ocr0b
-       rjmp    _aout_ocr1a
-       rjmp    _aout_ocr1b
-       rjmp    _aout_ocr2a
-       rjmp    _aout_ocr2b
-       rjmp    _aout_done
-       rjmp    _aout_done
+       rjmp    _aout_ocr0a              ; 2
+       rjmp    _aout_ocr0b              ; 2
+       rjmp    _aout_ocr1a              ; 2
+       rjmp    _aout_ocr1b              ; 2
+       rjmp    _aout_ocr2a              ; 2
+       rjmp    _aout_ocr2b              ; 2
+       rjmp    _aout_done               ; 2
+       rjmp    _aout_done               ; 2
 
 _aout_ocr0a:
-       out     OCR0A, r6
-       rjmp    _aout_done
+       out     OCR0A, r6                ; 1
+       rjmp    _aout_done               ; 2
 
 _aout_ocr0b:
-       out     OCR0B, r6
-       rjmp    _aout_done
+       out     OCR0B, r6                ; 1
+       rjmp    _aout_done               ; 2
 
 _aout_ocr1a:
-       sts     OCR1AH, r7
-       sts     OCR1AL, r8
-       rjmp    _aout_done
+       sts     OCR1AH, r7               ; 2
+       sts     OCR1AL, r8               ; 2
+       rjmp    _aout_done               ; 2
 
 _aout_ocr1b:
-       sts     OCR1BH, r7
-       sts     OCR1BL, r8
-       rjmp    _aout_done
+       sts     OCR1BH, r7               ; 2
+       sts     OCR1BL, r8               ; 2
+       rjmp    _aout_done               ; 2
 
 _aout_ocr2a:
-       sts     OCR2A, r6
-       rjmp    _aout_done
+       sts     OCR2A, r6                ; 2
+       rjmp    _aout_done               ; 2
 
 _aout_ocr2b:
-       sts     OCR2B, r6
-       rjmp    _aout_done
+       sts     OCR2B, r6                ; 2
+       rjmp    _aout_done               ; 2
 
 _aout_done:
-       rjmp    _dispatch_done
+       rjmp    _dispatch_done           ; 2
 
 
 exec_ldb:
        ; Load byte
-       movw    ZL, r10
-       ld      r6, Z+
+       movw    ZL, r10                  ; 1
+       ld      r6, Z+                   ; 2
 
        ; Sign extend
-       clr     r0
-       sbrc    r6, 7
-       com     r0
-       mov     r7, r0
-       mov     r8, r0
-       mov     r9, r0
+       clr     r0                       ; 1
+       sbrc    r6, 7                    ; 1/2
+       com     r0                       ; 1
+       mov     r7, r0                   ; 1
+       mov     r8, r0                   ; 1
+       mov     r9, r0                   ; 1
 
-       rjmp    _dispatch_done_writeback_reg
+       rjmp    _dispatch_done_writeback_reg  ; 2
 
 
 exec_ldh:
        ; Load halfword
-       movw    ZL, r10
-       ld      r6, Z+
-       ld      r7, Z+
+       movw    ZL, r10                  ; 1
+       ld      r6, Z+                   ; 2
+       ld      r7, Z+                   ; 2
 
        ; Sign extend
-       clr     r0
-       sbrc    r7, 7
-       com     r0
-       mov     r8, r0
-       mov     r9, r0
+       clr     r0                       ; 1
+       sbrc    r7, 7                    ; 1/2
+       com     r0                       ; 1
+       mov     r8, r0                   ; 1
+       mov     r9, r0                   ; 1
 
-       rjmp    _dispatch_done_writeback_reg
+       rjmp    _dispatch_done_writeback_reg  ; 2
 
 
 exec_ldw:
        ; Load word
-       movw    ZL, r10
-       ld      r6, Z+
-       ld      r7, Z+
-       ld      r8, Z+
-       ld      r9, Z+
+       movw    ZL, r10                  ; 1
+       ld      r6, Z+                   ; 2
+       ld      r7, Z+                   ; 2
+       ld      r8, Z+                   ; 2
+       ld      r9, Z+                   ; 2
 
-       rjmp    _dispatch_done_writeback_reg
+       rjmp    _dispatch_done_writeback_reg  ; 2
 
 
 exec_lpb:
        ; Load byte
-       movw    ZL, r10
-       lpm     r6, Z+
+       movw    ZL, r10                  ; 1
+       lpm     r6, Z+                   ; 3
 
        ; Sign extend
-       clr     r0
-       sbrc    r6, 7
-       com     r0
-       mov     r7, r0
-       mov     r8, r0
-       mov     r9, r0
+       clr     r0                       ; 1
+       sbrc    r6, 7                    ; 1/2
+       com     r0                       ; 1
+       mov     r7, r0                   ; 1
+       mov     r8, r0                   ; 1
+       mov     r9, r0                   ; 1
 
-       rjmp    _dispatch_done_writeback_reg
+       rjmp    _dispatch_done_writeback_reg  ; 2
 
 
 exec_lph:
        ; Load halfword
-       movw    ZL, r10
-       lpm     r6, Z+
-       lpm     r7, Z+
+       movw    ZL, r10                  ; 1
+       lpm     r6, Z+                   ; 3
+       lpm     r7, Z+                   ; 3
 
        ; Sign extend
-       clr     r0
-       sbrc    r7, 7
-       com     r0
-       mov     r8, r0
-       mov     r9, r0
+       clr     r0                       ; 1
+       sbrc    r7, 7                    ; 1/2
+       com     r0                       ; 1
+       mov     r8, r0                   ; 1
+       mov     r9, r0                   ; 1
 
-       rjmp    _dispatch_done_writeback_reg
+       rjmp    _dispatch_done_writeback_reg  ; 2
 
 
 exec_lpw:
        ; Load word
-       movw    ZL, r10
-       lpm     r6, Z+
-       lpm     r7, Z+
-       lpm     r8, Z+
-       lpm     r9, Z+
+       movw    ZL, r10                  ; 1
+       lpm     r6, Z+                   ; 3
+       lpm     r7, Z+                   ; 3
+       lpm     r8, Z+                   ; 3
+       lpm     r9, Z+                   ; 3
 
-       rjmp    _dispatch_done_writeback_reg
+       rjmp    _dispatch_done_writeback_reg  ; 2
 
 
 exec_stb:
-       movw    ZL, r10
-       st      Z+, r6
-       rjmp    _dispatch_done
+       movw    ZL, r10                  ; 1
+       st      Z+, r6                   ; 2
+       rjmp    _dispatch_done           ; 2
 
 
 exec_sth:
-       movw    ZL, r10
-       st      Z+, r6
-       st      Z+, r7
-       rjmp    _dispatch_done
+       movw    ZL, r10                  ; 1
+       st      Z+, r6                   ; 2
+       st      Z+, r7                   ; 2
+       rjmp    _dispatch_done           ; 2
 
 
 exec_stw:
-       movw    ZL, r10
-       st      Z+, r6
-       st      Z+, r7
-       st      Z+, r8
-       st      Z+, r9
-       rjmp    _dispatch_done
+       movw    ZL, r10                  ; 1
+       st      Z+, r6                   ; 2
+       st      Z+, r7                   ; 2
+       st      Z+, r8                   ; 2
+       st      Z+, r9                   ; 2
+       rjmp    _dispatch_done           ; 2
 
 
 exec_ext:
        ; Can't use regular call/ret instructions, they take more than 3 cycles
 
        ; Put the return address into temporaries
-       ldi     r24, LOW(_ext_done)
-       ldi     r25, HIGH(_ext_done)
+       ldi     r24, LOW(_ext_done)      ; 1
+       ldi     r25, HIGH(_ext_done)     ; 1
 
        ; Execute at the target address
-       movw    ZL, r10
-       ijmp
+       movw    ZL, r10                  ; 1
+       ijmp                             ; 2
 _ext_done:
-       rjmp    _dispatch_done
+       rjmp    _dispatch_done           ; 2
 
 
 exec_jtab:
-       add     r10, r6    ; Add V[X] to PC+sext(nn)
-       adc     r11, r7
-       movw    r2, r10
-       rjmp    _dispatch_done
+       add     r10, r6                  ; 1  Add V[X] to PC+sext(nn)
+       adc     r11, r7                  ; 1*
+       movw    r2, r10                  ; 1
+       rjmp    _dispatch_done           ; 2
 
 
 exec_blt:
-       sbrc    r14, 4
-       movw    r2, r10    ; Branch if S bit is set
-       rjmp    _dispatch_done
+       sbrc    r14, 4                   ; 1/2
+       movw    r2, r10                  ; 1  Branch if S bit is set
+       rjmp    _dispatch_done           ; 2
 exec_bge:
-       sbrs    r14, 4
-       movw    r2, r10    ; Branch if S bit is clear
-       rjmp    _dispatch_done
+       sbrs    r14, 4                   ; 1/2
+       movw    r2, r10                  ; 1  Branch if S bit is clear
+       rjmp    _dispatch_done           ; 2
 
 
 exec_bv:
-       sbrc    r14, 3
-       movw    r2, r10    ; Branch if V bit is set
-       rjmp    _dispatch_done
+       sbrc    r14, 3                   ; 1/2
+       movw    r2, r10                  ; 1  Branch if V bit is set
+       rjmp    _dispatch_done           ; 2
 exec_bnv:
-       sbrs    r14, 3
-       movw    r2, r10    ; Branch if V bit is clear
-       rjmp    _dispatch_done
+       sbrs    r14, 3                   ; 1/2
+       movw    r2, r10                  ; 1  Branch if V bit is clear
+       rjmp    _dispatch_done           ; 2
 
 
 exec_bmi:
-       sbrc    r14, 2
-       movw    r2, r10    ; Branch if N bit is set
-       rjmp    _dispatch_done
+       sbrc    r14, 2                   ; 1/2
+       movw    r2, r10                  ; 1  Branch if N bit is set
+       rjmp    _dispatch_done           ; 2
 exec_bpl:
-       sbrs    r14, 2
-       movw    r2, r10    ; Branch if N bit is clear
-       rjmp    _dispatch_done
+       sbrs    r14, 2                   ; 1/2
+       movw    r2, r10                  ; 1  Branch if N bit is clear
+       rjmp    _dispatch_done           ; 2
 
 
 exec_bz:
-       sbrc    r14, 1
-       movw    r2, r10    ; Branch if Z bit is set
-       rjmp    _dispatch_done
+       sbrc    r14, 1                   ; 1/2
+       movw    r2, r10                  ; 1  Branch if Z bit is set
+       rjmp    _dispatch_done           ; 2
 exec_bnz:
-       sbrs    r14, 1
-       movw    r2, r10    ; Branch if Z bit is clear
-       rjmp    _dispatch_done
+       sbrs    r14, 1                   ; 1/2
+       movw    r2, r10                  ; 1  Branch if Z bit is clear
+       rjmp    _dispatch_done           ; 2
 
 
 exec_c:
-       sbrc    r14, 0
-       movw    r2, r10    ; Branch if C bit is set
-       rjmp    _dispatch_done
+       sbrc    r14, 0                   ; 1/2
+       movw    r2, r10                  ; 1  Branch if C bit is set
+       rjmp    _dispatch_done           ; 2
 exec_nc:
-       sbrs    r14, 0
-       movw    r2, r10    ; Branch if C bit is clear
-       rjmp    _dispatch_done
+       sbrs    r14, 0                   ; 1/2
+       movw    r2, r10                  ; 1  Branch if C bit is clear
+       rjmp    _dispatch_done           ; 2
 
 
 exec_jmp:
-       movw    r2, r10
-       rjmp    _dispatch_done
+       movw    r2, r10                  ; 1
+       rjmp    _dispatch_done           ; 2
 
 
 ; ------------------------------------------------------------------------------