// 18 channels of R/C-style PWM input (nominal 1ms to 2ms positive pulses at nominally 50Hz to 100Hz)
// inputs are on D2-D13, A0-A5,
// outputs the 18 channels via Serial port at 115200 baud.
// use Arduino IDE Serial Monitor to view channels - displays all 18 channels on one row in half-microsecond units
// better, use Arduino IDE Serial Plotter, to view channels graphically
// ceptimus April 2022

struct edgeEvent {
  uint16_t tcnt; // a snapshot of TCNT1 when the interrupt occurred
  uint8_t pins; // a snapshot of the 8 inputs when the interrupt occurred
  uint8_t id; // 0, 1, or 2 to identify which pinchange interrupt added the event
};

edgeEvent q[64];  // a FIFO of edge events.  64 of them, so that the index used by the asm interrupt routines to write to the queue can be simple byte (64 * 4-byte event = 256 bytes)
volatile uint8_t qInput = 0;  // the index used by the asm interrupt routines.  Each interrupt increases its value by 4
uint8_t qOutput = 0; // the index used by the (non-interrupt) code which reads events from the queue

uint16_t startEdge[18]; // timeStamps (in half-microsecond units) for leading edges of channel pulses
uint16_t pulseWidth[18]; // latest received pulseWidth for each channel in half microsecond units

uint8_t serialBuffer[8];

void setup() {
  // using 16-bit Timer/Counter1 to provide 16-bit timestamps with half-microsecond resolution
  TCCR1A = 0x00;
  TCCR1B = 0x02;
  
  DDRB   &= 0b11000000; // port B (pins D8 - D13) inputs
  PORTB  |= 0b00111111; // input pullup to avoid phantom edges from unused PWM inputs
  PCMSK0 |= 0b00111111; // allow pinchange interrupts for pins D8 - D13 (handled by PCINT0)
  DDRC   &= 0b11000000; // port C (pins A0 - A5) inputs
  PORTC  |= 0b00111111; // input pullup to avoid phantom edges from unused PWM inputs
  PCMSK1 |= 0b00111111; // allow pinchange interrupts for pins A0 - A5 (handled by PCINT1)
  DDRD   &= 0b00000011; // port D (pins D2 - D7) inputs
  PORTD  |= 0b11111100; // input pullup to avoid phantom edges from unused PWM inputs
  PCMSK2 |= 0b11111100; // allow pinchange interrupts for pins D2 - D7 (handled by PCINT2)
  PCICR  |= 0b00000111; // enable pinchange interrupts PCINT0, PCINT1, PCINT2

  delay(50); // settle time for any start-up interrupts to fire before initializing defaults

  // all 18 PWM channels may not be connected: default all channels to nominal 'centre'
  for (uint8_t channel = 0; channel < 18; channel++) {
    pulseWidth[channel] = 3000;
  }
  qInput = 0;
  
  // serial port setup for 115200 baud, 8N1. not using Serial.begin() to avoid enabling Serial interrupts which would make PWM edge detection slightly more jittery
#define BAUD 115200
#define UBRR_DOUBLESPEED (F_CPU/8/BAUD-1)
  UBRR0H = (uint8_t)UBRR_DOUBLESPEED >> 8;
  UBRR0L = (uint8_t)UBRR_DOUBLESPEED;
  UCSR0A = (1<<U2X0); // double USART transmission speed
  UCSR0B = (1<<TXEN0); // enable transmit
  UCSR0C = (3<<UCSZ00); // no parity, 1 stop bit, 8 data bits

  // not using millis() so timer0 can be disabled to remove another potential source of jitter on the PWM signals
  TIMSK0 &= ~_BV(TOIE0); // disable timer0 overflow interrupt
}

void loop() {
  static uint8_t prevB = 0, prevC = 0, prevD = 0; // state of pins for ports B, C, D on previous event for that port
  static uint8_t serialPrintChannel = 0;
  static uint8_t charsInSerialBuffer = 0;
  
  while (qOutput != qInput) {
    edgeEvent event = q[qOutput >> 2]; // divide by 4 to convert a byte offset to an edgeEvent index
    uint8_t changed; // which bit(s) changed since previous event on same port
    uint8_t mask; // bitMask to scan changed bit(s)
    uint8_t channel;
    switch (event.id) {
    case 0:  // port B: pins D8 - D13
      changed = prevB ^ event.pins;
      prevB = event.pins;
      mask = 0x01;
      for (channel = 6; channel < 12; channel++) {
        if (changed & mask) { // pin [channel - 6] in this group of 6 changed state
          if (prevB & mask) { // pin went high - remember start time
            startEdge[channel] = event.tcnt;
          } else { // pin went low, so store pulseWidth since it went high
            pulseWidth[channel] = event.tcnt - startEdge[channel];
          }
        }
        mask <<= 1;
      }
      break;
    case 1:  // port C: pins A0 - A5
      changed = prevC ^ event.pins;
      prevC = event.pins;
      mask = 0x01;
      for (channel = 12; channel < 18; channel++) {
        if (changed & mask) { // pin [channel - 12] in this group of 6 changed state
          if (prevC & mask) { // pin went high - remember start time
            startEdge[channel] = event.tcnt;
          } else { // pin went low, so store pulseWidth since it went high
            pulseWidth[channel] = event.tcnt - startEdge[channel];
          }
        }
        mask <<= 1;
      }
      break;
    default: // port D: pins D2 - D7
      changed = prevD ^ event.pins;
      prevD = event.pins;
      mask = 0x04;
      for (channel = 0; channel < 6; channel++) {
        if (changed & mask) { // pin [channel] in this group of 6 changed state
          if (prevD & mask) { // pin went high - remember start time
            startEdge[channel] = event.tcnt;
          } else { // pin went low, so store pulseWidth since it went high
            pulseWidth[channel] = event.tcnt - startEdge[channel];
          }
        }
        mask <<= 1;
      }
      break;
    }
    qOutput += 4;
  }
  if (UCSR0A & (1<<UDRE0)) { // serial transmit data register empty - a byte can be written to the serial port
    if (charsInSerialBuffer) { // there are bytes in the buffer for the current channel
      UDR0 = serialBuffer[6 - charsInSerialBuffer]; // write byte to transmit data buffer register
      charsInSerialBuffer--;
    } else { // previous channel transmission complete. move on to next/first channel
      uint16_t pw = pulseWidth[serialPrintChannel];
      if (pw > 9999) {
        pw = 9999;
      }
      serialBuffer[3] = pw % 10 + '0';
      pw /= 10;
      serialBuffer[2] = pw % 10 + '0';
      pw /= 10;
      serialBuffer[1] = pw % 10 + '0';
      pw /= 10;
      serialBuffer[0] = pw + '0';
      
      if (serialPrintChannel < 17) {
        serialBuffer[4] = ',';
        serialBuffer[5] = ' ';
        serialPrintChannel++;
      } else {
        serialBuffer[4] = ' ';
        serialBuffer[5] = '\n';
        serialPrintChannel = 0;
      }
      charsInSerialBuffer = 6;
    }
  }
}

ISR(PCINT0_vect, ISR_NAKED) { // pin change interrupt for Port B: pins D8 - D13
  asm (
    "push r31           \n" // preserve Z
    "push r30           \n"
    "push r25           \n" // preserve R25
    "push r24           \n" // preserve R24
    "in   r24, __SREG__ \n"
    "push r24           \n" // preserve SREG // 11 cycles of asm to here

    "ldi r30, lo8(q)    \n" // Z points to the start of the queue
    "ldi r31, hi8(q)    \n"
    "lds r24,(qInput)   \n"
    "ldi r25, 0         \n"
    "add r30, r24       \n" // Z now points to event within the queue
    "adc r31, r25       \n"
    "ldi r25, 4         \n"
    "add r24, r25       \n" // bump qInput ready for next event
    "sts (qInput), r24  \n"
    "lds r24, 0x84      \n" // TCNT1 lo
    "lds r25, 0x85      \n" // TCNT1 hi
    "st  Z+, r24        \n"
    "st  Z+, r25        \n" // write the 16-bit TCNT1 timestamp to the event
    "in  r24, 0x03      \n" // read PINB
    "st  Z+, r24        \n" // write snapshot of pins to event
    "ldi r24, 0         \n" // id for Port B events
    "st  Z+, r24        \n" // write id to event  // + 28 cycles of asm to here
    
    "pop  r24           \n"
    "out  __SREG__, r24 \n" // restore SREG
    "pop  r24           \n" // restore R24
    "pop  r25           \n" // restore R25
    "pop  r30           \n" // restore Z
    "pop  r31           \n"
    "reti               \n" // + 15 cycles of asm to here.  Total interrupt cycles 54 + interrupt call overhead (7 cycles?) - so about 61 cycles total or just under 4us with 16MHz clock 
  );
}

ISR(PCINT1_vect, ISR_NAKED) { // pin change interrupt for Port C: pins A0 - A5
  asm (
    "push r31           \n" // preserve Z
    "push r30           \n"
    "push r25           \n" // preserve R25
    "push r24           \n" // preserve R24
    "in   r24, __SREG__ \n"
    "push r24           \n" // preserve SREG // 11 cycles of asm to here

    "ldi r30, lo8(q)    \n" // Z points to the start of the queue
    "ldi r31, hi8(q)    \n"
    "lds r24,(qInput)   \n"
    "ldi r25, 0         \n"
    "add r30, r24       \n" // Z now points to event within the queue
    "adc r31, r25       \n"
    "ldi r25, 4         \n"
    "add r24, r25       \n" // bump qInput ready for next event
    "sts (qInput), r24  \n"
    "lds r24, 0x84      \n" // TCNT1 lo
    "lds r25, 0x85      \n" // TCNT1 hi
    "st  Z+, r24        \n"
    "st  Z+, r25        \n" // write the 16-bit TCNT1 timestamp to the event
    "in  r24, 0x06      \n" // read PINC
    "st  Z+, r24        \n" // write snapshot of pins to event
    "ldi r24, 1         \n" // id for Port C events
    "st  Z+, r24        \n" // write id to event  // + 28 cycles of asm to here
    
    "pop  r24           \n"
    "out  __SREG__, r24 \n" // restore SREG
    "pop  r24           \n" // restore R24
    "pop  r25           \n" // restore R25
    "pop  r30           \n" // restore Z
    "pop  r31           \n"
    "reti               \n" // + 15 cycles of asm to here.  Total interrupt cycles 54 + interrupt call overhead (7 cycles?) - so about 61 cycles total or just under 4us with 16MHz clock 
  );
}

ISR(PCINT2_vect, ISR_NAKED) { // pin change interrupt for Port D: pins D2 - D7
  asm (
    "push r31           \n" // preserve Z
    "push r30           \n"
    "push r25           \n" // preserve R25
    "push r24           \n" // preserve R24
    "in   r24, __SREG__ \n"
    "push r24           \n" // preserve SREG // 11 cycles of asm to here

    "ldi r30, lo8(q)    \n" // Z points to the start of the queue
    "ldi r31, hi8(q)    \n"
    "lds r24,(qInput)   \n"
    "ldi r25, 0         \n"
    "add r30, r24       \n" // Z now points to event within the queue
    "adc r31, r25       \n"
    "ldi r25, 4         \n"
    "add r24, r25       \n" // bump qInput ready for next event
    "sts (qInput), r24  \n"
    "lds r24, 0x84      \n" // TCNT1 lo
    "lds r25, 0x85      \n" // TCNT1 hi
    "st  Z+, r24        \n"
    "st  Z+, r25        \n" // write the 16-bit TCNT1 timestamp to the event
    "in  r24, 0x09      \n" // read PIND
    "st  Z+, r24        \n" // write snapshot of pins to event
    "ldi r24, 2         \n" // id for Port D events
    "st  Z+, r24        \n" // write id to event  // + 28 cycles of asm to here
    
    "pop  r24           \n"
    "out  __SREG__, r24 \n" // restore SREG
    "pop  r24           \n" // restore R24
    "pop  r25           \n" // restore R25
    "pop  r30           \n" // restore Z
    "pop  r31           \n"
    "reti               \n" // + 15 cycles of asm to here.  Total interrupt cycles 54 + interrupt call overhead (7 cycles?) - so about 61 cycles total or just under 4us with 16MHz clock 
  );
}
