From a86aa65f63ecec9378596c87966b913613ddc3a8 Mon Sep 17 00:00:00 2001
From: Aikku93 <aik@aol.com.au>
Date: Mon, 11 Jul 2022 18:45:10 +1000
Subject: [PATCH 1/6] Fix ADPCM savestate looping bug

---
 desmume/src/SPU.cpp | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)
diff --git a/desmume/src/SPU.cpp b/desmume/src/SPU.cpp
index 4045be3c6..6c15e71cc 100644
--- a/desmume/src/SPU.cpp
+++ b/desmume/src/SPU.cpp
@@ -1183,6 +1183,10 @@ template<int FORMAT> static FORCEINLINE void TestForLoop(SPU_struct *SPU, channe
 		return;
 	}
 
+	// Wrap sampcnt
+	u32 loopSize = chan->totlength_shifted - (chan->loopstart << format_shift[FORMAT]);
+	do chan->sampcntInt -= loopSize; while(chan->sampcntInt >= chan->totlength_shifted)
+
 	// ADPCM needs special handling
 	if(FORMAT == 2)
 	{
@@ -1192,22 +1196,33 @@ template<int FORMAT> static FORCEINLINE void TestForLoop(SPU_struct *SPU, channe
 		// fix: 7th Dragon (JP) - http://sourceforge.net/p/desmume/bugs/1357/
 		if (chan->totlength < 4) return;
 
-		// Stash loop sample and index
+		// Fetch loop sample and index, and get the "new" current decoding position
+		s32 curpos;
+		s16 *pcm16Dst = &chan->pcm16b[SPUCHAN_PCM16B_AT(chan->pcm16bOffs)];
 		if(chan->loop_index == K_ADPCM_LOOPING_RECOVERY_INDEX)
 		{
-			chan->pcm16b[SPUCHAN_PCM16B_AT(chan->pcm16bOffs)] = (s16)read16(chan->addr);
+			// We need to decode from the start until current position,
+			// as the loop sample/index is very likely to be incorrect
+			*pcm16Dst = (s16)read16(chan->addr);
 			chan->index = read08(chan->addr+2) & 0x7F;
+			curpos = 8;
 		}
 		else
 		{
-			chan->pcm16b[SPUCHAN_PCM16B_AT(chan->pcm16bOffs)] = chan->loop_pcm16b;
+			*pcm16Dst = chan->loop_pcm16b;
 			chan->index = chan->loop_index;
+			curpos = (chan->loopstart << format_shift[FORMAT]);
 		}
-	}
 
-	// Wrap sampcnt
-	u32 step = chan->totlength_shifted - (chan->loopstart << format_shift[FORMAT]);
-	while (chan->sampcntInt >= chan->totlength_shifted) chan->sampcntInt -= step;
+		// Decode until we reach the target position
+		// This is really only used for fast seeking (ie. SNDDummy
+		// and loop reset), but makes the code much cleaner.
+		while(curpos < chan->sampcntInt)
+		{
+			*pcm16Dst = FetchADPCMData(chan, curpos);
+			curpos++;
+		}
+	}
 }
 
 template<int CHANNELS> FORCEINLINE static void SPU_Mix(SPU_struct* SPU, channel_struct *chan, s32 data)

From b6da15a0482e718574bd622a12f3aaae31db97ea Mon Sep 17 00:00:00 2001
From: Aikku93 <aik@aol.com.au>
Date: Mon, 11 Jul 2022 18:52:57 +1000
Subject: [PATCH 2/6] Add Catmull-Rom interpolation option

---
 desmume/src/frontend/posix/gtk/main.cpp  | 5 +++++
 desmume/src/frontend/posix/gtk/menu.ui   | 5 +++++
 desmume/src/frontend/posix/gtk2/main.cpp | 4 +++-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/desmume/src/frontend/posix/gtk/main.cpp b/desmume/src/frontend/posix/gtk/main.cpp
index 8bbb159ad..b65399b98 100644
--- a/desmume/src/frontend/posix/gtk/main.cpp
+++ b/desmume/src/frontend/posix/gtk/main.cpp
@@ -2357,6 +2357,8 @@ static void Modify_SPUInterpolation(GSimpleAction *action, GVariant *parameter,
         mode = SPUInterpolation_Linear;
     else if (strcmp(string, "cosine") == 0)
         mode = SPUInterpolation_Cosine;
+    else if (strcmp(string, "catmullrom") == 0)
+        mode = SPUInterpolation_CatmullRom;
     CommonSettings.spuInterpolationMode = mode;
     config.audio_interpolation = CommonSettings.spuInterpolationMode;
     g_simple_action_set_state(action, parameter);
@@ -3227,6 +3229,9 @@ common_gtk_main(GApplication *app, gpointer user_data)
         case SPUInterpolation_Cosine:
             string = "cosine";
             break;
+        case SPUInterpolation_CatmullRom:
+            string = "catmullrom";
+            break;
     }
     g_simple_action_set_state(G_SIMPLE_ACTION(g_action_map_lookup_action(G_ACTION_MAP(app), "spu_interpolation")), g_variant_new_string(string.c_str()));
 
diff --git a/desmume/src/frontend/posix/gtk/menu.ui b/desmume/src/frontend/posix/gtk/menu.ui
index db28d2ae7..b59d8f237 100644
--- a/desmume/src/frontend/posix/gtk/menu.ui
+++ b/desmume/src/frontend/posix/gtk/menu.ui
@@ -661,6 +661,11 @@
               <attribute name='action'>app.spu_interpolation</attribute>
               <attribute name='target'>cosine</attribute>
             </item>
+            <item>
+              <attribute name='label' translatable='yes'>_CatmullRom</attribute>
+              <attribute name='action'>app.spu_interpolation</attribute>
+              <attribute name='target'>catmullrom</attribute>
+            </item>
           </section>
         </submenu>
         <submenu>
diff --git a/desmume/src/frontend/posix/gtk2/main.cpp b/desmume/src/frontend/posix/gtk2/main.cpp
index 2d60bd13d..6c6be995e 100644
--- a/desmume/src/frontend/posix/gtk2/main.cpp
+++ b/desmume/src/frontend/posix/gtk2/main.cpp
@@ -349,6 +349,7 @@ static const char *ui_description =
 "        <menuitem action='SPUInterpolationNone'/>"
 "        <menuitem action='SPUInterpolationLinear'/>"
 "        <menuitem action='SPUInterpolationCosine'/>"
+"        <menuitem action='SPUInterpolationCatmullRom'/>"
 "      </menu>"
 "      <menu action='CheatMenu'>"
 "        <menuitem action='cheatsearch'/>"
@@ -587,7 +588,8 @@ static const GtkRadioActionEntry spumode_entries[] = {
 static const GtkRadioActionEntry spuinterpolation_entries[] = {
     { "SPUInterpolationNone", NULL, "_None", NULL, NULL, SPUInterpolation_None },
     { "SPUInterpolationLinear", NULL, "_Linear", NULL, NULL, SPUInterpolation_Linear },
-    { "SPUInterpolationCosine", NULL, "_Cosine", NULL, NULL, SPUInterpolation_Cosine }
+    { "SPUInterpolationCosine", NULL, "_Cosine", NULL, NULL, SPUInterpolation_Cosine },
+    { "SPUInterpolationCatmullRom", NULL, "_CatmullRom", NULL, NULL, SPUInterpolation_CatmullRom }
 };
 
 enum frameskip_enum {

From 2bb2802458a4d057b6cda5d0c81df856f78eb2e1 Mon Sep 17 00:00:00 2001
From: Aikku93 <aik@aol.com.au>
Date: Wed, 5 Oct 2022 18:57:09 +1100
Subject: [PATCH 3/6] SPU: Logic re-write

---
 desmume/src/NDSSystem.cpp                  |    9 +-
 desmume/src/NDSSystem.h                    |    6 +-
 desmume/src/SPU.cpp                        | 1229 ++++++++++++--------
 desmume/src/SPU.h                          |   49 +-
 desmume/src/frontend/windows/soundView.cpp |   24 +-
 5 files changed, 764 insertions(+), 553 deletions(-)

diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp
index 58cbb57d9..99c8f3cf5 100644
--- a/desmume/src/NDSSystem.cpp
+++ b/desmume/src/NDSSystem.cpp
@@ -1422,9 +1422,12 @@ static void execHardware_hblank()
 
 	//emulation housekeeping. for some reason we always do this at hblank,
 	//even though it sounds more reasonable to do it at hstart
-	SPU_Emulate_core();
-	driver->AVI_SoundUpdate(SPU_core->outbuf,spu_core_samples);
-	WAV_WavSoundUpdate(SPU_core->outbuf,spu_core_samples);
+	int coreSamples = SPU_Emulate_core(355*6); // HLine = 355 dots @ 6c/dot
+	if(coreSamples)
+	{
+		driver->AVI_SoundUpdate(SPU_core->outbuf,coreSamples);
+		WAV_WavSoundUpdate(SPU_core->outbuf,coreSamples);
+	}
 }
 
 static void execHardware_hstart_vblankEnd()
diff --git a/desmume/src/NDSSystem.h b/desmume/src/NDSSystem.h
index f2af566d4..4dbbd779d 100644
--- a/desmume/src/NDSSystem.h
+++ b/desmume/src/NDSSystem.h
@@ -530,6 +530,7 @@ extern struct TCommonSettings
 		, spuInterpolationMode(2)
 		, manualBackupType(0)
 		, autodetectBackupMethod(0)
+		, spu_muteChannels(0)
 		, spu_captureMuted(false)
 		, spu_advanced(true)
 		, StylusPressure(50)
@@ -543,9 +544,6 @@ extern struct TCommonSettings
 		strcpy(ARM7BIOS, "biosnds7.bin");
 		strcpy(ExtFirmwarePath, "firmware.bin");
 
-		for(int i=0;i<16;i++)
-			spu_muteChannels[i] = false;
-
 		for(int g=0;g<2;g++)
 			for(int x=0;x<5;x++)
 				dispLayers[g][x]=true;
@@ -652,7 +650,7 @@ extern struct TCommonSettings
 	int SPU_sync_mode;
 	int SPU_sync_method;
 
-	bool spu_muteChannels[16];
+	u16  spu_muteChannels;
 	bool spu_captureMuted;
 	bool spu_advanced;
 
diff --git a/desmume/src/SPU.cpp b/desmume/src/SPU.cpp
index 6c15e71cc..1d03e4296 100644
--- a/desmume/src/SPU.cpp
+++ b/desmume/src/SPU.cpp
@@ -48,7 +48,14 @@ static inline s16 read16(u32 addr) { return (s16)_MMU_read16<ARMCPU_ARM7,MMU_AT_
 static inline u8 read08(u32 addr) { return _MMU_read08<ARMCPU_ARM7,MMU_AT_DEBUG>(addr); }
 static inline s8 read_s8(u32 addr) { return (s8)_MMU_read08<ARMCPU_ARM7,MMU_AT_DEBUG>(addr); }
 
-#define K_ADPCM_LOOPING_RECOVERY_INDEX 99999
+// Disabling capture when _currentSNDCore == SNDDummy can increase
+// performance by disabling all mixing, but could potentially cause
+// problems if the software is relying on the capture output. By
+// default, we disable all mixing only when capture isn't running,
+// as this is guaranteed to be safe.
+#define ENABLE_DUMMY_SPU_CAPTURE 1
+
+#define K_ADPCM_LOOPING_RECOVERY_INDEX 255
 
 #define CATMULLROM_INTERPOLATION_RESOLUTION_BITS 11
 #define CATMULLROM_INTERPOLATION_RESOLUTION (1<<CATMULLROM_INTERPOLATION_RESOLUTION_BITS)
@@ -56,12 +63,8 @@ static inline s8 read_s8(u32 addr) { return (s8)_MMU_read08<ARMCPU_ARM7,MMU_AT_D
 #define COSINE_INTERPOLATION_RESOLUTION_BITS 13
 #define COSINE_INTERPOLATION_RESOLUTION (1<<COSINE_INTERPOLATION_RESOLUTION_BITS)
 
-#define SPUCHAN_PCM16B_AT(x) ((u32)(x) % SPUINTERPOLATION_TAPS)
-
-//#ifdef FASTBUILD
-	#undef FORCEINLINE
-	#define FORCEINLINE
-//#endif
+#define SPUCHAN_PCM16B_AT(x)    ((x) & (SPUCHAN_PCM16B_SIZE -1))
+#define SPUCAPTURE_PCM16B_AT(x) ((x) & (SPUCAPTURE_FIFO_SIZE-1))
 
 //static ISynchronizingAudioBuffer* _currentSynchronizer = metaspu_construct(ESynchMethod_Z);
 static ISynchronizingAudioBuffer* _currentSynchronizer = metaspu_construct(ESynchMethod_N);
@@ -108,12 +111,9 @@ static u8 precalcindextbl[89][8];
 static u16 catmullrom_lut[CATMULLROM_INTERPOLATION_RESOLUTION][4];
 static u16 cos_lut[COSINE_INTERPOLATION_RESOLUTION];
 
-static const double ARM7_CLOCK = 33513982;
+static const u32 ARM7_CLOCK = 33513982;
 
-static const double samples_per_hline = (DESMUME_SAMPLE_RATE / 59.8261f) / 263.0f;
-
-static double _samples = 0;
-int spu_core_samples = 0;
+static u32 _spu_core_cyclesCounter = 0;
 
 template<typename T>
 static FORCEINLINE T MinMax(T val, T min, T max)
@@ -211,19 +211,19 @@ int SPU_Init(int coreid, int newBufferSizeBytes)
 		// If we wanted to, we could stick entirely to integer maths
 		// here, but I doubt it's worth the hassle.
 		double x = i / (double)CATMULLROM_INTERPOLATION_RESOLUTION;
-		double a = x*(x*(-x + 2) - 1);
+		double a = x*(x*(x - 2) + 1);
 		double b = x*x*(3*x - 5) + 2;
 		double c = x*(x*(-3*x + 4) + 1);
-		double d = x*x*(x - 1);
-		catmullrom_lut[i][0] = (u16)floor((1u<<15) * -0.5*a);
-		catmullrom_lut[i][1] = (u16)floor((1u<<15) *  0.5*b);
-		catmullrom_lut[i][2] = (u16)floor((1u<<15) *  0.5*c);
-		catmullrom_lut[i][3] = (u16)floor((1u<<15) * -0.5*d);
+		double d = x*x*(1 - x);
+		catmullrom_lut[i][0] = (u16)floor((double)(1<<15) * 0.5*a);
+		catmullrom_lut[i][1] = (u16)floor((double)(1<<15) * 0.5*b);
+		catmullrom_lut[i][2] = (u16)floor((double)(1<<15) * 0.5*c);
+		catmullrom_lut[i][3] = (u16)floor((double)(1<<15) * 0.5*d);
 	}
 	for (size_t i = 0; i < COSINE_INTERPOLATION_RESOLUTION; i++)
-		cos_lut[i] = (u16)floor((1u<<16) * ((1.0 - cos(((double)i/(double)COSINE_INTERPOLATION_RESOLUTION) * M_PI)) * 0.5));
+		cos_lut[i] = (u16)floor((double)(1<<16) * ((1.0 - cos(((double)i/(double)COSINE_INTERPOLATION_RESOLUTION) * M_PI)) * 0.5));
 
-	SPU_core = new SPU_struct((int)ceil(samples_per_hline));
+	SPU_core = new SPU_struct();
 	SPU_Reset();
 
 	//create adpcm decode accelerator lookups
@@ -285,7 +285,7 @@ void SPU_SetSynchMode(int mode, int method)
 		
 	if (_currentSynchMode == ESynchMode_DualSynchAsynch)
 	{
-		SPU_user = new SPU_struct(_currentBufferSize);
+		SPU_user = new SPU_struct();
 		SPU_CloneUser();
 	}
 }
@@ -327,15 +327,14 @@ void SPU_Reset(void)
 	for (i = 0x400; i < 0x51D; i++)
 		T1WriteByte(MMU.ARM7_REG, i, 0);
 
-	_samples = 0;
+	_spu_core_cyclesCounter = 0;
 }
 
 //------------------------------------------
 
 void SPU_struct::reset()
 {
-	memset(sndbuf,0,bufsize*2*4);
-	memset(outbuf,0,bufsize*2*2);
+	memset(outbuf,0,bufsize*sizeof(s16)*2);
 
 	memset((void *)channels, 0, sizeof(channel_struct) * 16);
 
@@ -347,22 +346,33 @@ void SPU_struct::reset()
 	}
 }
 
-SPU_struct::SPU_struct(int buffersize)
-	: bufpos(0)
-	, buflength(0)
-	, sndbuf(0)
-	, outbuf(0)
-	, bufsize(buffersize)
+void SPU_struct::resizeBuffer(int buffersize)
 {
-	sndbuf = new s32[buffersize*2];
-	outbuf = new s16[buffersize*2];
+	if(outbuf) delete[] outbuf;
+	outbuf = new s16[(size_t)buffersize*2];
+	bufsize = buffersize;
+}
+
+SPU_struct::SPU_struct()
+	: outbuf(NULL)
+	, bufsize(0)
+{
+	// mixdata[] must be able to contain:
+	// struct {
+	//   s32 mixbuf     [N][2]
+	//   s32 mutedmixbuf[N][2]
+	//   s16 capbuf     [N][2]
+	//   s16 chanbuf    [N][2]
+	// };
+	// where N is at most SPUCAPTURE_FIFO_SIZE
+	mixdata = new s32[SPUCAPTURE_FIFO_SIZE * (sizeof(s32)+sizeof(s32)+sizeof(s16)+sizeof(s16))*2 / sizeof(s32)];
 	reset();
 }
 
 SPU_struct::~SPU_struct()
 {
-	if(sndbuf) delete[] sndbuf;
-	if(outbuf) delete[] outbuf;
+	if(mixdata) delete[] mixdata;
+	if(outbuf)  delete[] outbuf;
 }
 
 void SPU_DeInit(void)
@@ -383,12 +393,13 @@ void SPU_struct::ShutUp()
 		 channels[i].status = CHANSTAT_STOPPED;
 }
 
-static FORCEINLINE void adjust_channel_timer(channel_struct *chan)
+/*FORCEINLINE*/ static void adjust_channel_timer(channel_struct *chan)
 {
 	//  ARM7_CLOCK / (DESMUME_SAMPLE_RATE*2) / (2^16 - Timer)
 	// = ARM7_CLOCK / (DESMUME_SAMPLE_RATE*2 * (2^16 - Timer))
-	// ... and then round up for good measure
-	u64 sampinc = ((u32)ARM7_CLOCK*(1ull << 32) - 1) / (DESMUME_SAMPLE_RATE * 2ull * (0x10000 - chan->timer)) + 1;
+	// Make sure to round DOWN, as we'd rather lag behind
+	// than be ahead, as this causes synchronization issues
+	u64 sampinc = (ARM7_CLOCK*(1ull << 32)) / (DESMUME_SAMPLE_RATE * 2ull * (0x10000 - chan->timer));
 	chan->sampincInt = (u32)(sampinc >> 32), chan->sampincFrac = (u32)sampinc;
 }
 
@@ -418,15 +429,14 @@ void SPU_struct::KeyOn(int channel)
 {
 	channel_struct &thischan = channels[channel];
 	thischan.status = CHANSTAT_PLAY;
-
 	thischan.totlength = thischan.length + thischan.loopstart;
+	thischan.totlength_shifted = thischan.totlength << format_shift[thischan.format];
+	thischan.sampcntFrac = 0;
 	adjust_channel_timer(&thischan);
 
 	thischan.pcm16bOffs = 0;
-	for(int i=0;i<SPUINTERPOLATION_TAPS;i++)
-	{
+	for(int i=0; i < SPUCHAN_PCM16B_SIZE; i++)
 		thischan.pcm16b[i] = 0;
-	}
 
 	//printf("keyon %d totlength:%d\n",channel,thischan.totlength);
 
@@ -440,30 +450,28 @@ void SPU_struct::KeyOn(int channel)
 	case 0: // 8-bit
 	//	thischan.loopstart = thischan.loopstart << 2;
 	//	thischan.length = (thischan.length << 2) + thischan.loopstart;
-		thischan.sampcntFrac = 0, thischan.sampcntInt = -3;
+		thischan.sampcntInt = -3;
 		break;
 	case 1: // 16-bit
 	//	thischan.loopstart = thischan.loopstart << 1;
 	//	thischan.length = (thischan.length << 1) + thischan.loopstart;
-		thischan.sampcntFrac = 0, thischan.sampcntInt = -3;
+		thischan.sampcntInt = -3;
 		break;
 	case 2: // ADPCM
 		thischan.pcm16b[0] = (s16)read16(thischan.addr);
 		thischan.index = read08(thischan.addr + 2) & 0x7F;
-		thischan.sampcntFrac = 0, thischan.sampcntInt = -3;
+		thischan.sampcntInt = -3;
 		thischan.loop_index = K_ADPCM_LOOPING_RECOVERY_INDEX;
 	//	thischan.loopstart = thischan.loopstart << 3;
 	//	thischan.length = (thischan.length << 3) + thischan.loopstart;
 		break;
 	case 3: // PSG
-		thischan.sampcntFrac = 0, thischan.sampcntInt = -1;
+		thischan.sampcntInt = -1;
 		thischan.x = 0x7FFF;
 		break;
 	default: break;
 	}
 
-	thischan.totlength_shifted = thischan.totlength << format_shift[thischan.format];
-
 	if(thischan.format != 3)
 	{
 		if(thischan.totlength_shifted == 0)
@@ -759,14 +767,25 @@ void SPU_struct::ProbeCapture(int which)
 		return;
 	}
 
+	// Original notes on the reasoning behind a FIFO for capture:
+	//so, this is a little strange. why go through a fifo?
+	//it seems that some games will set up a reverb effect by capturing
+	//to the nearly same address as playback, but ahead by a couple.
+	//So, playback will always end up being what was captured a couple of samples ago.
+	//This system counts on playback always having read ahead 16 samples.
+	//In that case, playback will end up being what was processed at one entire buffer length ago,
+	//since the 16 samples would have read ahead before they got captured over
+
+	//It's actually the source channels which should have a fifo, but we are
+	//not going to take the hit in speed and complexity. Save it for a future rewrite.
+	//Instead, what we do here is delay the capture by 16 samples to create a similar effect.
+	//Subjectively, it seems to be working.
 	REGS::CAP &cap = regs.cap[which];
 	cap.runtime.running = 1;
-	cap.runtime.curdad = cap.dad;
+	cap.runtime.dad = cap.dad;
 	u32 len = cap.len;
 	if(len==0) len=1;
-	cap.runtime.maxdad = cap.dad + len*4;
-	cap.runtime.sampcntFrac = cap.runtime.sampcntInt = 0;
-	cap.runtime.fifo.reset();
+	cap.runtime.sampcntFrac = 0, cap.runtime.sampcntInt = -SPUCAPTURE_FIFO_SIZE;
 }
 
 void SPU_struct::WriteByte(u32 addr, u8 val)
@@ -1043,7 +1062,8 @@ void SPU_struct::WriteLong(u32 addr, u32 val)
 
 //////////////////////////////////////////////////////////////////////////////
 
-template<SPUInterpolationMode INTERPOLATE_MODE> static FORCEINLINE s32 Interpolate(const s16 *pcm16b, u8 pcm16bOffs, u32 subPos)
+template<SPUInterpolationMode INTERPOLATE_MODE>
+FORCEINLINE static s16 Interpolate(const s16 *pcm16b, u8 pcm16bOffs, u32 subPos)
 {
 	switch (INTERPOLATE_MODE)
 	{
@@ -1051,12 +1071,20 @@ template<SPUInterpolationMode INTERPOLATE_MODE> static FORCEINLINE s32 Interpola
 		{
 			// Catmull-Rom spline
 			// Delay: 2 samples, Maximum gain: 1.25
+			// NOTE: Ideally, we would just re-scale the resampling
+			// kernel to have a maximum gain of 1.0. However, this
+			// would mean reducing the output volume, which can then
+			// go on to make feedback capture (ie. echo effects)
+			// decay abnormally quickly. Since Catmull-Rom is more
+			// of a 'luxury' thing, we should be able to use MinMax
+			// since if the user is using this interpolation method,
+			// there's likely enough processing power to handle it.
 			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 3)];
 			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 2)];
 			s32 c = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 1)];
 			s32 d = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 0)];
 			const u16 *w = catmullrom_lut[subPos >> (32 - CATMULLROM_INTERPOLATION_RESOLUTION_BITS)];
-			return (-a*(s32)w[0] + b*(s32)w[1] + c*(s32)w[2] - d*(s32)w[3]) >> 15;
+			return (s16)MinMax((-a*(s32)w[0] + b*(s32)w[1] + c*(s32)w[2] - d*(s32)w[3]) >> 15, -0x8000, +0x7FFF);
 		}
 
 		case SPUInterpolation_Cosine:
@@ -1065,10 +1093,13 @@ template<SPUInterpolationMode INTERPOLATE_MODE> static FORCEINLINE s32 Interpola
 			// ratio2 = (1 - cos(ratio * M_PI)) / 2
 			// sampleI = sampleA * (1 - ratio2) + sampleB * ratio2
 			// Delay: 1 sample, Maximum gain: 1.0
+			// NOTE: Always cast the result to s16. (b-a) can
+			// overflow, but a+(b-a)*subPos can't. So we might
+			// have garbage in the upper 16 bits.
 			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 1)];
 			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 0)];
 			s32 subPos16 = (s32)cos_lut[subPos >> (32 - COSINE_INTERPOLATION_RESOLUTION_BITS)];
-			return a + ((b - a)*subPos16 >> 16);
+			return (s16)(a + (((b - a)*subPos16) >> 16));
 		}
 
 		case SPUInterpolation_Linear:
@@ -1076,10 +1107,11 @@ template<SPUInterpolationMode INTERPOLATE_MODE> static FORCEINLINE s32 Interpola
 			// Linear Interpolation Formula:
 			// sampleI = sampleA * (1 - ratio) + sampleB * ratio
 			// Delay: 1 sample, Maximum gain: 1.0
+			// NOTE: Always cast the result to s16 (see above).
 			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 1)];
 			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 0)];
 			s32 subPos16 = subPos >> (32 - 16);
-			return a + ((b - a)*subPos16 >> 16);
+			return (s16)(a + (((b - a)*subPos16) >> 16));
 		}
 
 		default:
@@ -1088,41 +1120,43 @@ template<SPUInterpolationMode INTERPOLATE_MODE> static FORCEINLINE s32 Interpola
 	}
 }
 
-static FORCEINLINE s32 Fetch8BitData(channel_struct *chan, s32 pos)
+FORCEINLINE static s16 Fetch8BitData(channel_struct *chan, s32 pos)
 {
 	if(pos < 0) return 0;
 
-	return read_s8(chan->addr + pos*1) << 8;
+	return (s16)read_s8(chan->addr + pos*1) << 8;
 }
 
-static FORCEINLINE s32 Fetch16BitData(channel_struct *chan, s32 pos)
+FORCEINLINE static s16 Fetch16BitData(channel_struct *chan, s32 pos)
 {
 	if(pos < 0) return 0;
 
 	return read16(chan->addr + pos*2);
 }
 
-static FORCEINLINE s32 FetchADPCMData(channel_struct *chan, s32 pos)
+// NOTE: The decoding state is updated during this function call
+FORCEINLINE static s16 FetchADPCMData(channel_struct *chan, s32 pos)
 {
 	if(pos < 8) return 0;
 
 	s16 last = chan->pcm16b[SPUCHAN_PCM16B_AT(chan->pcm16bOffs)];
 
-	if(pos == (chan->loopstart<<3)) {
-		//if(chan->loop_index != K_ADPCM_LOOPING_RECOVERY_INDEX) printf("over-snagging\n");
+	// Stash loop sample and index
+	// This saves having to decode to the loop point every time
+	if(pos == ((s32)chan->loopstart<<3)) {
 		chan->loop_pcm16b = last;
 		chan->loop_index = chan->index;
 	}
 	
-	const u32 shift    = (pos&1) * 4;
-	const u32 data4bit = ((u32)read08(chan->addr + (pos>>1))) >> shift;
+	const u8 shift    = ((u8)pos&1) * 4;
+	const u8 data4bit = read08(chan->addr + (pos>>1)) >> shift;
 	const s32 diff = precalcdifftbl [chan->index][data4bit & 0xF];
 	chan->index    = precalcindextbl[chan->index][data4bit & 0x7];
 
-	return MinMax(last + diff, -0x8000, 0x7FFF);
+	return (s16)MinMax(last + diff, -0x8000, 0x7FFF);
 }
 
-static FORCEINLINE s32 FetchPSGData(channel_struct *chan, s32 pos)
+FORCEINLINE static s16 FetchPSGData(channel_struct *chan, s32 pos)
 {
 	if(pos < 0 || chan->num < 8) return 0;
 
@@ -1130,7 +1164,7 @@ static FORCEINLINE s32 FetchPSGData(channel_struct *chan, s32 pos)
 	if(chan->num < 14)
 	{
 		// Doing this avoids using a LUT
-		return ((pos%8u) > chan->waveduty) ? (-0x7FFF) : (+0x7FFF);
+		return (((u8)pos%8u) > chan->waveduty) ? (-0x7FFF) : (+0x7FFF);
 	}
 	else
 	{
@@ -1149,43 +1183,22 @@ static FORCEINLINE s32 FetchPSGData(channel_struct *chan, s32 pos)
 
 //////////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE void MixL(SPU_struct* SPU, channel_struct *chan, s32 data)
-{
-	data = spumuldiv7(data, chan->vol) >> volume_shift[chan->volumeDiv];
-	SPU->sndbuf[SPU->bufpos<<1] += data;
-}
-
-static FORCEINLINE void MixR(SPU_struct* SPU, channel_struct *chan, s32 data)
+// Returns false when the channel needs to stop
+// NOTE: Assumes channel has already reached the end of playback
+template<int FORMAT>
+/*FORCEINLINE*/ static bool TestForLoop(channel_struct *chan, s32 *pos, s32 totalLength)
 {
-	data = spumuldiv7(data, chan->vol) >> volume_shift[chan->volumeDiv];
-	SPU->sndbuf[(SPU->bufpos<<1)+1] += data;
-}
-
-static FORCEINLINE void MixLR(SPU_struct* SPU, channel_struct *chan, s32 data)
-{
-	data = spumuldiv7(data, chan->vol) >> volume_shift[chan->volumeDiv];
-	SPU->sndbuf[SPU->bufpos<<1] += spumuldiv7(data, 127 - chan->pan);
-	SPU->sndbuf[(SPU->bufpos<<1)+1] += spumuldiv7(data, chan->pan);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-
-template<int FORMAT> static FORCEINLINE void TestForLoop(SPU_struct *SPU, channel_struct *chan)
-{
-	// Do nothing if we haven't reached the end
-	if(chan->sampcntInt < chan->totlength_shifted) return;
-
 	// Kill the channel if we don't repeat
 	if(chan->repeat != 1)
 	{
-		SPU->KeyOff(chan->num);
-		SPU->bufpos = SPU->buflength;
-		return;
+		//SPU->KeyOff(chan->num); // Inlining this avoids having to pass an SPU pointer around
+		chan->status = CHANSTAT_STOPPED;
+		return false;
 	}
 
 	// Wrap sampcnt
-	u32 loopSize = chan->totlength_shifted - (chan->loopstart << format_shift[FORMAT]);
-	do chan->sampcntInt -= loopSize; while(chan->sampcntInt >= chan->totlength_shifted)
+	u32 loopSize = totalLength - (chan->loopstart << format_shift[FORMAT]);
+	do *pos -= loopSize; while(*pos >= totalLength);
 
 	// ADPCM needs special handling
 	if(FORMAT == 2)
@@ -1194,7 +1207,7 @@ template<int FORMAT> static FORCEINLINE void TestForLoop(SPU_struct *SPU, channe
 		// smaller values (0..3 words) are causing hang-ups 
 		// (busy bit remains set infinite, but no sound output occurs).
 		// fix: 7th Dragon (JP) - http://sourceforge.net/p/desmume/bugs/1357/
-		if (chan->totlength < 4) return;
+		if (totalLength < (4 << format_shift[FORMAT])) return true;
 
 		// Fetch loop sample and index, and get the "new" current decoding position
 		s32 curpos;
@@ -1217,314 +1230,257 @@ template<int FORMAT> static FORCEINLINE void TestForLoop(SPU_struct *SPU, channe
 		// Decode until we reach the target position
 		// This is really only used for fast seeking (ie. SNDDummy
 		// and loop reset), but makes the code much cleaner.
-		while(curpos < chan->sampcntInt)
+		while(curpos < *pos)
 		{
 			*pcm16Dst = FetchADPCMData(chan, curpos);
 			curpos++;
 		}
 	}
+	return true;
 }
 
-template<int CHANNELS> FORCEINLINE static void SPU_Mix(SPU_struct* SPU, channel_struct *chan, s32 data)
+//////////////////////////////////////////////////////////////////////////////
+
+//WORK
+template<int CHANNELS, int FORMAT, SPUInterpolationMode INTERPOLATE_MODE> 
+static void __SPU_GenerateChanData(channel_struct* const chan, s16 *chanbuf, int length)
 {
-	switch(CHANNELS)
+	s32 totalLength = chan->totlength_shifted;
+
+	if (!CHANNELS)
 	{
-		case 0: MixL(SPU, chan, data); break;
-		case 1: MixLR(SPU, chan, data); break;
-		case 2: MixR(SPU, chan, data); break;
-		default: break;
+		// When we aren't mixing at all, take a much faster path where
+		// we simply update sampcnt. This can glitch interpolation for
+		// up to SPUCHAN_PCM16B_SIZE source samples (since we're not
+		// updating chan->pcm16b[]), but this glitching should really
+		// only show up when switching from Dual SPU to Sync mode, or
+		// when switching from SNDDummy core to an actual output core,
+		// and only for non-ADPCM sources (ADPCM needs to decode data
+		// all the time, so we keep pcm16b[] filled correctly anyway).
+		s32 cursampcntInt = chan->sampcntInt;
+		s64 newsampcnt  = (chan->sampcntFrac | (s64)   cursampcntInt<<32);
+		    newsampcnt += (chan->sampincFrac | (u64)chan->sampincInt<<32) * length;
+		s32 newsampcntInt = (s32)(newsampcnt >> 32);
+		if(FORMAT == 2 && newsampcntInt <= totalLength)
+		{
+			// We won't go past the end, so decode until reaching the target position
+			while(cursampcntInt < newsampcntInt)
+			{
+				s16 data = FetchADPCMData(chan, cursampcntInt);
+				chan->pcm16bOffs++;
+				chan->pcm16b[SPUCHAN_PCM16B_AT(chan->pcm16bOffs)] = data;
+				cursampcntInt++;
+			}
+		}
+		else if(FORMAT != 3 && newsampcntInt >= totalLength) TestForLoop<FORMAT>(chan, &newsampcntInt, totalLength);
+		chan->sampcntFrac = (u32)newsampcnt;
+		chan->sampcntInt  = newsampcntInt;
+		return;
 	}
-	SPU->lastdata = data;
-}
 
-//WORK
-template<int FORMAT, SPUInterpolationMode INTERPOLATE_MODE, int CHANNELS> 
-	FORCEINLINE static void ____SPU_ChanUpdate(SPU_struct* const SPU, channel_struct* const chan)
-{
-	for (; SPU->bufpos < SPU->buflength; SPU->bufpos++)
+	// chan->vol is .7fxp, plus .4fxp for chan->volumeDiv (total .11fxp)
+	// chan->pan is .7fxp
+	// This gives us .18fxp, but we need at most .16fxp, so we shift down.
+	s32 vol_shifted   = spumuladjust7(chan->vol);
+	    vol_shifted <<= 4;
+	    vol_shifted >>= volume_shift[chan->volumeDiv];
+	s32 vol_left      = spumuladjust7(127 - chan->pan);
+	    vol_left     *= vol_shifted;
+	    vol_left    >>= 2; // .16fxp
+	s32 vol_right     = spumuladjust7(chan->pan);
+	    vol_right    *= vol_shifted;
+	    vol_right   >>= 2; // .16fxp
+
+	// Start mixing loop
+	u32 sampcntFrac = chan->sampcntFrac;
+	s32 sampcntInt  = chan->sampcntInt;
+	do
 	{
 		// Advance sampcnt one sample at a time. This is
 		// needed to keep pcm16b[] filled for interpolation.
-		u32 nSamplesToSkip = chan->sampincInt + AddAndReturnCarry(&chan->sampcntFrac, chan->sampincFrac);
+		u32 nSamplesToSkip = chan->sampincInt + AddAndReturnCarry(&sampcntFrac, chan->sampincFrac);
 		while(nSamplesToSkip--)
 		{
+			// If channel stops, fill the rest of the buffer with 0
+			if(FORMAT != 3 && sampcntInt >= totalLength && !TestForLoop<FORMAT>(chan, &sampcntInt, totalLength))
+			{
+				memset(chanbuf, 0, length*sizeof(s16)*2);
+				return;
+			}
+
 			s16 data = 0;
-			s32 pos = chan->sampcntInt;
 			switch(FORMAT)
 			{
-				case 0: data = Fetch8BitData (chan, pos); break;
-				case 1: data = Fetch16BitData(chan, pos); break;
-				case 2: data = FetchADPCMData(chan, pos); break;
-				case 3: data = FetchPSGData  (chan, pos); break;
-				default: break;
+				case 0: data = Fetch8BitData (chan, sampcntInt); break;
+				case 1: data = Fetch16BitData(chan, sampcntInt); break;
+				case 2: data = FetchADPCMData(chan, sampcntInt); break;
+				case 3: data = FetchPSGData  (chan, sampcntInt); break;
 			}
 			chan->pcm16bOffs++;
 			chan->pcm16b[SPUCHAN_PCM16B_AT(chan->pcm16bOffs)] = data;
-
-			chan->sampcntInt++;
-			if (FORMAT != 3) TestForLoop<FORMAT>(SPU, chan);
+			sampcntInt++;
 		}
 
-		if(CHANNELS != -1)
+		// Because chanbuf[] is aligned to at least 32 bits, we can
+		// cheat and store a hard-panned sample by writing as 32bit
+		s16 sample, sampleL = 0, sampleR = 0; // <- Clearing these to 0 shuts the compiler up
+		sample = Interpolate<INTERPOLATE_MODE>(chan->pcm16b, chan->pcm16bOffs, sampcntFrac);
+		if(CHANNELS & (1<<0)) sampleL = (s16)(sample * vol_left  >> 16);
+		if(CHANNELS & (1<<1)) sampleR = (s16)(sample * vol_right >> 16);
+		switch(CHANNELS)
 		{
-			s32 data = Interpolate<INTERPOLATE_MODE>(chan->pcm16b, chan->pcm16bOffs, chan->sampcntFrac);
-			SPU_Mix<CHANNELS>(SPU, chan, data);
+			case (1<<0)|(0<<1):
+#ifdef MSB_FIRST
+				*(u32*)chanbuf = (u32)sampleL << 16;
+#else
+				*(u32*)chanbuf = (u32)sampleL;
+#endif
+				break;
+			case (0<<0)|(1<<1):
+#ifdef MSB_FIRST
+				*(u32*)chanbuf = (u32)sampleR;
+#else
+				*(u32*)chanbuf = (u32)sampleR << 16;
+#endif
+				break;
+			case (1<<0)|(1<<1):
+				chanbuf[0] = sampleL;
+				chanbuf[1] = sampleR;
+				break;
 		}
+		chanbuf += 2;
+	} while(--length);
+	chan->sampcntFrac = sampcntFrac;
+	chan->sampcntInt  = sampcntInt;
+}
+
+// Outputs {L,R} into chanbuf[]
+// Assumes chanbuf[] is always aligned to at least 32 bits
+FORCEINLINE static void _SPU_GenerateChanData(bool actuallyMix, channel_struct* const chan, s16 *chanbuf, int length)
+{
+	typedef void (*_SPU_GenerateChanData_Func_t)(channel_struct* const chan, s16 *chanbuf, int length);
+
+	// This looks insane and pointless, but compilers generate
+	// a massive if/elseif block in place of something like this,
+	// since they don't know the range of values we use.
+	// Note that we use SPUInterpolation_None in the case of PSG
+	// channels, as we don't want to interpolate the raw samples.
+	// We also use SPUInterpolation_None with actuallyMix==false,
+	// so that we avoid instantiating the exact same code under
+	// a new template instance.
+	// The table is acessed as: FuncTable[INTERPOLATE_MODE][FORMAT][CHANNELS]
+#define __GENERATE_FUNCTABLE(CHANNELS, FORMAT, INTERPOLATE_MODE) \
+	__SPU_GenerateChanData<CHANNELS, FORMAT, INTERPOLATE_MODE>
+#define _GENERATE_FUNCTABLE(FORMAT, INTERPOLATE_MODE) \
+	{ \
+		__GENERATE_FUNCTABLE((0<<0 | 0<<1), FORMAT, SPUInterpolation_None), \
+		__GENERATE_FUNCTABLE((1<<0 | 0<<1), FORMAT, INTERPOLATE_MODE), \
+		__GENERATE_FUNCTABLE((0<<0 | 1<<1), FORMAT, INTERPOLATE_MODE), \
+		__GENERATE_FUNCTABLE((1<<0 | 1<<1), FORMAT, INTERPOLATE_MODE), \
 	}
-}
-
-template<int FORMAT, SPUInterpolationMode INTERPOLATE_MODE> 
-	FORCEINLINE static void ___SPU_ChanUpdate(const bool actuallyMix, SPU_struct* const SPU, channel_struct* const chan)
-{
-	if(!actuallyMix)
-		____SPU_ChanUpdate<FORMAT,INTERPOLATE_MODE,-1>(SPU,chan);
-	else if (chan->pan == 0)
-		____SPU_ChanUpdate<FORMAT,INTERPOLATE_MODE,0>(SPU,chan);
-	else if (chan->pan == 127)
-		____SPU_ChanUpdate<FORMAT,INTERPOLATE_MODE,2>(SPU,chan);
-	else
-		____SPU_ChanUpdate<FORMAT,INTERPOLATE_MODE,1>(SPU,chan);
-}
-
-template<SPUInterpolationMode INTERPOLATE_MODE> 
-	FORCEINLINE static void __SPU_ChanUpdate(const bool actuallyMix, SPU_struct* const SPU, channel_struct* const chan)
-{
-	// NOTE: PSG doesn't use interpolation, or it would try to
-	// interpolate between the raw sample points (very bad)
-	switch(chan->format)
-	{
-		case 0: ___SPU_ChanUpdate<0,INTERPOLATE_MODE>(actuallyMix, SPU, chan); break;
-		case 1: ___SPU_ChanUpdate<1,INTERPOLATE_MODE>(actuallyMix, SPU, chan); break;
-		case 2: ___SPU_ChanUpdate<2,INTERPOLATE_MODE>(actuallyMix, SPU, chan); break;
-		case 3: ___SPU_ChanUpdate<3,SPUInterpolation_None>(actuallyMix, SPU, chan); break;
-		default: assert(false);
+#define GENERATE_FUNCTABLE(INTERPOLATE_MODE) \
+	{ \
+		_GENERATE_FUNCTABLE(0, INTERPOLATE_MODE), \
+		_GENERATE_FUNCTABLE(1, INTERPOLATE_MODE), \
+		_GENERATE_FUNCTABLE(2, INTERPOLATE_MODE), \
+		_GENERATE_FUNCTABLE(3, SPUInterpolation_None), \
 	}
-}
-
-FORCEINLINE static void _SPU_ChanUpdate(const bool actuallyMix, SPU_struct* const SPU, channel_struct* const chan)
-{
-	switch(CommonSettings.spuInterpolationMode)
+	static const _SPU_GenerateChanData_Func_t FuncTable[4][4][4] =
 	{
-	case SPUInterpolation_None:       __SPU_ChanUpdate<SPUInterpolation_None>(actuallyMix, SPU, chan); break;
-	case SPUInterpolation_Linear:     __SPU_ChanUpdate<SPUInterpolation_Linear>(actuallyMix, SPU, chan); break;
-	case SPUInterpolation_Cosine:     __SPU_ChanUpdate<SPUInterpolation_Cosine>(actuallyMix, SPU, chan); break;
-	case SPUInterpolation_CatmullRom: __SPU_ChanUpdate<SPUInterpolation_CatmullRom>(actuallyMix, SPU, chan); break;
-	default: assert(false);
-	}
-}
-
-//ENTERNEW
-static void SPU_MixAudio_Advanced(bool actuallyMix, SPU_struct *SPU, int length)
-{
-	//the advanced spu function correctly handles all sound control mixing options, as well as capture
-	//this code is not entirely optimal, as it relies on sort of manhandling the core mixing functions
-	//in order to get the results it needs.
-
-	//THIS IS MAX HACKS!!!!
-	//AND NEEDS TO BE REWRITTEN ALONG WITH THE DEEPEST PARTS OF THE SPU
-	//ONCE WE KNOW THAT IT WORKS
-	
-	//BIAS gets ignored since our spu is still not bit perfect,
-	//and it doesnt matter for purposes of capture
-
-	//-----------DEBUG CODE
-	bool skipcap = false;
-	//-----------------
-
-	s32 samp0[2] = {0,0};
-	
-	//believe it or not, we are going to do this one sample at a time.
-	//like i said, it is slower.
-	for (int samp = 0; samp < length; samp++)
+		GENERATE_FUNCTABLE(SPUInterpolation_None),
+		GENERATE_FUNCTABLE(SPUInterpolation_Linear),
+		GENERATE_FUNCTABLE(SPUInterpolation_Cosine),
+		GENERATE_FUNCTABLE(SPUInterpolation_CatmullRom),
+	};
+#undef GENERATE_FUNCTABLE
+#undef _GENERATE_FUNCTABLE
+#undef __GENERATE_FUNCTABLE
+
+	const _SPU_GenerateChanData_Func_t *Funcs = FuncTable[CommonSettings.spuInterpolationMode][chan->format];
+	     if(!actuallyMix)     Funcs[0](chan, chanbuf, length);
+	else if(chan->pan == 0)   Funcs[1](chan, chanbuf, length);
+	else if(chan->pan == 127) Funcs[2](chan, chanbuf, length);
+	else                      Funcs[3](chan, chanbuf, length);
+}
+
+template<int CAP_BITS, bool USE_SRCBUF>
+/*FORCEINLINE*/ static bool __SPU_WriteCapture(SPU_struct::REGS::CAP& cap, const channel_struct& srcChan, const s16 *srcBuf, int length)
+{
+	s32 capLen_shifted = cap.len * (32 / CAP_BITS);
+	SPU_struct::REGS::CAP::Runtime& runtime = cap.runtime;
+	s32 pos = runtime.sampcntInt;
+	do
 	{
-		SPU->sndbuf[0] = 0;
-		SPU->sndbuf[1] = 0;
-		SPU->buflength = 1;
-
-		s32 capmix[2] = {0,0};
-		s32 mix[2] = {0,0};
-		s32 chanout[16];
-		s32 submix[32];
-
-		//generate each channel, and helpfully mix it at the same time
-		for (int i = 0; i < 16; i++)
+		s16 sample = USE_SRCBUF ? (*srcBuf) : 0;
+		u32 nSamplesToProcess = srcChan.sampincInt + AddAndReturnCarry(&runtime.sampcntFrac, srcChan.sampincFrac);
+		while(nSamplesToProcess--)
 		{
-			channel_struct *chan = &SPU->channels[i];
-
-			if (chan->status == CHANSTAT_PLAY)
+			if(pos >= capLen_shifted)
 			{
-				SPU->bufpos = 0;
-
-				bool bypass = false;
-				if (i==1 && SPU->regs.ctl_ch1bypass) bypass=true;
-				if (i==3 && SPU->regs.ctl_ch3bypass) bypass=true;
-
-
-				//output to mixer unless we are bypassed.
-				//dont output to mixer if the user muted us
-				bool outputToMix = true;
-				if (CommonSettings.spu_muteChannels[i]) outputToMix = false;
-				if (bypass) outputToMix = false;
-				bool outputToCap = outputToMix;
-				if (CommonSettings.spu_captureMuted && !bypass) outputToCap = true;
-
-				//channels 1 and 3 should probably always generate their audio
-				//internally at least, just in case they get used by the spu output
-				bool domix = outputToCap || outputToMix || i==1 || i==3;
-
-				//clear the output buffer since this is where _SPU_ChanUpdate wants to accumulate things
-				SPU->sndbuf[0] = SPU->sndbuf[1] = 0;
-
-				//get channel's next output sample.
-				_SPU_ChanUpdate(domix, SPU, chan);
-				chanout[i] = SPU->lastdata >> volume_shift[chan->volumeDiv];
-
-				//save the panned results
-				submix[i*2] = SPU->sndbuf[0];
-				submix[i*2+1] = SPU->sndbuf[1];
-
-				//send sample to our capture mix
-				if (outputToCap)
+				if(cap.oneshot)
 				{
-					capmix[0] += submix[i*2];
-					capmix[1] += submix[i*2+1];
+					cap.active = runtime.running = 0;
+					return false;
 				}
+				pos -= capLen_shifted;
+			}
 
-				//send sample to our main mixer
-				if (outputToMix)
+			s16 *data = &runtime.pcm16b[SPUCAPTURE_PCM16B_AT(runtime.pcm16bOffs)];
+			if(pos >= 0)
+			{
+				if (CAP_BITS == 8)
 				{
-					mix[0] += submix[i*2];
-					mix[1] += submix[i*2+1];
+					_MMU_write08<ARMCPU_ARM7,MMU_AT_DMA>(runtime.dad + pos*sizeof(s8), (u8)(*data >> 8));
+				}
+				else
+				{
+					_MMU_write16<ARMCPU_ARM7,MMU_AT_DMA>(runtime.dad + pos*sizeof(s16), (u16)(*data));
 				}
 			}
-			else 
-			{
-				chanout[i] = 0;
-				submix[i*2] = 0;
-				submix[i*2+1] = 0;
-			}
-		} //foreach channel
-
-		s32 mixout[2] = {mix[0],mix[1]};
-		s32 capmixout[2] = {capmix[0],capmix[1]};
-		s32 sndout[2];
-		s32 capout[2];
-
-		//create SPU output
-		switch (SPU->regs.ctl_left)
-		{
-			case SPU_struct::REGS::LOM_LEFT_MIXER: sndout[0] = mixout[0]; break;
-			case SPU_struct::REGS::LOM_CH1: sndout[0] = submix[1*2+0]; break;
-			case SPU_struct::REGS::LOM_CH3: sndout[0] = submix[3*2+0]; break;
-			case SPU_struct::REGS::LOM_CH1_PLUS_CH3: sndout[0] = submix[1*2+0] + submix[3*2+0]; break;
-			default: break;
-		}
-		switch (SPU->regs.ctl_right)
-		{
-			case SPU_struct::REGS::ROM_RIGHT_MIXER: sndout[1] = mixout[1]; break;
-			case SPU_struct::REGS::ROM_CH1: sndout[1] = submix[1*2+1]; break;
-			case SPU_struct::REGS::ROM_CH3: sndout[1] = submix[3*2+1]; break;
-			case SPU_struct::REGS::ROM_CH1_PLUS_CH3: sndout[1] = submix[1*2+1] + submix[3*2+1]; break;
-			default: break;
+			*data = sample;
+			runtime.pcm16bOffs++;
+			pos++;
 		}
 
+		// srcBuf[] stores two samples per time unit
+		// Either {Ch0[+Ch1],Ch2[+Ch3]}, or {LMix,RMix}
+		if(USE_SRCBUF) srcBuf += 2;
+	} while(--length);
+	runtime.sampcntInt = pos;
+	return true;
+}
 
-		//generate capture output ("capture bugs" from gbatek are not emulated)
-		if (SPU->regs.cap[0].source == 0)
-			capout[0] = capmixout[0]; //cap0 = L-mix
-		else if (SPU->regs.cap[0].add)
-			capout[0] = chanout[0] + chanout[1]; //cap0 = ch0+ch1
-		else capout[0] = chanout[0]; //cap0 = ch0
-
-		if (SPU->regs.cap[1].source == 0)
-			capout[1] = capmixout[1]; //cap1 = R-mix
-		else if (SPU->regs.cap[1].add)
-			capout[1] = chanout[2] + chanout[3]; //cap1 = ch2+ch3
-		else capout[1] = chanout[2]; //cap1 = ch2
-
-		capout[0] = MinMax(capout[0],-0x8000,0x7FFF);
-		capout[1] = MinMax(capout[1],-0x8000,0x7FFF);
+// Writes capture output to capture unit destination
+// Returns false if capture has stopped
+template<bool USE_SRCBUF>
+FORCEINLINE static bool _SPU_WriteCapture(SPU_struct::REGS::CAP& cap, const channel_struct& srcChan, const s16 *srcBuf, int length)
+{
+	if(cap.bits8)
+		return __SPU_WriteCapture< 8,USE_SRCBUF>(cap, srcChan, srcBuf, length);
+	else
+		return __SPU_WriteCapture<16,USE_SRCBUF>(cap, srcChan, srcBuf, length);
+}
 
-		//write the output sample where it is supposed to go
-		if (samp == 0)
+// Advances capture unit destination without writing anything
+/*FORCEINLINE*/ static void _SPU_SeekCapture(SPU_struct::REGS::CAP& cap, const channel_struct& srcChan, int length)
+{
+	s32 capLen_shifted = cap.len * (cap.bits8 ? (32/8) : (32/16));
+	SPU_struct::REGS::CAP::Runtime& runtime = cap.runtime;
+	s64 pos64  = (runtime.sampcntFrac | (s64)runtime.sampcntInt<<32);
+	    pos64 += (srcChan.sampincFrac | (u64)srcChan.sampincInt<<32) * length;
+	runtime.sampcntFrac = (u32)pos64;
+	runtime.sampcntInt  = (s32)(pos64 >> 32);
+	if(runtime.sampcntInt >= capLen_shifted)
+	{
+		if(cap.oneshot)
 		{
-			samp0[0] = sndout[0];
-			samp0[1] = sndout[1];
+			cap.active = runtime.running = 0;
 		}
 		else
 		{
-			SPU->sndbuf[samp*2+0] = sndout[0];
-			SPU->sndbuf[samp*2+1] = sndout[1];
+			do runtime.sampcntInt -= capLen_shifted; while(runtime.sampcntInt >= capLen_shifted);
 		}
-
-		for (int capchan = 0; capchan < 2; capchan++)
-		{
-			SPU_struct::REGS::CAP& cap = SPU->regs.cap[capchan];
-			channel_struct& srcChan = SPU->channels[1 + 2 * capchan];
-			if (SPU->regs.cap[capchan].runtime.running)
-			{
-				u32 nSamplesToProcess = srcChan.sampincInt + AddAndReturnCarry(&cap.runtime.sampcntFrac, srcChan.sampincFrac);
-				cap.runtime.sampcntInt += nSamplesToProcess;
-				while(nSamplesToProcess--)
-				{
-					//so, this is a little strange. why go through a fifo?
-					//it seems that some games will set up a reverb effect by capturing
-					//to the nearly same address as playback, but ahead by a couple.
-					//So, playback will always end up being what was captured a couple of samples ago.
-					//This system counts on playback always having read ahead 16 samples.
-					//In that case, playback will end up being what was processed at one entire buffer length ago,
-					//since the 16 samples would have read ahead before they got captured over
-
-					//It's actually the source channels which should have a fifo, but we are
-					//not going to take the hit in speed and complexity. Save it for a future rewrite.
-					//Instead, what we do here is delay the capture by 16 samples to create a similar effect.
-					//Subjectively, it seems to be working.
-
-					//Don't do anything until the fifo is filled, so as to delay it
-					if (cap.runtime.fifo.size < 16)
-					{
-						cap.runtime.fifo.enqueue(capout[capchan]);
-						continue;
-					}
-
-					//(actually capture sample from fifo instead of most recently generated)
-					u32 multiplier;
-					s32 sample = cap.runtime.fifo.dequeue();
-					cap.runtime.fifo.enqueue(capout[capchan]);
-
-					//static FILE* fp = NULL;
-					//if(!fp) fp = fopen("d:\\capout.raw","wb");
-					//fwrite(&sample,2,1,fp);
-					
-					if (cap.bits8)
-					{
-						s8 sample8 = sample >> 8;
-						if (skipcap) _MMU_write08<1,MMU_AT_DMA>(cap.runtime.curdad,0);
-						else _MMU_write08<1,MMU_AT_DMA>(cap.runtime.curdad,sample8);
-						cap.runtime.curdad++;
-						multiplier = 4;
-					}
-					else
-					{
-						s16 sample16 = sample;
-						if (skipcap) _MMU_write16<1,MMU_AT_DMA>(cap.runtime.curdad,0);
-						else _MMU_write16<1,MMU_AT_DMA>(cap.runtime.curdad,sample16);
-						cap.runtime.curdad+=2;
-						multiplier = 2;
-					}
-
-					if (cap.runtime.curdad >= cap.runtime.maxdad)
-					{
-						cap.runtime.curdad = cap.dad;
-						cap.runtime.sampcntInt -= cap.len*multiplier;
-					}
-				} //sampinc loop
-			} //if capchan running
-		} //capchan loop
-	} //main sample loop
-
-	SPU->sndbuf[0] = samp0[0];
-	SPU->sndbuf[1] = samp0[1];
+	}
 }
 
 //ENTER
@@ -1532,8 +1488,8 @@ static void SPU_MixAudio(bool actuallyMix, SPU_struct *SPU, int length)
 {
 	if (actuallyMix)
 	{
-		memset(SPU->sndbuf, 0, length*4*2);
-		memset(SPU->outbuf, 0, length*2*2);
+		if(SPU->bufsize < length) SPU->resizeBuffer(length);
+		memset(SPU->outbuf, 0, length*sizeof(s16)*2);
 	}
 
 	//we used to use master enable here, and do nothing if audio is disabled.
@@ -1542,133 +1498,366 @@ static void SPU_MixAudio(bool actuallyMix, SPU_struct *SPU, int length)
 	//is this still a good idea? zeroing the capture buffers is important...
 	if(!SPU->regs.masteren) return;
 
-	bool advanced = CommonSettings.spu_advanced;
-
-	//branch here so that slow computers don't have to take the advanced (slower) codepath.
-	//it remainds to be seen exactly how much slower it is
-	//if it isnt much slower then we should refactor everything to be simpler, once it is working
-	if (advanced && SPU == SPU_core)
+	// We used to branch here into advanced/non-advanced mode here.
+	// Hopefully, the current code is good enough to avoid the need now...
+
+	/************************************************/
+
+	// Overall flow:
+	//  For each channel:
+	//    Generate L/R sample data into chanbuf[]
+	//    If not bypassed:
+	//      If not muted:
+	//        Mix chanbuf[] into mixbuf[]
+	//      Else if capturing muted channels:
+	//        Mix chanbuf[] into mutedmixbuf[]
+	//    If capturing from channels:
+	//      Copy/mix chanbuf[] into capbuf[]
+	//    If not playing from mixer:
+	//      Copy/mix chanbuf[] into outbuf[]
+	//  If capturing from channels:
+	//    Output capbuf[] to capture units
+	//  If playing from mixer:
+	//    Output mixbuf[] to outbuf[]
+	//  If capturing from mixer:
+	//    If capturing muted channels:
+	//      Output mixbuf[]+mutedmixbuf[] to capture units
+	//    Else
+	//      Output mixbuf[] to capture units
+	
+	//we used to bail out if speakers were disabled.
+	//this is technically wrong. sound may still be captured, or something.
+	//in all likelihood, any game doing this probably master disabled the SPU also
+	//so, optimization of this case is probably not necessary.
+	//later, we'll just silence the output
+	bool speakersOn = T1ReadWord(MMU.ARM7_REG, 0x304) & 0x01;
+
+	// Translate the mixer and capture states.
+	// This should improve the code generation so that
+	// it doesn't have to reference a lot of memory and
+	// can instead just bitwise-test as needed.
+	//  -bypassMixer controls whether chanbuf[] should NOT be added to mixbuf[]
+	//  -capbufFlags0/1 controls the following:
+	//    -The least-significant bit enabled will store UN-PANNED chanbuf[] to capbuf[]
+	//    -All other bits will add UN-PANNED chanbuf[] to capbuf[]
+	//    -If all zero, capture is either disabled or comes from the mixer
+	//  -outbufFlagsL/R controls the following:
+	//    -The least-significant bit enabled will store PANNED chanbuf[] to outbuf[]
+	//    -All other bits will add PANNED chanbuf[] to outbuf[]
+	//    -If all zero, output comes from the mixer
+	enum
 	{
-		SPU_MixAudio_Advanced(actuallyMix, SPU, length);
-	}
-	else
+		CAPSRC_NONE,
+		CAPSRC_MIXER, // Capture mixer output
+		CAPSRC_CHAN,  // Capture channel 1/3 output
+		CAPSRC_MIXED, // Capture channel 0+1/2+3 output (buggy on hardware)
+	};
+	u8   bypassMixer   = 0;
+	u16  chanMuteFlags = CommonSettings.spu_muteChannels;
+	u8   capbufFlags0  = 0;
+	u8   capbufFlags1  = 0;
+	u8   outbufFlagsL  = 0;
+	u8   outbufFlagsR  = 0;
+	u8   cap0Src       = CAPSRC_NONE;
+	u8   cap1Src       = CAPSRC_NONE;
+	u8  *mixdata       = (u8*)SPU->mixdata;
+	s32 *mixbuf        = NULL;
+	s32 *mutedmixbuf   = NULL;
+	s16 *capbuf        = NULL;
+	s16 *chanbuf       = NULL;
+	s16  *outbuf       = SPU->outbuf;
+	s32   masterVol    = spumuladjust7(SPU->regs.mastervol);
+	int mixdataClearSizeBytes = 0;
+	if(actuallyMix)
 	{
-		//non-advanced mode
-		for (int i = 0; i < 16; i++)
-		{
-			channel_struct *chan = &SPU->channels[i];
+		if(SPU->regs.ctl_ch1bypass) bypassMixer |= (1 << 1);
+		if(SPU->regs.ctl_ch3bypass) bypassMixer |= (1 << 3);
 
-			if (chan->status != CHANSTAT_PLAY)
-				continue;
+		// Translate capture state
+		if(SPU->regs.cap[0].runtime.running)
+		{
+			if(SPU->regs.cap[0].source == 0) cap0Src = CAPSRC_MIXER;
+			else if(SPU->regs.cap[0].add) cap0Src = CAPSRC_MIXED;
+			else cap0Src = CAPSRC_CHAN;
+		}
+		if(SPU->regs.cap[1].runtime.running)
+		{
+			if(SPU->regs.cap[1].source == 0) cap1Src = CAPSRC_MIXER;
+			else if(SPU->regs.cap[1].add) cap1Src = CAPSRC_MIXED;
+			else cap1Src = CAPSRC_CHAN;
+		}
+		if(cap0Src == CAPSRC_CHAN || cap0Src == CAPSRC_MIXED) capbufFlags0 |= (1 << 0);
+		if(                          cap0Src == CAPSRC_MIXED) capbufFlags0 |= (1 << 1);
+		if(cap1Src == CAPSRC_CHAN || cap1Src == CAPSRC_MIXED) capbufFlags1 |= (1 << 2);
+		if(                          cap1Src == CAPSRC_MIXED) capbufFlags1 |= (1 << 3);
+		bool isCapturing  = (cap0Src != CAPSRC_NONE) || (cap1Src != CAPSRC_NONE);
+		bool captureMuted = isCapturing && CommonSettings.spu_captureMuted;
+		if(!captureMuted)
+		{
+			capbufFlags0 &= ~chanMuteFlags;
+			capbufFlags1 &= ~chanMuteFlags;
+		}
 
-			SPU->bufpos = 0;
-			SPU->buflength = length;
+		// Translate outputs
+		switch(SPU->regs.ctl_left)
+		{
+			case SPU_struct::REGS::LOM_CH1:
+				outbufFlagsL = (1 << 1);
+				break;
+			case SPU_struct::REGS::LOM_CH3:
+				outbufFlagsL = (1 << 3);
+				break;
+			case SPU_struct::REGS::LOM_CH1_PLUS_CH3:
+				outbufFlagsL = (1 << 1) | (1 << 3);
+				break;
+		}
+		switch(SPU->regs.ctl_right)
+		{
+			case SPU_struct::REGS::ROM_CH1:
+				outbufFlagsR = (1 << 1);
+				break;
+			case SPU_struct::REGS::ROM_CH3:
+				outbufFlagsR = (1 << 3);
+				break;
+			case SPU_struct::REGS::ROM_CH1_PLUS_CH3:
+				outbufFlagsR = (1 << 1) | (1 << 3);
+				break;
+		}
 
-			// Mix audio
-			_SPU_ChanUpdate(!CommonSettings.spu_muteChannels[i] && actuallyMix, SPU, chan);
+		// Generate mixing pointers
+		// This setup is so we can clear everything in a single memset() call
+		// PONDER: Can we put these on the stack?
+		// ie. u8 mixdata[FIFO_SIZE * (sizeof(s32)+sizeof(s32)+sizeof(s16)+sizeof(s16))*2]
+		int mixBufSize = MIN(length, SPUCAPTURE_FIFO_SIZE);
+		u8 *nextdata = mixdata;
+		if(actuallyMix)  mixbuf      = (s32*)nextdata, nextdata += mixBufSize * sizeof(s32)*2;
+		if(captureMuted) mutedmixbuf = (s32*)nextdata, nextdata += mixBufSize * sizeof(s32)*2;
+		if(isCapturing)  capbuf      = (s16*)nextdata, nextdata += mixBufSize * sizeof(s16)*2;
+		if(actuallyMix)  chanbuf     = (s16*)nextdata; // <- Do not increment nextData
+		mixdataClearSizeBytes = nextdata - mixdata;
+	}
+	else
+	{
+		// If we end up here, we're either mixing the core SPU while
+		// in dual SPU mode (meaning we shouldn't output data from
+		// the capture unit, as we'll do this in the user SPU), or
+		// the output core is SNDDummy (with the capture units either
+		// not running, or disabled via ENABLE_DUMMY_SPU_CAPTURE==0).
+		// In the former case, we can just seek the capture position,
+		// but in the latter case, we must output silence to avoid
+		// potentially leaving the capture buffers filled with garbage.
+		bool captureZeros = (SPU_SoundCore() == &SNDDummy);
+		if(SPU->regs.cap[0].runtime.running)
+		{
+			if(captureZeros) _SPU_WriteCapture<false>(SPU->regs.cap[0], SPU->channels[1], NULL, length);
+			else             _SPU_SeekCapture        (SPU->regs.cap[0], SPU->channels[1],       length);
+		}
+		if(SPU->regs.cap[1].runtime.running)
+		{
+			if(captureZeros) _SPU_WriteCapture<false>(SPU->regs.cap[1], SPU->channels[3], NULL, length);
+			else             _SPU_SeekCapture        (SPU->regs.cap[1], SPU->channels[3],       length);
 		}
+	}
 
-		//zero out capture buffers - effectively transform no-advanced-spu-emulation to capturing-zeroes
-		//this is needed so when the option is changed (or a state with a different setting is loaded)
-		//this code is bulkier and slower than it might otherwise be to reduce the chance of bugs 
-		//IDEALLY the non-advanced codepath would be removed (while the advanced codepath was optimized and improved)
-		//and this code would disappear, to be replaced with code more capable of emitting zeroes at the opportune time.
-		for (int capchan = 0; capchan < 2; capchan++)
+	while(length)
+	{
+		if(mixdataClearSizeBytes) memset(mixdata, 0, mixdataClearSizeBytes);
+
+		// We can only process at most SPUCAPTURE_FIFO_SIZE samples
+		// per mixing batch, in case the capture buffers wrap around.
+		// Technically, we could actually check if this is needed at
+		// all, but this should work well enough as is.
+		int thisLength = MIN(length, SPUCAPTURE_FIFO_SIZE);
+		length -= thisLength;
+
+		// Process each channel in turn
+		// Note that we are using unsigned overflow to avoid counting
+		// directly, as we need to keep track of the bit index anyway
+		channel_struct *chan = SPU->channels;
+		for(u16 chanBit=1; chanBit != 0; chan++, chanBit <<= 1)
 		{
-			SPU_struct::REGS::CAP& cap = SPU->regs.cap[capchan];
-			channel_struct& srcChan = SPU->channels[1 + 2 * capchan];
-			if (cap.runtime.running)
+			if (chan->status != CHANSTAT_PLAY) continue;
+		
+			// Generate data into chanbuf[]
+			// NOTE: If actuallyMix==false, the channel is updated but no data is generated.
+			_SPU_GenerateChanData(actuallyMix, chan, chanbuf, thisLength);
+			if(!actuallyMix) continue;
+
+			// Bypass means we must NOT mix this channel into mixbuf[] OR mutedmixbuf[]
+			if((bypassMixer & chanBit) == 0)
 			{
-				for (int samp = 0; samp < length; samp++)
+				s32 *mixtarget = ((chanMuteFlags & chanBit) == 0) ? mixbuf : mutedmixbuf;
+				if(mixtarget)
 				{
-					u32 nSamplesToProcess = srcChan.sampincInt + AddAndReturnCarry(&cap.runtime.sampcntFrac, srcChan.sampincFrac);
-					cap.runtime.sampcntInt += nSamplesToProcess;
-					while (nSamplesToProcess--)
-					{
-						if (cap.bits8)
-						{
-							_MMU_write08<1,MMU_AT_DMA>(cap.runtime.curdad,0);
-							cap.runtime.curdad++;
-						}
-						else
-						{
-							_MMU_write16<1,MMU_AT_DMA>(cap.runtime.curdad,0);
-							cap.runtime.curdad+=2;
-						}
-
-						if (cap.runtime.curdad >= cap.runtime.maxdad)
-						{
-							cap.runtime.curdad = cap.dad;
-							cap.runtime.sampcntInt -= cap.len*(cap.bits8?4:2);
-						}
-					}
+					for(int n=0; n < thisLength*2; n++) mixtarget[n] += chanbuf[n];
 				}
 			}
-		}
-	} //non-advanced branch
 
-	//we used to bail out if speakers were disabled.
-	//this is technically wrong. sound may still be captured, or something.
-	//in all likelihood, any game doing this probably master disabled the SPU also
-	//so, optimization of this case is probably not necessary.
-	//later, we'll just silence the output
-	bool speakers = T1ReadWord(MMU.ARM7_REG, 0x304) & 0x01;
+			// Generate outputs for channel capture
+			// Yes, we have to undo the panning here, but that's fine.
+			// Incidentally, this emulates the ch(a)+ch(b) overflow bug
+			if((capbufFlags0 & chanBit) != 0)
+			{
+				if((capbufFlags0 & (chanBit-1)) == 0)
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+0]  = chanbuf[n*2+0] + chanbuf[n*2+1];
+				else
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+0] += chanbuf[n*2+0] + chanbuf[n*2+1];
+			}
+			if((capbufFlags1 & chanBit) != 0)
+			{
+				if((capbufFlags1 & (chanBit-1)) == 0)
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+1]  = chanbuf[n*2+0] + chanbuf[n*2+1];
+				else
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+1] += chanbuf[n*2+0] + chanbuf[n*2+1];
+			}
 
-	u8 vol = SPU->regs.mastervol;
+			// If speakers are turned off or the channel is muted, we can skip
+			// setting outbuf[]. Note that if the channel is muted and we are
+			// generating outbuf[] from chanbuf[], outbuf[] must already have
+			// been cleared to silence.
+			if(!speakersOn || (chanMuteFlags & chanBit) != 0) continue;
 
-	// convert from 32-bit->16-bit
-	if (actuallyMix && speakers)
-		for (int i = 0; i < length*2; i++)
+			// Set outbuf[] from chanbuf[] when L/R source is not the mixer
+			// Note that Ch1+Ch3 mode clips as intended; only capture has overflow bugs
+			if((outbufFlagsL & chanBit) != 0)
+			{
+				if((outbufFlagsL & (chanBit-1)) == 0)
+					for(int n=0; n < thisLength; n++)
+						outbuf[n*2+0] =                        (chanbuf[n*2+0] * masterVol >> 7);
+				else
+					for(int n=0; n < thisLength; n++)
+						outbuf[n*2+0] = MinMax(outbuf[n*2+0] + (chanbuf[n*2+0] * masterVol >> 7), -0x8000, +0x7FFF);
+			}
+			if((outbufFlagsR & chanBit) != 0)
+			{
+				if((outbufFlagsR & (chanBit-1)) == 0)
+					for(int n=0; n < thisLength; n++)
+						outbuf[n*2+1] =                        (chanbuf[n*2+1] * masterVol >> 7);
+				else
+					for(int n=0; n < thisLength; n++)
+						outbuf[n*2+1] = MinMax(outbuf[n*2+1] + (chanbuf[n*2+1] * masterVol >> 7), -0x8000, +0x7FFF);
+			}
+		}
+
+		// Generate mixer output to outbuf[]
+		if(mixbuf && speakersOn)
 		{
-			// Apply Master Volume
-			SPU->sndbuf[i] = spumuldiv7(SPU->sndbuf[i], vol);
-			s16 outsample = MinMax(SPU->sndbuf[i],-0x8000,0x7FFF);
-			SPU->outbuf[i] = outsample;
+			if(outbufFlagsL == 0)
+			{
+				for(int n=0; n < thisLength; n++)
+					outbuf[n*2+0] = MinMax(mixbuf[n*2+0] * masterVol >> 7, -0x8000, +0x7FFF);
+			}
+			if(outbufFlagsR == 0)
+			{
+				for(int n=0; n < thisLength; n++)
+					outbuf[n*2+1] = MinMax(mixbuf[n*2+1] * masterVol >> 7, -0x8000, +0x7FFF);
+			}
 		}
 
+		// Generate final capture output
+		if(cap0Src != CAPSRC_NONE)
+		{
+			if(cap0Src == CAPSRC_MIXER)
+			{
+				if(mutedmixbuf)
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+0] = MinMax(mixbuf[n*2+0] + mutedmixbuf[n*2+0], -0x8000, +0x7FFF);
+				else
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+0] = MinMax(mixbuf[n*2+0],                      -0x8000, +0x7FFF);
+			}
+			bool run = _SPU_WriteCapture<true>(SPU->regs.cap[0], SPU->channels[1], capbuf, thisLength);
+			if(!run) cap0Src = CAPSRC_NONE, capbufFlags0 = 0;
+		}
+		if(cap1Src != CAPSRC_NONE)
+		{
+			if(cap1Src == CAPSRC_MIXER)
+			{
+				if(mutedmixbuf)
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+1] = MinMax(mixbuf[n*2+1] + mutedmixbuf[n*2+1], -0x8000, +0x7FFF);
+				else
+					for(int n=0; n < thisLength; n++)
+						capbuf[n*2+1] = MinMax(mixbuf[n*2+1],                      -0x8000, +0x7FFF);
+			}
+			bool run = _SPU_WriteCapture<true>(SPU->regs.cap[1], SPU->channels[3], capbuf+1, thisLength);
+			if(!run) cap1Src = CAPSRC_NONE, capbufFlags1 = 0;
+		}
 
+		// Advance buffer
+		outbuf += (size_t)thisLength*2;
+	}
 }
 
 //////////////////////////////////////////////////////////////////////////////
 
 
-//emulates one hline of the cpu core.
-//this will produce a variable number of samples, calculated to keep a 44100hz output
-//in sync with the emulator framerate
-void SPU_Emulate_core()
+// Emulates the cpu core for the specified number of ARM7 cycles.
+// This will produce a variable number of samples to sync to DESMUME_SAMPLE_RATE
+int SPU_Emulate_core(u32 numberOfARM7Cycles)
 {
-	bool needToMix = true;
 	SoundInterface_struct *soundProcessor = SPU_SoundCore();
-	
-	_samples += samples_per_hline;
-	spu_core_samples = (int)(_samples);
-	_samples -= spu_core_samples;
-	
+
 	// We don't need to mix audio for Dual Synch/Asynch mode since we do this
 	// later in SPU_Emulate_user(). Disable mixing here to speed up processing.
-	// However, recording still needs to mix the audio, so make sure we're also
-	// not recording before we disable mixing.
-	if ( _currentSynchMode == ESynchMode_DualSynchAsynch &&
-		!(driver->AVI_IsRecording() || driver->WAV_IsRecording()) )
+	// If we are outputting to the dummy core, we can disable all mixing if
+	// we are not capturing the output at all, increasing performance.
+	bool needToMix = false;
+	if(soundProcessor != &SNDDummy)
+		needToMix = (_currentSynchMode != ESynchMode_DualSynchAsynch);
+#if ENABLE_DUMMY_SPU_CAPTURE
+	else if(SPU_core->regs.cap[0].runtime.running || SPU_core->regs.cap[1].runtime.running)
+		needToMix = true;
+#endif
+
+	// If we are recording, we will need to mix the core SPU regardless of anything else
+	// NOTE: Technically, we should be checking wavWriter.mode==WAVMODE_CORE, but that
+	// is only enabled with DEVELOPER_MENU_ITEMS, and we won't break anything anyway.
+	needToMix = needToMix || driver->AVI_IsRecording() || driver->WAV_IsRecording();
+
+	// NOTE: We used to keep a double-type counter here, and pre-divided by
+	// ARM7_CLOCK. This is probably enough for most cases, but for the sake
+	// of perfect accuracy (at least in regards to this), we use a cycles
+	// counter instead here, and figure out the sample count from there.
+	int samplesToMix;
 	{
-		needToMix = false;
+		// minMixSize controls the mixing latency, which reduces the
+		// overhead of the update routines at the cost of synchronicity.
+		// NOTE: minMixSize must be <= 128, or else _spu_core_cyclesCounter
+		// would cause 32bit overflow if we postpone mixing for long enough.
+		// We could use a 64bit counter instead, but 128 samples should be plenty.
+		static const u32 doMix_minMixSize = 1; // <- Setting this too high can break streams, so keep at minimum
+		static const u32 noMix_minMixSize = 64;
+		u64 minDeltaCycles = (u64)(needToMix ? doMix_minMixSize : noMix_minMixSize) * ARM7_CLOCK;
+		u64 cycles64 = _spu_core_cyclesCounter + (u64)numberOfARM7Cycles*DESMUME_SAMPLE_RATE;
+		if(cycles64 < minDeltaCycles)
+		{
+			_spu_core_cyclesCounter = (u32)cycles64;
+			return 0;
+		}
+		samplesToMix            = (int)(cycles64 / ARM7_CLOCK);
+		_spu_core_cyclesCounter = (u32)(cycles64 % ARM7_CLOCK);
 	}
 	
-	SPU_MixAudio(needToMix, SPU_core, spu_core_samples);
+	SPU_MixAudio(needToMix, SPU_core, samplesToMix);
 	
-	if (soundProcessor == NULL)
+	if (soundProcessor != NULL)
 	{
-		return;
-	}
-	
-	if (soundProcessor->FetchSamples != NULL)
-	{
-		soundProcessor->FetchSamples(SPU_core->outbuf, spu_core_samples, _currentSynchMode, _currentSynchronizer);
-	}
-	else
-	{
-		SPU_DefaultFetchSamples(SPU_core->outbuf, spu_core_samples, _currentSynchMode, _currentSynchronizer);
+		if (soundProcessor->FetchSamples != NULL)
+		{
+			soundProcessor->FetchSamples(SPU_core->outbuf, samplesToMix, _currentSynchMode, _currentSynchronizer);
+		}
+		else
+		{
+			SPU_DefaultFetchSamples(SPU_core->outbuf, samplesToMix, _currentSynchMode, _currentSynchronizer);
+		}
 	}
+
+	return samplesToMix;
 }
 
 void SPU_Emulate_user(bool mix)
@@ -1687,16 +1876,16 @@ void SPU_Emulate_user(bool mix)
 	// Check to see how many free samples are available.
 	// If there are some, fill up the output buffer.
 	freeSampleCount = soundProcessor->GetAudioSpace();
-	if (freeSampleCount == 0)
-	{
-		return;
-	}
 	
 	//printf("mix %i samples\n", audiosize);
 	if (freeSampleCount > _currentBufferSize)
 	{
 		freeSampleCount = _currentBufferSize;
 	}
+	if (freeSampleCount == 0)
+	{
+		return;
+	}
 	
 	// If needed, resize the post-process buffer to guarantee that
 	// we can store all the sound data.
@@ -1877,7 +2066,7 @@ void WavWriter::update(void* soundData, int numSamples)
 {
 	if(!spufp) return;
 	//TODO - big endian for the s16 samples??
-	size_t elems_written = fwrite(soundData, numSamples*2, 2, spufp);
+	size_t elems_written = fwrite(soundData, sizeof(s16)*2, numSamples, spufp);
 }
 
 bool WavWriter::isRecording() const
@@ -1929,14 +2118,14 @@ void WAV_WavSoundUpdate(void* soundData, int numSamples, WAVMode mode)
 void spu_savestate(EMUFILE &os)
 {
 	//version
-	os.write_32LE(7);
+	os.write_32LE(8);
 
 	SPU_struct *spu = SPU_core;
 
+	os.write_u8(SPUCHAN_PCM16B_SIZE);
 	for (int j = 0; j < 16; j++)
 	{
 		channel_struct &chan = spu->channels[j];
-		os.write_32LE(chan.num);
 		os.write_u8(chan.vol);
 		os.write_u8(chan.volumeDiv);
 		os.write_u8(chan.hold);
@@ -1952,15 +2141,13 @@ void spu_savestate(EMUFILE &os)
 		os.write_32LE(chan.length);
 		os.write_32LE(chan.sampcntFrac);
 		os.write_32LE(chan.sampcntInt);
-		os.write_32LE(chan.sampincFrac);
-		os.write_32LE(chan.sampincInt);
-		for (int i = 0; i < SPUINTERPOLATION_TAPS; i++) os.write_16LE(chan.pcm16b[i]);
-		os.write_32LE(chan.index);
+		for (int i = 0; i < SPUCHAN_PCM16B_SIZE; i++) os.write_16LE(chan.pcm16b[i]);
+		os.write_u8(chan.index);
 		os.write_16LE(chan.x);
 		os.write_u8(chan.keyon);
 	}
 
-	os.write_doubleLE(_samples);
+	os.write_32LE(_spu_core_cyclesCounter);
 
 	os.write_u8(spu->regs.mastervol);
 	os.write_u8(spu->regs.ctl_left);
@@ -1980,23 +2167,22 @@ void spu_savestate(EMUFILE &os)
 		os.write_32LE(spu->regs.cap[i].dad);
 		os.write_16LE(spu->regs.cap[i].len);
 		os.write_u8(spu->regs.cap[i].runtime.running);
-		os.write_32LE(spu->regs.cap[i].runtime.curdad);
-		os.write_32LE(spu->regs.cap[i].runtime.maxdad);
+		os.write_32LE(spu->regs.cap[i].runtime.dad);
 		os.write_32LE(spu->regs.cap[i].runtime.sampcntFrac);
 		os.write_32LE(spu->regs.cap[i].runtime.sampcntInt);
 	}
 
+	os.write_u8(SPUCAPTURE_FIFO_SIZE);
 	for (int i = 0; i < 2; i++)
-		spu->regs.cap[i].runtime.fifo.save(os);
+	{
+		os.write_u8(spu->regs.cap[i].runtime.pcm16bOffs);
+		for (int n = 0; n < SPUCAPTURE_FIFO_SIZE; n++)
+			os.write_16LE(spu->regs.cap[i].runtime.pcm16b[n]);
+	}
 }
 
 bool spu_loadstate(EMUFILE &is, int size)
 {
-	//note! if we load a state created with advanced spu logic on a system without it,
-	//there's a high likelihood of captured data existing.
-	//this would get played back forever without being replaced by captured data.
-	//it's been solved by capturing zeroes though even when advanced spu logic is disabled.
-	
 	//read version
 	u32 version;
 	if (is.read_32LE(version) != 1) return false;
@@ -2004,10 +2190,11 @@ bool spu_loadstate(EMUFILE &is, int size)
 	SPU_struct *spu = SPU_core;
 	reconstruct(&SPU_core->regs);
 
+	int pcm16bSz_Chan = (version >= 8) ? (int)is.read_u8() : 4;
 	for (int j = 0; j < 16; j++)
 	{
 		channel_struct &chan = spu->channels[j];
-		is.read_32LE(chan.num);
+		if(version < 8) is.read_32LE(chan.num); else chan.num = j;
 		is.read_u8(chan.vol);
 		is.read_u8(chan.volumeDiv);
 		if (chan.volumeDiv == 4) chan.volumeDiv = 3;
@@ -2017,7 +2204,7 @@ bool spu_loadstate(EMUFILE &is, int size)
 		is.read_u8(chan.repeat);
 		is.read_u8(chan.format);
 		is.read_u8(chan.status);
-		if (version >= 7) is.read_u8(chan.pcm16bOffs); else chan.pcm16bOffs = 0;
+		if (version >= 7) chan.pcm16bOffs = SPUCHAN_PCM16B_AT(is.read_u8());
 		is.read_32LE(chan.addr);
 		is.read_16LE(chan.timer);
 		is.read_16LE(chan.loopstart);
@@ -2027,54 +2214,45 @@ bool spu_loadstate(EMUFILE &is, int size)
 		if(version >= 7) {
 			is.read_32LE(chan.sampcntFrac);
 			is.read_32LE(chan.sampcntInt);
-			is.read_32LE(chan.sampincFrac);
-			is.read_32LE(chan.sampincInt);
+			if(version < 8) is.fseek(8, SEEK_CUR); // chan.sampincFrac (LE32), chan.sampincInt (LE32)
 		}
-		else if (version >= 2)
+		else /*if (version >= 2)*/ // <- This check (and its broken else clause) was never needed
 		{
-			double temp;
-			s64 temp2;
-			is.read_doubleLE(temp); temp2 = (s64)(temp * (1ll << 32));
-			chan.sampcntFrac = (u32)temp2;
-			chan.sampcntInt  = (s32)(temp2 >> 32);
-			is.read_doubleLE(temp); temp2 = (u64)(temp * (1ull << 32)); // Intentionally unsigned
-			chan.sampincFrac = (u32)temp2;
-			chan.sampincInt  = (u32)(temp2 >> 32);
-		}
-		else
-		{
-			// FIXME
-			// What even is supposed to be happening here?
-			// sampcnt and sampinc were double type before
-			// I even made any changes, so this is broken.
-			chan.sampcntFrac = 0;
-			is.read_32LE(chan.sampcntInt);
-			chan.sampincFrac = 0;
-			is.read_32LE(chan.sampincInt);
+			s64 temp = (s64)(is.read_doubleLE() * (double)(1ll << 32));
+			chan.sampcntFrac = (u32)temp;
+			chan.sampcntInt  = (s32)(temp >> 32);
+			is.fseek(8, SEEK_CUR); // chan.sampinc (LEdouble)
 		}
 		if (version >= 7) {
-			for (int i = 0; i < SPUINTERPOLATION_TAPS; i++) is.read_16LE(chan.pcm16b[i]);
+			for (int i = 0; i < pcm16bSz_Chan; i++) is.read_16LE(chan.pcm16b[SPUCHAN_PCM16B_AT(i)]);
 		}
 		else
 		{
 			is.fseek(4, SEEK_CUR);        // chan.lastsampcnt (LE32)
 			is.read_16LE(chan.pcm16b[0]); // chan.pcm16b
-			is.fseek(2, SEEK_CUR);        // chan.pcm16b_last
+			is.fseek(2, SEEK_CUR);        // chan.pcm16b_last (LE16)
 		}
-		is.read_32LE(chan.index);
+		chan.index = (version >= 8) ? is.read_u8() : (u8)is.read_s32LE();
 		is.read_16LE(chan.x);
 		if (version < 7) is.fseek(2, SEEK_CUR); // chan.psgnoise_last (LE16)
 
 		if (version >= 4)
 			is.read_u8(chan.keyon);
 
+		// Because we don't save sampinc, we need to recalculate it
+		adjust_channel_timer(&chan);
+
 		//hopefully trigger a recovery of the adpcm looping system
 		chan.loop_index = K_ADPCM_LOOPING_RECOVERY_INDEX;
 	}
 
-	if (version >= 2)
+	if (version >= 8)
 	{
-		is.read_doubleLE(_samples);
+		is.read_32LE(_spu_core_cyclesCounter);
+	}
+	else if (version >= 2)
+	{
+		_spu_core_cyclesCounter = (u32)(is.read_doubleLE() * ARM7_CLOCK); // _samples (doubleLE)
 	}
 
 	if (version >= 4)
@@ -2100,27 +2278,52 @@ bool spu_loadstate(EMUFILE &is, int size)
 			is.read_32LE(spu->regs.cap[i].dad);
 			is.read_16LE(spu->regs.cap[i].len);
 			is.read_u8(spu->regs.cap[i].runtime.running);
-			is.read_32LE(spu->regs.cap[i].runtime.curdad);
-			is.read_32LE(spu->regs.cap[i].runtime.maxdad);
+			if (version >= 8) is.read_32LE(spu->regs.cap[i].dad);
+			else {
+				is.fseek(4, SEEK_CUR); // regs.cap[i].runtime.curdad (LE32)
+				is.read_32LE(spu->regs.cap[i].dad); // regs.cap[i].runtime.maxdad
+				spu->regs.cap[i].dad -= spu->regs.cap[i].len*4;
+			}
 			if (version >= 7) {
 				is.read_32LE(spu->regs.cap[i].runtime.sampcntFrac);
 				is.read_32LE(spu->regs.cap[i].runtime.sampcntInt);
 			}
 			else
 			{
-				double temp;
-				u64 temp2;
-				is.read_doubleLE(temp); temp2 = (u64)(temp * (1ull << 32));
-				spu->regs.cap[i].runtime.sampcntFrac = (u32)temp2;
-				spu->regs.cap[i].runtime.sampcntInt  = (u32)(temp2 >> 32);
+				s64 temp = (s64)(is.read_doubleLE() * (double)(1ll << 32));
+				spu->regs.cap[i].runtime.sampcntFrac = (u32)temp;
+				spu->regs.cap[i].runtime.sampcntInt  = (s32)(temp >> 32);
+			}
+			if(version <= 7)
+			{
+				// Before, sampcnt incremented "as expected" and the FIFO
+				// delay was implemented within the SndFifo construct.
+				// Now, though, we create the delay by setting sampcnt to
+				// -FIFO_SIZE on starting capture, so account for this here.
+				spu->regs.cap[i].runtime.sampcntInt -= SPUCAPTURE_FIFO_SIZE;
 			}
 		}
 	}
 
-	if (version >= 6)
-		for (int i=0;i<2;i++) spu->regs.cap[i].runtime.fifo.load(is);
-	else
-		for (int i=0;i<2;i++) spu->regs.cap[i].runtime.fifo.reset();
+	int pcm16bSz_Capture = (version >= 8) ? (int)is.read_u8() : 16;
+	if (version >= 8)
+		for (int i=0;i<2;i++)
+		{
+			spu->regs.cap[i].runtime.pcm16bOffs = SPUCAPTURE_PCM16B_AT(is.read_u8());
+			for (int n = 0; n < pcm16bSz_Capture; n++)
+				is.read_16LE(spu->regs.cap[i].runtime.pcm16b[SPUCAPTURE_PCM16B_AT(n)]);
+		}
+	else if (version >= 6)
+		for (int i=0;i<2;i++)
+		{
+			// Setting pcm16bOffs to -fifo.size ensures that we always
+			// fill at the correct offset relative to the FIFO queue size
+			SPUFifo fifo;
+			fifo.load(is);
+			spu->regs.cap[i].runtime.pcm16bOffs = (u8)(-fifo.size);
+			for (int n = 0; n < 16; n++)
+				spu->regs.cap[i].runtime.pcm16b[SPUCAPTURE_PCM16B_AT(n)] = fifo.dequeue();
+		}
 
 	//older versions didnt store a mastervol; 
 	//we must reload this or else games will start silent
diff --git a/desmume/src/SPU.h b/desmume/src/SPU.h
index 5ba6d1c96..7ab7a470f 100644
--- a/desmume/src/SPU.h
+++ b/desmume/src/SPU.h
@@ -36,12 +36,16 @@ class EMUFILE;
 #define CHANSTAT_STOPPED          0
 #define CHANSTAT_PLAY             1
 
-#define SPUINTERPOLATION_TAPS 4 // Must be at least 4 for Catmull-Rom interpolation
+#define SPUCHAN_PCM16B_SIZE   4 // Must be 2^n, and at least 4 for Catmull-Rom interpolation
+#define SPUCAPTURE_FIFO_SIZE 16 // Must be 2^n
 
-//who made these static? theyre used in multiple places.
-FORCEINLINE s32 spumuldiv7(s32 val, u8 multiplier) {
-	assert(multiplier <= 127);
-	return (multiplier == 127) ? val : ((val * multiplier) >> 7);
+// This converts a value of 127/128 into 128/128. Needed for volume/pan/etc. calculations
+template<typename T>
+FORCEINLINE T spumuladjust7(T x)
+{
+	// Using >= can result in better code on some platforms
+	assert(x <= 127);
+	return x + (x >= (T)127);
 }
 
 enum SPUInterpolationMode
@@ -95,6 +99,7 @@ struct channel_struct
 						sampcntInt(0),
 						sampincFrac(0),
 						sampincInt(0),
+						pcm16b(),
 						loop_pcm16b(0),
 						index(0),
 						loop_index(0),
@@ -121,11 +126,11 @@ struct channel_struct
    s32 sampcntInt;
    u32 sampincFrac;
    u32 sampincInt;
-   s16 pcm16b[SPUINTERPOLATION_TAPS];
+   s16 pcm16b[SPUCHAN_PCM16B_SIZE];
    // ADPCM specific
    s16 loop_pcm16b;
-   s32 index;
-   int loop_index;
+   u8  index;
+   u8  loop_index;
    // PSG noise
    u16 x;
 };
@@ -146,12 +151,9 @@ class SPUFifo
 class SPU_struct
 {
 public:
-	SPU_struct(int buffersize);
-   u32 bufpos;
-   u32 buflength;
-   s32 *sndbuf;
-   s32 lastdata; //the last sample that a channel generated
-   s16 *outbuf;
+	SPU_struct();
+   s32 *mixdata; // Mixing buffers
+   s16 *outbuf;  // Device output source (L,R)
    u32 bufsize;
    channel_struct channels[16];
 
@@ -192,19 +194,22 @@ class SPU_struct
 		   u16 len;
 		   struct Runtime {
 			   Runtime()
-				   : running(0), curdad(0), maxdad(0)
+				   : running(0), pcm16bOffs(0), dad(0), len(0), sampcntFrac(0), sampcntInt(0), pcm16b()
 			   {}
+
 			   u8 running;
-			   u32 curdad;
-			   u32 maxdad;
+			   u8 pcm16bOffs;
+			   u32 dad;
+			   u32 len;
 			   u32 sampcntFrac;
-			   u32 sampcntInt;
-			   SPUFifo fifo;
+			   s32 sampcntInt;
+			   s16 pcm16b[SPUCAPTURE_FIFO_SIZE];
 		   } runtime;
 	   } cap[2];
    } regs;
 
    void reset();
+   void resizeBuffer(int buffersize);
    ~SPU_struct();
    void KeyOff(int channel);
    void KeyOn(int channel);
@@ -223,7 +228,6 @@ class SPU_struct
 };
 
 extern SPU_struct *SPU_core, *SPU_user;
-extern int spu_core_samples;
 
 int SPU_ChangeSoundCore(int coreid, int newBufferSizeBytes);
 SoundInterface_struct *SPU_SoundCore();
@@ -236,7 +240,7 @@ void SPU_SetSynchMode(int mode, int method);
 void SPU_ClearOutputBuffer(void);
 void SPU_Reset(void);
 void SPU_DeInit(void);
-void SPU_KeyOn(int channel);
+
 static FORCEINLINE void SPU_WriteByte(u32 addr, u8 val)
 {
 	addr &= 0xFFF;
@@ -264,7 +268,8 @@ static FORCEINLINE void SPU_WriteLong(u32 addr, u32 val)
 static FORCEINLINE u8 SPU_ReadByte(u32 addr) { return SPU_core->ReadByte(addr & 0x0FFF); }
 static FORCEINLINE u16 SPU_ReadWord(u32 addr) { return SPU_core->ReadWord(addr & 0x0FFF); }
 static FORCEINLINE u32 SPU_ReadLong(u32 addr) { return SPU_core->ReadLong(addr & 0x0FFF); }
-void SPU_Emulate_core(void);
+
+int SPU_Emulate_core(u32 numberOfARM7Cycles);
 void SPU_Emulate_user(bool mix = true);
 void SPU_DefaultFetchSamples(s16 *sampleBuffer, size_t sampleCount, ESynchMode synchMode, ISynchronizingAudioBuffer *theSynchronizer);
 size_t SPU_DefaultPostProcessSamples(s16 *postProcessBuffer, size_t requestedSampleCount, ESynchMode synchMode, ISynchronizingAudioBuffer *theSynchronizer);
diff --git a/desmume/src/frontend/windows/soundView.cpp b/desmume/src/frontend/windows/soundView.cpp
index 786b789a1..d0e9a1424 100644
--- a/desmume/src/frontend/windows/soundView.cpp
+++ b/desmume/src/frontend/windows/soundView.cpp
@@ -141,7 +141,7 @@ void SoundView_Refresh(bool forceRedraw)
 		InvalidateRect(GetDlgItem(hDlg, IDC_SOUND0PANBAR+chanId), NULL, FALSE);
 		if(thischan.status != CHANSTAT_STOPPED)
 		{
-			volBar[chan] = spumuldiv7(128, thischan.vol) >> volume_shift[thischan.volumeDiv];
+			volBar[chan] = spumuladjust7(thischan.vol) >> volume_shift[thischan.volumeDiv];
 			InvalidateRect(GetDlgItem(hDlg, IDC_SOUND0VOLBAR+chanId), NULL, FALSE);
 
 			if(SoundView_Data->volModeAlternate) 
@@ -274,7 +274,7 @@ void SoundView_Refresh(bool forceRedraw)
 		sprintf(buf,"%08X",cap0.len);
 		SetDlgItemText(hDlg,IDC_CAP0_LEN,buf);
 
-		sprintf(buf,"%08X",cap0.runtime.curdad);
+		sprintf(buf,"%08X",cap0.runtime.dad+cap0.runtime.sampcntInt*(cap0.bits8 ? 1 : 2));
 		SetDlgItemText(hDlg,IDC_CAP0_CURDAD,buf);
 
 		memcpy(&oldCap[0], &cap0, sizeof(SPU_struct::REGS::CAP));
@@ -306,7 +306,7 @@ void SoundView_Refresh(bool forceRedraw)
 		sprintf(buf,"%08X",cap1.len);
 		SetDlgItemText(hDlg,IDC_CAP1_LEN,buf);
 
-		sprintf(buf,"%08X",cap1.runtime.curdad);
+		sprintf(buf,"%08X",cap1.runtime.dad+cap1.runtime.sampcntInt*(cap1.bits8 ? 1 : 2));
 		SetDlgItemText(hDlg,IDC_CAP1_CURDAD,buf);
 
 		memcpy(&oldCap[1], &cap1, sizeof(SPU_struct::REGS::CAP));
@@ -319,19 +319,23 @@ void SoundView_Refresh(bool forceRedraw)
 static void updateMute_toSettings(HWND hDlg, int chan)
 {
 	for(int chanId = 0; chanId < 8; chanId++)
-		CommonSettings.spu_muteChannels[chanId+chanOfs()] = IsDlgButtonChecked(hDlg, IDC_SOUND0MUTE+chanId) == BST_CHECKED;
+	{
+		u16 bit = 1 << (chanId+chanOfs());
+		CommonSettings.spu_muteChannels &= ~bit;
+		CommonSettings.spu_muteChannels |=  bit * (IsDlgButtonChecked(hDlg, IDC_SOUND0MUTE+chanId) == BST_CHECKED);
+	}
 }
 
 static void updateMute_allFromSettings(HWND hDlg)
 {
 	for(int chanId = 0; chanId < 16; chanId++)
-		CheckDlgItem(hDlg,IDC_SOUND0MUTE+chanId,CommonSettings.spu_muteChannels[chanId]);
+		CheckDlgItem(hDlg,IDC_SOUND0MUTE+chanId,(CommonSettings.spu_muteChannels & (1 << chanId)) != 0);
 }
 
 static void updateMute_fromSettings(HWND hDlg)
 {
 	for(int chanId = 0; chanId < 8; chanId++)
-		CheckDlgItem(hDlg,IDC_SOUND0MUTE+chanId,CommonSettings.spu_muteChannels[chanId+chanOfs()]);
+		CheckDlgItem(hDlg,IDC_SOUND0MUTE+chanId,(CommonSettings.spu_muteChannels & (1 << (chanId+chanOfs()))) != 0);
 }
 static void SoundView_SwitchChanOfs(SoundView_DataStruct *data)
 {
@@ -435,7 +439,7 @@ static INT_PTR CALLBACK SoundView_DlgProc(HWND hDlg, UINT uMsg, WPARAM wParam, L
 			}
 
 			for(int chanId = 0; chanId < 8; chanId++) {
-				if(CommonSettings.spu_muteChannels[chanId])
+				if((CommonSettings.spu_muteChannels & (1<<chanId)) != 0)
 					SendDlgItemMessage(hDlg, IDC_SOUND0MUTE+chanId, BM_SETCHECK, TRUE, 0);
 			}
 
@@ -483,14 +487,12 @@ static INT_PTR CALLBACK SoundView_DlgProc(HWND hDlg, UINT uMsg, WPARAM wParam, L
 			CommonSettings.spu_captureMuted = IsDlgButtonChecked(hDlg,IDC_SOUND_CAPTURE_MUTED) != 0;
 			return 1;
 		case IDC_SOUND_UNMUTE_ALL:
-			for(int i=0;i<16;i++) CommonSettings.spu_muteChannels[i] = false;
+			CommonSettings.spu_muteChannels = 0;
 			updateMute_allFromSettings(hDlg);
 			return 1;
 		case IDC_SOUND_ANALYZE_CAP:
 			printf("WTF\n");
-			for(int i=0;i<16;i++) CommonSettings.spu_muteChannels[i] = true;
-			CommonSettings.spu_muteChannels[1] = false;
-			CommonSettings.spu_muteChannels[3] = false;
+			CommonSettings.spu_muteChannels = (u16)(~0) &~ ((1 << 1) | (1 << 3));
 			CommonSettings.spu_captureMuted = true;
 			updateMute_allFromSettings(hDlg);
 			CheckDlgItem(hDlg,IDC_SOUND_CAPTURE_MUTED,CommonSettings.spu_captureMuted);

From ac17a7a8c467331229d8d41e3cb156ba4146460f Mon Sep 17 00:00:00 2001
From: Aikku93 <aik@aol.com.au>
Date: Wed, 5 Oct 2022 19:05:54 +1100
Subject: [PATCH 4/6] SPU: Remove advanced SPU toggle

---
 desmume/src/NDSSystem.h                           | 2 --
 desmume/src/commandline.cpp                       | 4 ----
 desmume/src/commandline.h                         | 1 -
 desmume/src/frontend/cocoa/openemu/NDSGameCore.mm | 1 -
 desmume/src/frontend/windows/main.cpp             | 8 --------
 desmume/src/frontend/windows/resources.rc         | 4 ----
 6 files changed, 20 deletions(-)

diff --git a/desmume/src/NDSSystem.h b/desmume/src/NDSSystem.h
index 4dbbd779d..f8cd54f1f 100644
--- a/desmume/src/NDSSystem.h
+++ b/desmume/src/NDSSystem.h
@@ -532,7 +532,6 @@ extern struct TCommonSettings
 		, autodetectBackupMethod(0)
 		, spu_muteChannels(0)
 		, spu_captureMuted(false)
-		, spu_advanced(true)
 		, StylusPressure(50)
 		, ConsoleType(NDS_CONSOLE_TYPE_FAT)
 		, backupSave(false)
@@ -652,7 +651,6 @@ extern struct TCommonSettings
 
 	u16  spu_muteChannels;
 	bool spu_captureMuted;
-	bool spu_advanced;
 
 	struct _ShowGpu {
 		_ShowGpu() : main(true), sub(true) {}
diff --git a/desmume/src/commandline.cpp b/desmume/src/commandline.cpp
index cb6c23ac4..98f3b6caf 100644
--- a/desmume/src/commandline.cpp
+++ b/desmume/src/commandline.cpp
@@ -51,7 +51,6 @@ CommandLine::CommandLine()
 , _fw_boot(0)
 , _spu_sync_mode(-1)
 , _spu_sync_method(-1)
-, _spu_advanced(0)
 , _num_cores(-1)
 , _rigorous_timing(0)
 , _advanced_timing(-1)
@@ -143,7 +142,6 @@ ENDL
 " --advanced-timing          Use advanced bus-level timing; default ON" ENDL
 " --rigorous-timing          Use more realistic component timings; default OFF" ENDL
 " --gamehacks                Use game-specific hacks; default ON" ENDL
-" --spu-advanced             Enable advanced SPU capture functions (reverb)" ENDL
 " --backupmem-db             Use DB for autodetecting backup memory type" ENDL
 ENDL
 "Arguments affecting the emulated requipment:" ENDL
@@ -283,7 +281,6 @@ bool CommandLine::parse(int argc,char **argv)
 			{ "rigorous-timing", no_argument, &_rigorous_timing, 1},
 			{ "advanced-timing", no_argument, &_advanced_timing, 1},
 			{ "gamehacks", no_argument, &_gamehacks, 1},
-			{ "spu-advanced", no_argument, &_spu_advanced, 1},
 			{ "backupmem-db", no_argument, &autodetect_method, 1},
 
 			//system equipment
@@ -458,7 +455,6 @@ bool CommandLine::parse(int argc,char **argv)
 	if(_slot1_no8000prot) CommonSettings.RetailCardProtection8000 = false;
 	if(_spu_sync_mode != -1) CommonSettings.SPU_sync_mode = _spu_sync_mode;
 	if(_spu_sync_method != -1) CommonSettings.SPU_sync_method = _spu_sync_method;
-	if(_spu_advanced) CommonSettings.spu_advanced = true;
 
 	free(_bios_arm9);
 	free(_bios_arm7);
diff --git a/desmume/src/commandline.h b/desmume/src/commandline.h
index 47a28aca6..60a447193 100644
--- a/desmume/src/commandline.h
+++ b/desmume/src/commandline.h
@@ -99,7 +99,6 @@ class CommandLine
 	int _fw_boot;
 	int _load_to_memory;
 	int _bios_swi;
-	int _spu_advanced;
 	int _num_cores;
 	int _rigorous_timing;
 	int _advanced_timing;
diff --git a/desmume/src/frontend/cocoa/openemu/NDSGameCore.mm b/desmume/src/frontend/cocoa/openemu/NDSGameCore.mm
index 19d4992d0..ed8cc8f6e 100644
--- a/desmume/src/frontend/cocoa/openemu/NDSGameCore.mm
+++ b/desmume/src/frontend/cocoa/openemu/NDSGameCore.mm
@@ -203,7 +203,6 @@ - (id)init
 	[cdsFirmware applySettings];
 	
 	// Set up the sound core
-	CommonSettings.spu_advanced = true;
 	CommonSettings.spuInterpolationMode = SPUInterpolation_Cosine;
 	CommonSettings.SPU_sync_mode = SPU_SYNC_MODE_SYNCHRONOUS;
 	CommonSettings.SPU_sync_method = SPU_SYNC_METHOD_N;
diff --git a/desmume/src/frontend/windows/main.cpp b/desmume/src/frontend/windows/main.cpp
index 5a3cb1608..0d5ed259e 100644
--- a/desmume/src/frontend/windows/main.cpp
+++ b/desmume/src/frontend/windows/main.cpp
@@ -2036,7 +2036,6 @@ int _main()
 	ScreenGapColor = GetPrivateProfileInt("Display", "ScreenGapColor", 0xFFFFFF, IniName);
 	CommonSettings.showGpu.main = GetPrivateProfileInt("Display", "MainGpu", 1, IniName) != 0;
 	CommonSettings.showGpu.sub = GetPrivateProfileInt("Display", "SubGpu", 1, IniName) != 0;
-	CommonSettings.spu_advanced = GetPrivateProfileBool("Sound", "SpuAdvanced", true, IniName);
 	CommonSettings.advanced_timing = GetPrivateProfileBool("Emulation", "AdvancedTiming", true, IniName);
 	CommonSettings.gamehacks.en = GetPrivateProfileBool("Emulation", "GameHacks", true, IniName);
 	CommonSettings.GFX3D_Renderer_TextureDeposterize =  GetPrivateProfileBool("3D", "TextureDeposterize", 0, IniName);
@@ -6546,9 +6545,6 @@ static LRESULT CALLBACK SoundSettingsDlgProc(HWND hDlg, UINT uMsg, WPARAM wParam
 			SendDlgItemMessage(hDlg, IDC_SLVOLUME, TBM_SETPOS, TRUE, sndvolume);
 			SoundSettings_updateVolumeReadout(hDlg);
 
-			// Set spu advanced
-			CheckDlgItem(hDlg,IDC_SPU_ADVANCED,CommonSettings.spu_advanced);
-
 			timerid = SetTimer(hDlg, 1, 500, NULL);
 			return TRUE;
 		}
@@ -6625,10 +6621,6 @@ static LRESULT CALLBACK SoundSettingsDlgProc(HWND hDlg, UINT uMsg, WPARAM wParam
 					CommonSettings.spuInterpolationMode = (SPUInterpolationMode)SendDlgItemMessage(hDlg, IDC_SPU_INTERPOLATION_CB, CB_GETCURSEL, 0, 0);
 					WritePrivateProfileInt("Sound","SPUInterpolation",(int)CommonSettings.spuInterpolationMode, IniName);
 
-					//write spu advanced
-					CommonSettings.spu_advanced = IsDlgCheckboxChecked(hDlg,IDC_SPU_ADVANCED);
-					WritePrivateProfileBool("Sound","SpuAdvanced",CommonSettings.spu_advanced,IniName);
-
 					return TRUE;
 				}
 			case IDCANCEL:
diff --git a/desmume/src/frontend/windows/resources.rc b/desmume/src/frontend/windows/resources.rc
index 2325fe3c3..6581a799f 100644
--- a/desmume/src/frontend/windows/resources.rc
+++ b/desmume/src/frontend/windows/resources.rc
@@ -1104,10 +1104,6 @@ BEGIN
     LTEXT           "Different ways to handle too-slow or too-fast",IDC_STATIC,24,138,138,8
     LTEXT           "in synch mode. Hopefully these will improve.",IDC_STATIC,24,146,139,8
     LTEXT           "Synchronous is more 'accurate' but you may prefer the sound of dual mode if you are running far from 60fps.  Beware that dual mode can clip the ends of sound effects and one-shot instruments.",IDC_STATIC,18,39,158,34
-    CONTROL         "Advanced SPU Logic (reboot game after changing)",IDC_SPU_ADVANCED,
-                    "Button",BS_AUTOCHECKBOX | WS_TABSTOP,197,74,177,10
-    LTEXT           "This will be necessary to emulate sound capture (reverb and music visualization). This is costly, few games use it, and the effect is unnecessary.  Enable it if your system has power to spare; check the Sound Viewer to see if the game is using capture.",IDC_STATIC,196,86,177,41
-    LTEXT           "Can cause deterministically visible changes to emulation in principle!!!",IDC_STATIC,196,127,177,17
 END
 
 IDD_SOUND_VIEW DIALOGEX 0, 0, 568, 269

From 017c8eb0be0b6d35206945ece9995033a6716e17 Mon Sep 17 00:00:00 2001
From: Aikku93 <aik@aol.com.au>
Date: Wed, 5 Oct 2022 19:09:06 +1100
Subject: [PATCH 5/6] SPU: Disable advanced SPU toggle

The UI still presents the option, but does nothing
---
 desmume/src/frontend/cocoa/cocoa_output.mm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/desmume/src/frontend/cocoa/cocoa_output.mm b/desmume/src/frontend/cocoa/cocoa_output.mm
index 554f353aa..ebe721207 100644
--- a/desmume/src/frontend/cocoa/cocoa_output.mm
+++ b/desmume/src/frontend/cocoa/cocoa_output.mm
@@ -325,7 +325,7 @@ - (void) setSpuAdvancedLogic:(BOOL)state
 	apple_unfairlock_unlock(_unfairlockSpuAdvancedLogic);
 	
 	pthread_rwlock_wrlock(self.rwlockProducer);
-	CommonSettings.spu_advanced = state;
+	//CommonSettings.spu_advanced = state;
 	pthread_rwlock_unlock(self.rwlockProducer);
 }
 

From 07c580c234927c6a361e6355041abeee7d4eaaf8 Mon Sep 17 00:00:00 2001
From: Aikku93 <aik@aol.com.au>
Date: Sun, 4 Sep 2022 11:55:01 +1000
Subject: [PATCH 6/6] Fix for #589

Enforce delay for channel playback and adjust capture delay prior to FIFO buffering.
---
 desmume/src/SPU.cpp | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/desmume/src/SPU.cpp b/desmume/src/SPU.cpp
index 1d03e4296..f6ae9ad8a 100644
--- a/desmume/src/SPU.cpp
+++ b/desmume/src/SPU.cpp
@@ -55,6 +55,19 @@ static inline s8 read_s8(u32 addr) { return (s8)_MMU_read08<ARMCPU_ARM7,MMU_AT_D
 // as this is guaranteed to be safe.
 #define ENABLE_DUMMY_SPU_CAPTURE 1
 
+// This specifies how many samples to buffer for the channel FIFO
+// If this is too low, then badly-synchronized streams can cause
+// buffer overrun.
+#define SPUCHAN_FIFO_DELAY 3
+#if SPUCHAN_FIFO_DELAY >= SPUCHAN_PCM16B_SIZE
+# error "Channel FIFO delay must less than SPUCHAN_PCM16B_SIZE"
+#endif
+
+// This controls the delay for the capture unit (how many output
+// samples to stall for before actually writing anything).
+// This seems to need matching to the channel playback delay (see KeyOn())?
+#define SPUCAPTURE_FIFO_DELAY 3
+
 #define K_ADPCM_LOOPING_RECOVERY_INDEX 255
 
 #define CATMULLROM_INTERPOLATION_RESOLUTION_BITS 11
@@ -785,7 +798,7 @@ void SPU_struct::ProbeCapture(int which)
 	cap.runtime.dad = cap.dad;
 	u32 len = cap.len;
 	if(len==0) len=1;
-	cap.runtime.sampcntFrac = 0, cap.runtime.sampcntInt = -SPUCAPTURE_FIFO_SIZE;
+	cap.runtime.sampcntFrac = 0, cap.runtime.sampcntInt = -SPUCAPTURE_FIFO_DELAY;
 }
 
 void SPU_struct::WriteByte(u32 addr, u8 val)
@@ -1079,10 +1092,10 @@ FORCEINLINE static s16 Interpolate(const s16 *pcm16b, u8 pcm16bOffs, u32 subPos)
 			// of a 'luxury' thing, we should be able to use MinMax
 			// since if the user is using this interpolation method,
 			// there's likely enough processing power to handle it.
-			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 3)];
-			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 2)];
-			s32 c = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 1)];
-			s32 d = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 0)];
+			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs-SPUCHAN_FIFO_DELAY+0)];
+			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs-SPUCHAN_FIFO_DELAY+1)];
+			s32 c = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs-SPUCHAN_FIFO_DELAY+2)];
+			s32 d = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs-SPUCHAN_FIFO_DELAY+3)];
 			const u16 *w = catmullrom_lut[subPos >> (32 - CATMULLROM_INTERPOLATION_RESOLUTION_BITS)];
 			return (s16)MinMax((-a*(s32)w[0] + b*(s32)w[1] + c*(s32)w[2] - d*(s32)w[3]) >> 15, -0x8000, +0x7FFF);
 		}
@@ -1096,8 +1109,8 @@ FORCEINLINE static s16 Interpolate(const s16 *pcm16b, u8 pcm16bOffs, u32 subPos)
 			// NOTE: Always cast the result to s16. (b-a) can
 			// overflow, but a+(b-a)*subPos can't. So we might
 			// have garbage in the upper 16 bits.
-			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 1)];
-			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 0)];
+			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs-SPUCHAN_FIFO_DELAY+0)];
+			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs-SPUCHAN_FIFO_DELAY+1)];
 			s32 subPos16 = (s32)cos_lut[subPos >> (32 - COSINE_INTERPOLATION_RESOLUTION_BITS)];
 			return (s16)(a + (((b - a)*subPos16) >> 16));
 		}
@@ -1108,15 +1121,15 @@ FORCEINLINE static s16 Interpolate(const s16 *pcm16b, u8 pcm16bOffs, u32 subPos)
 			// sampleI = sampleA * (1 - ratio) + sampleB * ratio
 			// Delay: 1 sample, Maximum gain: 1.0
 			// NOTE: Always cast the result to s16 (see above).
-			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 1)];
-			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs - 0)];
+			s32 a = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs-SPUCHAN_FIFO_DELAY+0)];
+			s32 b = pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs-SPUCHAN_FIFO_DELAY+1)];
 			s32 subPos16 = subPos >> (32 - 16);
 			return (s16)(a + (((b - a)*subPos16) >> 16));
 		}
 
 		default:
 			// Delay: 0 samples, Maximum gain: 1.0
-			return pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs)];
+			return pcm16b[SPUCHAN_PCM16B_AT(pcm16bOffs-SPUCHAN_FIFO_DELAY+0)];
 	}
 }
 
@@ -1411,7 +1424,7 @@ template<int CAP_BITS, bool USE_SRCBUF>
 	s32 pos = runtime.sampcntInt;
 	do
 	{
-		s16 sample = USE_SRCBUF ? (*srcBuf) : 0;
+		s16 *data = &runtime.pcm16b[SPUCAPTURE_PCM16B_AT(runtime.pcm16bOffs)];
 		u32 nSamplesToProcess = srcChan.sampincInt + AddAndReturnCarry(&runtime.sampcntFrac, srcChan.sampincFrac);
 		while(nSamplesToProcess--)
 		{
@@ -1425,22 +1438,22 @@ template<int CAP_BITS, bool USE_SRCBUF>
 				pos -= capLen_shifted;
 			}
 
-			s16 *data = &runtime.pcm16b[SPUCAPTURE_PCM16B_AT(runtime.pcm16bOffs)];
 			if(pos >= 0)
 			{
+				s16 sample = *data;
 				if (CAP_BITS == 8)
 				{
-					_MMU_write08<ARMCPU_ARM7,MMU_AT_DMA>(runtime.dad + pos*sizeof(s8), (u8)(*data >> 8));
+					_MMU_write08<ARMCPU_ARM7,MMU_AT_DMA>(runtime.dad + pos*sizeof(s8), (u8)(sample >> 8));
 				}
 				else
 				{
-					_MMU_write16<ARMCPU_ARM7,MMU_AT_DMA>(runtime.dad + pos*sizeof(s16), (u16)(*data));
+					_MMU_write16<ARMCPU_ARM7,MMU_AT_DMA>(runtime.dad + pos*sizeof(s16), (u16)(sample));
 				}
 			}
-			*data = sample;
-			runtime.pcm16bOffs++;
 			pos++;
 		}
+		*data = USE_SRCBUF ? (*srcBuf) : 0;
+		runtime.pcm16bOffs++;
 
 		// srcBuf[] stores two samples per time unit
 		// Either {Ch0[+Ch1],Ch2[+Ch3]}, or {LMix,RMix}