Commit 9bb14eda authored by Thomas Guillem's avatar Thomas Guillem

coreaudio: replace TPCircularBuffer by os_unfair_lock and a block chain

Remove the usage of TPCircularBuffer and multiple atomic variables that start
to make this code way too complicated. Replace it by os_unfair_lock and a block
chain.

os_unfair_lock is a safe spinlock that waits in the kernel in case of thread
contention.

Fallback to pthread_mutex_t if os_unfair_lock is not availaible (before macOS
10.12 / iOS 10.0).

The unfairness of this new lock is not an issue here since both locking threads
(the render callback and the VLC DecoderThread calling aout_DecPlay) will be
automatically paced (and will let the other thread take the lock). Indeed, the
render thread need a sample every 22 or 88ms, and the DecoderThread will wait
for the decoder, wait in the decoder lock, or wait from the aout if the FIFO is
full.
parent 4a4ecf29
......@@ -101,16 +101,14 @@ aout_LTLIBRARIES += libwaveout_plugin.la
endif
libauhal_plugin_la_SOURCES = audio_output/auhal.c \
audio_output/coreaudio_common.c audio_output/coreaudio_common.h \
audio_output/TPCircularBuffer.h audio_output/TPCircularBuffer.c
audio_output/coreaudio_common.c audio_output/coreaudio_common.h
libauhal_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(aoutdir)' \
-Wl,-framework,CoreAudio,-framework,AudioUnit,-framework,AudioToolbox,-framework,CoreServices
if HAVE_OSX
aout_LTLIBRARIES += libauhal_plugin.la
endif
libaudiounit_ios_plugin_la_SOURCES = audio_output/audiounit_ios.m \
audio_output/coreaudio_common.c audio_output/coreaudio_common.h \
audio_output/TPCircularBuffer.h audio_output/TPCircularBuffer.c
audio_output/coreaudio_common.c audio_output/coreaudio_common.h
libaudiounit_ios_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(aoutdir)' \
-Wl,-framework,CoreAudio,-framework,AudioUnit,-framework,AudioToolbox,-framework,CoreServices,-framework,UIKit,-framework,AVFoundation
if HAVE_IOS
......
//
// TPCircularBuffer.c
// Circular/Ring buffer implementation
//
// Created by Michael Tyson on 10/12/2011.
// Copyright 2011-2012 A Tasty Pixel. All rights reserved.
#include "audio_output/TPCircularBuffer.h"
#include <mach/mach.h>
#include <stdio.h>
#define reportResult(result,operation) (_reportResult((result),(operation),strrchr(__FILE__, '/')+1,__LINE__))
static inline bool _reportResult(kern_return_t result, const char *operation, const char* file, int line) {
if ( result != ERR_SUCCESS ) {
printf("%s:%d: %s: %s\n", file, line, operation, mach_error_string(result));
return false;
}
return true;
}
bool TPCircularBufferInit(TPCircularBuffer *buffer, int length) {
// Keep trying until we get our buffer, needed to handle race conditions
int retries = 3;
while ( true ) {
buffer->length = round_page(length); // We need whole page sizes
// Temporarily allocate twice the length, so we have the contiguous address space to
// support a second instance of the buffer directly after
vm_address_t bufferAddress;
kern_return_t result = vm_allocate(mach_task_self(),
&bufferAddress,
buffer->length * 2,
VM_FLAGS_ANYWHERE); // allocate anywhere it'll fit
if ( result != ERR_SUCCESS ) {
if ( retries-- == 0 ) {
reportResult(result, "Buffer allocation");
return false;
}
// Try again if we fail
continue;
}
// Now replace the second half of the allocation with a virtual copy of the first half. Deallocate the second half...
result = vm_deallocate(mach_task_self(),
bufferAddress + buffer->length,
buffer->length);
if ( result != ERR_SUCCESS ) {
if ( retries-- == 0 ) {
reportResult(result, "Buffer deallocation");
return false;
}
// If this fails somehow, deallocate the whole region and try again
vm_deallocate(mach_task_self(), bufferAddress, buffer->length);
continue;
}
// Re-map the buffer to the address space immediately after the buffer
vm_address_t virtualAddress = bufferAddress + buffer->length;
vm_prot_t cur_prot, max_prot;
result = vm_remap(mach_task_self(),
&virtualAddress, // mirror target
buffer->length, // size of mirror
0, // auto alignment
0, // force remapping to virtualAddress
mach_task_self(), // same task
bufferAddress, // mirror source
0, // MAP READ-WRITE, NOT COPY
&cur_prot, // unused protection struct
&max_prot, // unused protection struct
VM_INHERIT_DEFAULT);
if ( result != ERR_SUCCESS ) {
if ( retries-- == 0 ) {
reportResult(result, "Remap buffer memory");
return false;
}
// If this remap failed, we hit a race condition, so deallocate and try again
vm_deallocate(mach_task_self(), bufferAddress, buffer->length);
continue;
}
if ( virtualAddress != bufferAddress+buffer->length ) {
// If the memory is not contiguous, clean up both allocated buffers and try again
if ( retries-- == 0 ) {
printf("Couldn't map buffer memory to end of buffer\n");
return false;
}
vm_deallocate(mach_task_self(), virtualAddress, buffer->length);
vm_deallocate(mach_task_self(), bufferAddress, buffer->length);
continue;
}
buffer->buffer = (void*)bufferAddress;
buffer->fillCount = 0;
buffer->head = buffer->tail = 0;
return true;
}
return false;
}
void TPCircularBufferCleanup(TPCircularBuffer *buffer) {
vm_deallocate(mach_task_self(), (vm_address_t)buffer->buffer, buffer->length * 2);
memset(buffer, 0, sizeof(TPCircularBuffer));
}
void TPCircularBufferClear(TPCircularBuffer *buffer) {
int32_t fillCount;
if ( TPCircularBufferTail(buffer, &fillCount) ) {
TPCircularBufferConsume(buffer, fillCount);
}
}
//
// TPCircularBuffer.h
// Circular/Ring buffer implementation
//
// https://github.com/michaeltyson/TPCircularBuffer
//
// Created by Michael Tyson on 10/12/2011.
// Copyright 2011-2012 A Tasty Pixel. All rights reserved.
//
//
// This implementation makes use of a virtual memory mapping technique that inserts a virtual copy
// of the buffer memory directly after the buffer's end, negating the need for any buffer wrap-around
// logic. Clients can simply use the returned memory address as if it were contiguous space.
//
// The implementation is thread-safe in the case of a single producer and single consumer.
//
// Virtual memory technique originally proposed by Philip Howard (http://vrb.slashusr.org/), and
// adapted to Darwin by Kurt Revis (http://www.snoize.com,
// http://www.snoize.com/Code/PlayBufferedSoundFile.tar.gz)
//
#ifndef TPCircularBuffer_h
#define TPCircularBuffer_h
#include <libkern/OSAtomic.h>
#include <string.h>
#include <assert.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
void *buffer;
int32_t length;
int32_t tail;
int32_t head;
volatile int32_t fillCount;
} TPCircularBuffer;
/*!
* Initialise buffer
*
* Note that the length is advisory only: Because of the way the
* memory mirroring technique works, the true buffer length will
* be multiples of the device page size (e.g. 4096 bytes)
*
* @param buffer Circular buffer
* @param length Length of buffer
*/
bool TPCircularBufferInit(TPCircularBuffer *buffer, int32_t length);
/*!
* Cleanup buffer
*
* Releases buffer resources.
*/
void TPCircularBufferCleanup(TPCircularBuffer *buffer);
/*!
* Clear buffer
*
* Resets buffer to original, empty state.
*
* This is safe for use by consumer while producer is accessing
* buffer.
*/
void TPCircularBufferClear(TPCircularBuffer *buffer);
// Reading (consuming)
/*!
* Access end of buffer
*
* This gives you a pointer to the end of the buffer, ready
* for reading, and the number of available bytes to read.
*
* @param buffer Circular buffer
* @param availableBytes On output, the number of bytes ready for reading
* @return Pointer to the first bytes ready for reading, or NULL if buffer is empty
*/
static __inline__ __attribute__((always_inline)) void* TPCircularBufferTail(TPCircularBuffer *buffer, int32_t* availableBytes) {
*availableBytes = buffer->fillCount;
if ( *availableBytes == 0 ) return NULL;
return (void*)((char*)buffer->buffer + buffer->tail);
}
/*!
* Consume bytes in buffer
*
* This frees up the just-read bytes, ready for writing again.
*
* @param buffer Circular buffer
* @param amount Number of bytes to consume
*/
static __inline__ __attribute__((always_inline)) void TPCircularBufferConsume(TPCircularBuffer *buffer, int32_t amount) {
buffer->tail = (buffer->tail + amount) % buffer->length;
OSAtomicAdd32Barrier(-amount, &buffer->fillCount);
assert(buffer->fillCount >= 0);
}
/*!
* Version of TPCircularBufferConsume without the memory barrier, for more optimal use in single-threaded contexts
*/
static __inline__ __attribute__((always_inline)) void TPCircularBufferConsumeNoBarrier(TPCircularBuffer *buffer, int32_t amount) {
buffer->tail = (buffer->tail + amount) % buffer->length;
buffer->fillCount -= amount;
assert(buffer->fillCount >= 0);
}
/*!
* Access front of buffer
*
* This gives you a pointer to the front of the buffer, ready
* for writing, and the number of available bytes to write.
*
* @param buffer Circular buffer
* @param availableBytes On output, the number of bytes ready for writing
* @return Pointer to the first bytes ready for writing, or NULL if buffer is full
*/
static __inline__ __attribute__((always_inline)) void* TPCircularBufferHead(TPCircularBuffer *buffer, int32_t* availableBytes) {
*availableBytes = (buffer->length - buffer->fillCount);
if ( *availableBytes == 0 ) return NULL;
return (void*)((char*)buffer->buffer + buffer->head);
}
// Writing (producing)
/*!
* Produce bytes in buffer
*
* This marks the given section of the buffer ready for reading.
*
* @param buffer Circular buffer
* @param amount Number of bytes to produce
*/
static __inline__ __attribute__((always_inline)) void TPCircularBufferProduce(TPCircularBuffer *buffer, int amount) {
buffer->head = (buffer->head + amount) % buffer->length;
OSAtomicAdd32Barrier(amount, &buffer->fillCount);
assert(buffer->fillCount <= buffer->length);
}
/*!
* Version of TPCircularBufferProduce without the memory barrier, for more optimal use in single-threaded contexts
*/
static __inline__ __attribute__((always_inline)) void TPCircularBufferProduceNoBarrier(TPCircularBuffer *buffer, int amount) {
buffer->head = (buffer->head + amount) % buffer->length;
buffer->fillCount += amount;
assert(buffer->fillCount <= buffer->length);
}
/*!
* Helper routine to copy bytes to buffer
*
* This copies the given bytes to the buffer, and marks them ready for writing.
*
* @param buffer Circular buffer
* @param src Source buffer
* @param len Number of bytes in source buffer
* @return true if bytes copied, false if there was insufficient space
*/
static __inline__ __attribute__((always_inline)) bool TPCircularBufferProduceBytes(TPCircularBuffer *buffer, const void* src, int32_t len) {
int32_t space;
void *ptr = TPCircularBufferHead(buffer, &space);
if ( space < len ) return false;
memcpy(ptr, src, len);
TPCircularBufferProduce(buffer, len);
return true;
}
#ifdef __cplusplus
}
#endif
#endif
......@@ -25,10 +25,13 @@
#import "coreaudio_common.h"
#import <CoreAudio/CoreAudioTypes.h>
#if !TARGET_OS_IPHONE
#import <CoreServices/CoreServices.h>
#import <vlc_dialog.h>
#endif
#import <dlfcn.h>
static struct
{
void (*lock)(os_unfair_lock *lock);
void (*unlock)(os_unfair_lock *lock);
} unfair_lock;
static inline uint64_t
BytesToFrames(struct aout_sys_common *p_sys, size_t i_bytes)
......@@ -42,17 +45,74 @@ FramesToUs(struct aout_sys_common *p_sys, uint64_t i_nb_frames)
return i_nb_frames * CLOCK_FREQ / p_sys->i_rate;
}
static void
ca_ClearOutBuffers(audio_output_t *p_aout)
{
struct aout_sys_common *p_sys = (struct aout_sys_common *) p_aout->sys;
block_ChainRelease(p_sys->p_out_chain);
p_sys->p_out_chain = NULL;
p_sys->pp_out_last = &p_sys->p_out_chain;
p_sys->i_out_size = 0;
}
static void
ca_init_once(void)
{
unfair_lock.lock = dlsym(RTLD_DEFAULT, "os_unfair_lock_lock");
if (!unfair_lock.lock)
return;
unfair_lock.unlock = dlsym(RTLD_DEFAULT, "os_unfair_lock_unlock");
if (!unfair_lock.unlock)
unfair_lock.lock = NULL;
}
static void
lock_init(struct aout_sys_common *p_sys)
{
if (unfair_lock.lock)
p_sys->lock.unfair = OS_UNFAIR_LOCK_INIT;
else
vlc_mutex_init(&p_sys->lock.mutex);
}
static void
lock_destroy(struct aout_sys_common *p_sys)
{
if (!unfair_lock.lock)
vlc_mutex_destroy(&p_sys->lock.mutex);
}
static void
lock_lock(struct aout_sys_common *p_sys)
{
if (unfair_lock.lock)
unfair_lock.lock(&p_sys->lock.unfair);
else
vlc_mutex_lock(&p_sys->lock.mutex);
}
static void
lock_unlock(struct aout_sys_common *p_sys)
{
if (unfair_lock.lock)
unfair_lock.unlock(&p_sys->lock.unfair);
else
vlc_mutex_unlock(&p_sys->lock.mutex);
}
void
ca_Open(audio_output_t *p_aout)
{
struct aout_sys_common *p_sys = (struct aout_sys_common *) p_aout->sys;
atomic_init(&p_sys->i_underrun_size, 0);
atomic_init(&p_sys->b_paused, false);
atomic_init(&p_sys->b_do_flush, false);
atomic_init(&p_sys->b_highlatency, true);
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, ca_init_once);
vlc_sem_init(&p_sys->flush_sem, 0);
vlc_mutex_init(&p_sys->lock);
lock_init(p_sys);
p_sys->p_out_chain = NULL;
p_aout->play = ca_Play;
p_aout->pause = ca_Pause;
......@@ -66,7 +126,7 @@ ca_Close(audio_output_t *p_aout)
struct aout_sys_common *p_sys = (struct aout_sys_common *) p_aout->sys;
vlc_sem_destroy(&p_sys->flush_sem);
vlc_mutex_destroy(&p_sys->lock);
lock_destroy(p_sys);
}
/* Called from render callbacks. No lock, wait, and IO here */
......@@ -76,52 +136,63 @@ ca_Render(audio_output_t *p_aout, uint32_t i_nb_samples, uint8_t *p_output,
{
struct aout_sys_common *p_sys = (struct aout_sys_common *) p_aout->sys;
const bool b_highlatency = CLOCK_FREQ * (uint64_t)i_nb_samples / p_sys->i_rate
> AOUT_MAX_PTS_DELAY;
lock_lock(p_sys);
if (b_highlatency)
atomic_store(&p_sys->b_highlatency, true);
p_sys->b_highlatency = CLOCK_FREQ * (uint64_t)i_nb_samples / p_sys->i_rate
> AOUT_MAX_PTS_DELAY;
bool expected = true;
if (atomic_compare_exchange_weak(&p_sys->b_do_flush, &expected, false))
if (p_sys->b_do_flush)
{
TPCircularBufferClear(&p_sys->circular_buffer);
ca_ClearOutBuffers(p_aout);
/* Signal that the renderer is flushed */
p_sys->b_do_flush = false;
vlc_sem_post(&p_sys->flush_sem);
}
if (atomic_load_explicit(&p_sys->b_paused, memory_order_relaxed))
{
memset(p_output, 0, i_requested);
return;
}
/* Pull audio from buffer */
int32_t i_available;
void *p_data = TPCircularBufferTail(&p_sys->circular_buffer,
&i_available);
if (i_available < 0)
i_available = 0;
if (p_sys->b_paused)
goto drop;
size_t i_tocopy = __MIN(i_requested, (size_t) i_available);
if (i_tocopy > 0)
size_t i_copied = 0;
block_t *p_block = p_sys->p_out_chain;
while (p_block != NULL && i_requested != 0)
{
memcpy(p_output, p_data, i_tocopy);
TPCircularBufferConsume(&p_sys->circular_buffer, i_tocopy);
size_t i_tocopy = __MIN(i_requested, p_block->i_buffer);
memcpy(&p_output[i_copied], p_block->p_buffer, i_tocopy);
i_requested -= i_tocopy;
i_copied += i_tocopy;
if (i_tocopy == p_block->i_buffer)
{
block_t *p_release = p_block;
p_block = p_block->p_next;
block_Release(p_release);
}
else
{
assert(i_requested == 0);
p_block->p_buffer += i_tocopy;
p_block->i_buffer -= i_tocopy;
}
}
p_sys->p_out_chain = p_block;
if (!p_sys->p_out_chain)
p_sys->pp_out_last = &p_sys->p_out_chain;
p_sys->i_out_size -= i_copied;
/* Pad with 0 */
if (i_requested > i_tocopy)
if (i_requested > 0)
{
atomic_fetch_add(&p_sys->i_underrun_size, i_requested - i_tocopy);
memset(&p_output[i_tocopy], 0, i_requested - i_tocopy);
assert(p_sys->i_out_size == 0);
p_sys->i_underrun_size += i_requested;
memset(&p_output[i_copied], 0, i_requested);
}
/* Set high delay to false (re-enabling ca_Timeget) after consuming the
* circular buffer */
if (!b_highlatency)
atomic_store(&p_sys->b_highlatency, false);
lock_unlock(p_sys);
return;
drop:
memset(p_output, 0, i_requested);
lock_unlock(p_sys);
}
int
......@@ -129,16 +200,17 @@ ca_TimeGet(audio_output_t *p_aout, mtime_t *delay)
{
struct aout_sys_common *p_sys = (struct aout_sys_common *) p_aout->sys;
/* Too high delay: TimeGet will be too imprecise */
if (atomic_load(&p_sys->b_highlatency))
lock_lock(p_sys);
if (p_sys->b_highlatency)
{
lock_unlock(p_sys);
return -1;
}
int32_t i_bytes;
TPCircularBufferTail(&p_sys->circular_buffer, &i_bytes);
int64_t i_frames = BytesToFrames(p_sys, i_bytes);
int64_t i_frames = BytesToFrames(p_sys, p_sys->i_out_size);
*delay = FramesToUs(p_sys, i_frames) + p_sys->i_dev_latency_us;
lock_unlock(p_sys);
return 0;
}
......@@ -147,47 +219,41 @@ ca_Flush(audio_output_t *p_aout, bool wait)
{
struct aout_sys_common *p_sys = (struct aout_sys_common *) p_aout->sys;
lock_lock(p_sys);
if (wait)
{
int32_t i_bytes;
while (TPCircularBufferTail(&p_sys->circular_buffer, &i_bytes) != NULL)
while (p_sys->i_out_size > 0)
{
if (atomic_load(&p_sys->b_paused))
if (p_sys->b_paused)
{
TPCircularBufferClear(&p_sys->circular_buffer);
return;
ca_ClearOutBuffers(p_aout);
break;
}
/* Calculate the duration of the circular buffer, in order to wait
* for the render thread to play it all */
const mtime_t i_frame_us =
FramesToUs(p_sys, BytesToFrames(p_sys, i_bytes)) + 10000;
msleep(i_frame_us / 2);
FramesToUs(p_sys, BytesToFrames(p_sys, p_sys->i_out_size)) + 10000;
lock_unlock(p_sys);
msleep(i_frame_us);
lock_lock(p_sys);
}
}
else
{
/* Request the renderer to flush, and wait for an ACK.
* b_do_flush and b_paused need to be locked together in order to not
* get stuck here when b_paused is being set after reading. This can
* happen when setAliveState() is called from any thread through an
* interrupt notification */
vlc_mutex_lock(&p_sys->lock);
assert(!atomic_load(&p_sys->b_do_flush));
if (atomic_load(&p_sys->b_paused))
assert(!p_sys->b_do_flush);
if (p_sys->b_paused)
ca_ClearOutBuffers(p_aout);
else
{
vlc_mutex_unlock(&p_sys->lock);
TPCircularBufferClear(&p_sys->circular_buffer);
p_sys->b_do_flush = true;
lock_unlock(p_sys);
vlc_sem_wait(&p_sys->flush_sem);
return;
}
atomic_store_explicit(&p_sys->b_do_flush, true, memory_order_release);
vlc_mutex_unlock(&p_sys->lock);
vlc_sem_wait(&p_sys->flush_sem);
}
lock_unlock(p_sys);
}
void
......@@ -196,7 +262,9 @@ ca_Pause(audio_output_t * p_aout, bool pause, mtime_t date)
struct aout_sys_common *p_sys = (struct aout_sys_common *) p_aout->sys;
VLC_UNUSED(date);
atomic_store_explicit(&p_sys->b_paused, pause, memory_order_relaxed);
lock_lock(p_sys);
p_sys->b_paused = pause;
lock_unlock(p_sys);
}
void
......@@ -210,43 +278,65 @@ ca_Play(audio_output_t * p_aout, block_t * p_block)
p_sys->chans_to_reorder, p_sys->chan_table,
VLC_CODEC_FL32);
/* move data to buffer */
while (!TPCircularBufferProduceBytes(&p_sys->circular_buffer,
p_block->p_buffer, p_block->i_buffer))
lock_lock(p_sys);
do
{
if (atomic_load_explicit(&p_sys->b_paused, memory_order_relaxed))
const size_t i_avalaible_bytes =
__MIN(p_block->i_buffer, p_sys->i_out_max_size - p_sys->i_out_size);
if (unlikely(i_avalaible_bytes != p_block->i_buffer))
{
msg_Warn(p_aout, "dropping block because the circular buffer is "
"full and paused");
break;
/* Not optimal but unlikely code path. */
lock_unlock(p_sys);
block_t *p_new = block_Alloc(i_avalaible_bytes);
if (!p_new)