Commit f49a1b2e authored by Steve Borho's avatar Steve Borho Committed by Fiona Glaser

OpenCL lookahead

OpenCL support is compiled in by default, but must be enabled at runtime by an
--opencl command line flag. Compiling OpenCL support requires perl. To avoid
the perl requirement use: configure --disable-opencl.

When enabled, the lookahead thread is mostly off-loaded to an OpenCL capable GPU
device.  Lowres intra cost prediction, lowres motion search (including subpel)
and bidir cost predictions are all done on the GPU.  MB-tree and final slice
decisions are still done by the CPU.  Presets which do not use a threaded
lookahead will not use OpenCL at all (superfast, ultrafast).

Because of data dependencies, the GPU must use an iterative motion search which
performs more total work than the CPU would do, so this is not work efficient
or power efficient. But if there are spare GPU cycles to spare, it can often
speed up the encode. Output quality when OpenCL lookahead is enabled is often
very slightly worse in quality than the CPU quality (because of the same data
dependencies).

x264 must compile its OpenCL kernels for your device before running them, and in
order to avoid doing this every run it caches the compiled kernel binary in a
file named x264_lookahead.clbin (--opencl-clbin FNAME to override).  The cache
file will be ignored if the device, driver, or OpenCL source are changed.

x264 will use the first GPU device which supports the required cl_image
features required by its kernels. Most modern discrete GPUs and all AMD
integrated GPUs will work.  Intel integrated GPUs (up to IvyBridge) do not
support those necessary features. Use --opencl-device N to specify a number of
capable GPUs to skip during device detection.

Switchable graphics environments (e.g. AMD Enduro) are currently not supported,
as some have bugs in their OpenCL drivers that cause output to be silently
incorrect.

Developed by MulticoreWare with support from AMD and Telestream.
parent 2d0c47a5
......@@ -43,3 +43,5 @@ checkasm
.digress_x264
dataDec.txt
log.dec
common/oclobj.h
x264_lookahead.clbin
......@@ -8,6 +8,8 @@ vpath %.S $(SRCPATH)
vpath %.asm $(SRCPATH)
vpath %.rc $(SRCPATH)
GENERATED =
all: default
default:
......@@ -145,6 +147,35 @@ OBJSO += $(if $(RC), x264res.dll.o)
endif
endif
QUOTED_CFLAGS := $(CFLAGS)
ifeq ($(HAVE_OPENCL),yes)
empty:=
space:=$(empty) $(empty)
escaped:=\ $(empty)
open:=(
escopen:=\(
close:=)
escclose:=\)
SAFE_INC_DIR := $(subst $(space),$(escaped),$(OPENCL_INC_DIR))
SAFE_INC_DIR := $(subst $(open),$(escopen),$(SAFE_INC_DIR))
SAFE_INC_DIR := $(subst $(close),$(escclose),$(SAFE_INC_DIR))
SAFE_LIB_DIR := $(subst $(space),$(escaped),$(OPENCL_LIB_DIR))
SAFE_LIB_DIR := $(subst $(open),$(escopen),$(SAFE_LIB_DIR))
SAFE_LIB_DIR := $(subst $(close),$(escclose),$(SAFE_LIB_DIR))
# For normal CFLAGS and LDFLAGS, we must escape spaces with a backslash to
# make gcc happy
CFLAGS += -I$(SAFE_INC_DIR) -DCL_USE_DEPRECATED_OPENCL_1_1_APIS
LDFLAGS += -l$(OPENCL_LIB) -L$(SAFE_LIB_DIR)
# For the CFLAGS used by the .depend rule, we must add quotes because
# the rule does an extra level of shell expansions
QUOTED_CFLAGS += -I"$(OPENCL_INC_DIR)" -DCL_USE_DEPRECATED_OPENCL_1_1_APIS
common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl)
cat $^ | perl $(SRCPATH)/tools/cltostr.pl x264_opencl_source > $@
GENERATED += common/oclobj.h
SRCS += common/opencl.c encoder/slicetype-cl.c
endif
OBJS += $(SRCS:%.c=%.o)
OBJCLI += $(SRCCLI:%.c=%.o)
OBJSO += $(SRCSO:%.c=%.o)
......@@ -155,12 +186,12 @@ cli: x264$(EXE)
lib-static: $(LIBX264)
lib-shared: $(SONAME)
$(LIBX264): .depend $(OBJS) $(OBJASM)
$(LIBX264): $(GENERATED) .depend $(OBJS) $(OBJASM)
rm -f $(LIBX264)
$(AR)$@ $(OBJS) $(OBJASM)
$(if $(RANLIB), $(RANLIB) $@)
$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
$(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO)
$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
ifneq ($(EXE),)
......@@ -169,10 +200,10 @@ x264: x264$(EXE)
checkasm: checkasm$(EXE)
endif
x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
$(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
......@@ -193,7 +224,7 @@ $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
.depend: config.mak
@rm -f .depend
@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(QUOTED_CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
config.mak:
./configure
......@@ -231,7 +262,7 @@ endif
clean:
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
rm -f checkasm checkasm.exe $(OBJCHK)
rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
distclean: clean
......
......@@ -171,6 +171,10 @@ void x264_param_default( x264_param_t *param )
param->b_pic_struct = 0;
param->b_fake_interlaced = 0;
param->i_frame_packing = -1;
param->b_opencl = 0;
param->i_opencl_device = 0;
param->opencl_device_id = NULL;
param->psz_clbin_file = NULL;
}
static int x264_param_apply_preset( x264_param_t *param, const char *preset )
......@@ -1033,6 +1037,12 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
p->b_fake_interlaced = atobool(value);
OPT("frame-packing")
p->i_frame_packing = atoi(value);
OPT("opencl")
p->b_opencl = atobool( value );
OPT("opencl-clbin")
p->psz_clbin_file = strdup( value );
OPT("opencl-device")
p->i_opencl_device = atoi( value );
else
return X264_PARAM_BAD_NAME;
#undef OPT
......@@ -1285,7 +1295,9 @@ char *x264_param2string( x264_param_t *p, int b_res )
s += sprintf( s, "bitdepth=%d ", BIT_DEPTH );
}
s += sprintf( s, "cabac=%d", p->b_cabac );
if( p->b_opencl )
s += sprintf( s, "opencl=%d", p->b_opencl );
s += sprintf( s, " cabac=%d", p->b_cabac );
s += sprintf( s, " ref=%d", p->i_frame_reference );
s += sprintf( s, " deblock=%d:%d:%d", p->b_deblocking_filter,
p->i_deblocking_filter_alphac0, p->i_deblocking_filter_beta );
......
......@@ -54,6 +54,8 @@ do {\
memset( var, 0, size );\
} while( 0 )
#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
#define X264_BFRAME_MAX 16
#define X264_REF_MAX 16
#define X264_THREAD_MAX 128
......@@ -94,6 +96,10 @@ do {\
#include <assert.h>
#include <limits.h>
#if HAVE_OPENCL
#include "opencl.h"
#endif
#if HAVE_INTERLACED
# define MB_INTERLACED h->mb.b_interlaced
# define SLICE_MBAFF h->sh.b_mbaff
......@@ -936,6 +942,10 @@ struct x264_t
struct visualize_t *visualize;
#endif
x264_lookahead_t *lookahead;
#if HAVE_OPENCL
x264_opencl_t opencl;
#endif
};
// included at the end because it needs x264_t
......
......@@ -316,6 +316,9 @@ void x264_frame_delete( x264_frame_t *frame )
}
x264_pthread_mutex_destroy( &frame->mutex );
x264_pthread_cond_destroy( &frame->cv );
#if HAVE_OPENCL
x264_opencl_frame_delete( frame );
#endif
}
x264_free( frame );
}
......
......@@ -172,6 +172,10 @@ typedef struct x264_frame
/* user frame properties */
uint8_t *mb_info;
void (*mb_info_free)( void* );
#if HAVE_OPENCL
x264_frame_opencl_t opencl;
#endif
} x264_frame_t;
/* synchronized frame list */
......
/*****************************************************************************
* opencl.c: OpenCL initialization and kernel compilation
*****************************************************************************
* Copyright (C) 2012-2013 x264 project
*
* Authors: Steve Borho <sborho@multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common.h"
#if _WIN32
#include <windows.h>
#else
#include <dlfcn.h> //dlopen, dlsym, dlclose
#endif
/* define from recent cl_ext.h, copied here in case headers are old */
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
/* Requires full include path in case of out-of-tree builds */
#include "common/oclobj.h"
static int x264_detect_switchable_graphics();
/* Try to load the cached compiled program binary, verify the device context is
* still valid before reuse */
static cl_program x264_opencl_cache_load( x264_t *h, char *devname, char *devvendor, char *driverversion )
{
cl_program program = NULL;
cl_int status;
/* try to load cached program binary */
FILE *fp = fopen( h->param.psz_clbin_file, "rb" );
if( !fp )
return NULL;
fseek( fp, 0L, SEEK_END );
size_t size = ftell( fp );
rewind( fp );
uint8_t *binary;
CHECKED_MALLOC( binary, size );
fread( binary, 1, size, fp );
const uint8_t *ptr = (const uint8_t*)binary;
#define CHECK_STRING( STR )\
do {\
size_t len = strlen( STR );\
if( size <= len || strncmp( (char*)ptr, STR, len ) )\
goto fail;\
else {\
size -= (len+1); ptr += (len+1);\
}\
} while( 0 )
CHECK_STRING( devname );
CHECK_STRING( devvendor );
CHECK_STRING( driverversion );
CHECK_STRING( x264_opencl_source_hash );
#undef CHECK_STRING
program = clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
if( status != CL_SUCCESS )
program = NULL;
fail:
fclose( fp );
x264_free( binary );
return program;
}
/* Save the compiled program binary to a file for later reuse. Device context
* is also saved in the cache file so we do not reuse stale binaries */
static void x264_opencl_cache_save( x264_t *h, cl_program program, char *devname, char *devvendor, char *driverversion )
{
FILE *fp = fopen( h->param.psz_clbin_file, "wb" );
if( !fp )
{
x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write");
return;
}
size_t size;
cl_int status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
if( status == CL_SUCCESS )
{
uint8_t *binary;
CHECKED_MALLOC( binary, size );
status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL );
if( status == CL_SUCCESS )
{
fputs( devname, fp );
fputc( '\n', fp );
fputs( devvendor, fp );
fputc( '\n', fp );
fputs( driverversion, fp );
fputc( '\n', fp );
fputs( x264_opencl_source_hash, fp );
fputc( '\n', fp );
fwrite( binary, 1, size, fp );
}
else
x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated");
x264_free( binary );
}
else
x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated");
fclose( fp );
fail:
return;
}
/* The OpenCL source under common/opencl will be merged into common/oclobj.h by
* the Makefile. It defines a x264_opencl_source byte array which we will pass
* to clCreateProgramWithSource(). We also attempt to use a cache file for the
* compiled binary, stored in the current working folder. */
static cl_program x264_opencl_compile( x264_t *h )
{
cl_program program;
cl_int status;
char devname[64];
char devvendor[64];
char driverversion[64];
status = clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
status |= clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR, sizeof(devvendor), devvendor, NULL );
status |= clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driverversion), driverversion, NULL );
if( status != CL_SUCCESS )
return NULL;
// Most AMD GPUs have vector registers
int vectorize = !strcmp( devvendor, "Advanced Micro Devices, Inc." );
h->opencl.b_device_AMD_SI = 0;
if( vectorize )
{
/* Disable OpenCL on Intel/AMD switchable graphics devices */
if( x264_detect_switchable_graphics() )
{
x264_log( h, X264_LOG_INFO, "OpenCL acceleration disabled, switchable graphics detected\n" );
return NULL;
}
/* Detect AMD SouthernIsland or newer device (single-width registers) */
cl_uint simdwidth = 4;
status = clGetDeviceInfo( h->opencl.device, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, sizeof(cl_uint), &simdwidth, NULL );
if( status == CL_SUCCESS && simdwidth == 1 )
{
vectorize = 0;
h->opencl.b_device_AMD_SI = 1;
}
}
x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", devvendor, devname, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
program = x264_opencl_cache_load( h, devname, devvendor, driverversion );
if( !program )
{
/* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
const char *strptr = (const char*)x264_opencl_source;
size_t size = sizeof(x264_opencl_source);
program = clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
if( status != CL_SUCCESS || !program )
{
x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" );
return NULL;
}
}
/* Build the program binary for the OpenCL device */
const char *buildopts = vectorize ? "-DVECTORIZE=1" : "";
status = clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
if( status == CL_SUCCESS )
{
x264_opencl_cache_save( h, program, devname, devvendor, driverversion );
return program;
}
/* Compile failure, should not happen with production code. */
size_t build_log_len = 0;
status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, NULL, &build_log_len );
if( status != CL_SUCCESS )
{
x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" );
return NULL;
}
char *build_log;
CHECKED_MALLOC( build_log, build_log_len );
if( !build_log )
{
x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" );
return NULL;
}
status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
if( status != CL_SUCCESS )
{
x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" );
x264_free( build_log );
return NULL;
}
FILE *lg = fopen( "x264_kernel_build_log.txt", "w" );
if( lg )
{
fwrite( build_log, 1, build_log_len, lg );
fclose( lg );
x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
}
x264_free( build_log );
fail:
return NULL;
}
static void x264_opencl_free_lookahead( x264_t *h )
{
#define RELEASE( a, f ) if( a ) f( a );
RELEASE( h->opencl.intra_kernel, clReleaseKernel )
RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel )
RELEASE( h->opencl.downscale_kernel1, clReleaseKernel )
RELEASE( h->opencl.downscale_kernel2, clReleaseKernel )
RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel )
RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel )
RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel )
RELEASE( h->opencl.memset_kernel, clReleaseKernel )
RELEASE( h->opencl.hme_kernel, clReleaseKernel )
RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel )
RELEASE( h->opencl.mode_select_kernel, clReleaseKernel )
RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel )
RELEASE( h->opencl.lookahead_program, clReleaseProgram )
RELEASE( h->opencl.row_satds[0], clReleaseMemObject )
RELEASE( h->opencl.row_satds[1], clReleaseMemObject )
RELEASE( h->opencl.frame_stats[0], clReleaseMemObject )
RELEASE( h->opencl.frame_stats[1], clReleaseMemObject )
RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject )
RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject )
RELEASE( h->opencl.mvp_buffer, clReleaseMemObject )
RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject )
RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject )
RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject )
RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject )
RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject )
RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject )
RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject )
for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject )
#undef RELEASE
}
int x264_opencl_init_lookahead( x264_t *h )
{
if( !h->param.rc.i_lookahead )
return -1;
static const char const *kernelnames[] = {
"mb_intra_cost_satd_8x8",
"sum_intra_cost",
"downscale_hpel",
"downscale1",
"downscale2",
"memset_int16",
"weightp_scaled_images",
"weightp_hpel",
"hierarchical_motion",
"subpel_refine",
"mode_selection",
"sum_inter_cost"
};
cl_kernel *kernels[] = {
&h->opencl.intra_kernel,
&h->opencl.rowsum_intra_kernel,
&h->opencl.downscale_hpel_kernel,
&h->opencl.downscale_kernel1,
&h->opencl.downscale_kernel2,
&h->opencl.memset_kernel,
&h->opencl.weightp_scaled_images_kernel,
&h->opencl.weightp_hpel_kernel,
&h->opencl.hme_kernel,
&h->opencl.subpel_refine_kernel,
&h->opencl.mode_select_kernel,
&h->opencl.rowsum_inter_kernel
};
cl_int status;
h->opencl.lookahead_program = x264_opencl_compile( h );
if( !h->opencl.lookahead_program )
{
x264_opencl_free_lookahead( h );
return -1;
}
for( int i = 0; i < ARRAY_SIZE(kernelnames); i++ )
{
*kernels[i] = clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
if( status != CL_SUCCESS )
{
x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
x264_opencl_free_lookahead( h );
return -1;
}
}
h->opencl.page_locked_buffer = clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
if( status != CL_SUCCESS )
{
x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status );
x264_opencl_free_lookahead( h );
return -1;
}
h->opencl.page_locked_ptr = clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
if( status != CL_SUCCESS )
{
x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status );
x264_opencl_free_lookahead( h );
return -1;
}
return 0;
}
static void x264_opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
{
/* Any error notification can be assumed to be fatal to the OpenCL context.
* We need to stop using it immediately to prevent further damage. */
x264_t *h = (x264_t*)user_data;
h->param.b_opencl = 0;
h->opencl.b_fatal_error = 1;
x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
x264_log( h, X264_LOG_ERROR, "OpenCL: fatal error, aborting encode\n" );
}
int x264_opencl_init( x264_t *h )
{
cl_int status;
cl_uint numPlatforms;
int ret = -1;
status = clGetPlatformIDs( 0, NULL, &numPlatforms );
if( status != CL_SUCCESS || numPlatforms == 0 )
{
x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n");
return -1;
}
cl_platform_id *platforms = (cl_platform_id*)x264_malloc( numPlatforms * sizeof(cl_platform_id) );
status = clGetPlatformIDs( numPlatforms, platforms, NULL );
if( status != CL_SUCCESS )
{
x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n");
x264_free( platforms );
return -1;
}
/* Select the first OpenCL platform with a GPU device that supports our
* required image (texture) formats */
for( cl_uint i = 0; i < numPlatforms; ++i )
{
cl_uint gpu_count = 0;
status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
if( status != CL_SUCCESS || !gpu_count )
continue;
cl_device_id *devices = x264_malloc( sizeof(cl_device_id) * gpu_count );
if( !devices )
continue;
status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, gpu_count, devices, NULL );
if( status != CL_SUCCESS )
{
x264_free( devices );
continue;
}
/* Find a GPU device that supports our image formats */
for( cl_uint gpu = 0; gpu < gpu_count; gpu++ )
{
h->opencl.device = devices[gpu];
/* if the user has specified an exact device ID, skip all other
* GPUs. If this device matches, allow it to continue through the
* checks for supported images, etc. */
if( h->param.opencl_device_id && devices[gpu] != (cl_device_id) h->param.opencl_device_id )
continue;
cl_bool image_support;
clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
if( !image_support )
continue;
cl_context context = clCreateContext( NULL, 1, &h->opencl.device, (void*)x264_opencl_error_notify, (void*)h, &status );
if( status != CL_SUCCESS )
continue;
cl_uint imagecount = 0;
clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 0, NULL, &imagecount );
if( !imagecount )
{
clReleaseContext( context );
continue;
}
cl_image_format *imageType = x264_malloc( sizeof(cl_image_format) * imagecount );
if( !imageType )
{
clReleaseContext( context );
continue;
}
clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, imagecount, imageType, NULL );
int b_has_r = 0;
int b_has_rgba = 0;
for( cl_uint j = 0; j < imagecount; j++ )
{
if( imageType[j].image_channel_order == CL_R &&
imageType[j].image_channel_data_type == CL_UNSIGNED_INT32 )
b_has_r = 1;
else if( imageType[j].image_channel_order == CL_RGBA &&
imageType[j].image_channel_data_type == CL_UNSIGNED_INT8 )
b_has_rgba = 1;
}
x264_free( imageType );
if( !b_has_r || !b_has_rgba )
{
char devname[64];
status = clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
if( status == CL_SUCCESS )
{
/* emit warning if we are discarding the user's explicit choice */
int level = h->param.opencl_device_id ? X264_LOG_WARNING : X264_LOG_DEBUG;
x264_log( h, level, "OpenCL: %s does not support required image formats\n", devname);
}
clReleaseContext( context );
continue;
}
/* user selection of GPU device, skip N first matches */
if( h->param.i_opencl_device )
{
h->param.i_opencl_device--;
clReleaseContext( context );
continue;
}
h->opencl.queue = clCreateCommandQueue( context, h->opencl.device, 0, &status );
if( status != CL_SUCCESS )
{
clReleaseContext( context );
continue;
}
h->opencl.context = context;
ret = 0;
break;
}
x264_free( devices );
if( !ret )
break;
}
x264_free( platforms );
if( !h->param.psz_clbin_file )
h->param.psz_clbin_file = "x264_lookahead.clbin";
if( ret )
x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to find a compatible device\n");
else
ret = x264_opencl_init_lookahead( h );
return ret;
}
void x264_opencl_frame_delete( x264_frame_t *frame )
{
#define RELEASEBUF(mem) if( mem ) clReleaseMemObject( mem );
for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
RELEASEBUF( frame->opencl.luma_hpel );
RELEASEBUF( frame->opencl.inv_qscale_factor );
RELEASEBUF( frame->opencl.intra_cost );
RELEASEBUF( frame->opencl.lowres_mvs0 );
RELEASEBUF( frame->opencl.lowres_mvs1 );
RELEASEBUF( frame->opencl.lowres_mv_costs0 );
RELEASEBUF( frame->opencl.lowres_mv_costs1 );
#undef RELEASEBUF
}
void x264_opencl_free( x264_t *h )
{
if( h->opencl.queue )
clFinish(h->opencl.queue );
x264_opencl_free_lookahead( h );
if( h->opencl.queue )
clReleaseCommandQueue( h->opencl.queue );
if( h->opencl.context )
clReleaseContext( h->opencl.context );
}
/* OpenCL misbehaves on hybrid laptops with Intel iGPU and AMD dGPU, so
* we consult AMD's ADL interface to detect this situation and disable
* OpenCL on these machines (Linux and Windows) */
#ifndef _WIN32
#define __stdcall
#define HINSTANCE void *
#endif
typedef void* ( __stdcall *ADL_MAIN_MALLOC_CALLBACK )( int );
typedef int ( *ADL_MAIN_CONTROL_CREATE )(ADL_MAIN_MALLOC_CALLBACK, int );
typedef int ( *ADL_ADAPTER_NUMBEROFADAPTERS_GET ) ( int* );
typedef int ( *ADL_POWERXPRESS_SCHEME_GET ) ( int, int *, int *, int * );
typedef int ( *ADL_MAIN_CONTROL_DESTROY )();
#define ADL_OK 0
#define ADL_PX_SCHEME_DYNAMIC 2
void* __stdcall adl_malloc_wrapper( int iSize ) { return x264_malloc( iSize ); }
static int x264_detect_switchable_graphics()
{
ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create;
ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
ADL_POWERXPRESS_SCHEME_GET ADL_PowerXpress_Scheme_Get;
ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy;
HINSTANCE hDLL;
int ret = 0;