	.version 2.2
	.target sm_20
	// compiled with ../../../External/3rdParty/NVIDIA/CUDA/win/bin/../open64/lib//be.exe
	// nvopencc 3.2 built on 2010-11-04

	.visible .func (.param .s32 __cudaretf__Z15IntegerMultiplyii) _Z15IntegerMultiplyii (.param .s32 __cudaparmf1__Z15IntegerMultiplyii, .param .s32 __cudaparmf2__Z15IntegerMultiplyii)

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelXv) _Z17Standard2DKernelXv ()

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelYv) _Z17Standard2DKernelYv ()

	.visible .func (.param .align 16 .b8 __cudaretf__Z13Half4ToFloat47ushort4[16]) _Z13Half4ToFloat47ushort4 (.param .align 8 .b8 __cudaparmf1__Z13Half4ToFloat47ushort4[8])

	.visible .func (.param .align 8 .b8 __cudaretf__Z13Float4ToHalf46float4[8]) _Z13Float4ToHalf46float4 (.param .align 16 .b8 __cudaparmf1__Z13Float4ToHalf46float4[16])

	.visible .func (.param .u32 __cudaretf__Z4Mix3RjS_S_) _Z4Mix3RjS_S_ (.param .u64 __cudaparmf1__Z4Mix3RjS_S_, .param .u64 __cudaparmf2__Z4Mix3RjS_S_, .param .u64 __cudaparmf3__Z4Mix3RjS_S_)

	.visible .func (.param .s32 __cudaretf__Z4Randj) _Z4Randj (.param .u32 __cudaparmf1__Z4Randj)

	.visible .func (.param .s32 __cudaretf__Z6Rand2Djjj) _Z6Rand2Djjj (.param .u32 __cudaparmf1__Z6Rand2Djjj, .param .u32 __cudaparmf2__Z6Rand2Djjj, .param .u32 __cudaparmf3__Z6Rand2Djjj)

	.visible .func (.param .s32 __cudaretf__Z6Rand2Dj) _Z6Rand2Dj (.param .u32 __cudaparmf1__Z6Rand2Dj)

	.visible .func (.param .align 8 .b8 __cudaretf__Z6Read2DI7ushort4ET_PKS1_iii[8]) _Z6Read2DI7ushort4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI7ushort4ET_PKS1_iii)

	.visible .func (.param .align 16 .b8 __cudaretf__Z6Read2DI6float4ET_PKS1_iii[16]) _Z6Read2DI6float4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI6float4ET_PKS1_iii)

	.visible .func _Z7Write2DI7ushort4EvT_PS1_iii (.param .align 8 .b8 __cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii[8], .param .u64 __cudaparmf2__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI7ushort4EvT_PS1_iii)

	.visible .func _Z7Write2DI6float4EvT_PS1_iii (.param .align 16 .b8 __cudaparmf1__Z7Write2DI6float4EvT_PS1_iii[16], .param .u64 __cudaparmf2__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI6float4EvT_PS1_iii)

	.visible .func (.param .align 16 .b8 __cudaretf__Z18UnpremultiplyPixel8PixelRGB[16]) _Z18UnpremultiplyPixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z18UnpremultiplyPixel8PixelRGB[16])

	.visible .func (.param .f32 __cudaretf__Z13ToLinearColorf) _Z13ToLinearColorf (.param .f32 __cudaparmf1__Z13ToLinearColorf)

	.visible .func (.param .f32 __cudaretf__Z15FromLinearColorf) _Z15FromLinearColorf (.param .f32 __cudaparmf1__Z15FromLinearColorf)

	.visible .func (.param .align 16 .b8 __cudaretf__Z25PremultiplyLinearizePixel8PixelRGB[16]) _Z25PremultiplyLinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16]) _Z29UnpremultiplyUnlinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z20PremultiplyLinearize6float4[16]) _Z20PremultiplyLinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z20PremultiplyLinearize6float4[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z24UnpremultiplyUnlinearize6float4[16]) _Z24UnpremultiplyUnlinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z24UnpremultiplyUnlinearize6float4[16])

	.visible .func (.param .f32 __cudaretf__Z8fromsRGBf) _Z8fromsRGBf (.param .f32 __cudaparmf1__Z8fromsRGBf)

	.visible .func (.param .f32 __cudaretf__Z6tosRGBf) _Z6tosRGBf (.param .f32 __cudaparmf1__Z6tosRGBf)

	.visible .func (.param .align 16 .b8 __cudaretf__Z17cuda_kernel_keyer6float4[16]) _Z17cuda_kernel_keyer6float4 (.param .align 16 .b8 __cudaparmf1__Z17cuda_kernel_keyer6float4[16])

	.visible .func (.param .f32 __cudaretf__Z10ReadMemoryPfi17DevicePixelFormat) _Z10ReadMemoryPfi17DevicePixelFormat (.param .u64 __cudaparmf1__Z10ReadMemoryPfi17DevicePixelFormat, .param .s32 __cudaparmf2__Z10ReadMemoryPfi17DevicePixelFormat, .param .u32 __cudaparmf3__Z10ReadMemoryPfi17DevicePixelFormat)

	.visible .func _Z6MinRowILi3EEvP6float2Pf (.param .u64 __cudaparmf1__Z6MinRowILi3EEvP6float2Pf, .param .u64 __cudaparmf2__Z6MinRowILi3EEvP6float2Pf)

	.visible .func _Z6MinRowILi2EEvP6float2Pf (.param .u64 __cudaparmf1__Z6MinRowILi2EEvP6float2Pf, .param .u64 __cudaparmf2__Z6MinRowILi2EEvP6float2Pf)

	.visible .func _Z6MinRowILi1EEvP6float2Pf (.param .u64 __cudaparmf1__Z6MinRowILi1EEvP6float2Pf, .param .u64 __cudaparmf2__Z6MinRowILi1EEvP6float2Pf)

	.visible .func _Z6MinRowILi0EEvP6float2Pf (.param .u64 __cudaparmf1__Z6MinRowILi0EEvP6float2Pf, .param .u64 __cudaparmf2__Z6MinRowILi0EEvP6float2Pf)

	.visible .func (.param .align 8 .b8 __cudaretf__Z9MinColumnILi3EE6float2PS0_[8]) _Z9MinColumnILi3EE6float2PS0_ (.param .u64 __cudaparmf1__Z9MinColumnILi3EE6float2PS0_)

	.visible .func (.param .align 8 .b8 __cudaretf__Z9MinColumnILi2EE6float2PS0_[8]) _Z9MinColumnILi2EE6float2PS0_ (.param .u64 __cudaparmf1__Z9MinColumnILi2EE6float2PS0_)

	.visible .func (.param .align 8 .b8 __cudaretf__Z9MinColumnILi1EE6float2PS0_[8]) _Z9MinColumnILi1EE6float2PS0_ (.param .u64 __cudaparmf1__Z9MinColumnILi1EE6float2PS0_)

	.visible .func (.param .align 8 .b8 __cudaretf__Z9MinColumnILi0EE6float2PS0_[8]) _Z9MinColumnILi0EE6float2PS0_ (.param .u64 __cudaparmf1__Z9MinColumnILi0EE6float2PS0_)

	.visible .func _Z6SumRowILi3EEvP6float2Pf (.param .u64 __cudaparmf1__Z6SumRowILi3EEvP6float2Pf, .param .u64 __cudaparmf2__Z6SumRowILi3EEvP6float2Pf)

	.visible .func _Z6SumRowILi2EEvP6float2Pf (.param .u64 __cudaparmf1__Z6SumRowILi2EEvP6float2Pf, .param .u64 __cudaparmf2__Z6SumRowILi2EEvP6float2Pf)

	.visible .func _Z6SumRowILi1EEvP6float2Pf (.param .u64 __cudaparmf1__Z6SumRowILi1EEvP6float2Pf, .param .u64 __cudaparmf2__Z6SumRowILi1EEvP6float2Pf)

	.visible .func _Z6SumRowILi0EEvP6float2Pf (.param .u64 __cudaparmf1__Z6SumRowILi0EEvP6float2Pf, .param .u64 __cudaparmf2__Z6SumRowILi0EEvP6float2Pf)

	.visible .func (.param .f32 __cudaretf__Z9SumColumnILi3EEfP6float2f) _Z9SumColumnILi3EEfP6float2f (.param .u64 __cudaparmf1__Z9SumColumnILi3EEfP6float2f, .param .f32 __cudaparmf2__Z9SumColumnILi3EEfP6float2f)

	.visible .func (.param .f32 __cudaretf__Z9SumColumnILi2EEfP6float2f) _Z9SumColumnILi2EEfP6float2f (.param .u64 __cudaparmf1__Z9SumColumnILi2EEfP6float2f, .param .f32 __cudaparmf2__Z9SumColumnILi2EEfP6float2f)

	.visible .func (.param .f32 __cudaretf__Z9SumColumnILi1EEfP6float2f) _Z9SumColumnILi1EEfP6float2f (.param .u64 __cudaparmf1__Z9SumColumnILi1EEfP6float2f, .param .f32 __cudaparmf2__Z9SumColumnILi1EEfP6float2f)

	.visible .func (.param .f32 __cudaretf__Z9SumColumnILi0EEfP6float2f) _Z9SumColumnILi0EEfP6float2f (.param .u64 __cudaparmf1__Z9SumColumnILi0EEfP6float2f, .param .f32 __cudaparmf2__Z9SumColumnILi0EEfP6float2f)

	//-----------------------------------------------------------
	// Compiling C:/Users/dvaeng/AppData/Local/Temp/tmpxft_00003d3c_00000000-11_keyer_Kernels.cpp3.i (C:/Users/dvaeng/AppData/Local/Temp/ccBI#.a16064)
	//-----------------------------------------------------------

	//-----------------------------------------------------------
	// Options:
	//-----------------------------------------------------------
	//  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
	//  -O3	(Optimization level)
	//  -g0	(Debug level)
	//  -m2	(Report advisories)
	//-----------------------------------------------------------

	.file	1	"C:/Users/dvaeng/AppData/Local/Temp/tmpxft_00003d3c_00000000-10_keyer_Kernels.cudafe2.gpu"
	.file	2	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/PixelFormat.h"
	.file	3	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/PixelRGB.h"
	.file	4	"c:\Mulder64\shared\adobe\MediaCore\Display\Inc\CUDA/Effects/Keyer_Kernels.h"
	.file	5	"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\crtdefs.h"
	.file	6	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\crt/device_runtime.h"
	.file	7	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\host_defines.h"
	.file	8	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\builtin_types.h"
	.file	9	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\device_types.h"
	.file	10	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\driver_types.h"
	.file	11	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\surface_types.h"
	.file	12	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\texture_types.h"
	.file	13	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\vector_types.h"
	.file	14	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\builtin_types.h"
	.file	15	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\host_defines.h"
	.file	16	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\device_launch_parameters.h"
	.file	17	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\crt\storage_class.h"
	.file	18	"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\time.h"
	.file	19	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/Utils.h"
	.file	20	"c:/Mulder64/shared/adobe/MediaCore/Display/Src/CUDA/Effects/keyer_Kernels.cu"
	.file	21	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\common_functions.h"
	.file	22	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_functions.h"
	.file	23	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_constants.h"
	.file	24	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\device_functions.h"
	.file	25	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_11_atomic_functions.h"
	.file	26	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_12_atomic_functions.h"
	.file	27	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_13_double_functions.h"
	.file	28	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_20_atomic_functions.h"
	.file	29	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_20_intrinsics.h"
	.file	30	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\surface_functions.h"
	.file	31	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\texture_fetch_functions.h"
	.file	32	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_functions_dbl_ptx3.h"


	.visible .func (.param .s32 __cudaretf__Z15IntegerMultiplyii) _Z15IntegerMultiplyii (.param .s32 __cudaparmf1__Z15IntegerMultiplyii, .param .s32 __cudaparmf2__Z15IntegerMultiplyii)
	{
	.reg .u32 %r<7>;
	.loc	19	60	0
$LDWbegin__Z15IntegerMultiplyii:
	ld.param.u32 	%r1, [__cudaparmf1__Z15IntegerMultiplyii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf2__Z15IntegerMultiplyii];
	mov.s32 	%r4, %r3;
	.loc	19	64	0
	mul.lo.s32 	%r5, %r2, %r4;
	st.param.s32 	[__cudaretf__Z15IntegerMultiplyii], %r5;
	ret;
$LDWend__Z15IntegerMultiplyii:
	} // _Z15IntegerMultiplyii

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelXv) _Z17Standard2DKernelXv ()
	{
	.reg .u32 %r<7>;
	.loc	19	73	0
$LDWbegin__Z17Standard2DKernelXv:
	.loc	19	74	0
	mov.u32 	%r1, %tid.x;
	cvt.s32.u32 	%r2, %ctaid.x;
	cvt.s32.u32 	%r3, %ntid.x;
	mul.lo.s32 	%r4, %r2, %r3;
	add.u32 	%r5, %r1, %r4;
	st.param.s32 	[__cudaretf__Z17Standard2DKernelXv], %r5;
	ret;
$LDWend__Z17Standard2DKernelXv:
	} // _Z17Standard2DKernelXv

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelYv) _Z17Standard2DKernelYv ()
	{
	.reg .u32 %r<7>;
	.loc	19	77	0
$LDWbegin__Z17Standard2DKernelYv:
	.loc	19	78	0
	mov.u32 	%r1, %tid.y;
	cvt.s32.u32 	%r2, %ctaid.y;
	cvt.s32.u32 	%r3, %ntid.y;
	mul.lo.s32 	%r4, %r2, %r3;
	add.u32 	%r5, %r1, %r4;
	st.param.s32 	[__cudaretf__Z17Standard2DKernelYv], %r5;
	ret;
$LDWend__Z17Standard2DKernelYv:
	} // _Z17Standard2DKernelYv

	.visible .func (.param .align 16 .b8 __cudaretf__Z13Half4ToFloat47ushort4[16]) _Z13Half4ToFloat47ushort4 (.param .align 8 .b8 __cudaparmf1__Z13Half4ToFloat47ushort4[8])
	{
	.reg .u32 %r<14>;
	.reg .f32 %f<9>;
	.loc	19	86	0
$LDWbegin__Z13Half4ToFloat47ushort4:
	ld.param.u16 	%r1, [__cudaparmf1__Z13Half4ToFloat47ushort4+0];
	mov.s32 	%r2, %r1;
	ld.param.u16 	%r3, [__cudaparmf1__Z13Half4ToFloat47ushort4+2];
	mov.s32 	%r4, %r3;
	ld.param.u16 	%r5, [__cudaparmf1__Z13Half4ToFloat47ushort4+4];
	mov.s32 	%r6, %r5;
	ld.param.u16 	%r7, [__cudaparmf1__Z13Half4ToFloat47ushort4+6];
	mov.s32 	%r8, %r7;
	.loc	19	87	0
	cvt.u16.u32 	%r9, %r4;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r9;
	cvt.ftz.f32.f16	%f1, %b1; }
	cvt.u16.u32 	%r10, %r6;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r10;
	cvt.ftz.f32.f16	%f2, %b1; }
	cvt.u16.u32 	%r11, %r8;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r11;
	cvt.ftz.f32.f16	%f3, %b1; }
	cvt.u16.u32 	%r12, %r2;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r12;
	cvt.ftz.f32.f16	%f4, %b1; }
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+0], %f4;
	mov.f32 	%f5, %f1;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+4], %f5;
	mov.f32 	%f6, %f2;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+8], %f6;
	mov.f32 	%f7, %f3;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+12], %f7;
	ret;
$LDWend__Z13Half4ToFloat47ushort4:
	} // _Z13Half4ToFloat47ushort4

	.visible .func (.param .align 8 .b8 __cudaretf__Z13Float4ToHalf46float4[8]) _Z13Float4ToHalf46float4 (.param .align 16 .b8 __cudaparmf1__Z13Float4ToHalf46float4[16])
	{
	.reg .u32 %r<13>;
	.reg .f32 %f<10>;
	.loc	19	95	0
$LDWbegin__Z13Float4ToHalf46float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z13Float4ToHalf46float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z13Float4ToHalf46float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z13Float4ToHalf46float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z13Float4ToHalf46float4+12];
	mov.f32 	%f8, %f7;
	.loc	19	96	0
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f4;
	mov.b32		%r1, %b1; }
	cvt.u16.u32 	%r2, %r1;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f6;
	mov.b32		%r3, %b1; }
	cvt.u16.u32 	%r4, %r3;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f8;
	mov.b32		%r5, %b1; }
	cvt.u16.u32 	%r6, %r5;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f2;
	mov.b32		%r7, %b1; }
	cvt.u16.u32 	%r8, %r7;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+0], %r8;
	mov.s32 	%r9, %r2;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+2], %r9;
	mov.s32 	%r10, %r4;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+4], %r10;
	mov.s32 	%r11, %r6;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+6], %r11;
	ret;
$LDWend__Z13Float4ToHalf46float4:
	} // _Z13Float4ToHalf46float4

	.visible .func (.param .u32 __cudaretf__Z4Mix3RjS_S_) _Z4Mix3RjS_S_ (.param .u64 __cudaparmf1__Z4Mix3RjS_S_, .param .u64 __cudaparmf2__Z4Mix3RjS_S_, .param .u64 __cudaparmf3__Z4Mix3RjS_S_)
	{
	.reg .u32 %r<75>;
	.reg .u64 %rd<8>;
	.loc	19	138	0
$LDWbegin__Z4Mix3RjS_S_:
	ld.param.u64 	%rd1, [__cudaparmf1__Z4Mix3RjS_S_];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z4Mix3RjS_S_];
	mov.s64 	%rd4, %rd3;
	ld.param.u64 	%rd5, [__cudaparmf3__Z4Mix3RjS_S_];
	mov.s64 	%rd6, %rd5;
	.loc	19	139	0
	ld.u32 	%r1, [%rd2+0];
	ld.u32 	%r2, [%rd4+0];
	sub.u32 	%r3, %r1, %r2;
	st.u32 	[%rd2+0], %r3;
	ld.u32 	%r4, [%rd6+0];
	sub.u32 	%r5, %r3, %r4;
	st.u32 	[%rd2+0], %r5;
	ld.u32 	%r6, [%rd6+0];
	shr.u32 	%r7, %r6, 13;
	xor.b32 	%r8, %r5, %r7;
	st.u32 	[%rd2+0], %r8;
	.loc	19	140	0
	ld.u32 	%r9, [%rd4+0];
	ld.u32 	%r10, [%rd6+0];
	sub.u32 	%r11, %r9, %r10;
	st.u32 	[%rd4+0], %r11;
	ld.u32 	%r12, [%rd2+0];
	sub.u32 	%r13, %r11, %r12;
	st.u32 	[%rd4+0], %r13;
	ld.u32 	%r14, [%rd2+0];
	shl.b32 	%r15, %r14, 8;
	xor.b32 	%r16, %r13, %r15;
	st.u32 	[%rd4+0], %r16;
	.loc	19	141	0
	ld.u32 	%r17, [%rd6+0];
	ld.u32 	%r18, [%rd2+0];
	sub.u32 	%r19, %r17, %r18;
	st.u32 	[%rd6+0], %r19;
	ld.u32 	%r20, [%rd4+0];
	sub.u32 	%r21, %r19, %r20;
	st.u32 	[%rd6+0], %r21;
	ld.u32 	%r22, [%rd4+0];
	shr.u32 	%r23, %r22, 13;
	xor.b32 	%r24, %r21, %r23;
	st.u32 	[%rd6+0], %r24;
	.loc	19	142	0
	ld.u32 	%r25, [%rd2+0];
	ld.u32 	%r26, [%rd4+0];
	sub.u32 	%r27, %r25, %r26;
	st.u32 	[%rd2+0], %r27;
	ld.u32 	%r28, [%rd6+0];
	sub.u32 	%r29, %r27, %r28;
	st.u32 	[%rd2+0], %r29;
	ld.u32 	%r30, [%rd6+0];
	shr.u32 	%r31, %r30, 12;
	xor.b32 	%r32, %r29, %r31;
	st.u32 	[%rd2+0], %r32;
	.loc	19	143	0
	ld.u32 	%r33, [%rd4+0];
	ld.u32 	%r34, [%rd6+0];
	sub.u32 	%r35, %r33, %r34;
	st.u32 	[%rd4+0], %r35;
	ld.u32 	%r36, [%rd2+0];
	sub.u32 	%r37, %r35, %r36;
	st.u32 	[%rd4+0], %r37;
	ld.u32 	%r38, [%rd2+0];
	shl.b32 	%r39, %r38, 16;
	xor.b32 	%r40, %r37, %r39;
	st.u32 	[%rd4+0], %r40;
	.loc	19	144	0
	ld.u32 	%r41, [%rd6+0];
	ld.u32 	%r42, [%rd2+0];
	sub.u32 	%r43, %r41, %r42;
	st.u32 	[%rd6+0], %r43;
	ld.u32 	%r44, [%rd4+0];
	sub.u32 	%r45, %r43, %r44;
	st.u32 	[%rd6+0], %r45;
	ld.u32 	%r46, [%rd4+0];
	shr.u32 	%r47, %r46, 5;
	xor.b32 	%r48, %r45, %r47;
	st.u32 	[%rd6+0], %r48;
	.loc	19	145	0
	ld.u32 	%r49, [%rd2+0];
	ld.u32 	%r50, [%rd4+0];
	sub.u32 	%r51, %r49, %r50;
	st.u32 	[%rd2+0], %r51;
	ld.u32 	%r52, [%rd6+0];
	sub.u32 	%r53, %r51, %r52;
	st.u32 	[%rd2+0], %r53;
	ld.u32 	%r54, [%rd6+0];
	shr.u32 	%r55, %r54, 3;
	xor.b32 	%r56, %r53, %r55;
	st.u32 	[%rd2+0], %r56;
	.loc	19	146	0
	ld.u32 	%r57, [%rd4+0];
	ld.u32 	%r58, [%rd6+0];
	sub.u32 	%r59, %r57, %r58;
	st.u32 	[%rd4+0], %r59;
	ld.u32 	%r60, [%rd2+0];
	sub.u32 	%r61, %r59, %r60;
	st.u32 	[%rd4+0], %r61;
	ld.u32 	%r62, [%rd2+0];
	shl.b32 	%r63, %r62, 10;
	xor.b32 	%r64, %r61, %r63;
	st.u32 	[%rd4+0], %r64;
	.loc	19	147	0
	ld.u32 	%r65, [%rd6+0];
	ld.u32 	%r66, [%rd2+0];
	sub.u32 	%r67, %r65, %r66;
	st.u32 	[%rd6+0], %r67;
	ld.u32 	%r68, [%rd4+0];
	sub.u32 	%r69, %r67, %r68;
	st.u32 	[%rd6+0], %r69;
	ld.u32 	%r70, [%rd4+0];
	shr.u32 	%r71, %r70, 15;
	xor.b32 	%r72, %r69, %r71;
	st.u32 	[%rd6+0], %r72;
	.loc	19	148	0
	mov.s32 	%r73, %r72;
	st.param.u32 	[__cudaretf__Z4Mix3RjS_S_], %r73;
	ret;
$LDWend__Z4Mix3RjS_S_:
	} // _Z4Mix3RjS_S_

	.visible .func (.param .s32 __cudaretf__Z4Randj) _Z4Randj (.param .u32 __cudaparmf1__Z4Randj)
	{
	.reg .u32 %r<14>;
	.loc	19	152	0
$LDWbegin__Z4Randj:
	ld.param.u32 	%r1, [__cudaparmf1__Z4Randj];
	mov.s32 	%r2, %r1;
	.loc	19	163	0
	mul.lo.u32 	%r3, %r2, 1103515245;
	add.u32 	%r4, %r3, 12345;
	shr.u32 	%r5, %r4, 16;
	and.b32 	%r6, %r5, 255;
	shl.b32 	%r7, %r6, 7;
	mul.lo.u32 	%r8, %r2, -1029531031;
	sub.u32 	%r9, %r8, 740551042;
	shr.u32 	%r10, %r9, 16;
	and.b32 	%r11, %r10, 255;
	xor.b32 	%r12, %r7, %r11;
	st.param.s32 	[__cudaretf__Z4Randj], %r12;
	ret;
$LDWend__Z4Randj:
	} // _Z4Randj

	.visible .func (.param .s32 __cudaretf__Z6Rand2Djjj) _Z6Rand2Djjj (.param .u32 __cudaparmf1__Z6Rand2Djjj, .param .u32 __cudaparmf2__Z6Rand2Djjj, .param .u32 __cudaparmf3__Z6Rand2Djjj)
	{
	.reg .u32 %r<54>;
	.loc	19	169	0
$LDWbegin__Z6Rand2Djjj:
	ld.param.u32 	%r1, [__cudaparmf1__Z6Rand2Djjj];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf2__Z6Rand2Djjj];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf3__Z6Rand2Djjj];
	mov.s32 	%r6, %r5;
	.loc	19	139	0
	sub.u32 	%r7, %r2, %r4;
	sub.u32 	%r8, %r7, %r6;
	shr.u32 	%r9, %r6, 13;
	xor.b32 	%r10, %r8, %r9;
	.loc	19	140	0
	sub.u32 	%r11, %r4, %r6;
	sub.u32 	%r12, %r11, %r10;
	shl.b32 	%r13, %r10, 8;
	xor.b32 	%r14, %r12, %r13;
	.loc	19	141	0
	sub.u32 	%r15, %r6, %r10;
	sub.u32 	%r16, %r15, %r14;
	shr.u32 	%r17, %r14, 13;
	xor.b32 	%r18, %r16, %r17;
	.loc	19	142	0
	sub.u32 	%r19, %r10, %r14;
	sub.u32 	%r20, %r19, %r18;
	shr.u32 	%r21, %r18, 12;
	xor.b32 	%r22, %r20, %r21;
	.loc	19	143	0
	sub.u32 	%r23, %r14, %r18;
	sub.u32 	%r24, %r23, %r22;
	shl.b32 	%r25, %r22, 16;
	xor.b32 	%r26, %r24, %r25;
	.loc	19	144	0
	sub.u32 	%r27, %r18, %r22;
	sub.u32 	%r28, %r27, %r26;
	shr.u32 	%r29, %r26, 5;
	xor.b32 	%r30, %r28, %r29;
	.loc	19	145	0
	sub.u32 	%r31, %r22, %r26;
	sub.u32 	%r32, %r31, %r30;
	shr.u32 	%r33, %r30, 3;
	xor.b32 	%r34, %r32, %r33;
	.loc	19	146	0
	sub.u32 	%r35, %r26, %r30;
	sub.u32 	%r36, %r35, %r34;
	shl.b32 	%r37, %r34, 10;
	xor.b32 	%r38, %r36, %r37;
	.loc	19	147	0
	sub.u32 	%r39, %r30, %r34;
	sub.u32 	%r40, %r39, %r38;
	shr.u32 	%r41, %r38, 15;
	xor.b32 	%r42, %r40, %r41;
	.loc	19	170	0
	mul.lo.u32 	%r43, %r42, 1103515245;
	add.u32 	%r44, %r43, 12345;
	shr.u32 	%r45, %r44, 16;
	and.b32 	%r46, %r45, 255;
	shl.b32 	%r47, %r46, 7;
	mul.lo.u32 	%r48, %r42, -1029531031;
	sub.u32 	%r49, %r48, 740551042;
	shr.u32 	%r50, %r49, 16;
	and.b32 	%r51, %r50, 255;
	xor.b32 	%r52, %r47, %r51;
	st.param.s32 	[__cudaretf__Z6Rand2Djjj], %r52;
	ret;
$LDWend__Z6Rand2Djjj:
	} // _Z6Rand2Djjj

	.visible .func (.param .s32 __cudaretf__Z6Rand2Dj) _Z6Rand2Dj (.param .u32 __cudaparmf1__Z6Rand2Dj)
	{
	.reg .u32 %r<60>;
	.loc	19	175	0
$LDWbegin__Z6Rand2Dj:
	ld.param.u32 	%r1, [__cudaparmf1__Z6Rand2Dj];
	mov.s32 	%r2, %r1;
	.loc	19	143	0
	cvt.s32.u32 	%r3, %ctaid.y;
	cvt.s32.u32 	%r4, %ntid.y;
	mul.lo.s32 	%r5, %r3, %r4;
	cvt.s32.u32 	%r6, %ctaid.x;
	cvt.s32.u32 	%r7, %ntid.x;
	mul.lo.s32 	%r8, %r6, %r7;
	mov.u32 	%r9, %tid.y;
	add.u32 	%r10, %r5, %r9;
	mov.u32 	%r11, %tid.x;
	add.u32 	%r12, %r8, %r11;
	shr.u32 	%r13, %r10, 13;
	sub.u32 	%r14, %r2, %r12;
	sub.u32 	%r15, %r12, %r10;
	sub.u32 	%r16, %r14, %r10;
	xor.b32 	%r17, %r13, %r16;
	shl.b32 	%r18, %r17, 8;
	sub.u32 	%r19, %r15, %r17;
	sub.u32 	%r20, %r10, %r17;
	xor.b32 	%r21, %r18, %r19;
	shr.u32 	%r22, %r21, 13;
	sub.u32 	%r23, %r20, %r21;
	sub.u32 	%r24, %r17, %r21;
	xor.b32 	%r25, %r22, %r23;
	shr.u32 	%r26, %r25, 12;
	sub.u32 	%r27, %r24, %r25;
	xor.b32 	%r28, %r26, %r27;
	sub.u32 	%r29, %r21, %r25;
	sub.u32 	%r30, %r29, %r28;
	shl.b32 	%r31, %r28, 16;
	xor.b32 	%r32, %r30, %r31;
	.loc	19	144	0
	sub.u32 	%r33, %r25, %r28;
	sub.u32 	%r34, %r33, %r32;
	shr.u32 	%r35, %r32, 5;
	xor.b32 	%r36, %r34, %r35;
	.loc	19	145	0
	sub.u32 	%r37, %r28, %r32;
	sub.u32 	%r38, %r37, %r36;
	shr.u32 	%r39, %r36, 3;
	xor.b32 	%r40, %r38, %r39;
	.loc	19	146	0
	sub.u32 	%r41, %r32, %r36;
	sub.u32 	%r42, %r41, %r40;
	shl.b32 	%r43, %r40, 10;
	xor.b32 	%r44, %r42, %r43;
	.loc	19	147	0
	sub.u32 	%r45, %r36, %r40;
	sub.u32 	%r46, %r45, %r44;
	shr.u32 	%r47, %r44, 15;
	xor.b32 	%r48, %r46, %r47;
	.loc	19	176	0
	mul.lo.u32 	%r49, %r48, 1103515245;
	add.u32 	%r50, %r49, 12345;
	shr.u32 	%r51, %r50, 16;
	and.b32 	%r52, %r51, 255;
	shl.b32 	%r53, %r52, 7;
	mul.lo.u32 	%r54, %r48, -1029531031;
	sub.u32 	%r55, %r54, 740551042;
	shr.u32 	%r56, %r55, 16;
	and.b32 	%r57, %r56, 255;
	xor.b32 	%r58, %r53, %r57;
	st.param.s32 	[__cudaretf__Z6Rand2Dj], %r58;
	ret;
$LDWend__Z6Rand2Dj:
	} // _Z6Rand2Dj

	.visible .func (.param .align 8 .b8 __cudaretf__Z6Read2DI7ushort4ET_PKS1_iii[8]) _Z6Read2DI7ushort4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI7ushort4ET_PKS1_iii)
	{
	.reg .u32 %r<14>;
	.reg .u64 %rd<7>;
	.loc	19	114	0
$LDWbegin__Z6Read2DI7ushort4ET_PKS1_iii:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf2__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf3__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf4__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s32 	%r6, %r5;
	.loc	19	115	0
	mul.lo.s32 	%r7, %r2, %r6;
	add.s32 	%r8, %r4, %r7;
	cvt.s64.s32 	%rd3, %r8;
	mul.wide.s32 	%rd4, %r8, 8;
	add.u64 	%rd5, %rd2, %rd4;
	ld.v4.u16 	{%r9,%r10,%r11,%r12}, [%rd5+0];
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+0], %r9;
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+2], %r10;
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+4], %r11;
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+6], %r12;
	ret;
$LDWend__Z6Read2DI7ushort4ET_PKS1_iii:
	} // _Z6Read2DI7ushort4ET_PKS1_iii

	.visible .func (.param .align 16 .b8 __cudaretf__Z6Read2DI6float4ET_PKS1_iii[16]) _Z6Read2DI6float4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI6float4ET_PKS1_iii)
	{
	.reg .u32 %r<10>;
	.reg .u64 %rd<7>;
	.reg .f32 %f<6>;
	.loc	19	114	0
$LDWbegin__Z6Read2DI6float4ET_PKS1_iii:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6Read2DI6float4ET_PKS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf2__Z6Read2DI6float4ET_PKS1_iii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf3__Z6Read2DI6float4ET_PKS1_iii];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf4__Z6Read2DI6float4ET_PKS1_iii];
	mov.s32 	%r6, %r5;
	.loc	19	115	0
	mul.lo.s32 	%r7, %r2, %r6;
	add.s32 	%r8, %r4, %r7;
	cvt.s64.s32 	%rd3, %r8;
	mul.wide.s32 	%rd4, %r8, 16;
	add.u64 	%rd5, %rd2, %rd4;
	ld.v4.f32 	{%f1,%f2,%f3,%f4}, [%rd5+0];
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+0], %f1;
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+4], %f2;
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+8], %f3;
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+12], %f4;
	ret;
$LDWend__Z6Read2DI6float4ET_PKS1_iii:
	} // _Z6Read2DI6float4ET_PKS1_iii

	.visible .func _Z7Write2DI7ushort4EvT_PS1_iii (.param .align 8 .b8 __cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii[8], .param .u64 __cudaparmf2__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI7ushort4EvT_PS1_iii)
	{
	.reg .u32 %r<18>;
	.reg .u64 %rd<7>;
	.loc	19	125	0
$LDWbegin__Z7Write2DI7ushort4EvT_PS1_iii:
	ld.param.u16 	%r1, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+0];
	mov.s32 	%r2, %r1;
	ld.param.u16 	%r3, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+2];
	mov.s32 	%r4, %r3;
	ld.param.u16 	%r5, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+4];
	mov.s32 	%r6, %r5;
	ld.param.u16 	%r7, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+6];
	mov.s32 	%r8, %r7;
	ld.param.u64 	%rd1, [__cudaparmf2__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r9, [__cudaparmf3__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s32 	%r10, %r9;
	ld.param.u32 	%r11, [__cudaparmf4__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s32 	%r12, %r11;
	ld.param.u32 	%r13, [__cudaparmf5__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s32 	%r14, %r13;
	.loc	19	126	0
	mul.lo.s32 	%r15, %r10, %r14;
	add.s32 	%r16, %r12, %r15;
	cvt.s64.s32 	%rd3, %r16;
	mul.wide.s32 	%rd4, %r16, 8;
	add.u64 	%rd5, %rd2, %rd4;
	st.v4.u16 	[%rd5+0], {%r2,%r4,%r6,%r8};
	.loc	19	127	0
	ret;
$LDWend__Z7Write2DI7ushort4EvT_PS1_iii:
	} // _Z7Write2DI7ushort4EvT_PS1_iii

	.visible .func _Z7Write2DI6float4EvT_PS1_iii (.param .align 16 .b8 __cudaparmf1__Z7Write2DI6float4EvT_PS1_iii[16], .param .u64 __cudaparmf2__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI6float4EvT_PS1_iii)
	{
	.reg .u32 %r<10>;
	.reg .u64 %rd<7>;
	.reg .f32 %f<10>;
	.loc	19	125	0
$LDWbegin__Z7Write2DI6float4EvT_PS1_iii:
	ld.param.f32 	%f1, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+12];
	mov.f32 	%f8, %f7;
	ld.param.u64 	%rd1, [__cudaparmf2__Z7Write2DI6float4EvT_PS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf3__Z7Write2DI6float4EvT_PS1_iii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf4__Z7Write2DI6float4EvT_PS1_iii];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf5__Z7Write2DI6float4EvT_PS1_iii];
	mov.s32 	%r6, %r5;
	.loc	19	126	0
	mul.lo.s32 	%r7, %r2, %r6;
	add.s32 	%r8, %r4, %r7;
	cvt.s64.s32 	%rd3, %r8;
	mul.wide.s32 	%rd4, %r8, 16;
	add.u64 	%rd5, %rd2, %rd4;
	st.v4.f32 	[%rd5+0], {%f2,%f4,%f6,%f8};
	.loc	19	127	0
	ret;
$LDWend__Z7Write2DI6float4EvT_PS1_iii:
	} // _Z7Write2DI6float4EvT_PS1_iii

	.visible .func (.param .align 16 .b8 __cudaretf__Z18UnpremultiplyPixel8PixelRGB[16]) _Z18UnpremultiplyPixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z18UnpremultiplyPixel8PixelRGB[16])
	{
	.reg .f32 %f<23>;
	.reg .pred %p<3>;
	.loc	3	206	0
$LDWbegin__Z18UnpremultiplyPixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_13_1282;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_13_1026;
$Lt_13_1282:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_13_1026:
	.loc	3	224	0
	mov.f32 	%f18, %f17;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+0], %f18;
	mov.f32 	%f19, %f16;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+4], %f19;
	mov.f32 	%f20, %f15;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+8], %f20;
	mov.f32 	%f21, %f10;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+12], %f21;
	ret;
$LDWend__Z18UnpremultiplyPixel8PixelRGB:
	} // _Z18UnpremultiplyPixel8PixelRGB

	.visible .func (.param .f32 __cudaretf__Z13ToLinearColorf) _Z13ToLinearColorf (.param .f32 __cudaparmf1__Z13ToLinearColorf)
	{
	.reg .f32 %f<15>;
	.reg .pred %p<3>;
	.loc	3	231	0
$LDWbegin__Z13ToLinearColorf:
	ld.param.f32 	%f1, [__cudaparmf1__Z13ToLinearColorf];
	mov.f32 	%f2, %f1;
	mov.f32 	%f3, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f3;
	@!%p1 bra 	$Lt_14_1026;
	.loc	3	234	0
	neg.ftz.f32 	%f4, %f2;
	lg2.approx.ftz.f32 	%f5, %f4;
	mov.f32 	%f6, 0f400ccccd;     	// 2.2
	mul.ftz.f32 	%f7, %f5, %f6;
	ex2.approx.ftz.f32 	%f8, %f7;
	neg.ftz.f32 	%f9, %f8;
	bra.uni 	$LBB4__Z13ToLinearColorf;
$Lt_14_1026:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f10, %f2;
	mov.f32 	%f11, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f12, %f10, %f11;
	ex2.approx.ftz.f32 	%f9, %f12;
$LBB4__Z13ToLinearColorf:
	mov.f32 	%f13, %f9;
	st.param.f32 	[__cudaretf__Z13ToLinearColorf], %f13;
	ret;
$LDWend__Z13ToLinearColorf:
	} // _Z13ToLinearColorf

	.visible .func (.param .f32 __cudaretf__Z15FromLinearColorf) _Z15FromLinearColorf (.param .f32 __cudaparmf1__Z15FromLinearColorf)
	{
	.reg .f32 %f<15>;
	.reg .pred %p<3>;
	.loc	3	239	0
$LDWbegin__Z15FromLinearColorf:
	ld.param.f32 	%f1, [__cudaparmf1__Z15FromLinearColorf];
	mov.f32 	%f2, %f1;
	mov.f32 	%f3, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f3;
	@!%p1 bra 	$Lt_15_1026;
	.loc	3	242	0
	neg.ftz.f32 	%f4, %f2;
	lg2.approx.ftz.f32 	%f5, %f4;
	mov.f32 	%f6, 0f3ee8ba2e;     	// 0.454545
	mul.ftz.f32 	%f7, %f5, %f6;
	ex2.approx.ftz.f32 	%f8, %f7;
	neg.ftz.f32 	%f9, %f8;
	bra.uni 	$LBB4__Z15FromLinearColorf;
$Lt_15_1026:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f10, %f2;
	mov.f32 	%f11, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f12, %f10, %f11;
	ex2.approx.ftz.f32 	%f9, %f12;
$LBB4__Z15FromLinearColorf:
	mov.f32 	%f13, %f9;
	st.param.f32 	[__cudaretf__Z15FromLinearColorf], %f13;
	ret;
$LDWend__Z15FromLinearColorf:
	} // _Z15FromLinearColorf

	.visible .func (.param .align 16 .b8 __cudaretf__Z25PremultiplyLinearizePixel8PixelRGB[16]) _Z25PremultiplyLinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB[16])
	{
	.reg .f32 %f<47>;
	.reg .pred %p<5>;
	.loc	3	252	0
$LDWbegin__Z25PremultiplyLinearizePixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	254	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	.loc	3	255	0
	mov.f32 	%f10, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f10;
	@!%p1 bra 	$Lt_16_4098;
	.loc	3	234	0
	neg.ftz.f32 	%f11, %f2;
	lg2.approx.ftz.f32 	%f12, %f11;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f15, %f14;
	neg.ftz.f32 	%f16, %f15;
	bra.uni 	$LDWendi___log2f_193_5;
$Lt_16_4098:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f17, %f2;
	mov.f32 	%f18, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f19, %f17, %f18;
	ex2.approx.ftz.f32 	%f16, %f19;
$LDWendi___log2f_193_5:
	.loc	3	256	0
	mov.f32 	%f20, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f4, %f20;
	@!%p2 bra 	$Lt_16_4610;
	.loc	3	234	0
	neg.ftz.f32 	%f21, %f4;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	neg.ftz.f32 	%f26, %f25;
	bra.uni 	$LDWendi___log2f_193_3;
$Lt_16_4610:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f27, %f4;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f26, %f29;
$LDWendi___log2f_193_3:
	.loc	3	257	0
	mov.f32 	%f30, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f6, %f30;
	@!%p3 bra 	$Lt_16_5122;
	.loc	3	234	0
	neg.ftz.f32 	%f31, %f6;
	lg2.approx.ftz.f32 	%f32, %f31;
	mov.f32 	%f33, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f34, %f32, %f33;
	ex2.approx.ftz.f32 	%f35, %f34;
	neg.ftz.f32 	%f36, %f35;
	bra.uni 	$LDWendi___log2f_193_1;
$Lt_16_5122:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f37, %f6;
	mov.f32 	%f38, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f39, %f37, %f38;
	ex2.approx.ftz.f32 	%f36, %f39;
$LDWendi___log2f_193_1:
	.loc	3	259	0
	mul.ftz.f32 	%f40, %f36, %f9;
	mul.ftz.f32 	%f41, %f26, %f9;
	mul.ftz.f32 	%f42, %f16, %f9;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+0], %f42;
	mov.f32 	%f43, %f41;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+4], %f43;
	mov.f32 	%f44, %f40;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+8], %f44;
	mov.f32 	%f45, %f9;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+12], %f45;
	ret;
$LDWend__Z25PremultiplyLinearizePixel8PixelRGB:
	} // _Z25PremultiplyLinearizePixel8PixelRGB

	.visible .func (.param .align 16 .b8 __cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16]) _Z29UnpremultiplyUnlinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16])
	{
	.reg .f32 %f<53>;
	.reg .pred %p<6>;
	.loc	3	263	0
$LDWbegin__Z29UnpremultiplyUnlinearizePixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_17_5122;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_17_4866;
$Lt_17_5122:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_17_4866:
	.loc	3	266	0
	mov.f32 	%f18, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f17, %f18;
	@!%p2 bra 	$Lt_17_5378;
	.loc	3	242	0
	neg.ftz.f32 	%f19, %f17;
	lg2.approx.ftz.f32 	%f20, %f19;
	mov.f32 	%f21, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f22, %f20, %f21;
	ex2.approx.ftz.f32 	%f23, %f22;
	neg.ftz.f32 	%f24, %f23;
	bra.uni 	$LDWendi___log2f_194_5;
$Lt_17_5378:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f25, %f17;
	mov.f32 	%f26, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f27, %f25, %f26;
	ex2.approx.ftz.f32 	%f24, %f27;
$LDWendi___log2f_194_5:
	.loc	3	267	0
	mov.f32 	%f28, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f16, %f28;
	@!%p3 bra 	$Lt_17_5890;
	.loc	3	242	0
	neg.ftz.f32 	%f29, %f16;
	lg2.approx.ftz.f32 	%f30, %f29;
	mov.f32 	%f31, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f32, %f30, %f31;
	ex2.approx.ftz.f32 	%f33, %f32;
	neg.ftz.f32 	%f34, %f33;
	bra.uni 	$LDWendi___log2f_194_3;
$Lt_17_5890:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f35, %f16;
	mov.f32 	%f36, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f37, %f35, %f36;
	ex2.approx.ftz.f32 	%f34, %f37;
$LDWendi___log2f_194_3:
	.loc	3	268	0
	mov.f32 	%f38, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p4, %f15, %f38;
	@!%p4 bra 	$Lt_17_6402;
	.loc	3	242	0
	neg.ftz.f32 	%f39, %f15;
	lg2.approx.ftz.f32 	%f40, %f39;
	mov.f32 	%f41, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f42, %f40, %f41;
	ex2.approx.ftz.f32 	%f43, %f42;
	neg.ftz.f32 	%f44, %f43;
	bra.uni 	$LDWendi___log2f_194_1;
$Lt_17_6402:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f45, %f15;
	mov.f32 	%f46, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f47, %f45, %f46;
	ex2.approx.ftz.f32 	%f44, %f47;
$LDWendi___log2f_194_1:
	.loc	3	269	0
	mov.f32 	%f48, %f24;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+0], %f48;
	mov.f32 	%f49, %f34;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+4], %f49;
	mov.f32 	%f50, %f44;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+8], %f50;
	mov.f32 	%f51, %f10;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+12], %f51;
	ret;
$LDWend__Z29UnpremultiplyUnlinearizePixel8PixelRGB:
	} // _Z29UnpremultiplyUnlinearizePixel8PixelRGB

	.visible .func (.param .align 16 .b8 __cudaretf__Z20PremultiplyLinearize6float4[16]) _Z20PremultiplyLinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z20PremultiplyLinearize6float4[16])
	{
	.reg .f32 %f<47>;
	.reg .pred %p<5>;
	.loc	3	277	0
$LDWbegin__Z20PremultiplyLinearize6float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z20PremultiplyLinearize6float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z20PremultiplyLinearize6float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z20PremultiplyLinearize6float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z20PremultiplyLinearize6float4+12];
	mov.f32 	%f8, %f7;
	.loc	3	254	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	.loc	3	255	0
	mov.f32 	%f10, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f10;
	@!%p1 bra 	$Lt_18_4098;
	.loc	3	234	0
	neg.ftz.f32 	%f11, %f2;
	lg2.approx.ftz.f32 	%f12, %f11;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f15, %f14;
	neg.ftz.f32 	%f16, %f15;
	bra.uni 	$LDWendi___log2f_195_5;
$Lt_18_4098:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f17, %f2;
	mov.f32 	%f18, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f19, %f17, %f18;
	ex2.approx.ftz.f32 	%f16, %f19;
$LDWendi___log2f_195_5:
	.loc	3	256	0
	mov.f32 	%f20, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f4, %f20;
	@!%p2 bra 	$Lt_18_4610;
	.loc	3	234	0
	neg.ftz.f32 	%f21, %f4;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	neg.ftz.f32 	%f26, %f25;
	bra.uni 	$LDWendi___log2f_195_3;
$Lt_18_4610:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f27, %f4;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f26, %f29;
$LDWendi___log2f_195_3:
	.loc	3	257	0
	mov.f32 	%f30, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f6, %f30;
	@!%p3 bra 	$Lt_18_5122;
	.loc	3	234	0
	neg.ftz.f32 	%f31, %f6;
	lg2.approx.ftz.f32 	%f32, %f31;
	mov.f32 	%f33, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f34, %f32, %f33;
	ex2.approx.ftz.f32 	%f35, %f34;
	neg.ftz.f32 	%f36, %f35;
	bra.uni 	$LDWendi___log2f_195_1;
$Lt_18_5122:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f37, %f6;
	mov.f32 	%f38, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f39, %f37, %f38;
	ex2.approx.ftz.f32 	%f36, %f39;
$LDWendi___log2f_195_1:
	.loc	3	259	0
	mul.ftz.f32 	%f40, %f36, %f9;
	mul.ftz.f32 	%f41, %f26, %f9;
	.loc	3	278	0
	mul.ftz.f32 	%f42, %f16, %f9;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+0], %f42;
	mov.f32 	%f43, %f41;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+4], %f43;
	mov.f32 	%f44, %f40;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+8], %f44;
	mov.f32 	%f45, %f9;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+12], %f45;
	ret;
$LDWend__Z20PremultiplyLinearize6float4:
	} // _Z20PremultiplyLinearize6float4

	.visible .func (.param .align 16 .b8 __cudaretf__Z24UnpremultiplyUnlinearize6float4[16]) _Z24UnpremultiplyUnlinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z24UnpremultiplyUnlinearize6float4[16])
	{
	.reg .f32 %f<53>;
	.reg .pred %p<6>;
	.loc	3	284	0
$LDWbegin__Z24UnpremultiplyUnlinearize6float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_19_5122;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_19_4866;
$Lt_19_5122:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_19_4866:
	.loc	3	266	0
	mov.f32 	%f18, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f17, %f18;
	@!%p2 bra 	$Lt_19_5378;
	.loc	3	242	0
	neg.ftz.f32 	%f19, %f17;
	lg2.approx.ftz.f32 	%f20, %f19;
	mov.f32 	%f21, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f22, %f20, %f21;
	ex2.approx.ftz.f32 	%f23, %f22;
	neg.ftz.f32 	%f24, %f23;
	bra.uni 	$LDWendi___log2f_196_5;
$Lt_19_5378:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f25, %f17;
	mov.f32 	%f26, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f27, %f25, %f26;
	ex2.approx.ftz.f32 	%f24, %f27;
$LDWendi___log2f_196_5:
	.loc	3	267	0
	mov.f32 	%f28, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f16, %f28;
	@!%p3 bra 	$Lt_19_5890;
	.loc	3	242	0
	neg.ftz.f32 	%f29, %f16;
	lg2.approx.ftz.f32 	%f30, %f29;
	mov.f32 	%f31, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f32, %f30, %f31;
	ex2.approx.ftz.f32 	%f33, %f32;
	neg.ftz.f32 	%f34, %f33;
	bra.uni 	$LDWendi___log2f_196_3;
$Lt_19_5890:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f35, %f16;
	mov.f32 	%f36, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f37, %f35, %f36;
	ex2.approx.ftz.f32 	%f34, %f37;
$LDWendi___log2f_196_3:
	.loc	3	268	0
	mov.f32 	%f38, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p4, %f15, %f38;
	@!%p4 bra 	$Lt_19_6402;
	.loc	3	242	0
	neg.ftz.f32 	%f39, %f15;
	lg2.approx.ftz.f32 	%f40, %f39;
	mov.f32 	%f41, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f42, %f40, %f41;
	ex2.approx.ftz.f32 	%f43, %f42;
	neg.ftz.f32 	%f44, %f43;
	bra.uni 	$LDWendi___log2f_196_1;
$Lt_19_6402:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f45, %f15;
	mov.f32 	%f46, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f47, %f45, %f46;
	ex2.approx.ftz.f32 	%f44, %f47;
$LDWendi___log2f_196_1:
	.loc	3	285	0
	mov.f32 	%f48, %f24;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+0], %f48;
	mov.f32 	%f49, %f34;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+4], %f49;
	mov.f32 	%f50, %f44;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+8], %f50;
	mov.f32 	%f51, %f10;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+12], %f51;
	ret;
$LDWend__Z24UnpremultiplyUnlinearize6float4:
	} // _Z24UnpremultiplyUnlinearize6float4

	.visible .func (.param .f32 __cudaretf__Z8fromsRGBf) _Z8fromsRGBf (.param .f32 __cudaparmf1__Z8fromsRGBf)
	{
	.reg .f32 %f<15>;
	.reg .pred %p<3>;
	.loc	20	31	0
$LDWbegin__Z8fromsRGBf:
	ld.param.f32 	%f1, [__cudaparmf1__Z8fromsRGBf];
	mov.f32 	%f2, %f1;
	.loc	20	33	0
	cvt.ftz.sat.f32.f32 	%f3, %f2;
	mov.f32 	%f4, 0f3d25aee6;     	// 0.04045
	setp.gt.ftz.f32 	%p1, %f3, %f4;
	@!%p1 bra 	$Lt_20_1282;
	.loc	20	35	0
	mov.f32 	%f5, 0f3d6147ae;     	// 0.055
	add.ftz.f32 	%f6, %f3, %f5;
	mov.f32 	%f7, 0f3f870a3d;     	// 1.055
	div.approx.ftz.f32 	%f8, %f6, %f7;
	lg2.approx.ftz.f32 	%f9, %f8;
	mov.f32 	%f10, 0f4019999a;    	// 2.4
	mul.ftz.f32 	%f11, %f9, %f10;
	ex2.approx.ftz.f32 	%f3, %f11;
	bra.uni 	$Lt_20_1026;
$Lt_20_1282:
	.loc	20	37	0
	mov.f32 	%f12, 0f414eb852;    	// 12.92
	div.approx.ftz.f32 	%f3, %f3, %f12;
$Lt_20_1026:
	.loc	20	39	0
	mov.f32 	%f13, %f3;
	st.param.f32 	[__cudaretf__Z8fromsRGBf], %f13;
	ret;
$LDWend__Z8fromsRGBf:
	} // _Z8fromsRGBf

	.visible .func (.param .f32 __cudaretf__Z6tosRGBf) _Z6tosRGBf (.param .f32 __cudaparmf1__Z6tosRGBf)
	{
	.reg .f32 %f<14>;
	.reg .pred %p<3>;
	.loc	20	48	0
$LDWbegin__Z6tosRGBf:
	ld.param.f32 	%f1, [__cudaparmf1__Z6tosRGBf];
	mov.f32 	%f2, %f1;
	.loc	20	50	0
	cvt.ftz.sat.f32.f32 	%f3, %f2;
	mov.f32 	%f4, 0f3b4d2e1c;     	// 0.0031308
	setp.lt.ftz.f32 	%p1, %f3, %f4;
	@!%p1 bra 	$Lt_21_1282;
	.loc	20	52	0
	mov.f32 	%f5, 0f414eb852;     	// 12.92
	mul.ftz.f32 	%f3, %f3, %f5;
	bra.uni 	$Lt_21_1026;
$Lt_21_1282:
	.loc	20	54	0
	mov.f32 	%f6, 0fbd6147ae;     	// -0.055
	lg2.approx.ftz.f32 	%f7, %f3;
	mov.f32 	%f8, 0f3ed55476;     	// 0.41666
	mul.ftz.f32 	%f9, %f7, %f8;
	ex2.approx.ftz.f32 	%f10, %f9;
	mov.f32 	%f11, 0f3f870a3d;    	// 1.055
	fma.rn.ftz.f32 	%f3, %f10, %f11, %f6;
$Lt_21_1026:
	.loc	20	56	0
	mov.f32 	%f12, %f3;
	st.param.f32 	[__cudaretf__Z6tosRGBf], %f12;
	ret;
$LDWend__Z6tosRGBf:
	} // _Z6tosRGBf
	.const .align 4 .b8 p[164];

	.visible .func (.param .align 16 .b8 __cudaretf__Z17cuda_kernel_keyer6float4[16]) _Z17cuda_kernel_keyer6float4 (.param .align 16 .b8 __cudaparmf1__Z17cuda_kernel_keyer6float4[16])
	{
	.reg .f32 %f<197>;
	.reg .pred %p<10>;
	.loc	20	64	0
$LDWbegin__Z17cuda_kernel_keyer6float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z17cuda_kernel_keyer6float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z17cuda_kernel_keyer6float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z17cuda_kernel_keyer6float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z17cuda_kernel_keyer6float4+12];
	mov.f32 	%f8, %f7;
	.loc	20	77	0
	cvt.ftz.sat.f32.f32 	%f9, %f2;
	mov.f32 	%f10, 0f3d25aee6;    	// 0.04045
	setp.gt.ftz.f32 	%p1, %f9, %f10;
	@!%p1 bra 	$Lt_22_6658;
	.loc	20	35	0
	mov.f32 	%f11, 0f3d6147ae;    	// 0.055
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f3f870a3d;    	// 1.055
	div.approx.ftz.f32 	%f14, %f12, %f13;
	lg2.approx.ftz.f32 	%f15, %f14;
	mov.f32 	%f16, 0f4019999a;    	// 2.4
	mul.ftz.f32 	%f17, %f15, %f16;
	ex2.approx.ftz.f32 	%f18, %f17;
	bra.uni 	$Lt_22_6402;
$Lt_22_6658:
	.loc	20	37	0
	mov.f32 	%f19, 0f414eb852;    	// 12.92
	div.approx.ftz.f32 	%f18, %f9, %f19;
$Lt_22_6402:
	.loc	20	78	0
	cvt.ftz.sat.f32.f32 	%f20, %f4;
	mov.f32 	%f21, 0f3d25aee6;    	// 0.04045
	setp.gt.ftz.f32 	%p2, %f20, %f21;
	@!%p2 bra 	$Lt_22_7170;
	.loc	20	35	0
	mov.f32 	%f22, 0f3d6147ae;    	// 0.055
	add.ftz.f32 	%f23, %f20, %f22;
	mov.f32 	%f24, 0f3f870a3d;    	// 1.055
	div.approx.ftz.f32 	%f25, %f23, %f24;
	lg2.approx.ftz.f32 	%f26, %f25;
	mov.f32 	%f27, 0f4019999a;    	// 2.4
	mul.ftz.f32 	%f28, %f26, %f27;
	ex2.approx.ftz.f32 	%f29, %f28;
	bra.uni 	$Lt_22_6914;
$Lt_22_7170:
	.loc	20	37	0
	mov.f32 	%f30, 0f414eb852;    	// 12.92
	div.approx.ftz.f32 	%f29, %f20, %f30;
$Lt_22_6914:
	.loc	20	79	0
	cvt.ftz.sat.f32.f32 	%f31, %f6;
	mov.f32 	%f32, 0f3d25aee6;    	// 0.04045
	setp.gt.ftz.f32 	%p3, %f31, %f32;
	@!%p3 bra 	$Lt_22_7682;
	.loc	20	35	0
	mov.f32 	%f33, 0f3d6147ae;    	// 0.055
	add.ftz.f32 	%f34, %f31, %f33;
	mov.f32 	%f35, 0f3f870a3d;    	// 1.055
	div.approx.ftz.f32 	%f36, %f34, %f35;
	lg2.approx.ftz.f32 	%f37, %f36;
	mov.f32 	%f38, 0f4019999a;    	// 2.4
	mul.ftz.f32 	%f39, %f37, %f38;
	ex2.approx.ftz.f32 	%f40, %f39;
	bra.uni 	$Lt_22_7426;
$Lt_22_7682:
	.loc	20	37	0
	mov.f32 	%f41, 0f414eb852;    	// 12.92
	div.approx.ftz.f32 	%f40, %f31, %f41;
$Lt_22_7426:
	.loc	20	93	0
	ld.const.f32 	%f42, [p+56];
	mul.ftz.f32 	%f43, %f42, %f29;
	ld.const.f32 	%f44, [p+68];
	mul.ftz.f32 	%f45, %f44, %f29;
	ld.const.f32 	%f46, [p+44];
	mul.ftz.f32 	%f47, %f46, %f29;
	ld.const.f32 	%f48, [p+52];
	fma.rn.ftz.f32 	%f49, %f48, %f40, %f43;
	ld.const.f32 	%f50, [p+64];
	fma.rn.ftz.f32 	%f51, %f50, %f40, %f45;
	ld.const.f32 	%f52, [p+40];
	fma.rn.ftz.f32 	%f53, %f52, %f40, %f47;
	ld.const.f32 	%f54, [p+60];
	fma.rn.ftz.f32 	%f55, %f54, %f18, %f49;
	ld.const.f32 	%f56, [p+72];
	fma.rn.ftz.f32 	%f57, %f56, %f18, %f51;
	ld.const.f32 	%f58, [p+48];
	fma.rn.ftz.f32 	%f59, %f58, %f18, %f53;
	mov.f32 	%f60, 0f02081cea;    	// 1e-037
	max.ftz.f32 	%f61, %f57, %f60;
	div.approx.ftz.f32 	%f62, %f55, %f61;
	div.approx.ftz.f32 	%f63, %f59, %f61;
	ld.const.f32 	%f64, [p+152];
	sub.ftz.f32 	%f65, %f62, %f64;
	ld.const.f32 	%f66, [p+148];
	sub.ftz.f32 	%f67, %f63, %f66;
	mul.ftz.f32 	%f68, %f65, %f65;
	fma.rn.ftz.f32 	%f69, %f67, %f67, %f68;
	mov.f32 	%f70, 0f02081cea;    	// 1e-037
	max.ftz.f32 	%f71, %f69, %f70;
	rsqrt.approx.ftz.f32 	%f72, %f71;
	ld.const.f32 	%f73, [p+160];
	mul.ftz.f32 	%f74, %f65, %f64;
	fma.rn.ftz.f32 	%f75, %f66, %f67, %f74;
	mul.ftz.f32 	%f76, %f72, %f73;
	mul.ftz.f32 	%f77, %f75, %f76;
	ld.const.f32 	%f78, [p+8];
	mov.f32 	%f79, 0f00000000;    	// 0
	setp.gt.ftz.f32 	%p4, %f77, %f79;
	@!%p4 bra 	$Lt_22_8194;
	.loc	20	95	0
	mul.ftz.f32 	%f80, %f77, %f78;
	.loc	20	96	0
	mov.f32 	%f81, %f78;
	bra.uni 	$Lt_22_7938;
$Lt_22_8194:
	.loc	20	99	0
	mov.f32 	%f82, 0f3f800000;    	// 1
	add.ftz.f32 	%f83, %f77, %f82;
	mul.ftz.f32 	%f81, %f78, %f83;
	mov.f32 	%f80, 0f00000000;    	// 0
$Lt_22_7938:
	.loc	20	104	0
	mov.f32 	%f84, 0f3f800000;    	// 1
	div.approx.ftz.f32 	%f85, %f84, %f72;
	mul.ftz.f32 	%f86, %f73, %f85;
	.loc	20	106	0
	mov.f32 	%f87, 0f3f800000;    	// 1
	sub.ftz.f32 	%f88, %f87, %f80;
	mul.ftz.f32 	%f80, %f86, %f88;
	.loc	20	107	0
	mov.f32 	%f89, 0f3f800000;    	// 1
	sub.ftz.f32 	%f90, %f89, %f81;
	mul.ftz.f32 	%f81, %f86, %f90;
	.loc	20	125	0
	ld.const.f32 	%f91, [p+16];
	mul.ftz.f32 	%f92, %f91, %f80;
	cvt.ftz.sat.f32.f32 	%f93, %f92;
	ld.const.f32 	%f94, [p+156];
	sub.ftz.f32 	%f95, %f61, %f94;
	div.approx.ftz.f32 	%f96, %f95, %f94;
	mov.f32 	%f97, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p5, %f96, %f97;
	ld.const.f32 	%f98, [p+20];
	mul.ftz.f32 	%f99, %f98, %f96;
	ld.const.f32 	%f100, [p+24];
	mul.ftz.f32 	%f101, %f100, %f96;
	selp.f32 	%f102, %f99, %f101, %p5;
	cvt.ftz.sat.f32.f32 	%f103, %f102;
	add.ftz.f32 	%f104, %f93, %f103;
	mul.ftz.f32 	%f105, %f103, %f93;
	sub.ftz.f32 	%f106, %f104, %f105;
	.loc	20	129	0
	ld.const.f32 	%f107, [p+28];
	mov.f32 	%f108, 0f3f800000;   	// 1
	mov.f32 	%f109, 0f02081cea;   	// 1e-037
	max.ftz.f32 	%f110, %f106, %f109;
	div.approx.ftz.f32 	%f111, %f108, %f110;
	mov.f32 	%f112, 0fbf800000;   	// -1
	add.ftz.f32 	%f113, %f111, %f112;
	mul.ftz.f32 	%f114, %f107, %f113;
	fma.rn.ftz.f32 	%f115, %f114, %f95, %f61;
	.loc	20	131	0
	ld.const.f32 	%f116, [p+32];
	mul.ftz.f32 	%f117, %f116, %f106;
	.loc	20	132	0
	ld.const.f32 	%f118, [p+36];
	sub.ftz.f32 	%f119, %f117, %f118;
	.loc	20	133	0
	cvt.ftz.sat.f32.f32 	%f120, %f119;
	.loc	20	137	0
	ld.const.f32 	%f121, [p+12];
	mul.ftz.f32 	%f122, %f121, %f81;
	ld.const.f32 	%f123, [p+0];
	mul.ftz.f32 	%f124, %f123, %f81;
	cvt.ftz.sat.f32.f32 	%f125, %f122;
	cvt.ftz.sat.f32.f32 	%f126, %f124;
	mov.f32 	%f127, 0f02081cea;   	// 1e-037
	max.ftz.f32 	%f128, %f126, %f127;
	mov.f32 	%f129, 0f3f800000;   	// 1
	div.approx.ftz.f32 	%f130, %f129, %f128;
	mov.f32 	%f131, 0fbf800000;   	// -1
	add.ftz.f32 	%f132, %f130, %f131;
	ld.const.f32 	%f133, [p+4];
	fma.rn.ftz.f32 	%f134, %f132, %f133, %f132;
	fma.rn.ftz.f32 	%f135, %f134, %f65, %f62;
	fma.rn.ftz.f32 	%f136, %f67, %f134, %f63;
	mul.ftz.f32 	%f137, %f125, %f135;
	mul.ftz.f32 	%f138, %f125, %f136;
	mul.ftz.f32 	%f139, %f137, %f115;
	mul.ftz.f32 	%f140, %f138, %f115;
	ld.const.f32 	%f141, [p+112];
	ld.const.f32 	%f142, [p+104];
	mul.ftz.f32 	%f143, %f142, %f139;
	ld.const.f32 	%f144, [p+100];
	fma.rn.ftz.f32 	%f145, %f144, %f140, %f143;
	ld.const.f32 	%f146, [p+108];
	fma.rn.ftz.f32 	%f147, %f146, %f115, %f145;
	add.ftz.f32 	%f148, %f141, %f147;
	.loc	20	50	0
	cvt.ftz.sat.f32.f32 	%f149, %f148;
	mov.f32 	%f150, 0f3b4d2e1c;   	// 0.0031308
	setp.lt.ftz.f32 	%p6, %f149, %f150;
	@!%p6 bra 	$Lt_22_8706;
	.loc	20	52	0
	mov.f32 	%f151, 0f414eb852;   	// 12.92
	mul.ftz.f32 	%f149, %f149, %f151;
	bra.uni 	$Lt_22_8450;
$Lt_22_8706:
	.loc	20	54	0
	mov.f32 	%f152, 0fbd6147ae;   	// -0.055
	lg2.approx.ftz.f32 	%f153, %f149;
	mov.f32 	%f154, 0f3ed55476;   	// 0.41666
	mul.ftz.f32 	%f155, %f153, %f154;
	ex2.approx.ftz.f32 	%f156, %f155;
	mov.f32 	%f157, 0f3f870a3d;   	// 1.055
	fma.rn.ftz.f32 	%f149, %f156, %f157, %f152;
$Lt_22_8450:
	.loc	20	138	0
	ld.const.f32 	%f158, [p+116];
	ld.const.f32 	%f159, [p+92];
	mul.ftz.f32 	%f160, %f159, %f139;
	ld.const.f32 	%f161, [p+88];
	fma.rn.ftz.f32 	%f162, %f161, %f140, %f160;
	ld.const.f32 	%f163, [p+96];
	fma.rn.ftz.f32 	%f164, %f163, %f115, %f162;
	add.ftz.f32 	%f165, %f158, %f164;
	.loc	20	50	0
	cvt.ftz.sat.f32.f32 	%f166, %f165;
	mov.f32 	%f167, 0f3b4d2e1c;   	// 0.0031308
	setp.lt.ftz.f32 	%p7, %f166, %f167;
	@!%p7 bra 	$Lt_22_9218;
	.loc	20	52	0
	mov.f32 	%f168, 0f414eb852;   	// 12.92
	mul.ftz.f32 	%f166, %f166, %f168;
	bra.uni 	$Lt_22_8962;
$Lt_22_9218:
	.loc	20	54	0
	mov.f32 	%f169, 0fbd6147ae;   	// -0.055
	lg2.approx.ftz.f32 	%f170, %f166;
	mov.f32 	%f171, 0f3ed55476;   	// 0.41666
	mul.ftz.f32 	%f172, %f170, %f171;
	ex2.approx.ftz.f32 	%f173, %f172;
	mov.f32 	%f174, 0f3f870a3d;   	// 1.055
	fma.rn.ftz.f32 	%f166, %f173, %f174, %f169;
$Lt_22_8962:
	.loc	20	139	0
	ld.const.f32 	%f175, [p+120];
	ld.const.f32 	%f176, [p+80];
	mul.ftz.f32 	%f177, %f176, %f139;
	ld.const.f32 	%f178, [p+76];
	fma.rn.ftz.f32 	%f179, %f178, %f140, %f177;
	ld.const.f32 	%f180, [p+84];
	fma.rn.ftz.f32 	%f181, %f180, %f115, %f179;
	add.ftz.f32 	%f182, %f175, %f181;
	.loc	20	50	0
	cvt.ftz.sat.f32.f32 	%f183, %f182;
	mov.f32 	%f184, 0f3b4d2e1c;   	// 0.0031308
	setp.lt.ftz.f32 	%p8, %f183, %f184;
	@!%p8 bra 	$Lt_22_9730;
	.loc	20	52	0
	mov.f32 	%f185, 0f414eb852;   	// 12.92
	mul.ftz.f32 	%f183, %f183, %f185;
	bra.uni 	$Lt_22_9474;
$Lt_22_9730:
	.loc	20	54	0
	mov.f32 	%f186, 0fbd6147ae;   	// -0.055
	lg2.approx.ftz.f32 	%f187, %f183;
	mov.f32 	%f188, 0f3ed55476;   	// 0.41666
	mul.ftz.f32 	%f189, %f187, %f188;
	ex2.approx.ftz.f32 	%f190, %f189;
	mov.f32 	%f191, 0f3f870a3d;   	// 1.055
	fma.rn.ftz.f32 	%f183, %f190, %f191, %f186;
$Lt_22_9474:
	.loc	20	140	0
	mul.ftz.f32 	%f8, %f120, %f8;
	.loc	20	142	0
	mov.f32 	%f192, %f149;
	st.param.f32 	[__cudaretf__Z17cuda_kernel_keyer6float4+0], %f192;
	mov.f32 	%f193, %f166;
	st.param.f32 	[__cudaretf__Z17cuda_kernel_keyer6float4+4], %f193;
	mov.f32 	%f194, %f183;
	st.param.f32 	[__cudaretf__Z17cuda_kernel_keyer6float4+8], %f194;
	mov.f32 	%f195, %f8;
	st.param.f32 	[__cudaretf__Z17cuda_kernel_keyer6float4+12], %f195;
	ret;
$LDWend__Z17cuda_kernel_keyer6float4:
	} // _Z17cuda_kernel_keyer6float4

	.visible .func (.param .f32 __cudaretf__Z10ReadMemoryPfi17DevicePixelFormat) _Z10ReadMemoryPfi17DevicePixelFormat (.param .u64 __cudaparmf1__Z10ReadMemoryPfi17DevicePixelFormat, .param .s32 __cudaparmf2__Z10ReadMemoryPfi17DevicePixelFormat, .param .u32 __cudaparmf3__Z10ReadMemoryPfi17DevicePixelFormat)
	{
	.reg .u32 %r<8>;
	.reg .u64 %rd<9>;
	.reg .f32 %f<4>;
	.reg .pred %p<3>;
	.loc	20	164	0
$LDWbegin__Z10ReadMemoryPfi17DevicePixelFormat:
	ld.param.u64 	%rd1, [__cudaparmf1__Z10ReadMemoryPfi17DevicePixelFormat];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf2__Z10ReadMemoryPfi17DevicePixelFormat];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf3__Z10ReadMemoryPfi17DevicePixelFormat];
	mov.s32 	%r4, %r3;
	cvt.s64.s32 	%rd3, %r2;
	mov.u32 	%r5, 0;
	setp.ne.s32 	%p1, %r4, %r5;
	@%p1 bra 	$Lt_23_1026;
	.loc	20	167	0
	mul.lo.u64 	%rd4, %rd3, 2;
	add.u64 	%rd5, %rd2, %rd4;
	ld.u16 	%r6, [%rd5+0];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r6;
	cvt.ftz.f32.f16	%f1, %b1; }
	bra.uni 	$LBB4__Z10ReadMemoryPfi17DevicePixelFormat;
$Lt_23_1026:
	.loc	20	169	0
	mul.lo.u64 	%rd6, %rd3, 4;
	add.u64 	%rd7, %rd2, %rd6;
	ld.f32 	%f1, [%rd7+0];
$LBB4__Z10ReadMemoryPfi17DevicePixelFormat:
	mov.f32 	%f2, %f1;
	st.param.f32 	[__cudaretf__Z10ReadMemoryPfi17DevicePixelFormat], %f2;
	ret;
$LDWend__Z10ReadMemoryPfi17DevicePixelFormat:
	} // _Z10ReadMemoryPfi17DevicePixelFormat

	.visible .func _Z6MinRowILi3EEvP6float2Pf (.param .u64 __cudaparmf1__Z6MinRowILi3EEvP6float2Pf, .param .u64 __cudaparmf2__Z6MinRowILi3EEvP6float2Pf)
	{
	.reg .u64 %rd<6>;
	.reg .f32 %f<19>;
	.loc	20	147	0
$LDWbegin__Z6MinRowILi3EEvP6float2Pf:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6MinRowILi3EEvP6float2Pf];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z6MinRowILi3EEvP6float2Pf];
	mov.s64 	%rd4, %rd3;
	.loc	20	153	0
	ld.f32 	%f1, [%rd4+12];
	ld.f32 	%f2, [%rd4+8];
	ld.f32 	%f3, [%rd4+4];
	ld.f32 	%f4, [%rd4+0];
	ld.f32 	%f5, [%rd4+-4];
	ld.f32 	%f6, [%rd4+-12];
	ld.f32 	%f7, [%rd4+-8];
	min.ftz.f32 	%f8, %f6, %f7;
	min.ftz.f32 	%f9, %f5, %f8;
	min.ftz.f32 	%f10, %f4, %f9;
	min.ftz.f32 	%f11, %f3, %f10;
	min.ftz.f32 	%f12, %f2, %f11;
	min.ftz.f32 	%f13, %f1, %f12;
	.loc	20	155	0
	ld.f32 	%f14, [%rd4+-16];
	ld.f32 	%f15, [%rd4+16];
	min.ftz.f32 	%f16, %f14, %f15;
	.loc	20	157	0
	min.ftz.f32 	%f17, %f16, %f13;
	st.v2.f32 	[%rd2+0], {%f17,%f13};
	.loc	20	158	0
	ret;
$LDWend__Z6MinRowILi3EEvP6float2Pf:
	} // _Z6MinRowILi3EEvP6float2Pf

	.visible .func _Z6MinRowILi2EEvP6float2Pf (.param .u64 __cudaparmf1__Z6MinRowILi2EEvP6float2Pf, .param .u64 __cudaparmf2__Z6MinRowILi2EEvP6float2Pf)
	{
	.reg .u64 %rd<6>;
	.reg .f32 %f<15>;
	.loc	20	147	0
$LDWbegin__Z6MinRowILi2EEvP6float2Pf:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6MinRowILi2EEvP6float2Pf];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z6MinRowILi2EEvP6float2Pf];
	mov.s64 	%rd4, %rd3;
	.loc	20	153	0
	ld.f32 	%f1, [%rd4+8];
	ld.f32 	%f2, [%rd4+4];
	ld.f32 	%f3, [%rd4+0];
	ld.f32 	%f4, [%rd4+-8];
	ld.f32 	%f5, [%rd4+-4];
	min.ftz.f32 	%f6, %f4, %f5;
	min.ftz.f32 	%f7, %f3, %f6;
	min.ftz.f32 	%f8, %f2, %f7;
	min.ftz.f32 	%f9, %f1, %f8;
	.loc	20	155	0
	ld.f32 	%f10, [%rd4+-12];
	ld.f32 	%f11, [%rd4+12];
	min.ftz.f32 	%f12, %f10, %f11;
	.loc	20	157	0
	min.ftz.f32 	%f13, %f12, %f9;
	st.v2.f32 	[%rd2+0], {%f13,%f9};
	.loc	20	158	0
	ret;
$LDWend__Z6MinRowILi2EEvP6float2Pf:
	} // _Z6MinRowILi2EEvP6float2Pf

	.visible .func _Z6MinRowILi1EEvP6float2Pf (.param .u64 __cudaparmf1__Z6MinRowILi1EEvP6float2Pf, .param .u64 __cudaparmf2__Z6MinRowILi1EEvP6float2Pf)
	{
	.reg .u64 %rd<6>;
	.reg .f32 %f<11>;
	.loc	20	147	0
$LDWbegin__Z6MinRowILi1EEvP6float2Pf:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6MinRowILi1EEvP6float2Pf];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z6MinRowILi1EEvP6float2Pf];
	mov.s64 	%rd4, %rd3;
	.loc	20	153	0
	ld.f32 	%f1, [%rd4+4];
	ld.f32 	%f2, [%rd4+-4];
	ld.f32 	%f3, [%rd4+0];
	min.ftz.f32 	%f4, %f2, %f3;
	min.ftz.f32 	%f5, %f1, %f4;
	.loc	20	155	0
	ld.f32 	%f6, [%rd4+-8];
	ld.f32 	%f7, [%rd4+8];
	min.ftz.f32 	%f8, %f6, %f7;
	.loc	20	157	0
	min.ftz.f32 	%f9, %f8, %f5;
	st.v2.f32 	[%rd2+0], {%f9,%f5};
	.loc	20	158	0
	ret;
$LDWend__Z6MinRowILi1EEvP6float2Pf:
	} // _Z6MinRowILi1EEvP6float2Pf

	.visible .func _Z6MinRowILi0EEvP6float2Pf (.param .u64 __cudaparmf1__Z6MinRowILi0EEvP6float2Pf, .param .u64 __cudaparmf2__Z6MinRowILi0EEvP6float2Pf)
	{
	.reg .u64 %rd<6>;
	.reg .f32 %f<7>;
	.loc	20	147	0
$LDWbegin__Z6MinRowILi0EEvP6float2Pf:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6MinRowILi0EEvP6float2Pf];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z6MinRowILi0EEvP6float2Pf];
	mov.s64 	%rd4, %rd3;
	.loc	20	150	0
	ld.f32 	%f1, [%rd4+0];
	.loc	20	155	0
	ld.f32 	%f2, [%rd4+-4];
	ld.f32 	%f3, [%rd4+4];
	min.ftz.f32 	%f4, %f2, %f3;
	.loc	20	157	0
	min.ftz.f32 	%f5, %f4, %f1;
	st.v2.f32 	[%rd2+0], {%f5,%f1};
	.loc	20	158	0
	ret;
$LDWend__Z6MinRowILi0EEvP6float2Pf:
	} // _Z6MinRowILi0EEvP6float2Pf

	.visible .func (.param .align 8 .b8 __cudaretf__Z9MinColumnILi3EE6float2PS0_[8]) _Z9MinColumnILi3EE6float2PS0_ (.param .u64 __cudaparmf1__Z9MinColumnILi3EE6float2PS0_)
	{
	.reg .u64 %rd<4>;
	.reg .f32 %f<32>;
	.loc	20	227	0
$LDWbegin__Z9MinColumnILi3EE6float2PS0_:
	ld.param.u64 	%rd1, [__cudaparmf1__Z9MinColumnILi3EE6float2PS0_];
	mov.s64 	%rd2, %rd1;
	.loc	20	239	0
	ld.f32 	%f1, [%rd2+32];
	ld.v2.f32 	{%f2,%f3}, [%rd2+24];
	ld.v2.f32 	{%f4,%f5}, [%rd2+16];
	ld.v2.f32 	{%f6,%f7}, [%rd2+8];
	ld.v2.f32 	{%f8,%f9}, [%rd2+0];
	ld.v2.f32 	{%f10,%f11}, [%rd2+-8];
	ld.v2.f32 	{%f12,%f13}, [%rd2+-16];
	ld.f32 	%f14, [%rd2+-32];
	ld.v2.f32 	{%f15,%f16}, [%rd2+-24];
	min.ftz.f32 	%f17, %f14, %f15;
	min.ftz.f32 	%f18, %f12, %f17;
	min.ftz.f32 	%f19, %f10, %f18;
	min.ftz.f32 	%f20, %f8, %f19;
	min.ftz.f32 	%f21, %f6, %f20;
	min.ftz.f32 	%f22, %f4, %f21;
	min.ftz.f32 	%f23, %f2, %f22;
	min.ftz.f32 	%f24, %f1, %f23;
	st.param.f32 	[__cudaretf__Z9MinColumnILi3EE6float2PS0_+0], %f24;
	min.ftz.f32 	%f25, %f16, %f13;
	min.ftz.f32 	%f26, %f11, %f25;
	min.ftz.f32 	%f27, %f9, %f26;
	min.ftz.f32 	%f28, %f7, %f27;
	min.ftz.f32 	%f29, %f5, %f28;
	min.ftz.f32 	%f30, %f3, %f29;
	st.param.f32 	[__cudaretf__Z9MinColumnILi3EE6float2PS0_+4], %f30;
	ret;
$LDWend__Z9MinColumnILi3EE6float2PS0_:
	} // _Z9MinColumnILi3EE6float2PS0_

	.visible .func (.param .align 8 .b8 __cudaretf__Z9MinColumnILi2EE6float2PS0_[8]) _Z9MinColumnILi2EE6float2PS0_ (.param .u64 __cudaparmf1__Z9MinColumnILi2EE6float2PS0_)
	{
	.reg .u64 %rd<4>;
	.reg .f32 %f<24>;
	.loc	20	227	0
$LDWbegin__Z9MinColumnILi2EE6float2PS0_:
	ld.param.u64 	%rd1, [__cudaparmf1__Z9MinColumnILi2EE6float2PS0_];
	mov.s64 	%rd2, %rd1;
	.loc	20	239	0
	ld.f32 	%f1, [%rd2+24];
	ld.v2.f32 	{%f2,%f3}, [%rd2+16];
	ld.v2.f32 	{%f4,%f5}, [%rd2+8];
	ld.v2.f32 	{%f6,%f7}, [%rd2+0];
	ld.v2.f32 	{%f8,%f9}, [%rd2+-8];
	ld.f32 	%f10, [%rd2+-24];
	ld.v2.f32 	{%f11,%f12}, [%rd2+-16];
	min.ftz.f32 	%f13, %f10, %f11;
	min.ftz.f32 	%f14, %f8, %f13;
	min.ftz.f32 	%f15, %f6, %f14;
	min.ftz.f32 	%f16, %f4, %f15;
	min.ftz.f32 	%f17, %f2, %f16;
	min.ftz.f32 	%f18, %f1, %f17;
	st.param.f32 	[__cudaretf__Z9MinColumnILi2EE6float2PS0_+0], %f18;
	min.ftz.f32 	%f19, %f12, %f9;
	min.ftz.f32 	%f20, %f7, %f19;
	min.ftz.f32 	%f21, %f5, %f20;
	min.ftz.f32 	%f22, %f3, %f21;
	st.param.f32 	[__cudaretf__Z9MinColumnILi2EE6float2PS0_+4], %f22;
	ret;
$LDWend__Z9MinColumnILi2EE6float2PS0_:
	} // _Z9MinColumnILi2EE6float2PS0_

	.visible .func (.param .align 8 .b8 __cudaretf__Z9MinColumnILi1EE6float2PS0_[8]) _Z9MinColumnILi1EE6float2PS0_ (.param .u64 __cudaparmf1__Z9MinColumnILi1EE6float2PS0_)
	{
	.reg .u64 %rd<4>;
	.reg .f32 %f<16>;
	.loc	20	227	0
$LDWbegin__Z9MinColumnILi1EE6float2PS0_:
	ld.param.u64 	%rd1, [__cudaparmf1__Z9MinColumnILi1EE6float2PS0_];
	mov.s64 	%rd2, %rd1;
	.loc	20	239	0
	ld.f32 	%f1, [%rd2+16];
	ld.v2.f32 	{%f2,%f3}, [%rd2+8];
	ld.v2.f32 	{%f4,%f5}, [%rd2+0];
	ld.f32 	%f6, [%rd2+-16];
	ld.v2.f32 	{%f7,%f8}, [%rd2+-8];
	min.ftz.f32 	%f9, %f6, %f7;
	min.ftz.f32 	%f10, %f4, %f9;
	min.ftz.f32 	%f11, %f2, %f10;
	min.ftz.f32 	%f12, %f1, %f11;
	st.param.f32 	[__cudaretf__Z9MinColumnILi1EE6float2PS0_+0], %f12;
	min.ftz.f32 	%f13, %f8, %f5;
	min.ftz.f32 	%f14, %f3, %f13;
	st.param.f32 	[__cudaretf__Z9MinColumnILi1EE6float2PS0_+4], %f14;
	ret;
$LDWend__Z9MinColumnILi1EE6float2PS0_:
	} // _Z9MinColumnILi1EE6float2PS0_

	.visible .func (.param .align 8 .b8 __cudaretf__Z9MinColumnILi0EE6float2PS0_[8]) _Z9MinColumnILi0EE6float2PS0_ (.param .u64 __cudaparmf1__Z9MinColumnILi0EE6float2PS0_)
	{
	.reg .u64 %rd<4>;
	.reg .f32 %f<8>;
	.loc	20	227	0
$LDWbegin__Z9MinColumnILi0EE6float2PS0_:
	ld.param.u64 	%rd1, [__cudaparmf1__Z9MinColumnILi0EE6float2PS0_];
	mov.s64 	%rd2, %rd1;
	.loc	20	239	0
	ld.f32 	%f1, [%rd2+8];
	ld.f32 	%f2, [%rd2+-8];
	ld.v2.f32 	{%f3,%f4}, [%rd2+0];
	min.ftz.f32 	%f5, %f2, %f3;
	min.ftz.f32 	%f6, %f1, %f5;
	st.param.f32 	[__cudaretf__Z9MinColumnILi0EE6float2PS0_+0], %f6;
	st.param.f32 	[__cudaretf__Z9MinColumnILi0EE6float2PS0_+4], %f4;
	ret;
$LDWend__Z9MinColumnILi0EE6float2PS0_:
	} // _Z9MinColumnILi0EE6float2PS0_

	.visible .func _Z6SumRowILi3EEvP6float2Pf (.param .u64 __cudaparmf1__Z6SumRowILi3EEvP6float2Pf, .param .u64 __cudaparmf2__Z6SumRowILi3EEvP6float2Pf)
	{
	.reg .u64 %rd<6>;
	.reg .f32 %f<19>;
	.loc	20	301	0
$LDWbegin__Z6SumRowILi3EEvP6float2Pf:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6SumRowILi3EEvP6float2Pf];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z6SumRowILi3EEvP6float2Pf];
	mov.s64 	%rd4, %rd3;
	.loc	20	307	0
	ld.f32 	%f1, [%rd4+12];
	ld.f32 	%f2, [%rd4+8];
	ld.f32 	%f3, [%rd4+4];
	ld.f32 	%f4, [%rd4+0];
	ld.f32 	%f5, [%rd4+-4];
	ld.f32 	%f6, [%rd4+-12];
	ld.f32 	%f7, [%rd4+-8];
	add.ftz.f32 	%f8, %f6, %f7;
	add.ftz.f32 	%f9, %f5, %f8;
	add.ftz.f32 	%f10, %f4, %f9;
	add.ftz.f32 	%f11, %f3, %f10;
	add.ftz.f32 	%f12, %f2, %f11;
	add.ftz.f32 	%f13, %f1, %f12;
	.loc	20	309	0
	ld.f32 	%f14, [%rd4+-16];
	ld.f32 	%f15, [%rd4+16];
	add.ftz.f32 	%f16, %f14, %f15;
	.loc	20	311	0
	add.ftz.f32 	%f17, %f16, %f13;
	st.v2.f32 	[%rd2+0], {%f17,%f13};
	.loc	20	312	0
	ret;
$LDWend__Z6SumRowILi3EEvP6float2Pf:
	} // _Z6SumRowILi3EEvP6float2Pf

	.visible .func _Z6SumRowILi2EEvP6float2Pf (.param .u64 __cudaparmf1__Z6SumRowILi2EEvP6float2Pf, .param .u64 __cudaparmf2__Z6SumRowILi2EEvP6float2Pf)
	{
	.reg .u64 %rd<6>;
	.reg .f32 %f<15>;
	.loc	20	301	0
$LDWbegin__Z6SumRowILi2EEvP6float2Pf:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6SumRowILi2EEvP6float2Pf];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z6SumRowILi2EEvP6float2Pf];
	mov.s64 	%rd4, %rd3;
	.loc	20	307	0
	ld.f32 	%f1, [%rd4+8];
	ld.f32 	%f2, [%rd4+4];
	ld.f32 	%f3, [%rd4+0];
	ld.f32 	%f4, [%rd4+-8];
	ld.f32 	%f5, [%rd4+-4];
	add.ftz.f32 	%f6, %f4, %f5;
	add.ftz.f32 	%f7, %f3, %f6;
	add.ftz.f32 	%f8, %f2, %f7;
	add.ftz.f32 	%f9, %f1, %f8;
	.loc	20	309	0
	ld.f32 	%f10, [%rd4+-12];
	ld.f32 	%f11, [%rd4+12];
	add.ftz.f32 	%f12, %f10, %f11;
	.loc	20	311	0
	add.ftz.f32 	%f13, %f12, %f9;
	st.v2.f32 	[%rd2+0], {%f13,%f9};
	.loc	20	312	0
	ret;
$LDWend__Z6SumRowILi2EEvP6float2Pf:
	} // _Z6SumRowILi2EEvP6float2Pf

	.visible .func _Z6SumRowILi1EEvP6float2Pf (.param .u64 __cudaparmf1__Z6SumRowILi1EEvP6float2Pf, .param .u64 __cudaparmf2__Z6SumRowILi1EEvP6float2Pf)
	{
	.reg .u64 %rd<6>;
	.reg .f32 %f<11>;
	.loc	20	301	0
$LDWbegin__Z6SumRowILi1EEvP6float2Pf:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6SumRowILi1EEvP6float2Pf];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z6SumRowILi1EEvP6float2Pf];
	mov.s64 	%rd4, %rd3;
	.loc	20	307	0
	ld.f32 	%f1, [%rd4+4];
	ld.f32 	%f2, [%rd4+-4];
	ld.f32 	%f3, [%rd4+0];
	add.ftz.f32 	%f4, %f2, %f3;
	add.ftz.f32 	%f5, %f1, %f4;
	.loc	20	309	0
	ld.f32 	%f6, [%rd4+-8];
	ld.f32 	%f7, [%rd4+8];
	add.ftz.f32 	%f8, %f6, %f7;
	.loc	20	311	0
	add.ftz.f32 	%f9, %f8, %f5;
	st.v2.f32 	[%rd2+0], {%f9,%f5};
	.loc	20	312	0
	ret;
$LDWend__Z6SumRowILi1EEvP6float2Pf:
	} // _Z6SumRowILi1EEvP6float2Pf

	.visible .func _Z6SumRowILi0EEvP6float2Pf (.param .u64 __cudaparmf1__Z6SumRowILi0EEvP6float2Pf, .param .u64 __cudaparmf2__Z6SumRowILi0EEvP6float2Pf)
	{
	.reg .u64 %rd<6>;
	.reg .f32 %f<7>;
	.loc	20	301	0
$LDWbegin__Z6SumRowILi0EEvP6float2Pf:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6SumRowILi0EEvP6float2Pf];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z6SumRowILi0EEvP6float2Pf];
	mov.s64 	%rd4, %rd3;
	.loc	20	304	0
	ld.f32 	%f1, [%rd4+0];
	.loc	20	309	0
	ld.f32 	%f2, [%rd4+-4];
	ld.f32 	%f3, [%rd4+4];
	add.ftz.f32 	%f4, %f2, %f3;
	.loc	20	311	0
	add.ftz.f32 	%f5, %f4, %f1;
	st.v2.f32 	[%rd2+0], {%f5,%f1};
	.loc	20	312	0
	ret;
$LDWend__Z6SumRowILi0EEvP6float2Pf:
	} // _Z6SumRowILi0EEvP6float2Pf

	.visible .func (.param .f32 __cudaretf__Z9SumColumnILi3EEfP6float2f) _Z9SumColumnILi3EEfP6float2f (.param .u64 __cudaparmf1__Z9SumColumnILi3EEfP6float2f, .param .f32 __cudaparmf2__Z9SumColumnILi3EEfP6float2f)
	{
	.reg .u64 %rd<4>;
	.reg .f32 %f<42>;
	.loc	20	354	0
$LDWbegin__Z9SumColumnILi3EEfP6float2f:
	ld.param.u64 	%rd1, [__cudaparmf1__Z9SumColumnILi3EEfP6float2f];
	mov.s64 	%rd2, %rd1;
	ld.param.f32 	%f1, [__cudaparmf2__Z9SumColumnILi3EEfP6float2f];
	mov.f32 	%f2, %f1;
	.loc	20	370	0
	ld.f32 	%f3, [%rd2+32];
	ld.v2.f32 	{%f4,%f5}, [%rd2+24];
	ld.v2.f32 	{%f6,%f7}, [%rd2+16];
	ld.v2.f32 	{%f8,%f9}, [%rd2+8];
	ld.v2.f32 	{%f10,%f11}, [%rd2+0];
	ld.v2.f32 	{%f12,%f13}, [%rd2+-8];
	ld.v2.f32 	{%f14,%f15}, [%rd2+-16];
	ld.f32 	%f16, [%rd2+-32];
	ld.v2.f32 	{%f17,%f18}, [%rd2+-24];
	add.ftz.f32 	%f19, %f16, %f17;
	add.ftz.f32 	%f20, %f14, %f19;
	add.ftz.f32 	%f21, %f12, %f20;
	add.ftz.f32 	%f22, %f10, %f21;
	add.ftz.f32 	%f23, %f8, %f22;
	add.ftz.f32 	%f24, %f6, %f23;
	add.ftz.f32 	%f25, %f4, %f24;
	add.ftz.f32 	%f26, %f3, %f25;
	mov.f32 	%f27, 0f42a20000;    	// 81
	div.approx.ftz.f32 	%f28, %f2, %f27;
	mul.ftz.f32 	%f29, %f26, %f28;
	add.ftz.f32 	%f30, %f18, %f15;
	add.ftz.f32 	%f31, %f13, %f30;
	add.ftz.f32 	%f32, %f11, %f31;
	add.ftz.f32 	%f33, %f9, %f32;
	add.ftz.f32 	%f34, %f7, %f33;
	add.ftz.f32 	%f35, %f5, %f34;
	mov.f32 	%f36, 0f3f800000;    	// 1
	sub.ftz.f32 	%f37, %f36, %f2;
	mov.f32 	%f38, 0f42440000;    	// 49
	div.approx.ftz.f32 	%f39, %f37, %f38;
	fma.rn.ftz.f32 	%f40, %f35, %f39, %f29;
	st.param.f32 	[__cudaretf__Z9SumColumnILi3EEfP6float2f], %f40;
	ret;
$LDWend__Z9SumColumnILi3EEfP6float2f:
	} // _Z9SumColumnILi3EEfP6float2f

	.visible .func (.param .f32 __cudaretf__Z9SumColumnILi2EEfP6float2f) _Z9SumColumnILi2EEfP6float2f (.param .u64 __cudaparmf1__Z9SumColumnILi2EEfP6float2f, .param .f32 __cudaparmf2__Z9SumColumnILi2EEfP6float2f)
	{
	.reg .u64 %rd<4>;
	.reg .f32 %f<34>;
	.loc	20	354	0
$LDWbegin__Z9SumColumnILi2EEfP6float2f:
	ld.param.u64 	%rd1, [__cudaparmf1__Z9SumColumnILi2EEfP6float2f];
	mov.s64 	%rd2, %rd1;
	ld.param.f32 	%f1, [__cudaparmf2__Z9SumColumnILi2EEfP6float2f];
	mov.f32 	%f2, %f1;
	.loc	20	370	0
	ld.f32 	%f3, [%rd2+24];
	ld.v2.f32 	{%f4,%f5}, [%rd2+16];
	ld.v2.f32 	{%f6,%f7}, [%rd2+8];
	ld.v2.f32 	{%f8,%f9}, [%rd2+0];
	ld.v2.f32 	{%f10,%f11}, [%rd2+-8];
	ld.f32 	%f12, [%rd2+-24];
	ld.v2.f32 	{%f13,%f14}, [%rd2+-16];
	add.ftz.f32 	%f15, %f12, %f13;
	add.ftz.f32 	%f16, %f10, %f15;
	add.ftz.f32 	%f17, %f8, %f16;
	add.ftz.f32 	%f18, %f6, %f17;
	add.ftz.f32 	%f19, %f4, %f18;
	add.ftz.f32 	%f20, %f3, %f19;
	mov.f32 	%f21, 0f42440000;    	// 49
	div.approx.ftz.f32 	%f22, %f2, %f21;
	mul.ftz.f32 	%f23, %f20, %f22;
	add.ftz.f32 	%f24, %f14, %f11;
	add.ftz.f32 	%f25, %f9, %f24;
	add.ftz.f32 	%f26, %f7, %f25;
	add.ftz.f32 	%f27, %f5, %f26;
	mov.f32 	%f28, 0f3f800000;    	// 1
	sub.ftz.f32 	%f29, %f28, %f2;
	mov.f32 	%f30, 0f41c80000;    	// 25
	div.approx.ftz.f32 	%f31, %f29, %f30;
	fma.rn.ftz.f32 	%f32, %f27, %f31, %f23;
	st.param.f32 	[__cudaretf__Z9SumColumnILi2EEfP6float2f], %f32;
	ret;
$LDWend__Z9SumColumnILi2EEfP6float2f:
	} // _Z9SumColumnILi2EEfP6float2f

	.visible .func (.param .f32 __cudaretf__Z9SumColumnILi1EEfP6float2f) _Z9SumColumnILi1EEfP6float2f (.param .u64 __cudaparmf1__Z9SumColumnILi1EEfP6float2f, .param .f32 __cudaparmf2__Z9SumColumnILi1EEfP6float2f)
	{
	.reg .u64 %rd<4>;
	.reg .f32 %f<26>;
	.loc	20	354	0
$LDWbegin__Z9SumColumnILi1EEfP6float2f:
	ld.param.u64 	%rd1, [__cudaparmf1__Z9SumColumnILi1EEfP6float2f];
	mov.s64 	%rd2, %rd1;
	ld.param.f32 	%f1, [__cudaparmf2__Z9SumColumnILi1EEfP6float2f];
	mov.f32 	%f2, %f1;
	.loc	20	370	0
	ld.f32 	%f3, [%rd2+16];
	ld.v2.f32 	{%f4,%f5}, [%rd2+8];
	ld.v2.f32 	{%f6,%f7}, [%rd2+0];
	ld.f32 	%f8, [%rd2+-16];
	ld.v2.f32 	{%f9,%f10}, [%rd2+-8];
	add.ftz.f32 	%f11, %f8, %f9;
	add.ftz.f32 	%f12, %f6, %f11;
	add.ftz.f32 	%f13, %f4, %f12;
	add.ftz.f32 	%f14, %f3, %f13;
	mov.f32 	%f15, 0f41c80000;    	// 25
	div.approx.ftz.f32 	%f16, %f2, %f15;
	mul.ftz.f32 	%f17, %f14, %f16;
	add.ftz.f32 	%f18, %f10, %f7;
	add.ftz.f32 	%f19, %f5, %f18;
	mov.f32 	%f20, 0f3f800000;    	// 1
	sub.ftz.f32 	%f21, %f20, %f2;
	mov.f32 	%f22, 0f41100000;    	// 9
	div.approx.ftz.f32 	%f23, %f21, %f22;
	fma.rn.ftz.f32 	%f24, %f19, %f23, %f17;
	st.param.f32 	[__cudaretf__Z9SumColumnILi1EEfP6float2f], %f24;
	ret;
$LDWend__Z9SumColumnILi1EEfP6float2f:
	} // _Z9SumColumnILi1EEfP6float2f

	.visible .func (.param .f32 __cudaretf__Z9SumColumnILi0EEfP6float2f) _Z9SumColumnILi0EEfP6float2f (.param .u64 __cudaparmf1__Z9SumColumnILi0EEfP6float2f, .param .f32 __cudaparmf2__Z9SumColumnILi0EEfP6float2f)
	{
	.reg .u64 %rd<4>;
	.reg .f32 %f<18>;
	.loc	20	354	0
$LDWbegin__Z9SumColumnILi0EEfP6float2f:
	ld.param.u64 	%rd1, [__cudaparmf1__Z9SumColumnILi0EEfP6float2f];
	mov.s64 	%rd2, %rd1;
	ld.param.f32 	%f1, [__cudaparmf2__Z9SumColumnILi0EEfP6float2f];
	mov.f32 	%f2, %f1;
	.loc	20	370	0
	ld.f32 	%f3, [%rd2+8];
	ld.f32 	%f4, [%rd2+-8];
	ld.v2.f32 	{%f5,%f6}, [%rd2+0];
	add.ftz.f32 	%f7, %f4, %f5;
	add.ftz.f32 	%f8, %f3, %f7;
	mov.f32 	%f9, 0f41100000;     	// 9
	div.approx.ftz.f32 	%f10, %f2, %f9;
	mul.ftz.f32 	%f11, %f8, %f10;
	mov.f32 	%f12, 0f3f800000;    	// 1
	sub.ftz.f32 	%f13, %f12, %f2;
	mov.f32 	%f14, 0f3f800000;    	// 1
	div.approx.ftz.f32 	%f15, %f13, %f14;
	fma.rn.ftz.f32 	%f16, %f6, %f15, %f11;
	st.param.f32 	[__cudaretf__Z9SumColumnILi0EEfP6float2f], %f16;
	ret;
$LDWend__Z9SumColumnILi0EEfP6float2f:
	} // _Z9SumColumnILi0EEfP6float2f

	.entry cuda_kernel_chokefilter_x (
		.param .u64 __cudaparm_cuda_kernel_chokefilter_x_id,
		.param .u64 __cudaparm_cuda_kernel_chokefilter_x_od,
		.param .s32 __cudaparm_cuda_kernel_chokefilter_x_idPitch,
		.param .s32 __cudaparm_cuda_kernel_chokefilter_x_odPitch,
		.param .s32 __cudaparm_cuda_kernel_chokefilter_x_w,
		.param .s32 __cudaparm_cuda_kernel_chokefilter_x_h,
		.param .s32 __cudaparm_cuda_kernel_chokefilter_x_r,
		.param .u32 __cudaparm_cuda_kernel_chokefilter_x_inDeviceFormat)
	{
	.reg .u32 %r<44>;
	.reg .u64 %rd<42>;
	.reg .f32 %f<57>;
	.reg .pred %p<14>;
	.shared .align 4 .b8 __cuda___cuda_local_var_91918_31_non_const_data208[1152];
	.loc	20	173	0
$LDWbegin_cuda_kernel_chokefilter_x:
	.loc	20	176	0
	cvt.s32.u32 	%r1, %ntid.x;
	cvt.s32.u32 	%r2, %ctaid.x;
	mul.lo.s32 	%r3, %r2, %r1;
	mov.u32 	%r4, %tid.x;
	add.u32 	%r5, %r3, %r4;
	ld.param.s32 	%r6, [__cudaparm_cuda_kernel_chokefilter_x_w];
	setp.gt.s32 	%p1, %r6, %r5;
	@%p1 bra 	$Lt_40_16898;
	bra.uni 	$LBB38_cuda_kernel_chokefilter_x;
$Lt_40_16898:
	.loc	20	177	0
	ld.param.s32 	%r7, [__cudaparm_cuda_kernel_chokefilter_x_inDeviceFormat];
	mov.s32 	%r8, 0;
	setp.eq.s32 	%p2, %r7, %r8;
	mov.u32 	%r9, %ctaid.y;
	ld.param.u32 	%r10, [__cudaparm_cuda_kernel_chokefilter_x_idPitch];
	mul.lo.u32 	%r11, %r10, %r9;
	add.u32 	%r12, %r11, %r5;
	cvt.u64.u32 	%rd1, %r12;
	ld.param.u64 	%rd2, [__cudaparm_cuda_kernel_chokefilter_x_id];
	@!%p2 bra 	$Lt_40_17666;
	.loc	20	184	0
	mul.lo.u64 	%rd3, %rd1, 8;
	add.u64 	%rd4, %rd2, %rd3;
	add.u64 	%rd5, %rd4, 6;
	bra.uni 	$Lt_40_17410;
$Lt_40_17666:
	.loc	20	188	0
	mul.lo.u64 	%rd6, %rd1, 16;
	add.u64 	%rd7, %rd2, %rd6;
	add.u64 	%rd5, %rd7, 12;
$Lt_40_17410:
	.loc	20	194	0
	@!%p2 bra 	$Lt_40_17922;
	.loc	20	167	0
	ld.global.u16 	%r13, [%rd5+0];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r13;
	cvt.ftz.f32.f16	%f1, %b1; }
	bra.uni 	$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_217_5;
$Lt_40_17922:
	.loc	20	169	0
	ld.global.f32 	%f1, [%rd5+0];
$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_217_5:
	.loc	20	194	0
	mov.u64 	%rd8, __cuda___cuda_local_var_91918_31_non_const_data208;
	cvt.u64.u32 	%rd9, %r4;
	mul.wide.u32 	%rd10, %r4, 4;
	add.u64 	%rd11, %rd10, %rd8;
	st.shared.f32 	[%rd11+64], %f1;
	ld.param.s32 	%r14, [__cudaparm_cuda_kernel_chokefilter_x_r];
	add.s32 	%r15, %r14, 1;
	setp.le.u32 	%p3, %r15, %r4;
	@%p3 bra 	$Lt_40_18690;
	neg.s32 	%r16, %r14;
	cvt.s64.s32 	%rd12, %r16;
	mul.wide.s32 	%rd13, %r16, 4;
	add.u64 	%rd14, %rd11, %rd13;
	setp.le.s32 	%p4, %r15, %r5;
	@%p4 bra 	$Lt_40_19202;
	.loc	20	198	0
	st.shared.f32 	[%rd14+60], %f1;
	bra.uni 	$Lt_40_20482;
$Lt_40_19202:
	.loc	20	199	0
	mul.lo.s32 	%r17, %r14, -4;
	cvt.s64.s32 	%rd15, %r17;
	@!%p2 bra 	$Lt_40_19458;
	.loc	20	167	0
	mul.lo.u64 	%rd16, %rd15, 2;
	add.u64 	%rd17, %rd5, %rd16;
	ld.global.u16 	%r18, [%rd17+-8];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r18;
	cvt.ftz.f32.f16	%f2, %b1; }
	bra.uni 	$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_217_3;
$Lt_40_19458:
	.loc	20	169	0
	mul.lo.u64 	%rd18, %rd15, 4;
	add.u64 	%rd19, %rd5, %rd18;
	ld.global.f32 	%f2, [%rd19+-16];
$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_217_3:
	.loc	20	199	0
	st.shared.f32 	[%rd14+60], %f2;
	bra.uni 	$Lt_40_20482;
$Lt_40_18690:
	sub.u32 	%r19, %r1, %r14;
	sub.u32 	%r20, %r19, 1;
	setp.lt.u32 	%p5, %r4, %r20;
	@%p5 bra 	$Lt_40_20482;
	cvt.s64.s32 	%rd20, %r14;
	mul.wide.s32 	%rd21, %r14, 4;
	add.u64 	%rd22, %rd11, %rd21;
	sub.s32 	%r21, %r6, %r14;
	sub.s32 	%r22, %r21, 1;
	setp.lt.s32 	%p6, %r5, %r22;
	@%p6 bra 	$Lt_40_20738;
	.loc	20	201	0
	st.shared.f32 	[%rd22+68], %f1;
	bra.uni 	$Lt_40_20482;
$Lt_40_20738:
	.loc	20	202	0
	mul.lo.s32 	%r23, %r14, 4;
	cvt.s64.s32 	%rd23, %r23;
	@!%p2 bra 	$Lt_40_20994;
	.loc	20	167	0
	mul.lo.u64 	%rd24, %rd23, 2;
	add.u64 	%rd25, %rd5, %rd24;
	ld.global.u16 	%r24, [%rd25+8];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f3, %b1; }
	bra.uni 	$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_217_1;
$Lt_40_20994:
	.loc	20	169	0
	mul.lo.u64 	%rd26, %rd23, 4;
	add.u64 	%rd27, %rd5, %rd26;
	ld.global.f32 	%f3, [%rd27+16];
$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_217_1:
	.loc	20	202	0
	st.shared.f32 	[%rd22+68], %f3;
$Lt_40_20482:
$Lt_40_19970:
$Lt_40_18434:
	.loc	20	205	0
	bar.sync 	0;
	ld.param.u64 	%rd28, [__cudaparm_cuda_kernel_chokefilter_x_od];
	ld.param.u32 	%r25, [__cudaparm_cuda_kernel_chokefilter_x_odPitch];
	mul.lo.u32 	%r26, %r25, %r9;
	cvt.u64.u32 	%rd29, %r26;
	cvt.s64.s32 	%rd30, %r5;
	mul.wide.s32 	%rd31, %r5, 8;
	add.u64 	%rd32, %rd29, %rd31;
	add.u64 	%rd33, %rd28, %rd32;
	mov.u32 	%r27, 3;
	setp.ne.s32 	%p7, %r14, %r27;
	@%p7 bra 	$Lt_40_21762;
	.loc	20	156	0
	ld.shared.f32 	%f4, [%rd11+64];
	ld.shared.f32 	%f5, [%rd11+60];
	ld.shared.f32 	%f6, [%rd11+68];
	ld.shared.f32 	%f7, [%rd11+56];
	ld.shared.f32 	%f8, [%rd11+72];
	ld.shared.f32 	%f9, [%rd11+52];
	ld.shared.f32 	%f10, [%rd11+76];
	min.ftz.f32 	%f11, %f9, %f7;
	min.ftz.f32 	%f12, %f11, %f5;
	min.ftz.f32 	%f13, %f12, %f4;
	min.ftz.f32 	%f14, %f13, %f6;
	min.ftz.f32 	%f15, %f14, %f8;
	min.ftz.f32 	%f16, %f15, %f10;
	.loc	20	157	0
	ld.shared.f32 	%f17, [%rd11+48];
	ld.shared.f32 	%f18, [%rd11+80];
	min.ftz.f32 	%f19, %f17, %f18;
	min.ftz.f32 	%f20, %f16, %f19;
	st.global.v2.f32 	[%rd33+0], {%f20,%f16};
	.loc	20	208	0
	bra.uni 	$LBB38_cuda_kernel_chokefilter_x;
$Lt_40_21762:
	mov.u32 	%r28, 2;
	setp.ne.s32 	%p8, %r14, %r28;
	@%p8 bra 	$Lt_40_22274;
	.loc	20	156	0
	ld.shared.f32 	%f21, [%rd11+64];
	ld.shared.f32 	%f22, [%rd11+60];
	ld.shared.f32 	%f23, [%rd11+68];
	ld.shared.f32 	%f24, [%rd11+56];
	ld.shared.f32 	%f25, [%rd11+72];
	min.ftz.f32 	%f26, %f24, %f22;
	min.ftz.f32 	%f27, %f26, %f21;
	min.ftz.f32 	%f28, %f27, %f23;
	min.ftz.f32 	%f29, %f28, %f25;
	.loc	20	157	0
	ld.shared.f32 	%f30, [%rd11+52];
	ld.shared.f32 	%f31, [%rd11+76];
	min.ftz.f32 	%f32, %f30, %f31;
	min.ftz.f32 	%f33, %f29, %f32;
	st.global.v2.f32 	[%rd33+0], {%f33,%f29};
	.loc	20	209	0
	bra.uni 	$LBB38_cuda_kernel_chokefilter_x;
$Lt_40_22274:
	mov.u32 	%r29, 1;
	setp.ne.s32 	%p9, %r14, %r29;
	@%p9 bra 	$Lt_40_22786;
	.loc	20	156	0
	ld.shared.f32 	%f34, [%rd11+64];
	ld.shared.f32 	%f35, [%rd11+60];
	ld.shared.f32 	%f36, [%rd11+68];
	min.ftz.f32 	%f37, %f35, %f34;
	min.ftz.f32 	%f38, %f37, %f36;
	.loc	20	157	0
	ld.shared.f32 	%f39, [%rd11+56];
	ld.shared.f32 	%f40, [%rd11+72];
	min.ftz.f32 	%f41, %f39, %f40;
	min.ftz.f32 	%f42, %f38, %f41;
	st.global.v2.f32 	[%rd33+0], {%f42,%f38};
	.loc	20	210	0
	bra.uni 	$LBB38_cuda_kernel_chokefilter_x;
$Lt_40_22786:
	mov.u32 	%r30, 0;
	setp.ne.s32 	%p10, %r14, %r30;
	@%p10 bra 	$Lt_40_23298;
	.loc	20	150	0
	ld.shared.f32 	%f43, [%rd11+64];
	.loc	20	157	0
	ld.shared.f32 	%f44, [%rd11+60];
	ld.shared.f32 	%f45, [%rd11+68];
	min.ftz.f32 	%f46, %f44, %f45;
	min.ftz.f32 	%f47, %f43, %f46;
	st.global.v2.f32 	[%rd33+0], {%f47,%f43};
	.loc	20	211	0
	bra.uni 	$LBB38_cuda_kernel_chokefilter_x;
$Lt_40_23298:
	.loc	20	214	0
	neg.s32 	%r31, %r14;
	cvt.s64.s32 	%rd34, %r31;
	mul.wide.s32 	%rd35, %r31, 4;
	add.u64 	%rd14, %rd11, %rd35;
	ld.shared.f32 	%f48, [%rd14+64];
	mul.lo.s32 	%r32, %r14, 2;
	mov.u32 	%r33, 0;
	setp.le.s32 	%p11, %r32, %r33;
	@%p11 bra 	$Lt_40_24578;
	add.s32 	%r34, %r32, 1;
	shr.s32 	%r35, %r34, 31;
	mov.s32 	%r36, 1;
	and.b32 	%r37, %r35, %r36;
	add.s32 	%r38, %r37, %r34;
	shr.s32 	%r39, %r38, 1;
	mov.s32 	%r40, %r31;
	sub.s32 	%r41, %r32, %r14;
	add.s64 	%rd36, %rd10, %rd35;
	add.u64 	%rd37, %rd8, %rd36;
	mov.s32 	%r42, %r39;
$Lt_40_25090:
 //<loop> Loop body line 214, nesting depth: 1, estimated iterations: unknown
	.loc	20	216	0
	ld.shared.f32 	%f49, [%rd37+68];
	min.ftz.f32 	%f50, %f49, %f48;
	.loc	20	217	0
	ld.shared.f32 	%f51, [%rd37+72];
	min.ftz.f32 	%f48, %f51, %f50;
	add.s32 	%r40, %r40, 2;
	add.u64 	%rd37, %rd37, 8;
	setp.lt.s32 	%p12, %r40, %r41;
	@%p12 bra 	$Lt_40_25090;
$Lt_40_24578:
	.loc	20	221	0
	ld.shared.f32 	%f52, [%rd14+60];
	cvt.s64.s32 	%rd38, %r14;
	mul.wide.s32 	%rd39, %r14, 4;
	add.u64 	%rd40, %rd11, %rd39;
	ld.shared.f32 	%f53, [%rd40+68];
	min.ftz.f32 	%f54, %f52, %f53;
	min.ftz.f32 	%f55, %f48, %f54;
	st.global.v2.f32 	[%rd33+0], {%f55,%f48};
$LBB38_cuda_kernel_chokefilter_x:
	.loc	20	223	0
	exit;
$LDWend_cuda_kernel_chokefilter_x:
	} // cuda_kernel_chokefilter_x

	.entry cuda_kernel_chokefilter_y (
		.param .u64 __cudaparm_cuda_kernel_chokefilter_y_id,
		.param .u64 __cudaparm_cuda_kernel_chokefilter_y_od,
		.param .s32 __cudaparm_cuda_kernel_chokefilter_y_idPitch,
		.param .s32 __cudaparm_cuda_kernel_chokefilter_y_odPitch,
		.param .s32 __cudaparm_cuda_kernel_chokefilter_y_w,
		.param .s32 __cudaparm_cuda_kernel_chokefilter_y_h,
		.param .s32 __cudaparm_cuda_kernel_chokefilter_y_r,
		.param .f32 __cudaparm_cuda_kernel_chokefilter_y_f,
		.param .s8 __cudaparm_cuda_kernel_chokefilter_y_remap,
		.param .u32 __cudaparm_cuda_kernel_chokefilter_y_inDeviceFormat)
	{
	.reg .u32 %r<70>;
	.reg .u64 %rd<35>;
	.reg .f32 %f<96>;
	.reg .pred %p<15>;
	.shared .align 8 .b8 __cuda___cuda_local_var_91987_32_non_const_data1408[3200];
	.loc	20	242	0
$LDWbegin_cuda_kernel_chokefilter_y:
	.loc	20	246	0
	cvt.s32.u32 	%r1, %ctaid.x;
	cvt.s32.u32 	%r2, %ntid.x;
	mul.lo.s32 	%r3, %r1, %r2;
	cvt.s32.u32 	%r4, %ntid.y;
	cvt.s32.u32 	%r5, %ctaid.y;
	mul.lo.s32 	%r6, %r5, %r4;
	mov.u32 	%r7, %tid.x;
	add.u32 	%r8, %r3, %r7;
	mov.u32 	%r9, %tid.y;
	add.u32 	%r10, %r6, %r9;
	ld.param.s32 	%r11, [__cudaparm_cuda_kernel_chokefilter_y_h];
	ld.param.s32 	%r12, [__cudaparm_cuda_kernel_chokefilter_y_w];
	set.le.u32.s32 	%r13, %r12, %r8;
	neg.s32 	%r14, %r13;
	set.le.u32.s32 	%r15, %r11, %r10;
	neg.s32 	%r16, %r15;
	or.b32 	%r17, %r14, %r16;
	mov.u32 	%r18, 0;
	setp.eq.s32 	%p1, %r17, %r18;
	@%p1 bra 	$Lt_41_14082;
	bra.uni 	$LBB30_cuda_kernel_chokefilter_y;
$Lt_41_14082:
	.loc	20	247	0
	mov.u64 	%rd1, __cuda___cuda_local_var_91987_32_non_const_data1408;
	.loc	20	254	0
	mul.lo.u32 	%r19, %r7, 25;
	ld.param.s32 	%r20, [__cudaparm_cuda_kernel_chokefilter_y_idPitch];
	add.u32 	%r21, %r19, %r9;
	cvt.u64.u32 	%rd2, %r21;
	mul.wide.u32 	%rd3, %r21, 8;
	add.u64 	%rd4, %rd3, %rd1;
	mul.lo.s32 	%r22, %r20, %r10;
	mul.lo.s32 	%r23, %r8, 8;
	add.s32 	%r24, %r22, %r23;
	cvt.u64.s32 	%rd5, %r24;
	ld.param.u64 	%rd6, [__cudaparm_cuda_kernel_chokefilter_y_id];
	add.u64 	%rd7, %rd5, %rd6;
	ld.global.v2.f32 	{%f1,%f2}, [%rd7+0];
	st.shared.v2.f32 	[%rd4+64], {%f1,%f2};
	ld.param.s32 	%r25, [__cudaparm_cuda_kernel_chokefilter_y_r];
	add.s32 	%r26, %r25, 1;
	setp.le.u32 	%p2, %r26, %r9;
	@%p2 bra 	$Lt_41_15106;
	neg.s32 	%r27, %r25;
	cvt.s64.s32 	%rd8, %r27;
	mul.wide.s32 	%rd9, %r27, 8;
	add.u64 	%rd10, %rd4, %rd9;
	setp.le.s32 	%p3, %r26, %r10;
	@%p3 bra 	$Lt_41_15362;
	.loc	20	256	0
	ld.shared.f32 	%f3, [%rd4+68];
	st.shared.v2.f32 	[%rd10+56], {%f1,%f3};
	bra.uni 	$Lt_41_15106;
$Lt_41_15362:
	.loc	20	257	0
	mul.lo.s32 	%r28, %r26, %r20;
	neg.s32 	%r29, %r28;
	shr.s32 	%r30, %r29, 31;
	mov.s32 	%r31, 7;
	and.b32 	%r32, %r30, %r31;
	add.s32 	%r33, %r32, %r29;
	shr.s32 	%r34, %r33, 3;
	cvt.s64.s32 	%rd11, %r34;
	mul.wide.s32 	%rd12, %r34, 8;
	add.u64 	%rd13, %rd7, %rd12;
	ld.global.v2.f32 	{%f4,%f5}, [%rd13+0];
	st.shared.v2.f32 	[%rd10+56], {%f4,%f5};
$Lt_41_15106:
$Lt_41_14594:
	sub.u32 	%r35, %r4, %r25;
	sub.u32 	%r36, %r35, 1;
	setp.lt.u32 	%p4, %r9, %r36;
	@%p4 bra 	$Lt_41_16130;
	cvt.s64.s32 	%rd14, %r25;
	mul.wide.s32 	%rd15, %r25, 8;
	add.u64 	%rd16, %rd4, %rd15;
	sub.s32 	%r37, %r11, %r25;
	sub.s32 	%r38, %r37, 1;
	setp.lt.s32 	%p5, %r10, %r38;
	@%p5 bra 	$Lt_41_16386;
	ld.shared.v2.f32 	{%f6,%f7}, [%rd4+64];
	st.shared.v2.f32 	[%rd16+72], {%f6,%f7};
	.loc	20	260	0
	bra.uni 	$Lt_41_16130;
$Lt_41_16386:
	.loc	20	261	0
	mul.lo.s32 	%r39, %r26, %r20;
	shr.s32 	%r40, %r39, 31;
	mov.s32 	%r41, 7;
	and.b32 	%r42, %r40, %r41;
	add.s32 	%r43, %r42, %r39;
	shr.s32 	%r44, %r43, 3;
	cvt.s64.s32 	%rd17, %r44;
	mul.wide.s32 	%rd18, %r44, 8;
	add.u64 	%rd19, %rd7, %rd18;
	ld.global.v2.f32 	{%f8,%f9}, [%rd19+0];
	st.shared.v2.f32 	[%rd16+72], {%f8,%f9};
$Lt_41_16130:
$Lt_41_15618:
	.loc	20	264	0
	bar.sync 	0;
	mov.u32 	%r45, 3;
	setp.ne.s32 	%p6, %r25, %r45;
	@%p6 bra 	$Lt_41_16898;
	ld.shared.v2.f32 	{%f10,%f11}, [%rd4+80];
	ld.shared.v2.f32 	{%f12,%f13}, [%rd4+72];
	ld.shared.v2.f32 	{%f14,%f15}, [%rd4+64];
	ld.shared.v2.f32 	{%f16,%f17}, [%rd4+56];
	ld.shared.v2.f32 	{%f18,%f19}, [%rd4+48];
	.loc	20	233	0
	ld.shared.f32 	%f20, [%rd4+32];
	ld.shared.v2.f32 	{%f21,%f22}, [%rd4+40];
	min.ftz.f32 	%f23, %f20, %f21;
	min.ftz.f32 	%f24, %f18, %f23;
	min.ftz.f32 	%f25, %f16, %f24;
	min.ftz.f32 	%f26, %f14, %f25;
	min.ftz.f32 	%f27, %f12, %f26;
	min.ftz.f32 	%f28, %f10, %f27;
	ld.shared.v2.f32 	{%f29,%f30}, [%rd4+88];
	.loc	20	235	0
	min.ftz.f32 	%f31, %f29, %f28;
	.loc	20	236	0
	min.ftz.f32 	%f32, %f22, %f19;
	min.ftz.f32 	%f33, %f17, %f32;
	min.ftz.f32 	%f34, %f15, %f33;
	min.ftz.f32 	%f35, %f13, %f34;
	min.ftz.f32 	%f36, %f11, %f35;
	min.ftz.f32 	%f37, %f30, %f36;
	.loc	20	238	0
	ld.shared.f32 	%f38, [%rd4+96];
	min.ftz.f32 	%f39, %f38, %f31;
	.loc	20	267	0
	mov.f32 	%f40, %f39;
	mov.f32 	%f41, %f37;
	bra.uni 	$Lt_41_18178;
$Lt_41_16898:
	mov.u32 	%r46, 2;
	setp.ne.s32 	%p7, %r25, %r46;
	@%p7 bra 	$Lt_41_17410;
	.loc	20	238	0
	ld.shared.f32 	%f42, [%rd4+88];
	ld.shared.v2.f32 	{%f43,%f44}, [%rd4+80];
	ld.shared.v2.f32 	{%f45,%f46}, [%rd4+72];
	ld.shared.v2.f32 	{%f47,%f48}, [%rd4+64];
	ld.shared.v2.f32 	{%f49,%f50}, [%rd4+56];
	ld.shared.f32 	%f51, [%rd4+40];
	ld.shared.v2.f32 	{%f52,%f53}, [%rd4+48];
	min.ftz.f32 	%f54, %f51, %f52;
	min.ftz.f32 	%f55, %f49, %f54;
	min.ftz.f32 	%f56, %f47, %f55;
	min.ftz.f32 	%f57, %f45, %f56;
	min.ftz.f32 	%f58, %f43, %f57;
	min.ftz.f32 	%f59, %f42, %f58;
	.loc	20	268	0
	mov.f32 	%f40, %f59;
	min.ftz.f32 	%f60, %f53, %f50;
	min.ftz.f32 	%f61, %f48, %f60;
	min.ftz.f32 	%f62, %f46, %f61;
	min.ftz.f32 	%f41, %f44, %f62;
	bra.uni 	$Lt_41_18178;
$Lt_41_17410:
	mov.u32 	%r47, 1;
	setp.ne.s32 	%p8, %r25, %r47;
	@%p8 bra 	$Lt_41_17922;
	.loc	20	269	0
	ld.shared.f32 	%f63, [%rd4+80];
	ld.shared.v2.f32 	{%f64,%f65}, [%rd4+72];
	ld.shared.v2.f32 	{%f66,%f67}, [%rd4+64];
	ld.shared.f32 	%f68, [%rd4+48];
	ld.shared.v2.f32 	{%f69,%f70}, [%rd4+56];
	min.ftz.f32 	%f71, %f68, %f69;
	min.ftz.f32 	%f72, %f66, %f71;
	min.ftz.f32 	%f73, %f64, %f72;
	min.ftz.f32 	%f40, %f63, %f73;
	min.ftz.f32 	%f74, %f70, %f67;
	min.ftz.f32 	%f41, %f65, %f74;
	bra.uni 	$Lt_41_18178;
$Lt_41_17922:
	mov.u32 	%r48, 0;
	setp.ne.s32 	%p9, %r25, %r48;
	@%p9 bra 	$Lt_41_18434;
	.loc	20	270	0
	ld.shared.f32 	%f75, [%rd4+72];
	ld.shared.f32 	%f76, [%rd4+56];
	ld.shared.v2.f32 	{%f77,%f41}, [%rd4+64];
	min.ftz.f32 	%f78, %f76, %f77;
	min.ftz.f32 	%f40, %f75, %f78;
	bra.uni 	$Lt_41_18178;
$Lt_41_18434:
	.loc	20	272	0
	neg.s32 	%r49, %r25;
	cvt.s64.s32 	%rd20, %r49;
	mul.wide.s32 	%rd21, %r49, 8;
	add.u64 	%rd22, %rd4, %rd21;
	ld.shared.f32 	%f79, [%rd22+56];
	ld.shared.v2.f32 	{%f80,%f41}, [%rd22+64];
	min.ftz.f32 	%f40, %f79, %f80;
	.loc	20	273	0
	mul.lo.s32 	%r50, %r25, 2;
	mov.u32 	%r51, 0;
	setp.le.s32 	%p10, %r50, %r51;
	@%p10 bra 	$Lt_41_19714;
	add.s32 	%r52, %r50, 1;
	shr.s32 	%r53, %r52, 31;
	mov.s32 	%r54, 1;
	and.b32 	%r55, %r53, %r54;
	add.s32 	%r56, %r55, %r52;
	shr.s32 	%r57, %r56, 1;
	mov.s32 	%r58, %r49;
	sub.s32 	%r59, %r50, %r25;
	add.s64 	%rd23, %rd3, %rd21;
	add.u64 	%rd24, %rd1, %rd23;
	mov.s32 	%r60, %r57;
$Lt_41_20226:
 //<loop> Loop body line 273, nesting depth: 1, estimated iterations: unknown
	ld.shared.v2.f32 	{%f81,%f82}, [%rd24+72];
	.loc	20	275	0
	min.ftz.f32 	%f83, %f81, %f40;
	.loc	20	276	0
	min.ftz.f32 	%f84, %f82, %f41;
	ld.shared.v2.f32 	{%f85,%f86}, [%rd24+80];
	.loc	20	277	0
	min.ftz.f32 	%f40, %f85, %f83;
	.loc	20	278	0
	min.ftz.f32 	%f41, %f86, %f84;
	add.s32 	%r58, %r58, 2;
	add.u64 	%rd24, %rd24, 16;
	setp.lt.s32 	%p11, %r58, %r59;
	@%p11 bra 	$Lt_41_20226;
$Lt_41_19714:
	.loc	20	280	0
	cvt.s64.s32 	%rd25, %r25;
	mul.wide.s32 	%rd26, %r25, 8;
	add.u64 	%rd27, %rd4, %rd26;
	ld.shared.f32 	%f87, [%rd27+72];
	min.ftz.f32 	%f40, %f87, %f40;
$Lt_41_18178:
$Lt_41_17666:
$Lt_41_17154:
$Lt_41_16642:
	.loc	20	282	0
	ld.param.f32 	%f88, [__cudaparm_cuda_kernel_chokefilter_y_f];
	mov.f32 	%f89, 0f3f800000;    	// 1
	sub.ftz.f32 	%f90, %f89, %f88;
	mul.ftz.f32 	%f91, %f41, %f90;
	fma.rn.ftz.f32 	%f40, %f88, %f40, %f91;
	ld.param.s8 	%r61, [__cudaparm_cuda_kernel_chokefilter_y_remap];
	mov.u32 	%r62, 0;
	setp.eq.s32 	%p12, %r61, %r62;
	@%p12 bra 	$Lt_41_20738;
	.loc	20	286	0
	ld.const.f32 	%f92, [p+144];
	ld.const.f32 	%f93, [p+140];
	fma.rn.ftz.f32 	%f94, %f40, %f93, %f92;
	cvt.ftz.sat.f32.f32 	%f40, %f94;
$Lt_41_20738:
	ld.param.s32 	%r63, [__cudaparm_cuda_kernel_chokefilter_y_odPitch];
	mul.lo.s32 	%r64, %r63, %r10;
	add.s32 	%r65, %r8, %r64;
	cvt.s64.s32 	%rd28, %r65;
	ld.param.u64 	%rd29, [__cudaparm_cuda_kernel_chokefilter_y_od];
	ld.param.s32 	%r66, [__cudaparm_cuda_kernel_chokefilter_y_inDeviceFormat];
	mov.u32 	%r67, 0;
	setp.ne.s32 	%p13, %r66, %r67;
	@%p13 bra 	$Lt_41_21506;
	.loc	20	291	0
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f40;
	mov.b32		%r68, %b1; }
	mul.lo.u64 	%rd30, %rd28, 8;
	add.u64 	%rd31, %rd29, %rd30;
	st.global.u16 	[%rd31+6], %r68;
	bra.uni 	$LBB30_cuda_kernel_chokefilter_y;
$Lt_41_21506:
	.loc	20	295	0
	mul.lo.u64 	%rd32, %rd28, 16;
	add.u64 	%rd33, %rd29, %rd32;
	st.global.f32 	[%rd33+12], %f40;
$LBB30_cuda_kernel_chokefilter_y:
	.loc	20	297	0
	exit;
$LDWend_cuda_kernel_chokefilter_y:
	} // cuda_kernel_chokefilter_y

	.entry cuda_kernel_boxfilter_x (
		.param .u64 __cudaparm_cuda_kernel_boxfilter_x_id,
		.param .u64 __cudaparm_cuda_kernel_boxfilter_x_od,
		.param .s32 __cudaparm_cuda_kernel_boxfilter_x_idPitch,
		.param .s32 __cudaparm_cuda_kernel_boxfilter_x_odPitch,
		.param .s32 __cudaparm_cuda_kernel_boxfilter_x_w,
		.param .s32 __cudaparm_cuda_kernel_boxfilter_x_h,
		.param .s32 __cudaparm_cuda_kernel_boxfilter_x_r,
		.param .u32 __cudaparm_cuda_kernel_boxfilter_x_inDeviceFormat)
	{
	.reg .u32 %r<33>;
	.reg .u64 %rd<32>;
	.reg .f32 %f<40>;
	.reg .pred %p<11>;
	.shared .align 4 .b8 __cuda___cuda_local_var_92059_31_non_const_data4648[1152];
	.loc	20	314	0
$LDWbegin_cuda_kernel_boxfilter_x:
	.loc	20	317	0
	cvt.s32.u32 	%r1, %ntid.x;
	cvt.s32.u32 	%r2, %ctaid.x;
	mul.lo.s32 	%r3, %r2, %r1;
	mov.u32 	%r4, %tid.x;
	add.u32 	%r5, %r3, %r4;
	ld.param.s32 	%r6, [__cudaparm_cuda_kernel_boxfilter_x_w];
	setp.gt.s32 	%p1, %r6, %r5;
	@%p1 bra 	$Lt_42_15106;
	bra.uni 	$LBB32_cuda_kernel_boxfilter_x;
$Lt_42_15106:
	.loc	20	318	0
	ld.param.s32 	%r7, [__cudaparm_cuda_kernel_boxfilter_x_inDeviceFormat];
	mov.s32 	%r8, 0;
	setp.eq.s32 	%p2, %r7, %r8;
	mov.u32 	%r9, %ctaid.y;
	ld.param.u32 	%r10, [__cudaparm_cuda_kernel_boxfilter_x_idPitch];
	mul.lo.u32 	%r11, %r10, %r9;
	add.u32 	%r12, %r11, %r5;
	cvt.u64.u32 	%rd1, %r12;
	ld.param.u64 	%rd2, [__cudaparm_cuda_kernel_boxfilter_x_id];
	@!%p2 bra 	$Lt_42_15874;
	.loc	20	325	0
	mul.lo.u64 	%rd3, %rd1, 8;
	add.u64 	%rd4, %rd2, %rd3;
	add.u64 	%rd5, %rd4, 6;
	bra.uni 	$Lt_42_15618;
$Lt_42_15874:
	.loc	20	329	0
	mul.lo.u64 	%rd6, %rd1, 16;
	add.u64 	%rd7, %rd2, %rd6;
	add.u64 	%rd5, %rd7, 12;
$Lt_42_15618:
	.loc	20	334	0
	@!%p2 bra 	$Lt_42_16130;
	.loc	20	167	0
	ld.global.u16 	%r13, [%rd5+0];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r13;
	cvt.ftz.f32.f16	%f1, %b1; }
	bra.uni 	$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_219_5;
$Lt_42_16130:
	.loc	20	169	0
	ld.global.f32 	%f1, [%rd5+0];
$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_219_5:
	.loc	20	334	0
	mov.u64 	%rd8, __cuda___cuda_local_var_92059_31_non_const_data4648;
	cvt.u64.u32 	%rd9, %r4;
	mul.wide.u32 	%rd10, %r4, 4;
	add.u64 	%rd11, %rd8, %rd10;
	st.shared.f32 	[%rd11+64], %f1;
	ld.param.s32 	%r14, [__cudaparm_cuda_kernel_boxfilter_x_r];
	add.s32 	%r15, %r14, 1;
	setp.le.u32 	%p3, %r15, %r4;
	@%p3 bra 	$Lt_42_16898;
	neg.s32 	%r16, %r14;
	cvt.s64.s32 	%rd12, %r16;
	mul.wide.s32 	%rd13, %r16, 4;
	add.u64 	%rd14, %rd11, %rd13;
	setp.le.s32 	%p4, %r15, %r5;
	@%p4 bra 	$Lt_42_17410;
	.loc	20	336	0
	st.shared.f32 	[%rd14+60], %f1;
	bra.uni 	$Lt_42_18690;
$Lt_42_17410:
	.loc	20	337	0
	mul.lo.s32 	%r17, %r14, -4;
	cvt.s64.s32 	%rd15, %r17;
	@!%p2 bra 	$Lt_42_17666;
	.loc	20	167	0
	mul.lo.u64 	%rd16, %rd15, 2;
	add.u64 	%rd17, %rd5, %rd16;
	ld.global.u16 	%r18, [%rd17+-8];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r18;
	cvt.ftz.f32.f16	%f2, %b1; }
	bra.uni 	$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_219_3;
$Lt_42_17666:
	.loc	20	169	0
	mul.lo.u64 	%rd18, %rd15, 4;
	add.u64 	%rd19, %rd5, %rd18;
	ld.global.f32 	%f2, [%rd19+-16];
$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_219_3:
	.loc	20	337	0
	st.shared.f32 	[%rd14+60], %f2;
	bra.uni 	$Lt_42_18690;
$Lt_42_16898:
	sub.u32 	%r19, %r1, %r14;
	sub.u32 	%r20, %r19, 1;
	setp.lt.u32 	%p5, %r4, %r20;
	@%p5 bra 	$Lt_42_18690;
	cvt.s64.s32 	%rd20, %r14;
	mul.wide.s32 	%rd21, %r14, 4;
	add.u64 	%rd22, %rd11, %rd21;
	sub.s32 	%r21, %r6, %r14;
	sub.s32 	%r22, %r21, 1;
	setp.lt.s32 	%p6, %r5, %r22;
	@%p6 bra 	$Lt_42_18946;
	.loc	20	339	0
	st.shared.f32 	[%rd22+68], %f1;
	bra.uni 	$Lt_42_18690;
$Lt_42_18946:
	.loc	20	340	0
	mul.lo.s32 	%r23, %r14, 4;
	cvt.s64.s32 	%rd23, %r23;
	@!%p2 bra 	$Lt_42_19202;
	.loc	20	167	0
	mul.lo.u64 	%rd24, %rd23, 2;
	add.u64 	%rd25, %rd5, %rd24;
	ld.global.u16 	%r24, [%rd25+8];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f3, %b1; }
	bra.uni 	$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_219_1;
$Lt_42_19202:
	.loc	20	169	0
	mul.lo.u64 	%rd26, %rd23, 4;
	add.u64 	%rd27, %rd5, %rd26;
	ld.global.f32 	%f3, [%rd27+16];
$LDWendi__Z10ReadMemoryPfi17DevicePixelFormat_219_1:
	.loc	20	340	0
	st.shared.f32 	[%rd22+68], %f3;
$Lt_42_18690:
$Lt_42_18178:
$Lt_42_16642:
	.loc	20	343	0
	bar.sync 	0;
	ld.param.u32 	%r25, [__cudaparm_cuda_kernel_boxfilter_x_odPitch];
	mul.lo.u32 	%r26, %r25, %r9;
	mul.lo.s32 	%r27, %r5, 8;
	ld.shared.f32 	%f4, [%rd11+64];
	ld.shared.f32 	%f5, [%rd11+60];
	ld.shared.f32 	%f6, [%rd11+68];
	add.u32 	%r28, %r27, %r26;
	cvt.u64.u32 	%rd28, %r28;
	ld.param.u64 	%rd29, [__cudaparm_cuda_kernel_boxfilter_x_od];
	add.u64 	%rd30, %rd28, %rd29;
	mov.u32 	%r29, 3;
	setp.ne.s32 	%p7, %r14, %r29;
	@%p7 bra 	$Lt_42_19970;
	.loc	20	310	0
	ld.shared.f32 	%f7, [%rd11+56];
	ld.shared.f32 	%f8, [%rd11+72];
	ld.shared.f32 	%f9, [%rd11+52];
	ld.shared.f32 	%f10, [%rd11+76];
	add.ftz.f32 	%f11, %f9, %f7;
	add.ftz.f32 	%f12, %f5, %f11;
	add.ftz.f32 	%f13, %f12, %f4;
	add.ftz.f32 	%f14, %f6, %f13;
	add.ftz.f32 	%f15, %f14, %f8;
	add.ftz.f32 	%f16, %f15, %f10;
	.loc	20	311	0
	ld.shared.f32 	%f17, [%rd11+48];
	ld.shared.f32 	%f18, [%rd11+80];
	add.ftz.f32 	%f19, %f17, %f18;
	add.ftz.f32 	%f20, %f16, %f19;
	st.global.v2.f32 	[%rd30+0], {%f20,%f16};
	.loc	20	345	0
	bra.uni 	$LBB32_cuda_kernel_boxfilter_x;
$Lt_42_19970:
	mov.u32 	%r30, 2;
	setp.ne.s32 	%p8, %r14, %r30;
	@%p8 bra 	$Lt_42_20482;
	.loc	20	310	0
	ld.shared.f32 	%f21, [%rd11+56];
	ld.shared.f32 	%f22, [%rd11+72];
	add.ftz.f32 	%f23, %f5, %f21;
	add.ftz.f32 	%f24, %f23, %f4;
	add.ftz.f32 	%f25, %f6, %f24;
	add.ftz.f32 	%f26, %f25, %f22;
	.loc	20	311	0
	ld.shared.f32 	%f27, [%rd11+52];
	ld.shared.f32 	%f28, [%rd11+76];
	add.ftz.f32 	%f29, %f27, %f28;
	add.ftz.f32 	%f30, %f26, %f29;
	st.global.v2.f32 	[%rd30+0], {%f30,%f26};
	.loc	20	346	0
	bra.uni 	$LBB32_cuda_kernel_boxfilter_x;
$Lt_42_20482:
	mov.u32 	%r31, 1;
	setp.ne.s32 	%p9, %r14, %r31;
	@%p9 bra 	$Lt_42_20994;
	.loc	20	310	0
	add.ftz.f32 	%f31, %f5, %f4;
	add.ftz.f32 	%f32, %f6, %f31;
	.loc	20	311	0
	ld.shared.f32 	%f33, [%rd11+56];
	ld.shared.f32 	%f34, [%rd11+72];
	add.ftz.f32 	%f35, %f33, %f34;
	add.ftz.f32 	%f36, %f32, %f35;
	st.global.v2.f32 	[%rd30+0], {%f36,%f32};
	.loc	20	347	0
	bra.uni 	$LBB32_cuda_kernel_boxfilter_x;
$Lt_42_20994:
	.loc	20	311	0
	add.ftz.f32 	%f37, %f5, %f6;
	add.ftz.f32 	%f38, %f4, %f37;
	st.global.v2.f32 	[%rd30+0], {%f38,%f4};
$LBB32_cuda_kernel_boxfilter_x:
	.loc	20	350	0
	exit;
$LDWend_cuda_kernel_boxfilter_x:
	} // cuda_kernel_boxfilter_x

	.entry cuda_kernel_boxfilter_y (
		.param .u64 __cudaparm_cuda_kernel_boxfilter_y_id,
		.param .u64 __cudaparm_cuda_kernel_boxfilter_y_od,
		.param .s32 __cudaparm_cuda_kernel_boxfilter_y_idPitch,
		.param .s32 __cudaparm_cuda_kernel_boxfilter_y_odPitch,
		.param .s32 __cudaparm_cuda_kernel_boxfilter_y_w,
		.param .s32 __cudaparm_cuda_kernel_boxfilter_y_h,
		.param .s32 __cudaparm_cuda_kernel_boxfilter_y_r,
		.param .f32 __cudaparm_cuda_kernel_boxfilter_y_f,
		.param .s8 __cudaparm_cuda_kernel_boxfilter_y_remap,
		.param .u32 __cudaparm_cuda_kernel_boxfilter_y_inDeviceFormat)
	{
	.reg .u32 %r<57>;
	.reg .u64 %rd<27>;
	.reg .f32 %f<99>;
	.reg .pred %p<12>;
	.shared .align 8 .b8 __cuda___cuda_local_var_92118_32_non_const_data5848[3200];
	.loc	20	373	0
$LDWbegin_cuda_kernel_boxfilter_y:
	.loc	20	377	0
	cvt.s32.u32 	%r1, %ctaid.x;
	cvt.s32.u32 	%r2, %ntid.x;
	mul.lo.s32 	%r3, %r1, %r2;
	cvt.s32.u32 	%r4, %ntid.y;
	cvt.s32.u32 	%r5, %ctaid.y;
	mul.lo.s32 	%r6, %r5, %r4;
	mov.u32 	%r7, %tid.x;
	add.u32 	%r8, %r3, %r7;
	mov.u32 	%r9, %tid.y;
	add.u32 	%r10, %r6, %r9;
	ld.param.s32 	%r11, [__cudaparm_cuda_kernel_boxfilter_y_h];
	ld.param.s32 	%r12, [__cudaparm_cuda_kernel_boxfilter_y_w];
	set.le.u32.s32 	%r13, %r12, %r8;
	neg.s32 	%r14, %r13;
	set.le.u32.s32 	%r15, %r11, %r10;
	neg.s32 	%r16, %r15;
	or.b32 	%r17, %r14, %r16;
	mov.u32 	%r18, 0;
	setp.eq.s32 	%p1, %r17, %r18;
	@%p1 bra 	$Lt_43_12290;
	bra.uni 	$LBB25_cuda_kernel_boxfilter_y;
$Lt_43_12290:
	.loc	20	385	0
	mul.lo.u32 	%r19, %r7, 25;
	ld.param.s32 	%r20, [__cudaparm_cuda_kernel_boxfilter_y_idPitch];
	mov.u64 	%rd1, __cuda___cuda_local_var_92118_32_non_const_data5848;
	add.u32 	%r21, %r19, %r9;
	cvt.u64.u32 	%rd2, %r21;
	mul.wide.u32 	%rd3, %r21, 8;
	add.u64 	%rd4, %rd1, %rd3;
	mul.lo.s32 	%r22, %r20, %r10;
	mul.lo.s32 	%r23, %r8, 8;
	add.s32 	%r24, %r22, %r23;
	cvt.u64.s32 	%rd5, %r24;
	ld.param.u64 	%rd6, [__cudaparm_cuda_kernel_boxfilter_y_id];
	add.u64 	%rd7, %rd5, %rd6;
	ld.global.v2.f32 	{%f1,%f2}, [%rd7+0];
	st.shared.v2.f32 	[%rd4+64], {%f1,%f2};
	ld.param.s32 	%r25, [__cudaparm_cuda_kernel_boxfilter_y_r];
	add.s32 	%r26, %r25, 1;
	setp.le.u32 	%p2, %r26, %r9;
	@%p2 bra 	$Lt_43_13058;
	neg.s32 	%r27, %r25;
	cvt.s64.s32 	%rd8, %r27;
	mul.wide.s32 	%rd9, %r27, 8;
	add.u64 	%rd10, %rd4, %rd9;
	setp.le.s32 	%p3, %r26, %r10;
	@%p3 bra 	$Lt_43_13570;
	.loc	20	387	0
	ld.shared.f32 	%f3, [%rd4+68];
	st.shared.v2.f32 	[%rd10+56], {%f1,%f3};
	bra.uni 	$Lt_43_14338;
$Lt_43_13570:
	.loc	20	388	0
	mul.lo.s32 	%r28, %r26, %r20;
	neg.s32 	%r29, %r28;
	shr.s32 	%r30, %r29, 31;
	mov.s32 	%r31, 7;
	and.b32 	%r32, %r30, %r31;
	add.s32 	%r33, %r32, %r29;
	shr.s32 	%r34, %r33, 3;
	cvt.s64.s32 	%rd11, %r34;
	mul.wide.s32 	%rd12, %r34, 8;
	add.u64 	%rd13, %rd7, %rd12;
	ld.global.v2.f32 	{%f4,%f5}, [%rd13+0];
	st.shared.v2.f32 	[%rd10+56], {%f4,%f5};
	bra.uni 	$Lt_43_14338;
$Lt_43_13058:
	sub.u32 	%r35, %r4, %r25;
	sub.u32 	%r36, %r35, 1;
	setp.lt.u32 	%p4, %r9, %r36;
	@%p4 bra 	$Lt_43_14338;
	cvt.s64.s32 	%rd14, %r25;
	mul.wide.s32 	%rd15, %r25, 8;
	add.u64 	%rd16, %rd4, %rd15;
	sub.s32 	%r37, %r11, %r25;
	sub.s32 	%r38, %r37, 1;
	setp.lt.s32 	%p5, %r10, %r38;
	@%p5 bra 	$Lt_43_14594;
	.loc	20	390	0
	ld.shared.f32 	%f6, [%rd4+68];
	st.shared.v2.f32 	[%rd16+72], {%f1,%f6};
	bra.uni 	$Lt_43_14338;
$Lt_43_14594:
	.loc	20	391	0
	mul.lo.s32 	%r39, %r26, %r20;
	shr.s32 	%r40, %r39, 31;
	mov.s32 	%r41, 7;
	and.b32 	%r42, %r40, %r41;
	add.s32 	%r43, %r42, %r39;
	shr.s32 	%r44, %r43, 3;
	cvt.s64.s32 	%rd17, %r44;
	mul.wide.s32 	%rd18, %r44, 8;
	add.u64 	%rd19, %rd7, %rd18;
	ld.global.v2.f32 	{%f7,%f8}, [%rd19+0];
	st.shared.v2.f32 	[%rd16+72], {%f7,%f8};
$Lt_43_14338:
$Lt_43_13826:
$Lt_43_12802:
	.loc	20	394	0
	bar.sync 	0;
	ld.param.f32 	%f9, [__cudaparm_cuda_kernel_boxfilter_y_f];
	mov.f32 	%f10, 0f3f800000;    	// 1
	sub.ftz.f32 	%f11, %f10, %f9;
	ld.shared.v2.f32 	{%f12,%f13}, [%rd4+64];
	ld.shared.f32 	%f14, [%rd4+72];
	ld.shared.f32 	%f15, [%rd4+56];
	mov.u32 	%r45, 3;
	setp.ne.s32 	%p6, %r25, %r45;
	@%p6 bra 	$Lt_43_15106;
	ld.shared.v2.f32 	{%f16,%f17}, [%rd4+80];
	ld.shared.v2.f32 	{%f18,%f19}, [%rd4+48];
	.loc	20	363	0
	ld.shared.f32 	%f20, [%rd4+32];
	ld.shared.v2.f32 	{%f21,%f22}, [%rd4+40];
	add.ftz.f32 	%f23, %f20, %f21;
	add.ftz.f32 	%f24, %f18, %f23;
	add.ftz.f32 	%f25, %f15, %f24;
	add.ftz.f32 	%f26, %f12, %f25;
	add.ftz.f32 	%f27, %f14, %f26;
	add.ftz.f32 	%f28, %f16, %f27;
	ld.shared.v2.f32 	{%f29,%f30}, [%rd4+88];
	.loc	20	365	0
	add.ftz.f32 	%f31, %f29, %f28;
	.loc	20	366	0
	ld.shared.f32 	%f32, [%rd4+76];
	ld.shared.f32 	%f33, [%rd4+60];
	add.ftz.f32 	%f34, %f22, %f19;
	add.ftz.f32 	%f35, %f33, %f34;
	add.ftz.f32 	%f36, %f13, %f35;
	add.ftz.f32 	%f37, %f32, %f36;
	add.ftz.f32 	%f38, %f17, %f37;
	add.ftz.f32 	%f39, %f30, %f38;
	.loc	20	368	0
	ld.shared.f32 	%f40, [%rd4+96];
	add.ftz.f32 	%f41, %f40, %f31;
	.loc	20	369	0
	mov.f32 	%f42, 0f42a20000;    	// 81
	div.approx.ftz.f32 	%f43, %f9, %f42;
	mul.ftz.f32 	%f44, %f41, %f43;
	mov.f32 	%f45, 0f42440000;    	// 49
	div.approx.ftz.f32 	%f46, %f11, %f45;
	fma.rn.ftz.f32 	%f47, %f39, %f46, %f44;
	.loc	20	397	0
	mov.f32 	%f48, %f47;
	bra.uni 	$Lt_43_15874;
$Lt_43_15106:
	mov.u32 	%r46, 2;
	setp.ne.s32 	%p7, %r25, %r46;
	@%p7 bra 	$Lt_43_15618;
	.loc	20	368	0
	ld.shared.f32 	%f49, [%rd4+88];
	ld.shared.v2.f32 	{%f50,%f51}, [%rd4+80];
	ld.shared.f32 	%f52, [%rd4+40];
	ld.shared.v2.f32 	{%f53,%f54}, [%rd4+48];
	add.ftz.f32 	%f55, %f52, %f53;
	add.ftz.f32 	%f56, %f15, %f55;
	add.ftz.f32 	%f57, %f12, %f56;
	add.ftz.f32 	%f58, %f14, %f57;
	add.ftz.f32 	%f59, %f50, %f58;
	add.ftz.f32 	%f60, %f49, %f59;
	.loc	20	398	0
	mov.f32 	%f61, 0f42440000;    	// 49
	div.approx.ftz.f32 	%f62, %f9, %f61;
	mul.ftz.f32 	%f63, %f60, %f62;
	ld.shared.f32 	%f64, [%rd4+76];
	ld.shared.f32 	%f65, [%rd4+60];
	add.ftz.f32 	%f66, %f54, %f65;
	add.ftz.f32 	%f67, %f13, %f66;
	add.ftz.f32 	%f68, %f64, %f67;
	add.ftz.f32 	%f69, %f51, %f68;
	mov.f32 	%f70, 0f41c80000;    	// 25
	div.approx.ftz.f32 	%f71, %f11, %f70;
	fma.rn.ftz.f32 	%f48, %f69, %f71, %f63;
	bra.uni 	$Lt_43_15874;
$Lt_43_15618:
	mov.u32 	%r47, 1;
	setp.ne.s32 	%p8, %r25, %r47;
	@%p8 bra 	$Lt_43_16130;
	.loc	20	369	0
	ld.shared.f32 	%f72, [%rd4+80];
	ld.shared.f32 	%f73, [%rd4+48];
	add.ftz.f32 	%f74, %f73, %f15;
	add.ftz.f32 	%f75, %f12, %f74;
	add.ftz.f32 	%f76, %f14, %f75;
	add.ftz.f32 	%f77, %f72, %f76;
	mov.f32 	%f78, 0f41c80000;    	// 25
	div.approx.ftz.f32 	%f79, %f9, %f78;
	mul.ftz.f32 	%f80, %f77, %f79;
	ld.shared.f32 	%f81, [%rd4+76];
	ld.shared.f32 	%f82, [%rd4+60];
	add.ftz.f32 	%f83, %f82, %f13;
	add.ftz.f32 	%f84, %f81, %f83;
	mov.f32 	%f85, 0f41100000;    	// 9
	div.approx.ftz.f32 	%f86, %f11, %f85;
	fma.rn.ftz.f32 	%f87, %f84, %f86, %f80;
	.loc	20	399	0
	mov.f32 	%f48, %f87;
	bra.uni 	$Lt_43_15874;
$Lt_43_16130:
	.loc	20	400	0
	add.ftz.f32 	%f88, %f12, %f15;
	add.ftz.f32 	%f89, %f14, %f88;
	mov.f32 	%f90, 0f41100000;    	// 9
	div.approx.ftz.f32 	%f91, %f9, %f90;
	mul.ftz.f32 	%f92, %f89, %f91;
	mov.f32 	%f93, 0f3f800000;    	// 1
	div.approx.ftz.f32 	%f94, %f11, %f93;
	fma.rn.ftz.f32 	%f48, %f13, %f94, %f92;
$Lt_43_15874:
$Lt_43_15362:
$Lt_43_14850:
	ld.param.s8 	%r48, [__cudaparm_cuda_kernel_boxfilter_y_remap];
	mov.u32 	%r49, 0;
	setp.eq.s32 	%p9, %r48, %r49;
	@%p9 bra 	$Lt_43_17410;
	.loc	20	404	0
	ld.const.f32 	%f95, [p+144];
	ld.const.f32 	%f96, [p+140];
	fma.rn.ftz.f32 	%f97, %f48, %f96, %f95;
	cvt.ftz.sat.f32.f32 	%f48, %f97;
$Lt_43_17410:
	ld.param.s32 	%r50, [__cudaparm_cuda_kernel_boxfilter_y_odPitch];
	mul.lo.s32 	%r51, %r50, %r10;
	add.s32 	%r52, %r8, %r51;
	cvt.s64.s32 	%rd20, %r52;
	ld.param.u64 	%rd21, [__cudaparm_cuda_kernel_boxfilter_y_od];
	ld.param.s32 	%r53, [__cudaparm_cuda_kernel_boxfilter_y_inDeviceFormat];
	mov.u32 	%r54, 0;
	setp.ne.s32 	%p10, %r53, %r54;
	@%p10 bra 	$Lt_43_18178;
	.loc	20	409	0
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f48;
	mov.b32		%r55, %b1; }
	mul.lo.u64 	%rd22, %rd20, 8;
	add.u64 	%rd23, %rd21, %rd22;
	st.global.u16 	[%rd23+6], %r55;
	bra.uni 	$LBB25_cuda_kernel_boxfilter_y;
$Lt_43_18178:
	.loc	20	413	0
	mul.lo.u64 	%rd24, %rd20, 16;
	add.u64 	%rd25, %rd21, %rd24;
	st.global.f32 	[%rd25+12], %f48;
$LBB25_cuda_kernel_boxfilter_y:
	.loc	20	415	0
	exit;
$LDWend_cuda_kernel_boxfilter_y:
	} // cuda_kernel_boxfilter_y

	.entry cuda_kernel_composite (
		.param .u64 __cudaparm_cuda_kernel_composite_srcFrame,
		.param .u64 __cudaparm_cuda_kernel_composite_dstFrame,
		.param .s32 __cudaparm_cuda_kernel_composite_width,
		.param .s32 __cudaparm_cuda_kernel_composite_height,
		.param .s32 __cudaparm_cuda_kernel_composite_srcPitch,
		.param .s32 __cudaparm_cuda_kernel_composite_dstPitch,
		.param .s8 __cudaparm_cuda_kernel_composite_remap,
		.param .u32 __cudaparm_cuda_kernel_composite_inDeviceFormat)
	{
	.reg .u32 %r<38>;
	.reg .u64 %rd<14>;
	.reg .f32 %f<193>;
	.reg .pred %p<13>;
	.loc	20	418	0
$LDWbegin_cuda_kernel_composite:
	.loc	20	421	0
	cvt.s32.u32 	%r1, %ctaid.x;
	cvt.s32.u32 	%r2, %ntid.x;
	mul.lo.s32 	%r3, %r1, %r2;
	cvt.s32.u32 	%r4, %ctaid.y;
	cvt.s32.u32 	%r5, %ntid.y;
	mul.lo.s32 	%r6, %r4, %r5;
	mov.u32 	%r7, %tid.x;
	add.u32 	%r8, %r3, %r7;
	mov.u32 	%r9, %tid.y;
	add.u32 	%r10, %r6, %r9;
	ld.param.s32 	%r11, [__cudaparm_cuda_kernel_composite_width];
	set.le.u32.s32 	%r12, %r11, %r8;
	neg.s32 	%r13, %r12;
	ld.param.s32 	%r14, [__cudaparm_cuda_kernel_composite_height];
	set.le.u32.s32 	%r15, %r14, %r10;
	neg.s32 	%r16, %r15;
	or.b32 	%r17, %r13, %r16;
	mov.u32 	%r18, 0;
	setp.eq.s32 	%p1, %r17, %r18;
	@%p1 bra 	$Lt_44_9730;
	bra.uni 	$LBB33_cuda_kernel_composite;
$Lt_44_9730:
	.loc	20	426	0
	ld.param.s32 	%r19, [__cudaparm_cuda_kernel_composite_inDeviceFormat];
	mov.s32 	%r20, 0;
	setp.eq.s32 	%p2, %r19, %r20;
	ld.param.s32 	%r21, [__cudaparm_cuda_kernel_composite_srcPitch];
	mul.lo.s32 	%r22, %r21, %r10;
	add.s32 	%r23, %r8, %r22;
	cvt.s64.s32 	%rd1, %r23;
	ld.param.u64 	%rd2, [__cudaparm_cuda_kernel_composite_srcFrame];
	@!%p2 bra 	$Lt_44_10498;
	.loc	19	115	0
	mul.lo.u64 	%rd3, %rd1, 8;
	add.u64 	%rd4, %rd2, %rd3;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd4+0];
	.loc	20	426	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f1, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f2, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f3, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f4, %b1; }
	bra.uni 	$Lt_44_10242;
$Lt_44_10498:
	mul.lo.u64 	%rd5, %rd1, 16;
	add.u64 	%rd6, %rd2, %rd5;
	ld.global.v4.f32 	{%f1,%f2,%f3,%f4}, [%rd6+0];
$Lt_44_10242:
	.loc	20	77	0
	cvt.ftz.sat.f32.f32 	%f5, %f1;
	mov.f32 	%f6, 0f3d25aee6;     	// 0.04045
	setp.gt.ftz.f32 	%p3, %f5, %f6;
	@!%p3 bra 	$Lt_44_11010;
	.loc	20	35	0
	mov.f32 	%f7, 0f3d6147ae;     	// 0.055
	add.ftz.f32 	%f8, %f5, %f7;
	mov.f32 	%f9, 0f3f870a3d;     	// 1.055
	div.approx.ftz.f32 	%f10, %f8, %f9;
	lg2.approx.ftz.f32 	%f11, %f10;
	mov.f32 	%f12, 0f4019999a;    	// 2.4
	mul.ftz.f32 	%f13, %f11, %f12;
	ex2.approx.ftz.f32 	%f14, %f13;
	bra.uni 	$Lt_44_10754;
$Lt_44_11010:
	.loc	20	37	0
	mov.f32 	%f15, 0f414eb852;    	// 12.92
	div.approx.ftz.f32 	%f14, %f5, %f15;
$Lt_44_10754:
	.loc	20	78	0
	cvt.ftz.sat.f32.f32 	%f16, %f2;
	mov.f32 	%f17, 0f3d25aee6;    	// 0.04045
	setp.gt.ftz.f32 	%p4, %f16, %f17;
	@!%p4 bra 	$Lt_44_11522;
	.loc	20	35	0
	mov.f32 	%f18, 0f3d6147ae;    	// 0.055
	add.ftz.f32 	%f19, %f16, %f18;
	mov.f32 	%f20, 0f3f870a3d;    	// 1.055
	div.approx.ftz.f32 	%f21, %f19, %f20;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f4019999a;    	// 2.4
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	bra.uni 	$Lt_44_11266;
$Lt_44_11522:
	.loc	20	37	0
	mov.f32 	%f26, 0f414eb852;    	// 12.92
	div.approx.ftz.f32 	%f25, %f16, %f26;
$Lt_44_11266:
	.loc	20	79	0
	cvt.ftz.sat.f32.f32 	%f27, %f3;
	mov.f32 	%f28, 0f3d25aee6;    	// 0.04045
	setp.gt.ftz.f32 	%p5, %f27, %f28;
	@!%p5 bra 	$Lt_44_12034;
	.loc	20	35	0
	mov.f32 	%f29, 0f3d6147ae;    	// 0.055
	add.ftz.f32 	%f30, %f27, %f29;
	mov.f32 	%f31, 0f3f870a3d;    	// 1.055
	div.approx.ftz.f32 	%f32, %f30, %f31;
	lg2.approx.ftz.f32 	%f33, %f32;
	mov.f32 	%f34, 0f4019999a;    	// 2.4
	mul.ftz.f32 	%f35, %f33, %f34;
	ex2.approx.ftz.f32 	%f36, %f35;
	bra.uni 	$Lt_44_11778;
$Lt_44_12034:
	.loc	20	37	0
	mov.f32 	%f37, 0f414eb852;    	// 12.92
	div.approx.ftz.f32 	%f36, %f27, %f37;
$Lt_44_11778:
	.loc	20	93	0
	ld.const.f32 	%f38, [p+56];
	mul.ftz.f32 	%f39, %f38, %f25;
	ld.const.f32 	%f40, [p+68];
	mul.ftz.f32 	%f41, %f40, %f25;
	ld.const.f32 	%f42, [p+44];
	mul.ftz.f32 	%f43, %f42, %f25;
	ld.const.f32 	%f44, [p+52];
	fma.rn.ftz.f32 	%f45, %f44, %f36, %f39;
	ld.const.f32 	%f46, [p+64];
	fma.rn.ftz.f32 	%f47, %f46, %f36, %f41;
	ld.const.f32 	%f48, [p+40];
	fma.rn.ftz.f32 	%f49, %f48, %f36, %f43;
	ld.const.f32 	%f50, [p+60];
	fma.rn.ftz.f32 	%f51, %f50, %f14, %f45;
	ld.const.f32 	%f52, [p+72];
	fma.rn.ftz.f32 	%f53, %f52, %f14, %f47;
	ld.const.f32 	%f54, [p+48];
	fma.rn.ftz.f32 	%f55, %f54, %f14, %f49;
	mov.f32 	%f56, 0f02081cea;    	// 1e-037
	max.ftz.f32 	%f57, %f53, %f56;
	div.approx.ftz.f32 	%f58, %f51, %f57;
	div.approx.ftz.f32 	%f59, %f55, %f57;
	ld.const.f32 	%f60, [p+152];
	sub.ftz.f32 	%f61, %f58, %f60;
	ld.const.f32 	%f62, [p+148];
	sub.ftz.f32 	%f63, %f59, %f62;
	mul.ftz.f32 	%f64, %f61, %f61;
	fma.rn.ftz.f32 	%f65, %f63, %f63, %f64;
	mov.f32 	%f66, 0f02081cea;    	// 1e-037
	max.ftz.f32 	%f67, %f65, %f66;
	rsqrt.approx.ftz.f32 	%f68, %f67;
	ld.const.f32 	%f69, [p+160];
	mul.ftz.f32 	%f70, %f61, %f60;
	fma.rn.ftz.f32 	%f71, %f62, %f63, %f70;
	mul.ftz.f32 	%f72, %f68, %f69;
	mul.ftz.f32 	%f73, %f71, %f72;
	ld.const.f32 	%f74, [p+8];
	mov.f32 	%f75, 0f00000000;    	// 0
	setp.gt.ftz.f32 	%p6, %f73, %f75;
	@!%p6 bra 	$Lt_44_12546;
	.loc	20	95	0
	mul.ftz.f32 	%f76, %f73, %f74;
	.loc	20	96	0
	mov.f32 	%f77, %f74;
	bra.uni 	$Lt_44_12290;
$Lt_44_12546:
	.loc	20	99	0
	mov.f32 	%f78, 0f3f800000;    	// 1
	add.ftz.f32 	%f79, %f73, %f78;
	mul.ftz.f32 	%f77, %f74, %f79;
	mov.f32 	%f76, 0f00000000;    	// 0
$Lt_44_12290:
	.loc	20	104	0
	mov.f32 	%f80, 0f3f800000;    	// 1
	div.approx.ftz.f32 	%f81, %f80, %f68;
	mul.ftz.f32 	%f82, %f69, %f81;
	.loc	20	106	0
	mov.f32 	%f83, 0f3f800000;    	// 1
	sub.ftz.f32 	%f84, %f83, %f76;
	mul.ftz.f32 	%f76, %f82, %f84;
	.loc	20	107	0
	mov.f32 	%f85, 0f3f800000;    	// 1
	sub.ftz.f32 	%f86, %f85, %f77;
	mul.ftz.f32 	%f77, %f82, %f86;
	.loc	20	125	0
	ld.const.f32 	%f87, [p+16];
	mul.ftz.f32 	%f88, %f87, %f76;
	cvt.ftz.sat.f32.f32 	%f89, %f88;
	ld.const.f32 	%f90, [p+156];
	sub.ftz.f32 	%f91, %f57, %f90;
	div.approx.ftz.f32 	%f92, %f91, %f90;
	mov.f32 	%f93, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p7, %f92, %f93;
	ld.const.f32 	%f94, [p+20];
	mul.ftz.f32 	%f95, %f94, %f92;
	ld.const.f32 	%f96, [p+24];
	mul.ftz.f32 	%f97, %f96, %f92;
	selp.f32 	%f98, %f95, %f97, %p7;
	cvt.ftz.sat.f32.f32 	%f99, %f98;
	add.ftz.f32 	%f100, %f89, %f99;
	mul.ftz.f32 	%f101, %f99, %f89;
	sub.ftz.f32 	%f102, %f100, %f101;
	.loc	20	129	0
	ld.const.f32 	%f103, [p+28];
	mov.f32 	%f104, 0f3f800000;   	// 1
	mov.f32 	%f105, 0f02081cea;   	// 1e-037
	max.ftz.f32 	%f106, %f102, %f105;
	div.approx.ftz.f32 	%f107, %f104, %f106;
	mov.f32 	%f108, 0fbf800000;   	// -1
	add.ftz.f32 	%f109, %f107, %f108;
	mul.ftz.f32 	%f110, %f103, %f109;
	fma.rn.ftz.f32 	%f111, %f110, %f91, %f57;
	.loc	20	131	0
	ld.const.f32 	%f112, [p+32];
	mul.ftz.f32 	%f113, %f112, %f102;
	.loc	20	132	0
	ld.const.f32 	%f114, [p+36];
	sub.ftz.f32 	%f115, %f113, %f114;
	.loc	20	133	0
	cvt.ftz.sat.f32.f32 	%f116, %f115;
	.loc	20	137	0
	ld.const.f32 	%f117, [p+12];
	mul.ftz.f32 	%f118, %f117, %f77;
	ld.const.f32 	%f119, [p+0];
	mul.ftz.f32 	%f120, %f119, %f77;
	cvt.ftz.sat.f32.f32 	%f121, %f118;
	cvt.ftz.sat.f32.f32 	%f122, %f120;
	mov.f32 	%f123, 0f02081cea;   	// 1e-037
	max.ftz.f32 	%f124, %f122, %f123;
	mov.f32 	%f125, 0f3f800000;   	// 1
	div.approx.ftz.f32 	%f126, %f125, %f124;
	mov.f32 	%f127, 0fbf800000;   	// -1
	add.ftz.f32 	%f128, %f126, %f127;
	ld.const.f32 	%f129, [p+4];
	fma.rn.ftz.f32 	%f130, %f128, %f129, %f128;
	fma.rn.ftz.f32 	%f131, %f130, %f61, %f58;
	fma.rn.ftz.f32 	%f132, %f63, %f130, %f59;
	mul.ftz.f32 	%f133, %f121, %f131;
	mul.ftz.f32 	%f134, %f121, %f132;
	mul.ftz.f32 	%f135, %f133, %f111;
	mul.ftz.f32 	%f136, %f134, %f111;
	ld.const.f32 	%f137, [p+112];
	ld.const.f32 	%f138, [p+104];
	mul.ftz.f32 	%f139, %f138, %f135;
	ld.const.f32 	%f140, [p+100];
	fma.rn.ftz.f32 	%f141, %f140, %f136, %f139;
	ld.const.f32 	%f142, [p+108];
	fma.rn.ftz.f32 	%f143, %f142, %f111, %f141;
	add.ftz.f32 	%f144, %f137, %f143;
	.loc	20	50	0
	cvt.ftz.sat.f32.f32 	%f145, %f144;
	mov.f32 	%f146, 0f3b4d2e1c;   	// 0.0031308
	setp.lt.ftz.f32 	%p8, %f145, %f146;
	@!%p8 bra 	$Lt_44_13058;
	.loc	20	52	0
	mov.f32 	%f147, 0f414eb852;   	// 12.92
	mul.ftz.f32 	%f145, %f145, %f147;
	bra.uni 	$Lt_44_12802;
$Lt_44_13058:
	.loc	20	54	0
	mov.f32 	%f148, 0fbd6147ae;   	// -0.055
	lg2.approx.ftz.f32 	%f149, %f145;
	mov.f32 	%f150, 0f3ed55476;   	// 0.41666
	mul.ftz.f32 	%f151, %f149, %f150;
	ex2.approx.ftz.f32 	%f152, %f151;
	mov.f32 	%f153, 0f3f870a3d;   	// 1.055
	fma.rn.ftz.f32 	%f145, %f152, %f153, %f148;
$Lt_44_12802:
	.loc	20	138	0
	ld.const.f32 	%f154, [p+116];
	ld.const.f32 	%f155, [p+92];
	mul.ftz.f32 	%f156, %f155, %f135;
	ld.const.f32 	%f157, [p+88];
	fma.rn.ftz.f32 	%f158, %f157, %f136, %f156;
	ld.const.f32 	%f159, [p+96];
	fma.rn.ftz.f32 	%f160, %f159, %f111, %f158;
	add.ftz.f32 	%f161, %f154, %f160;
	.loc	20	50	0
	cvt.ftz.sat.f32.f32 	%f162, %f161;
	mov.f32 	%f163, 0f3b4d2e1c;   	// 0.0031308
	setp.lt.ftz.f32 	%p9, %f162, %f163;
	@!%p9 bra 	$Lt_44_13570;
	.loc	20	52	0
	mov.f32 	%f164, 0f414eb852;   	// 12.92
	mul.ftz.f32 	%f162, %f162, %f164;
	bra.uni 	$Lt_44_13314;
$Lt_44_13570:
	.loc	20	54	0
	mov.f32 	%f165, 0fbd6147ae;   	// -0.055
	lg2.approx.ftz.f32 	%f166, %f162;
	mov.f32 	%f167, 0f3ed55476;   	// 0.41666
	mul.ftz.f32 	%f168, %f166, %f167;
	ex2.approx.ftz.f32 	%f169, %f168;
	mov.f32 	%f170, 0f3f870a3d;   	// 1.055
	fma.rn.ftz.f32 	%f162, %f169, %f170, %f165;
$Lt_44_13314:
	.loc	20	139	0
	ld.const.f32 	%f171, [p+120];
	ld.const.f32 	%f172, [p+80];
	mul.ftz.f32 	%f173, %f172, %f135;
	ld.const.f32 	%f174, [p+76];
	fma.rn.ftz.f32 	%f175, %f174, %f136, %f173;
	ld.const.f32 	%f176, [p+84];
	fma.rn.ftz.f32 	%f177, %f176, %f111, %f175;
	add.ftz.f32 	%f178, %f171, %f177;
	.loc	20	50	0
	cvt.ftz.sat.f32.f32 	%f179, %f178;
	mov.f32 	%f180, 0f3b4d2e1c;   	// 0.0031308
	setp.lt.ftz.f32 	%p10, %f179, %f180;
	@!%p10 bra 	$Lt_44_14082;
	.loc	20	52	0
	mov.f32 	%f181, 0f414eb852;   	// 12.92
	mul.ftz.f32 	%f179, %f179, %f181;
	bra.uni 	$Lt_44_13826;
$Lt_44_14082:
	.loc	20	54	0
	mov.f32 	%f182, 0fbd6147ae;   	// -0.055
	lg2.approx.ftz.f32 	%f183, %f179;
	mov.f32 	%f184, 0f3ed55476;   	// 0.41666
	mul.ftz.f32 	%f185, %f183, %f184;
	ex2.approx.ftz.f32 	%f186, %f185;
	mov.f32 	%f187, 0f3f870a3d;   	// 1.055
	fma.rn.ftz.f32 	%f179, %f186, %f187, %f182;
$Lt_44_13826:
	.loc	20	426	0
	mul.ftz.f32 	%f188, %f116, %f4;
	ld.param.s8 	%r28, [__cudaparm_cuda_kernel_composite_remap];
	mov.u32 	%r29, 0;
	setp.eq.s32 	%p11, %r28, %r29;
	@%p11 bra 	$Lt_44_14338;
	.loc	20	429	0
	ld.const.f32 	%f189, [p+144];
	ld.const.f32 	%f190, [p+140];
	fma.rn.ftz.f32 	%f191, %f188, %f190, %f189;
	cvt.ftz.sat.f32.f32 	%f188, %f191;
$Lt_44_14338:
	ld.param.s32 	%r30, [__cudaparm_cuda_kernel_composite_dstPitch];
	mul.lo.s32 	%r31, %r30, %r10;
	add.s32 	%r32, %r8, %r31;
	cvt.s64.s32 	%rd7, %r32;
	ld.param.u64 	%rd8, [__cudaparm_cuda_kernel_composite_dstFrame];
	@!%p2 bra 	$Lt_44_15106;
	.loc	19	126	0
	mul.lo.u64 	%rd9, %rd7, 8;
	add.u64 	%rd10, %rd8, %rd9;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f145;
	mov.b32		%r33, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f162;
	mov.b32		%r34, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f179;
	mov.b32		%r35, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f188;
	mov.b32		%r36, %b1; }
	st.global.v4.u16 	[%rd10+0], {%r33,%r34,%r35,%r36};
	.loc	20	431	0
	bra.uni 	$LBB33_cuda_kernel_composite;
$Lt_44_15106:
	.loc	19	126	0
	mul.lo.u64 	%rd11, %rd7, 16;
	add.u64 	%rd12, %rd8, %rd11;
	st.global.v4.f32 	[%rd12+0], {%f145,%f162,%f179,%f188};
$LBB33_cuda_kernel_composite:
	.loc	20	432	0
	exit;
$LDWend_cuda_kernel_composite:
	} // cuda_kernel_composite

	.entry cuda_kernel_showAlpha (
		.param .u64 __cudaparm_cuda_kernel_showAlpha_srcFrame,
		.param .u64 __cudaparm_cuda_kernel_showAlpha_dstFrame,
		.param .s32 __cudaparm_cuda_kernel_showAlpha_width,
		.param .s32 __cudaparm_cuda_kernel_showAlpha_height,
		.param .s32 __cudaparm_cuda_kernel_showAlpha_srcPitch,
		.param .s32 __cudaparm_cuda_kernel_showAlpha_dstPitch,
		.param .u32 __cudaparm_cuda_kernel_showAlpha_inDeviceFormat)
	{
	.reg .u32 %r<32>;
	.reg .u64 %rd<14>;
	.reg .f32 %f<7>;
	.reg .pred %p<4>;
	.loc	20	435	0
$LDWbegin_cuda_kernel_showAlpha:
	.loc	20	438	0
	cvt.s32.u32 	%r1, %ctaid.x;
	cvt.s32.u32 	%r2, %ntid.x;
	mul.lo.s32 	%r3, %r1, %r2;
	cvt.s32.u32 	%r4, %ctaid.y;
	cvt.s32.u32 	%r5, %ntid.y;
	mul.lo.s32 	%r6, %r4, %r5;
	mov.u32 	%r7, %tid.x;
	add.u32 	%r8, %r3, %r7;
	mov.u32 	%r9, %tid.y;
	add.u32 	%r10, %r6, %r9;
	ld.param.s32 	%r11, [__cudaparm_cuda_kernel_showAlpha_width];
	set.le.u32.s32 	%r12, %r11, %r8;
	neg.s32 	%r13, %r12;
	ld.param.s32 	%r14, [__cudaparm_cuda_kernel_showAlpha_height];
	set.le.u32.s32 	%r15, %r14, %r10;
	neg.s32 	%r16, %r15;
	or.b32 	%r17, %r13, %r16;
	mov.u32 	%r18, 0;
	setp.eq.s32 	%p1, %r17, %r18;
	@%p1 bra 	$Lt_45_2818;
	bra.uni 	$LBB10_cuda_kernel_showAlpha;
$Lt_45_2818:
	.loc	20	441	0
	ld.param.s32 	%r19, [__cudaparm_cuda_kernel_showAlpha_inDeviceFormat];
	mov.s32 	%r20, 0;
	setp.eq.s32 	%p2, %r19, %r20;
	ld.param.s32 	%r21, [__cudaparm_cuda_kernel_showAlpha_srcPitch];
	mul.lo.s32 	%r22, %r21, %r10;
	add.s32 	%r23, %r8, %r22;
	cvt.s64.s32 	%rd1, %r23;
	ld.param.u64 	%rd2, [__cudaparm_cuda_kernel_showAlpha_srcFrame];
	@!%p2 bra 	$Lt_45_3586;
	.loc	19	115	0
	mul.lo.u64 	%rd3, %rd1, 8;
	add.u64 	%rd4, %rd2, %rd3;
	ld.global.u16 	%r24, [%rd4+6];
	.loc	20	443	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f1, %b1; }
	bra.uni 	$Lt_45_3330;
$Lt_45_3586:
	mul.lo.u64 	%rd5, %rd1, 16;
	add.u64 	%rd6, %rd2, %rd5;
	ld.global.f32 	%f1, [%rd6+12];
$Lt_45_3330:
	ld.param.s32 	%r25, [__cudaparm_cuda_kernel_showAlpha_dstPitch];
	mul.lo.s32 	%r26, %r25, %r10;
	add.s32 	%r27, %r8, %r26;
	cvt.s64.s32 	%rd7, %r27;
	ld.param.u64 	%rd8, [__cudaparm_cuda_kernel_showAlpha_dstFrame];
	@!%p2 bra 	$Lt_45_4098;
	.loc	19	126	0
	cvt.ftz.sat.f32.f32 	%f2, %f1;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f2;
	mov.b32		%r28, %b1; }
	mov.s32 	%r29, %r28;
	mul.lo.u64 	%rd9, %rd7, 8;
	add.u64 	%rd10, %rd8, %rd9;
	mov.f32 	%f3, 0f3f800000;     	// 1
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f3;
	mov.b32		%r30, %b1; }
	st.global.v4.u16 	[%rd10+0], {%r29,%r29,%r29,%r30};
	.loc	20	449	0
	bra.uni 	$LBB10_cuda_kernel_showAlpha;
$Lt_45_4098:
	.loc	19	126	0
	cvt.ftz.sat.f32.f32 	%f4, %f1;
	mul.lo.u64 	%rd11, %rd7, 16;
	add.u64 	%rd12, %rd8, %rd11;
	mov.f32 	%f5, 0f3f800000;     	// 1
	st.global.v4.f32 	[%rd12+0], {%f4,%f4,%f4,%f5};
$LBB10_cuda_kernel_showAlpha:
	.loc	20	450	0
	exit;
$LDWend_cuda_kernel_showAlpha:
	} // cuda_kernel_showAlpha

	.entry cuda_kernel_showColor (
		.param .u64 __cudaparm_cuda_kernel_showColor_srcFrame,
		.param .u64 __cudaparm_cuda_kernel_showColor_dstFrame,
		.param .s32 __cudaparm_cuda_kernel_showColor_width,
		.param .s32 __cudaparm_cuda_kernel_showColor_height,
		.param .s32 __cudaparm_cuda_kernel_showColor_srcPitch,
		.param .s32 __cudaparm_cuda_kernel_showColor_dstPitch,
		.param .u32 __cudaparm_cuda_kernel_showColor_inDeviceFormat)
	{
	.reg .u32 %r<36>;
	.reg .u64 %rd<14>;
	.reg .f32 %f<16>;
	.reg .pred %p<4>;
	.loc	20	453	0
$LDWbegin_cuda_kernel_showColor:
	.loc	20	456	0
	cvt.s32.u32 	%r1, %ctaid.x;
	cvt.s32.u32 	%r2, %ntid.x;
	mul.lo.s32 	%r3, %r1, %r2;
	cvt.s32.u32 	%r4, %ctaid.y;
	cvt.s32.u32 	%r5, %ntid.y;
	mul.lo.s32 	%r6, %r4, %r5;
	mov.u32 	%r7, %tid.x;
	add.u32 	%r8, %r3, %r7;
	mov.u32 	%r9, %tid.y;
	add.u32 	%r10, %r6, %r9;
	ld.param.s32 	%r11, [__cudaparm_cuda_kernel_showColor_width];
	set.le.u32.s32 	%r12, %r11, %r8;
	neg.s32 	%r13, %r12;
	ld.param.s32 	%r14, [__cudaparm_cuda_kernel_showColor_height];
	set.le.u32.s32 	%r15, %r14, %r10;
	neg.s32 	%r16, %r15;
	or.b32 	%r17, %r13, %r16;
	mov.u32 	%r18, 0;
	setp.eq.s32 	%p1, %r17, %r18;
	@%p1 bra 	$Lt_46_2818;
	bra.uni 	$LBB10_cuda_kernel_showColor;
$Lt_46_2818:
	.loc	20	459	0
	ld.param.s32 	%r19, [__cudaparm_cuda_kernel_showColor_inDeviceFormat];
	mov.s32 	%r20, 0;
	setp.eq.s32 	%p2, %r19, %r20;
	ld.param.s32 	%r21, [__cudaparm_cuda_kernel_showColor_srcPitch];
	mul.lo.s32 	%r22, %r21, %r10;
	add.s32 	%r23, %r8, %r22;
	cvt.s64.s32 	%rd1, %r23;
	ld.param.u64 	%rd2, [__cudaparm_cuda_kernel_showColor_srcFrame];
	@!%p2 bra 	$Lt_46_3586;
	.loc	19	115	0
	mul.lo.u64 	%rd3, %rd1, 8;
	add.u64 	%rd4, %rd2, %rd3;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd4+0];
	.loc	20	461	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f1, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f2, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f3, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f4, %b1; }
	bra.uni 	$Lt_46_3330;
$Lt_46_3586:
	mul.lo.u64 	%rd5, %rd1, 16;
	add.u64 	%rd6, %rd2, %rd5;
	ld.global.v4.f32 	{%f1,%f2,%f3,%f4}, [%rd6+0];
$Lt_46_3330:
	ld.param.s32 	%r28, [__cudaparm_cuda_kernel_showColor_dstPitch];
	mul.lo.s32 	%r29, %r28, %r10;
	add.s32 	%r30, %r8, %r29;
	cvt.s64.s32 	%rd7, %r30;
	ld.param.u64 	%rd8, [__cudaparm_cuda_kernel_showColor_dstFrame];
	@!%p2 bra 	$Lt_46_4098;
	.loc	19	126	0
	cvt.ftz.sat.f32.f32 	%f5, %f4;
	mul.lo.u64 	%rd9, %rd7, 8;
	add.u64 	%rd10, %rd8, %rd9;
	mul.ftz.f32 	%f6, %f5, %f1;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f6;
	mov.b32		%r31, %b1; }
	mul.ftz.f32 	%f7, %f5, %f2;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f7;
	mov.b32		%r32, %b1; }
	mul.ftz.f32 	%f8, %f5, %f3;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f8;
	mov.b32		%r33, %b1; }
	mov.f32 	%f9, 0f3f800000;     	// 1
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f9;
	mov.b32		%r34, %b1; }
	st.global.v4.u16 	[%rd10+0], {%r31,%r32,%r33,%r34};
	.loc	20	467	0
	bra.uni 	$LBB10_cuda_kernel_showColor;
$Lt_46_4098:
	.loc	19	126	0
	cvt.ftz.sat.f32.f32 	%f10, %f4;
	mul.lo.u64 	%rd11, %rd7, 16;
	add.u64 	%rd12, %rd8, %rd11;
	mul.ftz.f32 	%f11, %f10, %f1;
	mul.ftz.f32 	%f12, %f10, %f2;
	mul.ftz.f32 	%f13, %f10, %f3;
	mov.f32 	%f14, 0f3f800000;    	// 1
	st.global.v4.f32 	[%rd12+0], {%f11,%f12,%f13,%f14};
$LBB10_cuda_kernel_showColor:
	.loc	20	468	0
	exit;
$LDWend_cuda_kernel_showColor:
	} // cuda_kernel_showColor

