	.version 2.2
	.target sm_20
	// compiled with ../../../External/3rdParty/NVIDIA/CUDA/win/bin/../open64/lib//be.exe
	// nvopencc 3.2 built on 2010-11-04

	.visible .func (.param .s32 __cudaretf__Z15IntegerMultiplyii) _Z15IntegerMultiplyii (.param .s32 __cudaparmf1__Z15IntegerMultiplyii, .param .s32 __cudaparmf2__Z15IntegerMultiplyii)

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelXv) _Z17Standard2DKernelXv ()

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelYv) _Z17Standard2DKernelYv ()

	.visible .func (.param .align 16 .b8 __cudaretf__Z13Half4ToFloat47ushort4[16]) _Z13Half4ToFloat47ushort4 (.param .align 8 .b8 __cudaparmf1__Z13Half4ToFloat47ushort4[8])

	.visible .func (.param .align 8 .b8 __cudaretf__Z13Float4ToHalf46float4[8]) _Z13Float4ToHalf46float4 (.param .align 16 .b8 __cudaparmf1__Z13Float4ToHalf46float4[16])

	.visible .func (.param .u32 __cudaretf__Z4Mix3RjS_S_) _Z4Mix3RjS_S_ (.param .u64 __cudaparmf1__Z4Mix3RjS_S_, .param .u64 __cudaparmf2__Z4Mix3RjS_S_, .param .u64 __cudaparmf3__Z4Mix3RjS_S_)

	.visible .func (.param .s32 __cudaretf__Z4Randj) _Z4Randj (.param .u32 __cudaparmf1__Z4Randj)

	.visible .func (.param .s32 __cudaretf__Z6Rand2Djjj) _Z6Rand2Djjj (.param .u32 __cudaparmf1__Z6Rand2Djjj, .param .u32 __cudaparmf2__Z6Rand2Djjj, .param .u32 __cudaparmf3__Z6Rand2Djjj)

	.visible .func (.param .s32 __cudaretf__Z6Rand2Dj) _Z6Rand2Dj (.param .u32 __cudaparmf1__Z6Rand2Dj)

	.visible .func (.param .align 8 .b8 __cudaretf__Z6Read2DI7ushort4ET_PKS1_iii[8]) _Z6Read2DI7ushort4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI7ushort4ET_PKS1_iii)

	.visible .func (.param .align 16 .b8 __cudaretf__Z6Read2DI6float4ET_PKS1_iii[16]) _Z6Read2DI6float4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI6float4ET_PKS1_iii)

	.visible .func _Z7Write2DI7ushort4EvT_PS1_iii (.param .align 8 .b8 __cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii[8], .param .u64 __cudaparmf2__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI7ushort4EvT_PS1_iii)

	.visible .func _Z7Write2DI6float4EvT_PS1_iii (.param .align 16 .b8 __cudaparmf1__Z7Write2DI6float4EvT_PS1_iii[16], .param .u64 __cudaparmf2__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI6float4EvT_PS1_iii)

	.visible .func (.param .align 16 .b8 __cudaretf__Z18UnpremultiplyPixel8PixelRGB[16]) _Z18UnpremultiplyPixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z18UnpremultiplyPixel8PixelRGB[16])

	.visible .func (.param .f32 __cudaretf__Z13ToLinearColorf) _Z13ToLinearColorf (.param .f32 __cudaparmf1__Z13ToLinearColorf)

	.visible .func (.param .f32 __cudaretf__Z15FromLinearColorf) _Z15FromLinearColorf (.param .f32 __cudaparmf1__Z15FromLinearColorf)

	.visible .func (.param .align 16 .b8 __cudaretf__Z25PremultiplyLinearizePixel8PixelRGB[16]) _Z25PremultiplyLinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16]) _Z29UnpremultiplyUnlinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z20PremultiplyLinearize6float4[16]) _Z20PremultiplyLinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z20PremultiplyLinearize6float4[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z24UnpremultiplyUnlinearize6float4[16]) _Z24UnpremultiplyUnlinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z24UnpremultiplyUnlinearize6float4[16])

	.visible .func _Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii (.param .u64 __cudaparmf1__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .u64 __cudaparmf2__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .s32 __cudaparmf3__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .f32 __cudaparmf4__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .u32 __cudaparmf5__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .s32 __cudaparmf6__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .s32 __cudaparmf7__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii)

	.visible .func (.param .u64 __cudaretf__ZpLI8PixelRGBERT_S2_RKS1_) _ZpLI8PixelRGBERT_S2_RKS1_ (.param .u64 __cudaparmf1__ZpLI8PixelRGBERT_S2_RKS1_, .param .u64 __cudaparmf2__ZpLI8PixelRGBERT_S2_RKS1_)

	//-----------------------------------------------------------
	// Compiling C:/Users/dvaeng/AppData/Local/Temp/tmpxft_00003fe4_00000000-11_Accumulate.cpp3.i (C:/Users/dvaeng/AppData/Local/Temp/ccBI#.a14116)
	//-----------------------------------------------------------

	//-----------------------------------------------------------
	// Options:
	//-----------------------------------------------------------
	//  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
	//  -O3	(Optimization level)
	//  -g0	(Debug level)
	//  -m2	(Report advisories)
	//-----------------------------------------------------------

	.file	1	"C:/Users/dvaeng/AppData/Local/Temp/tmpxft_00003fe4_00000000-10_Accumulate.cudafe2.gpu"
	.file	2	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/PixelFormat.h"
	.file	3	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/PixelRGB.h"
	.file	4	"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\crtdefs.h"
	.file	5	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\crt/device_runtime.h"
	.file	6	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\host_defines.h"
	.file	7	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\builtin_types.h"
	.file	8	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\device_types.h"
	.file	9	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\driver_types.h"
	.file	10	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\surface_types.h"
	.file	11	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\texture_types.h"
	.file	12	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\vector_types.h"
	.file	13	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\builtin_types.h"
	.file	14	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\host_defines.h"
	.file	15	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\device_launch_parameters.h"
	.file	16	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\crt\storage_class.h"
	.file	17	"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\time.h"
	.file	18	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/Utils.h"
	.file	19	"c:/Mulder64/shared/adobe/MediaCore/GPUFoundation/Src/ImageProcessing/Accumulate.cu"
	.file	20	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/VectorUtils.h"
	.file	21	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\common_functions.h"
	.file	22	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_functions.h"
	.file	23	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_constants.h"
	.file	24	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\device_functions.h"
	.file	25	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_11_atomic_functions.h"
	.file	26	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_12_atomic_functions.h"
	.file	27	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_13_double_functions.h"
	.file	28	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_20_atomic_functions.h"
	.file	29	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_20_intrinsics.h"
	.file	30	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\surface_functions.h"
	.file	31	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\texture_fetch_functions.h"
	.file	32	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_functions_dbl_ptx3.h"


	.visible .func (.param .s32 __cudaretf__Z15IntegerMultiplyii) _Z15IntegerMultiplyii (.param .s32 __cudaparmf1__Z15IntegerMultiplyii, .param .s32 __cudaparmf2__Z15IntegerMultiplyii)
	{
	.reg .u32 %r<7>;
	.loc	18	60	0
$LDWbegin__Z15IntegerMultiplyii:
	ld.param.u32 	%r1, [__cudaparmf1__Z15IntegerMultiplyii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf2__Z15IntegerMultiplyii];
	mov.s32 	%r4, %r3;
	.loc	18	64	0
	mul.lo.s32 	%r5, %r2, %r4;
	st.param.s32 	[__cudaretf__Z15IntegerMultiplyii], %r5;
	ret;
$LDWend__Z15IntegerMultiplyii:
	} // _Z15IntegerMultiplyii

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelXv) _Z17Standard2DKernelXv ()
	{
	.reg .u32 %r<7>;
	.loc	18	73	0
$LDWbegin__Z17Standard2DKernelXv:
	.loc	18	74	0
	mov.u32 	%r1, %tid.x;
	cvt.s32.u32 	%r2, %ctaid.x;
	cvt.s32.u32 	%r3, %ntid.x;
	mul.lo.s32 	%r4, %r2, %r3;
	add.u32 	%r5, %r1, %r4;
	st.param.s32 	[__cudaretf__Z17Standard2DKernelXv], %r5;
	ret;
$LDWend__Z17Standard2DKernelXv:
	} // _Z17Standard2DKernelXv

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelYv) _Z17Standard2DKernelYv ()
	{
	.reg .u32 %r<7>;
	.loc	18	77	0
$LDWbegin__Z17Standard2DKernelYv:
	.loc	18	78	0
	mov.u32 	%r1, %tid.y;
	cvt.s32.u32 	%r2, %ctaid.y;
	cvt.s32.u32 	%r3, %ntid.y;
	mul.lo.s32 	%r4, %r2, %r3;
	add.u32 	%r5, %r1, %r4;
	st.param.s32 	[__cudaretf__Z17Standard2DKernelYv], %r5;
	ret;
$LDWend__Z17Standard2DKernelYv:
	} // _Z17Standard2DKernelYv

	.visible .func (.param .align 16 .b8 __cudaretf__Z13Half4ToFloat47ushort4[16]) _Z13Half4ToFloat47ushort4 (.param .align 8 .b8 __cudaparmf1__Z13Half4ToFloat47ushort4[8])
	{
	.reg .u32 %r<14>;
	.reg .f32 %f<9>;
	.loc	18	86	0
$LDWbegin__Z13Half4ToFloat47ushort4:
	ld.param.u16 	%r1, [__cudaparmf1__Z13Half4ToFloat47ushort4+0];
	mov.s32 	%r2, %r1;
	ld.param.u16 	%r3, [__cudaparmf1__Z13Half4ToFloat47ushort4+2];
	mov.s32 	%r4, %r3;
	ld.param.u16 	%r5, [__cudaparmf1__Z13Half4ToFloat47ushort4+4];
	mov.s32 	%r6, %r5;
	ld.param.u16 	%r7, [__cudaparmf1__Z13Half4ToFloat47ushort4+6];
	mov.s32 	%r8, %r7;
	.loc	18	87	0
	cvt.u16.u32 	%r9, %r4;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r9;
	cvt.ftz.f32.f16	%f1, %b1; }
	cvt.u16.u32 	%r10, %r6;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r10;
	cvt.ftz.f32.f16	%f2, %b1; }
	cvt.u16.u32 	%r11, %r8;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r11;
	cvt.ftz.f32.f16	%f3, %b1; }
	cvt.u16.u32 	%r12, %r2;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r12;
	cvt.ftz.f32.f16	%f4, %b1; }
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+0], %f4;
	mov.f32 	%f5, %f1;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+4], %f5;
	mov.f32 	%f6, %f2;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+8], %f6;
	mov.f32 	%f7, %f3;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+12], %f7;
	ret;
$LDWend__Z13Half4ToFloat47ushort4:
	} // _Z13Half4ToFloat47ushort4

	.visible .func (.param .align 8 .b8 __cudaretf__Z13Float4ToHalf46float4[8]) _Z13Float4ToHalf46float4 (.param .align 16 .b8 __cudaparmf1__Z13Float4ToHalf46float4[16])
	{
	.reg .u32 %r<13>;
	.reg .f32 %f<10>;
	.loc	18	95	0
$LDWbegin__Z13Float4ToHalf46float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z13Float4ToHalf46float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z13Float4ToHalf46float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z13Float4ToHalf46float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z13Float4ToHalf46float4+12];
	mov.f32 	%f8, %f7;
	.loc	18	96	0
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f4;
	mov.b32		%r1, %b1; }
	cvt.u16.u32 	%r2, %r1;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f6;
	mov.b32		%r3, %b1; }
	cvt.u16.u32 	%r4, %r3;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f8;
	mov.b32		%r5, %b1; }
	cvt.u16.u32 	%r6, %r5;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f2;
	mov.b32		%r7, %b1; }
	cvt.u16.u32 	%r8, %r7;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+0], %r8;
	mov.s32 	%r9, %r2;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+2], %r9;
	mov.s32 	%r10, %r4;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+4], %r10;
	mov.s32 	%r11, %r6;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+6], %r11;
	ret;
$LDWend__Z13Float4ToHalf46float4:
	} // _Z13Float4ToHalf46float4

	.visible .func (.param .u32 __cudaretf__Z4Mix3RjS_S_) _Z4Mix3RjS_S_ (.param .u64 __cudaparmf1__Z4Mix3RjS_S_, .param .u64 __cudaparmf2__Z4Mix3RjS_S_, .param .u64 __cudaparmf3__Z4Mix3RjS_S_)
	{
	.reg .u32 %r<75>;
	.reg .u64 %rd<8>;
	.loc	18	138	0
$LDWbegin__Z4Mix3RjS_S_:
	ld.param.u64 	%rd1, [__cudaparmf1__Z4Mix3RjS_S_];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z4Mix3RjS_S_];
	mov.s64 	%rd4, %rd3;
	ld.param.u64 	%rd5, [__cudaparmf3__Z4Mix3RjS_S_];
	mov.s64 	%rd6, %rd5;
	.loc	18	139	0
	ld.u32 	%r1, [%rd2+0];
	ld.u32 	%r2, [%rd4+0];
	sub.u32 	%r3, %r1, %r2;
	st.u32 	[%rd2+0], %r3;
	ld.u32 	%r4, [%rd6+0];
	sub.u32 	%r5, %r3, %r4;
	st.u32 	[%rd2+0], %r5;
	ld.u32 	%r6, [%rd6+0];
	shr.u32 	%r7, %r6, 13;
	xor.b32 	%r8, %r5, %r7;
	st.u32 	[%rd2+0], %r8;
	.loc	18	140	0
	ld.u32 	%r9, [%rd4+0];
	ld.u32 	%r10, [%rd6+0];
	sub.u32 	%r11, %r9, %r10;
	st.u32 	[%rd4+0], %r11;
	ld.u32 	%r12, [%rd2+0];
	sub.u32 	%r13, %r11, %r12;
	st.u32 	[%rd4+0], %r13;
	ld.u32 	%r14, [%rd2+0];
	shl.b32 	%r15, %r14, 8;
	xor.b32 	%r16, %r13, %r15;
	st.u32 	[%rd4+0], %r16;
	.loc	18	141	0
	ld.u32 	%r17, [%rd6+0];
	ld.u32 	%r18, [%rd2+0];
	sub.u32 	%r19, %r17, %r18;
	st.u32 	[%rd6+0], %r19;
	ld.u32 	%r20, [%rd4+0];
	sub.u32 	%r21, %r19, %r20;
	st.u32 	[%rd6+0], %r21;
	ld.u32 	%r22, [%rd4+0];
	shr.u32 	%r23, %r22, 13;
	xor.b32 	%r24, %r21, %r23;
	st.u32 	[%rd6+0], %r24;
	.loc	18	142	0
	ld.u32 	%r25, [%rd2+0];
	ld.u32 	%r26, [%rd4+0];
	sub.u32 	%r27, %r25, %r26;
	st.u32 	[%rd2+0], %r27;
	ld.u32 	%r28, [%rd6+0];
	sub.u32 	%r29, %r27, %r28;
	st.u32 	[%rd2+0], %r29;
	ld.u32 	%r30, [%rd6+0];
	shr.u32 	%r31, %r30, 12;
	xor.b32 	%r32, %r29, %r31;
	st.u32 	[%rd2+0], %r32;
	.loc	18	143	0
	ld.u32 	%r33, [%rd4+0];
	ld.u32 	%r34, [%rd6+0];
	sub.u32 	%r35, %r33, %r34;
	st.u32 	[%rd4+0], %r35;
	ld.u32 	%r36, [%rd2+0];
	sub.u32 	%r37, %r35, %r36;
	st.u32 	[%rd4+0], %r37;
	ld.u32 	%r38, [%rd2+0];
	shl.b32 	%r39, %r38, 16;
	xor.b32 	%r40, %r37, %r39;
	st.u32 	[%rd4+0], %r40;
	.loc	18	144	0
	ld.u32 	%r41, [%rd6+0];
	ld.u32 	%r42, [%rd2+0];
	sub.u32 	%r43, %r41, %r42;
	st.u32 	[%rd6+0], %r43;
	ld.u32 	%r44, [%rd4+0];
	sub.u32 	%r45, %r43, %r44;
	st.u32 	[%rd6+0], %r45;
	ld.u32 	%r46, [%rd4+0];
	shr.u32 	%r47, %r46, 5;
	xor.b32 	%r48, %r45, %r47;
	st.u32 	[%rd6+0], %r48;
	.loc	18	145	0
	ld.u32 	%r49, [%rd2+0];
	ld.u32 	%r50, [%rd4+0];
	sub.u32 	%r51, %r49, %r50;
	st.u32 	[%rd2+0], %r51;
	ld.u32 	%r52, [%rd6+0];
	sub.u32 	%r53, %r51, %r52;
	st.u32 	[%rd2+0], %r53;
	ld.u32 	%r54, [%rd6+0];
	shr.u32 	%r55, %r54, 3;
	xor.b32 	%r56, %r53, %r55;
	st.u32 	[%rd2+0], %r56;
	.loc	18	146	0
	ld.u32 	%r57, [%rd4+0];
	ld.u32 	%r58, [%rd6+0];
	sub.u32 	%r59, %r57, %r58;
	st.u32 	[%rd4+0], %r59;
	ld.u32 	%r60, [%rd2+0];
	sub.u32 	%r61, %r59, %r60;
	st.u32 	[%rd4+0], %r61;
	ld.u32 	%r62, [%rd2+0];
	shl.b32 	%r63, %r62, 10;
	xor.b32 	%r64, %r61, %r63;
	st.u32 	[%rd4+0], %r64;
	.loc	18	147	0
	ld.u32 	%r65, [%rd6+0];
	ld.u32 	%r66, [%rd2+0];
	sub.u32 	%r67, %r65, %r66;
	st.u32 	[%rd6+0], %r67;
	ld.u32 	%r68, [%rd4+0];
	sub.u32 	%r69, %r67, %r68;
	st.u32 	[%rd6+0], %r69;
	ld.u32 	%r70, [%rd4+0];
	shr.u32 	%r71, %r70, 15;
	xor.b32 	%r72, %r69, %r71;
	st.u32 	[%rd6+0], %r72;
	.loc	18	148	0
	mov.s32 	%r73, %r72;
	st.param.u32 	[__cudaretf__Z4Mix3RjS_S_], %r73;
	ret;
$LDWend__Z4Mix3RjS_S_:
	} // _Z4Mix3RjS_S_

	.visible .func (.param .s32 __cudaretf__Z4Randj) _Z4Randj (.param .u32 __cudaparmf1__Z4Randj)
	{
	.reg .u32 %r<14>;
	.loc	18	152	0
$LDWbegin__Z4Randj:
	ld.param.u32 	%r1, [__cudaparmf1__Z4Randj];
	mov.s32 	%r2, %r1;
	.loc	18	163	0
	mul.lo.u32 	%r3, %r2, 1103515245;
	add.u32 	%r4, %r3, 12345;
	shr.u32 	%r5, %r4, 16;
	and.b32 	%r6, %r5, 255;
	shl.b32 	%r7, %r6, 7;
	mul.lo.u32 	%r8, %r2, -1029531031;
	sub.u32 	%r9, %r8, 740551042;
	shr.u32 	%r10, %r9, 16;
	and.b32 	%r11, %r10, 255;
	xor.b32 	%r12, %r7, %r11;
	st.param.s32 	[__cudaretf__Z4Randj], %r12;
	ret;
$LDWend__Z4Randj:
	} // _Z4Randj

	.visible .func (.param .s32 __cudaretf__Z6Rand2Djjj) _Z6Rand2Djjj (.param .u32 __cudaparmf1__Z6Rand2Djjj, .param .u32 __cudaparmf2__Z6Rand2Djjj, .param .u32 __cudaparmf3__Z6Rand2Djjj)
	{
	.reg .u32 %r<54>;
	.loc	18	169	0
$LDWbegin__Z6Rand2Djjj:
	ld.param.u32 	%r1, [__cudaparmf1__Z6Rand2Djjj];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf2__Z6Rand2Djjj];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf3__Z6Rand2Djjj];
	mov.s32 	%r6, %r5;
	.loc	18	139	0
	sub.u32 	%r7, %r2, %r4;
	sub.u32 	%r8, %r7, %r6;
	shr.u32 	%r9, %r6, 13;
	xor.b32 	%r10, %r8, %r9;
	.loc	18	140	0
	sub.u32 	%r11, %r4, %r6;
	sub.u32 	%r12, %r11, %r10;
	shl.b32 	%r13, %r10, 8;
	xor.b32 	%r14, %r12, %r13;
	.loc	18	141	0
	sub.u32 	%r15, %r6, %r10;
	sub.u32 	%r16, %r15, %r14;
	shr.u32 	%r17, %r14, 13;
	xor.b32 	%r18, %r16, %r17;
	.loc	18	142	0
	sub.u32 	%r19, %r10, %r14;
	sub.u32 	%r20, %r19, %r18;
	shr.u32 	%r21, %r18, 12;
	xor.b32 	%r22, %r20, %r21;
	.loc	18	143	0
	sub.u32 	%r23, %r14, %r18;
	sub.u32 	%r24, %r23, %r22;
	shl.b32 	%r25, %r22, 16;
	xor.b32 	%r26, %r24, %r25;
	.loc	18	144	0
	sub.u32 	%r27, %r18, %r22;
	sub.u32 	%r28, %r27, %r26;
	shr.u32 	%r29, %r26, 5;
	xor.b32 	%r30, %r28, %r29;
	.loc	18	145	0
	sub.u32 	%r31, %r22, %r26;
	sub.u32 	%r32, %r31, %r30;
	shr.u32 	%r33, %r30, 3;
	xor.b32 	%r34, %r32, %r33;
	.loc	18	146	0
	sub.u32 	%r35, %r26, %r30;
	sub.u32 	%r36, %r35, %r34;
	shl.b32 	%r37, %r34, 10;
	xor.b32 	%r38, %r36, %r37;
	.loc	18	147	0
	sub.u32 	%r39, %r30, %r34;
	sub.u32 	%r40, %r39, %r38;
	shr.u32 	%r41, %r38, 15;
	xor.b32 	%r42, %r40, %r41;
	.loc	18	170	0
	mul.lo.u32 	%r43, %r42, 1103515245;
	add.u32 	%r44, %r43, 12345;
	shr.u32 	%r45, %r44, 16;
	and.b32 	%r46, %r45, 255;
	shl.b32 	%r47, %r46, 7;
	mul.lo.u32 	%r48, %r42, -1029531031;
	sub.u32 	%r49, %r48, 740551042;
	shr.u32 	%r50, %r49, 16;
	and.b32 	%r51, %r50, 255;
	xor.b32 	%r52, %r47, %r51;
	st.param.s32 	[__cudaretf__Z6Rand2Djjj], %r52;
	ret;
$LDWend__Z6Rand2Djjj:
	} // _Z6Rand2Djjj

	.visible .func (.param .s32 __cudaretf__Z6Rand2Dj) _Z6Rand2Dj (.param .u32 __cudaparmf1__Z6Rand2Dj)
	{
	.reg .u32 %r<60>;
	.loc	18	175	0
$LDWbegin__Z6Rand2Dj:
	ld.param.u32 	%r1, [__cudaparmf1__Z6Rand2Dj];
	mov.s32 	%r2, %r1;
	.loc	18	143	0
	cvt.s32.u32 	%r3, %ctaid.y;
	cvt.s32.u32 	%r4, %ntid.y;
	mul.lo.s32 	%r5, %r3, %r4;
	cvt.s32.u32 	%r6, %ctaid.x;
	cvt.s32.u32 	%r7, %ntid.x;
	mul.lo.s32 	%r8, %r6, %r7;
	mov.u32 	%r9, %tid.y;
	add.u32 	%r10, %r5, %r9;
	mov.u32 	%r11, %tid.x;
	add.u32 	%r12, %r8, %r11;
	shr.u32 	%r13, %r10, 13;
	sub.u32 	%r14, %r2, %r12;
	sub.u32 	%r15, %r12, %r10;
	sub.u32 	%r16, %r14, %r10;
	xor.b32 	%r17, %r13, %r16;
	shl.b32 	%r18, %r17, 8;
	sub.u32 	%r19, %r15, %r17;
	sub.u32 	%r20, %r10, %r17;
	xor.b32 	%r21, %r18, %r19;
	shr.u32 	%r22, %r21, 13;
	sub.u32 	%r23, %r20, %r21;
	sub.u32 	%r24, %r17, %r21;
	xor.b32 	%r25, %r22, %r23;
	shr.u32 	%r26, %r25, 12;
	sub.u32 	%r27, %r24, %r25;
	xor.b32 	%r28, %r26, %r27;
	sub.u32 	%r29, %r21, %r25;
	sub.u32 	%r30, %r29, %r28;
	shl.b32 	%r31, %r28, 16;
	xor.b32 	%r32, %r30, %r31;
	.loc	18	144	0
	sub.u32 	%r33, %r25, %r28;
	sub.u32 	%r34, %r33, %r32;
	shr.u32 	%r35, %r32, 5;
	xor.b32 	%r36, %r34, %r35;
	.loc	18	145	0
	sub.u32 	%r37, %r28, %r32;
	sub.u32 	%r38, %r37, %r36;
	shr.u32 	%r39, %r36, 3;
	xor.b32 	%r40, %r38, %r39;
	.loc	18	146	0
	sub.u32 	%r41, %r32, %r36;
	sub.u32 	%r42, %r41, %r40;
	shl.b32 	%r43, %r40, 10;
	xor.b32 	%r44, %r42, %r43;
	.loc	18	147	0
	sub.u32 	%r45, %r36, %r40;
	sub.u32 	%r46, %r45, %r44;
	shr.u32 	%r47, %r44, 15;
	xor.b32 	%r48, %r46, %r47;
	.loc	18	176	0
	mul.lo.u32 	%r49, %r48, 1103515245;
	add.u32 	%r50, %r49, 12345;
	shr.u32 	%r51, %r50, 16;
	and.b32 	%r52, %r51, 255;
	shl.b32 	%r53, %r52, 7;
	mul.lo.u32 	%r54, %r48, -1029531031;
	sub.u32 	%r55, %r54, 740551042;
	shr.u32 	%r56, %r55, 16;
	and.b32 	%r57, %r56, 255;
	xor.b32 	%r58, %r53, %r57;
	st.param.s32 	[__cudaretf__Z6Rand2Dj], %r58;
	ret;
$LDWend__Z6Rand2Dj:
	} // _Z6Rand2Dj

	.visible .func (.param .align 8 .b8 __cudaretf__Z6Read2DI7ushort4ET_PKS1_iii[8]) _Z6Read2DI7ushort4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI7ushort4ET_PKS1_iii)
	{
	.reg .u32 %r<14>;
	.reg .u64 %rd<7>;
	.loc	18	114	0
$LDWbegin__Z6Read2DI7ushort4ET_PKS1_iii:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf2__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf3__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf4__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s32 	%r6, %r5;
	.loc	18	115	0
	mul.lo.s32 	%r7, %r2, %r6;
	add.s32 	%r8, %r4, %r7;
	cvt.s64.s32 	%rd3, %r8;
	mul.wide.s32 	%rd4, %r8, 8;
	add.u64 	%rd5, %rd2, %rd4;
	ld.v4.u16 	{%r9,%r10,%r11,%r12}, [%rd5+0];
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+0], %r9;
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+2], %r10;
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+4], %r11;
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+6], %r12;
	ret;
$LDWend__Z6Read2DI7ushort4ET_PKS1_iii:
	} // _Z6Read2DI7ushort4ET_PKS1_iii

	.visible .func (.param .align 16 .b8 __cudaretf__Z6Read2DI6float4ET_PKS1_iii[16]) _Z6Read2DI6float4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI6float4ET_PKS1_iii)
	{
	.reg .u32 %r<10>;
	.reg .u64 %rd<7>;
	.reg .f32 %f<6>;
	.loc	18	114	0
$LDWbegin__Z6Read2DI6float4ET_PKS1_iii:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6Read2DI6float4ET_PKS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf2__Z6Read2DI6float4ET_PKS1_iii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf3__Z6Read2DI6float4ET_PKS1_iii];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf4__Z6Read2DI6float4ET_PKS1_iii];
	mov.s32 	%r6, %r5;
	.loc	18	115	0
	mul.lo.s32 	%r7, %r2, %r6;
	add.s32 	%r8, %r4, %r7;
	cvt.s64.s32 	%rd3, %r8;
	mul.wide.s32 	%rd4, %r8, 16;
	add.u64 	%rd5, %rd2, %rd4;
	ld.v4.f32 	{%f1,%f2,%f3,%f4}, [%rd5+0];
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+0], %f1;
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+4], %f2;
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+8], %f3;
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+12], %f4;
	ret;
$LDWend__Z6Read2DI6float4ET_PKS1_iii:
	} // _Z6Read2DI6float4ET_PKS1_iii

	.visible .func _Z7Write2DI7ushort4EvT_PS1_iii (.param .align 8 .b8 __cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii[8], .param .u64 __cudaparmf2__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI7ushort4EvT_PS1_iii)
	{
	.reg .u32 %r<18>;
	.reg .u64 %rd<7>;
	.loc	18	125	0
$LDWbegin__Z7Write2DI7ushort4EvT_PS1_iii:
	ld.param.u16 	%r1, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+0];
	mov.s32 	%r2, %r1;
	ld.param.u16 	%r3, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+2];
	mov.s32 	%r4, %r3;
	ld.param.u16 	%r5, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+4];
	mov.s32 	%r6, %r5;
	ld.param.u16 	%r7, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+6];
	mov.s32 	%r8, %r7;
	ld.param.u64 	%rd1, [__cudaparmf2__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r9, [__cudaparmf3__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s32 	%r10, %r9;
	ld.param.u32 	%r11, [__cudaparmf4__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s32 	%r12, %r11;
	ld.param.u32 	%r13, [__cudaparmf5__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s32 	%r14, %r13;
	.loc	18	126	0
	mul.lo.s32 	%r15, %r10, %r14;
	add.s32 	%r16, %r12, %r15;
	cvt.s64.s32 	%rd3, %r16;
	mul.wide.s32 	%rd4, %r16, 8;
	add.u64 	%rd5, %rd2, %rd4;
	st.v4.u16 	[%rd5+0], {%r2,%r4,%r6,%r8};
	.loc	18	127	0
	ret;
$LDWend__Z7Write2DI7ushort4EvT_PS1_iii:
	} // _Z7Write2DI7ushort4EvT_PS1_iii

	.visible .func _Z7Write2DI6float4EvT_PS1_iii (.param .align 16 .b8 __cudaparmf1__Z7Write2DI6float4EvT_PS1_iii[16], .param .u64 __cudaparmf2__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI6float4EvT_PS1_iii)
	{
	.reg .u32 %r<10>;
	.reg .u64 %rd<7>;
	.reg .f32 %f<10>;
	.loc	18	125	0
$LDWbegin__Z7Write2DI6float4EvT_PS1_iii:
	ld.param.f32 	%f1, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+12];
	mov.f32 	%f8, %f7;
	ld.param.u64 	%rd1, [__cudaparmf2__Z7Write2DI6float4EvT_PS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf3__Z7Write2DI6float4EvT_PS1_iii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf4__Z7Write2DI6float4EvT_PS1_iii];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf5__Z7Write2DI6float4EvT_PS1_iii];
	mov.s32 	%r6, %r5;
	.loc	18	126	0
	mul.lo.s32 	%r7, %r2, %r6;
	add.s32 	%r8, %r4, %r7;
	cvt.s64.s32 	%rd3, %r8;
	mul.wide.s32 	%rd4, %r8, 16;
	add.u64 	%rd5, %rd2, %rd4;
	st.v4.f32 	[%rd5+0], {%f2,%f4,%f6,%f8};
	.loc	18	127	0
	ret;
$LDWend__Z7Write2DI6float4EvT_PS1_iii:
	} // _Z7Write2DI6float4EvT_PS1_iii

	.visible .func (.param .align 16 .b8 __cudaretf__Z18UnpremultiplyPixel8PixelRGB[16]) _Z18UnpremultiplyPixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z18UnpremultiplyPixel8PixelRGB[16])
	{
	.reg .f32 %f<23>;
	.reg .pred %p<3>;
	.loc	3	206	0
$LDWbegin__Z18UnpremultiplyPixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_13_1282;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_13_1026;
$Lt_13_1282:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_13_1026:
	.loc	3	224	0
	mov.f32 	%f18, %f17;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+0], %f18;
	mov.f32 	%f19, %f16;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+4], %f19;
	mov.f32 	%f20, %f15;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+8], %f20;
	mov.f32 	%f21, %f10;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+12], %f21;
	ret;
$LDWend__Z18UnpremultiplyPixel8PixelRGB:
	} // _Z18UnpremultiplyPixel8PixelRGB

	.visible .func (.param .f32 __cudaretf__Z13ToLinearColorf) _Z13ToLinearColorf (.param .f32 __cudaparmf1__Z13ToLinearColorf)
	{
	.reg .f32 %f<15>;
	.reg .pred %p<3>;
	.loc	3	231	0
$LDWbegin__Z13ToLinearColorf:
	ld.param.f32 	%f1, [__cudaparmf1__Z13ToLinearColorf];
	mov.f32 	%f2, %f1;
	mov.f32 	%f3, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f3;
	@!%p1 bra 	$Lt_14_1026;
	.loc	3	234	0
	neg.ftz.f32 	%f4, %f2;
	lg2.approx.ftz.f32 	%f5, %f4;
	mov.f32 	%f6, 0f400ccccd;     	// 2.2
	mul.ftz.f32 	%f7, %f5, %f6;
	ex2.approx.ftz.f32 	%f8, %f7;
	neg.ftz.f32 	%f9, %f8;
	bra.uni 	$LBB4__Z13ToLinearColorf;
$Lt_14_1026:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f10, %f2;
	mov.f32 	%f11, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f12, %f10, %f11;
	ex2.approx.ftz.f32 	%f9, %f12;
$LBB4__Z13ToLinearColorf:
	mov.f32 	%f13, %f9;
	st.param.f32 	[__cudaretf__Z13ToLinearColorf], %f13;
	ret;
$LDWend__Z13ToLinearColorf:
	} // _Z13ToLinearColorf

	.visible .func (.param .f32 __cudaretf__Z15FromLinearColorf) _Z15FromLinearColorf (.param .f32 __cudaparmf1__Z15FromLinearColorf)
	{
	.reg .f32 %f<15>;
	.reg .pred %p<3>;
	.loc	3	239	0
$LDWbegin__Z15FromLinearColorf:
	ld.param.f32 	%f1, [__cudaparmf1__Z15FromLinearColorf];
	mov.f32 	%f2, %f1;
	mov.f32 	%f3, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f3;
	@!%p1 bra 	$Lt_15_1026;
	.loc	3	242	0
	neg.ftz.f32 	%f4, %f2;
	lg2.approx.ftz.f32 	%f5, %f4;
	mov.f32 	%f6, 0f3ee8ba2e;     	// 0.454545
	mul.ftz.f32 	%f7, %f5, %f6;
	ex2.approx.ftz.f32 	%f8, %f7;
	neg.ftz.f32 	%f9, %f8;
	bra.uni 	$LBB4__Z15FromLinearColorf;
$Lt_15_1026:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f10, %f2;
	mov.f32 	%f11, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f12, %f10, %f11;
	ex2.approx.ftz.f32 	%f9, %f12;
$LBB4__Z15FromLinearColorf:
	mov.f32 	%f13, %f9;
	st.param.f32 	[__cudaretf__Z15FromLinearColorf], %f13;
	ret;
$LDWend__Z15FromLinearColorf:
	} // _Z15FromLinearColorf

	.visible .func (.param .align 16 .b8 __cudaretf__Z25PremultiplyLinearizePixel8PixelRGB[16]) _Z25PremultiplyLinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB[16])
	{
	.reg .f32 %f<47>;
	.reg .pred %p<5>;
	.loc	3	252	0
$LDWbegin__Z25PremultiplyLinearizePixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	254	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	.loc	3	255	0
	mov.f32 	%f10, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f10;
	@!%p1 bra 	$Lt_16_4098;
	.loc	3	234	0
	neg.ftz.f32 	%f11, %f2;
	lg2.approx.ftz.f32 	%f12, %f11;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f15, %f14;
	neg.ftz.f32 	%f16, %f15;
	bra.uni 	$LDWendi___log2f_193_5;
$Lt_16_4098:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f17, %f2;
	mov.f32 	%f18, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f19, %f17, %f18;
	ex2.approx.ftz.f32 	%f16, %f19;
$LDWendi___log2f_193_5:
	.loc	3	256	0
	mov.f32 	%f20, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f4, %f20;
	@!%p2 bra 	$Lt_16_4610;
	.loc	3	234	0
	neg.ftz.f32 	%f21, %f4;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	neg.ftz.f32 	%f26, %f25;
	bra.uni 	$LDWendi___log2f_193_3;
$Lt_16_4610:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f27, %f4;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f26, %f29;
$LDWendi___log2f_193_3:
	.loc	3	257	0
	mov.f32 	%f30, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f6, %f30;
	@!%p3 bra 	$Lt_16_5122;
	.loc	3	234	0
	neg.ftz.f32 	%f31, %f6;
	lg2.approx.ftz.f32 	%f32, %f31;
	mov.f32 	%f33, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f34, %f32, %f33;
	ex2.approx.ftz.f32 	%f35, %f34;
	neg.ftz.f32 	%f36, %f35;
	bra.uni 	$LDWendi___log2f_193_1;
$Lt_16_5122:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f37, %f6;
	mov.f32 	%f38, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f39, %f37, %f38;
	ex2.approx.ftz.f32 	%f36, %f39;
$LDWendi___log2f_193_1:
	.loc	3	259	0
	mul.ftz.f32 	%f40, %f36, %f9;
	mul.ftz.f32 	%f41, %f26, %f9;
	mul.ftz.f32 	%f42, %f16, %f9;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+0], %f42;
	mov.f32 	%f43, %f41;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+4], %f43;
	mov.f32 	%f44, %f40;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+8], %f44;
	mov.f32 	%f45, %f9;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+12], %f45;
	ret;
$LDWend__Z25PremultiplyLinearizePixel8PixelRGB:
	} // _Z25PremultiplyLinearizePixel8PixelRGB

	.visible .func (.param .align 16 .b8 __cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16]) _Z29UnpremultiplyUnlinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16])
	{
	.reg .f32 %f<53>;
	.reg .pred %p<6>;
	.loc	3	263	0
$LDWbegin__Z29UnpremultiplyUnlinearizePixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_17_5122;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_17_4866;
$Lt_17_5122:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_17_4866:
	.loc	3	266	0
	mov.f32 	%f18, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f17, %f18;
	@!%p2 bra 	$Lt_17_5378;
	.loc	3	242	0
	neg.ftz.f32 	%f19, %f17;
	lg2.approx.ftz.f32 	%f20, %f19;
	mov.f32 	%f21, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f22, %f20, %f21;
	ex2.approx.ftz.f32 	%f23, %f22;
	neg.ftz.f32 	%f24, %f23;
	bra.uni 	$LDWendi___log2f_194_5;
$Lt_17_5378:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f25, %f17;
	mov.f32 	%f26, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f27, %f25, %f26;
	ex2.approx.ftz.f32 	%f24, %f27;
$LDWendi___log2f_194_5:
	.loc	3	267	0
	mov.f32 	%f28, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f16, %f28;
	@!%p3 bra 	$Lt_17_5890;
	.loc	3	242	0
	neg.ftz.f32 	%f29, %f16;
	lg2.approx.ftz.f32 	%f30, %f29;
	mov.f32 	%f31, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f32, %f30, %f31;
	ex2.approx.ftz.f32 	%f33, %f32;
	neg.ftz.f32 	%f34, %f33;
	bra.uni 	$LDWendi___log2f_194_3;
$Lt_17_5890:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f35, %f16;
	mov.f32 	%f36, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f37, %f35, %f36;
	ex2.approx.ftz.f32 	%f34, %f37;
$LDWendi___log2f_194_3:
	.loc	3	268	0
	mov.f32 	%f38, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p4, %f15, %f38;
	@!%p4 bra 	$Lt_17_6402;
	.loc	3	242	0
	neg.ftz.f32 	%f39, %f15;
	lg2.approx.ftz.f32 	%f40, %f39;
	mov.f32 	%f41, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f42, %f40, %f41;
	ex2.approx.ftz.f32 	%f43, %f42;
	neg.ftz.f32 	%f44, %f43;
	bra.uni 	$LDWendi___log2f_194_1;
$Lt_17_6402:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f45, %f15;
	mov.f32 	%f46, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f47, %f45, %f46;
	ex2.approx.ftz.f32 	%f44, %f47;
$LDWendi___log2f_194_1:
	.loc	3	269	0
	mov.f32 	%f48, %f24;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+0], %f48;
	mov.f32 	%f49, %f34;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+4], %f49;
	mov.f32 	%f50, %f44;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+8], %f50;
	mov.f32 	%f51, %f10;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+12], %f51;
	ret;
$LDWend__Z29UnpremultiplyUnlinearizePixel8PixelRGB:
	} // _Z29UnpremultiplyUnlinearizePixel8PixelRGB

	.visible .func (.param .align 16 .b8 __cudaretf__Z20PremultiplyLinearize6float4[16]) _Z20PremultiplyLinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z20PremultiplyLinearize6float4[16])
	{
	.reg .f32 %f<47>;
	.reg .pred %p<5>;
	.loc	3	277	0
$LDWbegin__Z20PremultiplyLinearize6float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z20PremultiplyLinearize6float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z20PremultiplyLinearize6float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z20PremultiplyLinearize6float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z20PremultiplyLinearize6float4+12];
	mov.f32 	%f8, %f7;
	.loc	3	254	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	.loc	3	255	0
	mov.f32 	%f10, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f10;
	@!%p1 bra 	$Lt_18_4098;
	.loc	3	234	0
	neg.ftz.f32 	%f11, %f2;
	lg2.approx.ftz.f32 	%f12, %f11;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f15, %f14;
	neg.ftz.f32 	%f16, %f15;
	bra.uni 	$LDWendi___log2f_195_5;
$Lt_18_4098:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f17, %f2;
	mov.f32 	%f18, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f19, %f17, %f18;
	ex2.approx.ftz.f32 	%f16, %f19;
$LDWendi___log2f_195_5:
	.loc	3	256	0
	mov.f32 	%f20, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f4, %f20;
	@!%p2 bra 	$Lt_18_4610;
	.loc	3	234	0
	neg.ftz.f32 	%f21, %f4;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	neg.ftz.f32 	%f26, %f25;
	bra.uni 	$LDWendi___log2f_195_3;
$Lt_18_4610:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f27, %f4;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f26, %f29;
$LDWendi___log2f_195_3:
	.loc	3	257	0
	mov.f32 	%f30, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f6, %f30;
	@!%p3 bra 	$Lt_18_5122;
	.loc	3	234	0
	neg.ftz.f32 	%f31, %f6;
	lg2.approx.ftz.f32 	%f32, %f31;
	mov.f32 	%f33, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f34, %f32, %f33;
	ex2.approx.ftz.f32 	%f35, %f34;
	neg.ftz.f32 	%f36, %f35;
	bra.uni 	$LDWendi___log2f_195_1;
$Lt_18_5122:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f37, %f6;
	mov.f32 	%f38, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f39, %f37, %f38;
	ex2.approx.ftz.f32 	%f36, %f39;
$LDWendi___log2f_195_1:
	.loc	3	259	0
	mul.ftz.f32 	%f40, %f36, %f9;
	mul.ftz.f32 	%f41, %f26, %f9;
	.loc	3	278	0
	mul.ftz.f32 	%f42, %f16, %f9;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+0], %f42;
	mov.f32 	%f43, %f41;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+4], %f43;
	mov.f32 	%f44, %f40;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+8], %f44;
	mov.f32 	%f45, %f9;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+12], %f45;
	ret;
$LDWend__Z20PremultiplyLinearize6float4:
	} // _Z20PremultiplyLinearize6float4

	.visible .func (.param .align 16 .b8 __cudaretf__Z24UnpremultiplyUnlinearize6float4[16]) _Z24UnpremultiplyUnlinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z24UnpremultiplyUnlinearize6float4[16])
	{
	.reg .f32 %f<53>;
	.reg .pred %p<6>;
	.loc	3	284	0
$LDWbegin__Z24UnpremultiplyUnlinearize6float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_19_5122;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_19_4866;
$Lt_19_5122:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_19_4866:
	.loc	3	266	0
	mov.f32 	%f18, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f17, %f18;
	@!%p2 bra 	$Lt_19_5378;
	.loc	3	242	0
	neg.ftz.f32 	%f19, %f17;
	lg2.approx.ftz.f32 	%f20, %f19;
	mov.f32 	%f21, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f22, %f20, %f21;
	ex2.approx.ftz.f32 	%f23, %f22;
	neg.ftz.f32 	%f24, %f23;
	bra.uni 	$LDWendi___log2f_196_5;
$Lt_19_5378:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f25, %f17;
	mov.f32 	%f26, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f27, %f25, %f26;
	ex2.approx.ftz.f32 	%f24, %f27;
$LDWendi___log2f_196_5:
	.loc	3	267	0
	mov.f32 	%f28, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f16, %f28;
	@!%p3 bra 	$Lt_19_5890;
	.loc	3	242	0
	neg.ftz.f32 	%f29, %f16;
	lg2.approx.ftz.f32 	%f30, %f29;
	mov.f32 	%f31, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f32, %f30, %f31;
	ex2.approx.ftz.f32 	%f33, %f32;
	neg.ftz.f32 	%f34, %f33;
	bra.uni 	$LDWendi___log2f_196_3;
$Lt_19_5890:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f35, %f16;
	mov.f32 	%f36, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f37, %f35, %f36;
	ex2.approx.ftz.f32 	%f34, %f37;
$LDWendi___log2f_196_3:
	.loc	3	268	0
	mov.f32 	%f38, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p4, %f15, %f38;
	@!%p4 bra 	$Lt_19_6402;
	.loc	3	242	0
	neg.ftz.f32 	%f39, %f15;
	lg2.approx.ftz.f32 	%f40, %f39;
	mov.f32 	%f41, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f42, %f40, %f41;
	ex2.approx.ftz.f32 	%f43, %f42;
	neg.ftz.f32 	%f44, %f43;
	bra.uni 	$LDWendi___log2f_196_1;
$Lt_19_6402:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f45, %f15;
	mov.f32 	%f46, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f47, %f45, %f46;
	ex2.approx.ftz.f32 	%f44, %f47;
$LDWendi___log2f_196_1:
	.loc	3	285	0
	mov.f32 	%f48, %f24;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+0], %f48;
	mov.f32 	%f49, %f34;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+4], %f49;
	mov.f32 	%f50, %f44;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+8], %f50;
	mov.f32 	%f51, %f10;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+12], %f51;
	ret;
$LDWend__Z24UnpremultiplyUnlinearize6float4:
	} // _Z24UnpremultiplyUnlinearize6float4

	.visible .func _Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii (.param .u64 __cudaparmf1__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .u64 __cudaparmf2__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .s32 __cudaparmf3__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .f32 __cudaparmf4__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .u32 __cudaparmf5__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .s32 __cudaparmf6__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii, .param .s32 __cudaparmf7__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii)
	{
	.reg .u32 %r<17>;
	.reg .u64 %rd<12>;
	.reg .f32 %f<48>;
	.reg .pred %p<7>;
	.loc	19	39	0
$LDWbegin__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii:
	ld.param.u64 	%rd1, [__cudaparmf1__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii];
	mov.s64 	%rd4, %rd3;
	ld.param.u32 	%r1, [__cudaparmf3__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii];
	mov.s32 	%r2, %r1;
	ld.param.f32 	%f1, [__cudaparmf4__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii];
	mov.f32 	%f2, %f1;
	ld.param.u32 	%r3, [__cudaparmf5__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf6__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii];
	mov.s32 	%r6, %r5;
	ld.param.u32 	%r7, [__cudaparmf7__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii];
	mov.s32 	%r8, %r7;
	mov.u64 	%rd5, 0;
	setp.eq.u64 	%p1, %rd4, %rd5;
	@%p1 bra 	$Lt_20_5634;
	mul.lo.s32 	%r9, %r2, %r8;
	add.s32 	%r10, %r6, %r9;
	cvt.s64.s32 	%rd6, %r10;
	mov.u32 	%r11, 0;
	setp.ne.s32 	%p2, %r4, %r11;
	@%p2 bra 	$Lt_20_6402;
	.loc	18	115	0
	mul.lo.u64 	%rd7, %rd6, 8;
	add.u64 	%rd8, %rd4, %rd7;
	ld.v4.u16 	{%r12,%r13,%r14,%r15}, [%rd8+0];
	.loc	19	42	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r12;
	cvt.ftz.f32.f16	%f3, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r13;
	cvt.ftz.f32.f16	%f4, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r14;
	cvt.ftz.f32.f16	%f5, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r15;
	cvt.ftz.f32.f16	%f6, %b1; }
	bra.uni 	$Lt_20_6146;
$Lt_20_6402:
	mul.lo.u64 	%rd9, %rd6, 16;
	add.u64 	%rd10, %rd4, %rd9;
	ld.v4.f32 	{%f3,%f4,%f5,%f6}, [%rd10+0];
$Lt_20_6146:
	.loc	3	255	0
	mov.f32 	%f7, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p3, %f3, %f7;
	@!%p3 bra 	$Lt_20_6658;
	.loc	3	234	0
	neg.ftz.f32 	%f8, %f3;
	lg2.approx.ftz.f32 	%f9, %f8;
	mov.f32 	%f10, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f11, %f9, %f10;
	ex2.approx.ftz.f32 	%f12, %f11;
	neg.ftz.f32 	%f13, %f12;
	bra.uni 	$LDWendi___log2f_197_5;
$Lt_20_6658:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f14, %f3;
	mov.f32 	%f15, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f16, %f14, %f15;
	ex2.approx.ftz.f32 	%f13, %f16;
$LDWendi___log2f_197_5:
	.loc	3	256	0
	mov.f32 	%f17, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p4, %f4, %f17;
	@!%p4 bra 	$Lt_20_7170;
	.loc	3	234	0
	neg.ftz.f32 	%f18, %f4;
	lg2.approx.ftz.f32 	%f19, %f18;
	mov.f32 	%f20, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f21, %f19, %f20;
	ex2.approx.ftz.f32 	%f22, %f21;
	neg.ftz.f32 	%f23, %f22;
	bra.uni 	$LDWendi___log2f_197_3;
$Lt_20_7170:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f24, %f4;
	mov.f32 	%f25, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f26, %f24, %f25;
	ex2.approx.ftz.f32 	%f23, %f26;
$LDWendi___log2f_197_3:
	.loc	3	257	0
	mov.f32 	%f27, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p5, %f5, %f27;
	@!%p5 bra 	$Lt_20_7682;
	.loc	3	234	0
	neg.ftz.f32 	%f28, %f5;
	lg2.approx.ftz.f32 	%f29, %f28;
	mov.f32 	%f30, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f31, %f29, %f30;
	ex2.approx.ftz.f32 	%f32, %f31;
	neg.ftz.f32 	%f33, %f32;
	bra.uni 	$LDWendi___log2f_197_1;
$Lt_20_7682:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f34, %f5;
	mov.f32 	%f35, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f36, %f34, %f35;
	ex2.approx.ftz.f32 	%f33, %f36;
$LDWendi___log2f_197_1:
	.loc	20	80	0
	mul.ftz.f32 	%f37, %f2, %f6;
	cvt.ftz.sat.f32.f32 	%f38, %f37;
	ld.v4.f32 	{%f39,%f40,%f41,%f42}, [%rd2+0];
	fma.rn.ftz.f32 	%f43, %f13, %f38, %f39;
	.loc	20	81	0
	fma.rn.ftz.f32 	%f44, %f23, %f38, %f40;
	.loc	20	82	0
	fma.rn.ftz.f32 	%f45, %f33, %f38, %f41;
	.loc	20	83	0
	add.ftz.f32 	%f46, %f42, %f38;
	st.v4.f32 	[%rd2+0], {%f43,%f44,%f45,%f46};
$Lt_20_5634:
	.loc	19	46	0
	ret;
$LDWend__Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii:
	} // _Z15AccumulatePixelR8PixelRGBPK6float4if17DevicePixelFormatii

	.visible .func (.param .u64 __cudaretf__ZpLI8PixelRGBERT_S2_RKS1_) _ZpLI8PixelRGBERT_S2_RKS1_ (.param .u64 __cudaparmf1__ZpLI8PixelRGBERT_S2_RKS1_, .param .u64 __cudaparmf2__ZpLI8PixelRGBERT_S2_RKS1_)
	{
	.reg .u64 %rd<7>;
	.reg .f32 %f<14>;
	.loc	20	79	0
$LDWbegin__ZpLI8PixelRGBERT_S2_RKS1_:
	ld.param.u64 	%rd1, [__cudaparmf1__ZpLI8PixelRGBERT_S2_RKS1_];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__ZpLI8PixelRGBERT_S2_RKS1_];
	mov.s64 	%rd4, %rd3;
	ld.v4.f32 	{%f1,%f2,%f3,%f4}, [%rd2+0];
	.loc	20	80	0
	ld.f32 	%f5, [%rd4+0];
	add.ftz.f32 	%f6, %f1, %f5;
	.loc	20	81	0
	ld.f32 	%f7, [%rd4+4];
	add.ftz.f32 	%f8, %f2, %f7;
	.loc	20	82	0
	ld.f32 	%f9, [%rd4+8];
	add.ftz.f32 	%f10, %f3, %f9;
	.loc	20	83	0
	ld.f32 	%f11, [%rd4+12];
	add.ftz.f32 	%f12, %f4, %f11;
	st.v4.f32 	[%rd2+0], {%f6,%f8,%f10,%f12};
	.loc	20	84	0
	mov.s64 	%rd5, %rd2;
	st.param.u64 	[__cudaretf__ZpLI8PixelRGBERT_S2_RKS1_], %rd5;
	ret;
$LDWend__ZpLI8PixelRGBERT_S2_RKS1_:
	} // _ZpLI8PixelRGBERT_S2_RKS1_

	.entry AccumulateKernel (
		.param .u64 __cudaparm_AccumulateKernel_inSrc0,
		.param .s32 __cudaparm_AccumulateKernel_inSrcPitch0,
		.param .f32 __cudaparm_AccumulateKernel_inSrcWeights0,
		.param .u64 __cudaparm_AccumulateKernel_inSrc1,
		.param .s32 __cudaparm_AccumulateKernel_inSrcPitch1,
		.param .f32 __cudaparm_AccumulateKernel_inSrcWeights1,
		.param .u64 __cudaparm_AccumulateKernel_inSrc2,
		.param .s32 __cudaparm_AccumulateKernel_inSrcPitch2,
		.param .f32 __cudaparm_AccumulateKernel_inSrcWeights2,
		.param .u64 __cudaparm_AccumulateKernel_inSrc3,
		.param .s32 __cudaparm_AccumulateKernel_inSrcPitch3,
		.param .f32 __cudaparm_AccumulateKernel_inSrcWeights3,
		.param .u64 __cudaparm_AccumulateKernel_inSrc4,
		.param .s32 __cudaparm_AccumulateKernel_inSrcPitch4,
		.param .f32 __cudaparm_AccumulateKernel_inSrcWeights4,
		.param .u64 __cudaparm_AccumulateKernel_inSrc5,
		.param .s32 __cudaparm_AccumulateKernel_inSrcPitch5,
		.param .f32 __cudaparm_AccumulateKernel_inSrcWeights5,
		.param .u64 __cudaparm_AccumulateKernel_inSrc6,
		.param .s32 __cudaparm_AccumulateKernel_inSrcPitch6,
		.param .f32 __cudaparm_AccumulateKernel_inSrcWeights6,
		.param .u64 __cudaparm_AccumulateKernel_inSrc7,
		.param .s32 __cudaparm_AccumulateKernel_inSrcPitch7,
		.param .f32 __cudaparm_AccumulateKernel_inSrcWeights7,
		.param .u64 __cudaparm_AccumulateKernel_inSrc8,
		.param .s32 __cudaparm_AccumulateKernel_inSrcPitch8,
		.param .f32 __cudaparm_AccumulateKernel_inSrcWeights8,
		.param .u64 __cudaparm_AccumulateKernel_inSrc9,
		.param .s32 __cudaparm_AccumulateKernel_inSrcPitch9,
		.param .f32 __cudaparm_AccumulateKernel_inSrcWeights9,
		.param .u64 __cudaparm_AccumulateKernel_inDest,
		.param .s32 __cudaparm_AccumulateKernel_inDestPitch,
		.param .u32 __cudaparm_AccumulateKernel_inDeviceFormat,
		.param .s32 __cudaparm_AccumulateKernel_inWidth,
		.param .s32 __cudaparm_AccumulateKernel_inHeight)
	{
	.reg .u32 %r<63>;
	.reg .u64 %rd<78>;
	.reg .f32 %f<388>;
	.reg .pred %p<48>;
	.loc	19	87	0
$LDWbegin_AccumulateKernel:
	.loc	19	90	0
	cvt.s32.u32 	%r1, %ctaid.x;
	cvt.s32.u32 	%r2, %ntid.x;
	mul.lo.s32 	%r3, %r1, %r2;
	cvt.s32.u32 	%r4, %ctaid.y;
	cvt.s32.u32 	%r5, %ntid.y;
	mul.lo.s32 	%r6, %r4, %r5;
	mov.u32 	%r7, %tid.x;
	add.u32 	%r8, %r3, %r7;
	mov.u32 	%r9, %tid.y;
	add.u32 	%r10, %r6, %r9;
	ld.param.s32 	%r11, [__cudaparm_AccumulateKernel_inWidth];
	set.gt.u32.s32 	%r12, %r11, %r8;
	neg.s32 	%r13, %r12;
	ld.param.s32 	%r14, [__cudaparm_AccumulateKernel_inHeight];
	set.gt.u32.s32 	%r15, %r14, %r10;
	neg.s32 	%r16, %r15;
	and.b32 	%r17, %r13, %r16;
	mov.u32 	%r18, 0;
	setp.eq.s32 	%p1, %r17, %r18;
	@%p1 bra 	$Lt_22_60418;
	.loc	19	96	0
	ld.param.u32 	%r19, [__cudaparm_AccumulateKernel_inDeviceFormat];
	mov.s32 	%r20, 0;
	setp.eq.s32 	%p2, %r19, %r20;
	ld.param.u64 	%rd1, [__cudaparm_AccumulateKernel_inSrc0];
	mov.u64 	%rd2, 0;
	setp.eq.u64 	%p3, %rd1, %rd2;
	@%p3 bra 	$Lt_22_61186;
	ld.param.s32 	%r21, [__cudaparm_AccumulateKernel_inSrcPitch0];
	mul.lo.s32 	%r22, %r21, %r10;
	add.s32 	%r23, %r8, %r22;
	cvt.s64.s32 	%rd3, %r23;
	@!%p2 bra 	$Lt_22_61698;
	.loc	18	115	0
	mul.lo.u64 	%rd4, %rd3, 8;
	add.u64 	%rd5, %rd1, %rd4;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd5+0];
	.loc	19	42	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f1, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f2, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f3, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f4, %b1; }
	bra.uni 	$Lt_22_61442;
$Lt_22_61698:
	mul.lo.u64 	%rd6, %rd3, 16;
	add.u64 	%rd7, %rd1, %rd6;
	ld.global.v4.f32 	{%f1,%f2,%f3,%f4}, [%rd7+0];
$Lt_22_61442:
	.loc	3	255	0
	mov.f32 	%f5, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p4, %f1, %f5;
	@!%p4 bra 	$Lt_22_61954;
	.loc	3	234	0
	neg.ftz.f32 	%f6, %f1;
	lg2.approx.ftz.f32 	%f7, %f6;
	mov.f32 	%f8, 0f400ccccd;     	// 2.2
	mul.ftz.f32 	%f9, %f7, %f8;
	ex2.approx.ftz.f32 	%f10, %f9;
	neg.ftz.f32 	%f11, %f10;
	bra.uni 	$LDWendi___log2f_199_65;
$Lt_22_61954:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f12, %f1;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f11, %f14;
$LDWendi___log2f_199_65:
	.loc	3	256	0
	mov.f32 	%f15, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p5, %f2, %f15;
	@!%p5 bra 	$Lt_22_62466;
	.loc	3	234	0
	neg.ftz.f32 	%f16, %f2;
	lg2.approx.ftz.f32 	%f17, %f16;
	mov.f32 	%f18, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f19, %f17, %f18;
	ex2.approx.ftz.f32 	%f20, %f19;
	neg.ftz.f32 	%f21, %f20;
	bra.uni 	$LDWendi___log2f_199_63;
$Lt_22_62466:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f22, %f2;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f21, %f24;
$LDWendi___log2f_199_63:
	.loc	3	257	0
	mov.f32 	%f25, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p6, %f3, %f25;
	@!%p6 bra 	$Lt_22_62978;
	.loc	3	234	0
	neg.ftz.f32 	%f26, %f3;
	lg2.approx.ftz.f32 	%f27, %f26;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f30, %f29;
	neg.ftz.f32 	%f31, %f30;
	bra.uni 	$LDWendi___log2f_199_61;
$Lt_22_62978:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f32, %f3;
	mov.f32 	%f33, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f34, %f32, %f33;
	ex2.approx.ftz.f32 	%f31, %f34;
$LDWendi___log2f_199_61:
	.loc	20	80	0
	ld.param.f32 	%f35, [__cudaparm_AccumulateKernel_inSrcWeights0];
	mul.ftz.f32 	%f36, %f35, %f4;
	cvt.ftz.sat.f32.f32 	%f37, %f36;
	mul.ftz.f32 	%f38, %f11, %f37;
	.loc	20	81	0
	mul.ftz.f32 	%f39, %f21, %f37;
	.loc	20	82	0
	mul.ftz.f32 	%f40, %f31, %f37;
	.loc	20	83	0
	mov.f32 	%f41, %f37;
	.loc	19	44	0
	bra.uni 	$Lt_22_60930;
$Lt_22_61186:
	mov.f32 	%f41, 0f00000000;    	// 0
	mov.f32 	%f40, 0f00000000;    	// 0
	mov.f32 	%f39, 0f00000000;    	// 0
	mov.f32 	%f38, 0f00000000;    	// 0
$Lt_22_60930:
	.loc	19	97	0
	ld.param.u64 	%rd8, [__cudaparm_AccumulateKernel_inSrc1];
	mov.u64 	%rd9, 0;
	setp.eq.u64 	%p7, %rd8, %rd9;
	@%p7 bra 	$Lt_22_63490;
	ld.param.s32 	%r28, [__cudaparm_AccumulateKernel_inSrcPitch1];
	mul.lo.s32 	%r29, %r28, %r10;
	add.s32 	%r30, %r8, %r29;
	cvt.s64.s32 	%rd10, %r30;
	@!%p2 bra 	$Lt_22_64258;
	.loc	18	115	0
	mul.lo.u64 	%rd11, %rd10, 8;
	add.u64 	%rd12, %rd8, %rd11;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd12+0];
	.loc	19	42	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f42, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f43, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f44, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f45, %b1; }
	bra.uni 	$Lt_22_64002;
$Lt_22_64258:
	mul.lo.u64 	%rd13, %rd10, 16;
	add.u64 	%rd14, %rd8, %rd13;
	ld.global.v4.f32 	{%f42,%f43,%f44,%f45}, [%rd14+0];
$Lt_22_64002:
	.loc	3	255	0
	mov.f32 	%f46, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p8, %f42, %f46;
	@!%p8 bra 	$Lt_22_64514;
	.loc	3	234	0
	neg.ftz.f32 	%f47, %f42;
	lg2.approx.ftz.f32 	%f48, %f47;
	mov.f32 	%f49, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f50, %f48, %f49;
	ex2.approx.ftz.f32 	%f51, %f50;
	neg.ftz.f32 	%f11, %f51;
	bra.uni 	$LDWendi___log2f_199_59;
$Lt_22_64514:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f52, %f42;
	mov.f32 	%f53, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f54, %f52, %f53;
	ex2.approx.ftz.f32 	%f11, %f54;
$LDWendi___log2f_199_59:
	.loc	3	256	0
	mov.f32 	%f55, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p9, %f43, %f55;
	@!%p9 bra 	$Lt_22_65026;
	.loc	3	234	0
	neg.ftz.f32 	%f56, %f43;
	lg2.approx.ftz.f32 	%f57, %f56;
	mov.f32 	%f58, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f59, %f57, %f58;
	ex2.approx.ftz.f32 	%f60, %f59;
	neg.ftz.f32 	%f21, %f60;
	bra.uni 	$LDWendi___log2f_199_57;
$Lt_22_65026:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f61, %f43;
	mov.f32 	%f62, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f63, %f61, %f62;
	ex2.approx.ftz.f32 	%f21, %f63;
$LDWendi___log2f_199_57:
	.loc	3	257	0
	mov.f32 	%f64, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p10, %f44, %f64;
	@!%p10 bra 	$Lt_22_65538;
	.loc	3	234	0
	neg.ftz.f32 	%f65, %f44;
	lg2.approx.ftz.f32 	%f66, %f65;
	mov.f32 	%f67, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f68, %f66, %f67;
	ex2.approx.ftz.f32 	%f69, %f68;
	neg.ftz.f32 	%f31, %f69;
	bra.uni 	$LDWendi___log2f_199_55;
$Lt_22_65538:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f70, %f44;
	mov.f32 	%f71, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f72, %f70, %f71;
	ex2.approx.ftz.f32 	%f31, %f72;
$LDWendi___log2f_199_55:
	.loc	20	80	0
	ld.param.f32 	%f73, [__cudaparm_AccumulateKernel_inSrcWeights1];
	mul.ftz.f32 	%f74, %f73, %f45;
	cvt.ftz.sat.f32.f32 	%f75, %f74;
	fma.rn.ftz.f32 	%f38, %f11, %f75, %f38;
	.loc	20	81	0
	fma.rn.ftz.f32 	%f39, %f21, %f75, %f39;
	.loc	20	82	0
	fma.rn.ftz.f32 	%f40, %f31, %f75, %f40;
	.loc	20	83	0
	add.ftz.f32 	%f41, %f75, %f41;
$Lt_22_63490:
	.loc	19	98	0
	ld.param.u64 	%rd15, [__cudaparm_AccumulateKernel_inSrc2];
	mov.u64 	%rd16, 0;
	setp.eq.u64 	%p11, %rd15, %rd16;
	@%p11 bra 	$Lt_22_66050;
	ld.param.s32 	%r31, [__cudaparm_AccumulateKernel_inSrcPitch2];
	mul.lo.s32 	%r32, %r31, %r10;
	add.s32 	%r33, %r8, %r32;
	cvt.s64.s32 	%rd17, %r33;
	@!%p2 bra 	$Lt_22_66818;
	.loc	18	115	0
	mul.lo.u64 	%rd18, %rd17, 8;
	add.u64 	%rd19, %rd15, %rd18;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd19+0];
	.loc	19	42	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f76, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f77, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f78, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f79, %b1; }
	bra.uni 	$Lt_22_66562;
$Lt_22_66818:
	mul.lo.u64 	%rd20, %rd17, 16;
	add.u64 	%rd21, %rd15, %rd20;
	ld.global.v4.f32 	{%f76,%f77,%f78,%f79}, [%rd21+0];
$Lt_22_66562:
	.loc	3	255	0
	mov.f32 	%f80, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p12, %f76, %f80;
	@!%p12 bra 	$Lt_22_67074;
	.loc	3	234	0
	neg.ftz.f32 	%f81, %f76;
	lg2.approx.ftz.f32 	%f82, %f81;
	mov.f32 	%f83, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f84, %f82, %f83;
	ex2.approx.ftz.f32 	%f85, %f84;
	neg.ftz.f32 	%f11, %f85;
	bra.uni 	$LDWendi___log2f_199_53;
$Lt_22_67074:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f86, %f76;
	mov.f32 	%f87, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f88, %f86, %f87;
	ex2.approx.ftz.f32 	%f11, %f88;
$LDWendi___log2f_199_53:
	.loc	3	256	0
	mov.f32 	%f89, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p13, %f77, %f89;
	@!%p13 bra 	$Lt_22_67586;
	.loc	3	234	0
	neg.ftz.f32 	%f90, %f77;
	lg2.approx.ftz.f32 	%f91, %f90;
	mov.f32 	%f92, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f93, %f91, %f92;
	ex2.approx.ftz.f32 	%f94, %f93;
	neg.ftz.f32 	%f21, %f94;
	bra.uni 	$LDWendi___log2f_199_51;
$Lt_22_67586:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f95, %f77;
	mov.f32 	%f96, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f97, %f95, %f96;
	ex2.approx.ftz.f32 	%f21, %f97;
$LDWendi___log2f_199_51:
	.loc	3	257	0
	mov.f32 	%f98, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p14, %f78, %f98;
	@!%p14 bra 	$Lt_22_68098;
	.loc	3	234	0
	neg.ftz.f32 	%f99, %f78;
	lg2.approx.ftz.f32 	%f100, %f99;
	mov.f32 	%f101, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f102, %f100, %f101;
	ex2.approx.ftz.f32 	%f103, %f102;
	neg.ftz.f32 	%f31, %f103;
	bra.uni 	$LDWendi___log2f_199_49;
$Lt_22_68098:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f104, %f78;
	mov.f32 	%f105, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f106, %f104, %f105;
	ex2.approx.ftz.f32 	%f31, %f106;
$LDWendi___log2f_199_49:
	.loc	20	80	0
	ld.param.f32 	%f107, [__cudaparm_AccumulateKernel_inSrcWeights2];
	mul.ftz.f32 	%f108, %f107, %f79;
	cvt.ftz.sat.f32.f32 	%f109, %f108;
	fma.rn.ftz.f32 	%f38, %f11, %f109, %f38;
	.loc	20	81	0
	fma.rn.ftz.f32 	%f39, %f21, %f109, %f39;
	.loc	20	82	0
	fma.rn.ftz.f32 	%f40, %f31, %f109, %f40;
	.loc	20	83	0
	add.ftz.f32 	%f41, %f109, %f41;
$Lt_22_66050:
	.loc	19	99	0
	ld.param.u64 	%rd22, [__cudaparm_AccumulateKernel_inSrc3];
	mov.u64 	%rd23, 0;
	setp.eq.u64 	%p15, %rd22, %rd23;
	@%p15 bra 	$Lt_22_68610;
	ld.param.s32 	%r34, [__cudaparm_AccumulateKernel_inSrcPitch3];
	mul.lo.s32 	%r35, %r34, %r10;
	add.s32 	%r36, %r8, %r35;
	cvt.s64.s32 	%rd24, %r36;
	@!%p2 bra 	$Lt_22_69378;
	.loc	18	115	0
	mul.lo.u64 	%rd25, %rd24, 8;
	add.u64 	%rd26, %rd22, %rd25;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd26+0];
	.loc	19	42	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f110, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f111, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f112, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f113, %b1; }
	bra.uni 	$Lt_22_69122;
$Lt_22_69378:
	mul.lo.u64 	%rd27, %rd24, 16;
	add.u64 	%rd28, %rd22, %rd27;
	ld.global.v4.f32 	{%f110,%f111,%f112,%f113}, [%rd28+0];
$Lt_22_69122:
	.loc	3	255	0
	mov.f32 	%f114, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p16, %f110, %f114;
	@!%p16 bra 	$Lt_22_69634;
	.loc	3	234	0
	neg.ftz.f32 	%f115, %f110;
	lg2.approx.ftz.f32 	%f116, %f115;
	mov.f32 	%f117, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f118, %f116, %f117;
	ex2.approx.ftz.f32 	%f119, %f118;
	neg.ftz.f32 	%f11, %f119;
	bra.uni 	$LDWendi___log2f_199_47;
$Lt_22_69634:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f120, %f110;
	mov.f32 	%f121, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f122, %f120, %f121;
	ex2.approx.ftz.f32 	%f11, %f122;
$LDWendi___log2f_199_47:
	.loc	3	256	0
	mov.f32 	%f123, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p17, %f111, %f123;
	@!%p17 bra 	$Lt_22_70146;
	.loc	3	234	0
	neg.ftz.f32 	%f124, %f111;
	lg2.approx.ftz.f32 	%f125, %f124;
	mov.f32 	%f126, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f127, %f125, %f126;
	ex2.approx.ftz.f32 	%f128, %f127;
	neg.ftz.f32 	%f21, %f128;
	bra.uni 	$LDWendi___log2f_199_45;
$Lt_22_70146:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f129, %f111;
	mov.f32 	%f130, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f131, %f129, %f130;
	ex2.approx.ftz.f32 	%f21, %f131;
$LDWendi___log2f_199_45:
	.loc	3	257	0
	mov.f32 	%f132, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p18, %f112, %f132;
	@!%p18 bra 	$Lt_22_70658;
	.loc	3	234	0
	neg.ftz.f32 	%f133, %f112;
	lg2.approx.ftz.f32 	%f134, %f133;
	mov.f32 	%f135, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f136, %f134, %f135;
	ex2.approx.ftz.f32 	%f137, %f136;
	neg.ftz.f32 	%f31, %f137;
	bra.uni 	$LDWendi___log2f_199_43;
$Lt_22_70658:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f138, %f112;
	mov.f32 	%f139, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f140, %f138, %f139;
	ex2.approx.ftz.f32 	%f31, %f140;
$LDWendi___log2f_199_43:
	.loc	20	80	0
	ld.param.f32 	%f141, [__cudaparm_AccumulateKernel_inSrcWeights3];
	mul.ftz.f32 	%f142, %f141, %f113;
	cvt.ftz.sat.f32.f32 	%f143, %f142;
	fma.rn.ftz.f32 	%f38, %f11, %f143, %f38;
	.loc	20	81	0
	fma.rn.ftz.f32 	%f39, %f21, %f143, %f39;
	.loc	20	82	0
	fma.rn.ftz.f32 	%f40, %f31, %f143, %f40;
	.loc	20	83	0
	add.ftz.f32 	%f41, %f143, %f41;
$Lt_22_68610:
	.loc	19	100	0
	ld.param.u64 	%rd29, [__cudaparm_AccumulateKernel_inSrc4];
	mov.u64 	%rd30, 0;
	setp.eq.u64 	%p19, %rd29, %rd30;
	@%p19 bra 	$Lt_22_71170;
	ld.param.s32 	%r37, [__cudaparm_AccumulateKernel_inSrcPitch4];
	mul.lo.s32 	%r38, %r37, %r10;
	add.s32 	%r39, %r8, %r38;
	cvt.s64.s32 	%rd31, %r39;
	@!%p2 bra 	$Lt_22_71938;
	.loc	18	115	0
	mul.lo.u64 	%rd32, %rd31, 8;
	add.u64 	%rd33, %rd29, %rd32;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd33+0];
	.loc	19	42	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f144, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f145, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f146, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f147, %b1; }
	bra.uni 	$Lt_22_71682;
$Lt_22_71938:
	mul.lo.u64 	%rd34, %rd31, 16;
	add.u64 	%rd35, %rd29, %rd34;
	ld.global.v4.f32 	{%f144,%f145,%f146,%f147}, [%rd35+0];
$Lt_22_71682:
	.loc	3	255	0
	mov.f32 	%f148, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p20, %f144, %f148;
	@!%p20 bra 	$Lt_22_72194;
	.loc	3	234	0
	neg.ftz.f32 	%f149, %f144;
	lg2.approx.ftz.f32 	%f150, %f149;
	mov.f32 	%f151, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f152, %f150, %f151;
	ex2.approx.ftz.f32 	%f153, %f152;
	neg.ftz.f32 	%f11, %f153;
	bra.uni 	$LDWendi___log2f_199_41;
$Lt_22_72194:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f154, %f144;
	mov.f32 	%f155, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f156, %f154, %f155;
	ex2.approx.ftz.f32 	%f11, %f156;
$LDWendi___log2f_199_41:
	.loc	3	256	0
	mov.f32 	%f157, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p21, %f145, %f157;
	@!%p21 bra 	$Lt_22_72706;
	.loc	3	234	0
	neg.ftz.f32 	%f158, %f145;
	lg2.approx.ftz.f32 	%f159, %f158;
	mov.f32 	%f160, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f161, %f159, %f160;
	ex2.approx.ftz.f32 	%f162, %f161;
	neg.ftz.f32 	%f21, %f162;
	bra.uni 	$LDWendi___log2f_199_39;
$Lt_22_72706:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f163, %f145;
	mov.f32 	%f164, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f165, %f163, %f164;
	ex2.approx.ftz.f32 	%f21, %f165;
$LDWendi___log2f_199_39:
	.loc	3	257	0
	mov.f32 	%f166, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p22, %f146, %f166;
	@!%p22 bra 	$Lt_22_73218;
	.loc	3	234	0
	neg.ftz.f32 	%f167, %f146;
	lg2.approx.ftz.f32 	%f168, %f167;
	mov.f32 	%f169, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f170, %f168, %f169;
	ex2.approx.ftz.f32 	%f171, %f170;
	neg.ftz.f32 	%f31, %f171;
	bra.uni 	$LDWendi___log2f_199_37;
$Lt_22_73218:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f172, %f146;
	mov.f32 	%f173, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f174, %f172, %f173;
	ex2.approx.ftz.f32 	%f31, %f174;
$LDWendi___log2f_199_37:
	.loc	20	80	0
	ld.param.f32 	%f175, [__cudaparm_AccumulateKernel_inSrcWeights4];
	mul.ftz.f32 	%f176, %f175, %f147;
	cvt.ftz.sat.f32.f32 	%f177, %f176;
	fma.rn.ftz.f32 	%f38, %f11, %f177, %f38;
	.loc	20	81	0
	fma.rn.ftz.f32 	%f39, %f21, %f177, %f39;
	.loc	20	82	0
	fma.rn.ftz.f32 	%f40, %f31, %f177, %f40;
	.loc	20	83	0
	add.ftz.f32 	%f41, %f177, %f41;
$Lt_22_71170:
	.loc	19	101	0
	ld.param.u64 	%rd36, [__cudaparm_AccumulateKernel_inSrc5];
	mov.u64 	%rd37, 0;
	setp.eq.u64 	%p23, %rd36, %rd37;
	@%p23 bra 	$Lt_22_73730;
	ld.param.s32 	%r40, [__cudaparm_AccumulateKernel_inSrcPitch5];
	mul.lo.s32 	%r41, %r40, %r10;
	add.s32 	%r42, %r8, %r41;
	cvt.s64.s32 	%rd38, %r42;
	@!%p2 bra 	$Lt_22_74498;
	.loc	18	115	0
	mul.lo.u64 	%rd39, %rd38, 8;
	add.u64 	%rd40, %rd36, %rd39;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd40+0];
	.loc	19	42	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f178, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f179, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f180, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f181, %b1; }
	bra.uni 	$Lt_22_74242;
$Lt_22_74498:
	mul.lo.u64 	%rd41, %rd38, 16;
	add.u64 	%rd42, %rd36, %rd41;
	ld.global.v4.f32 	{%f178,%f179,%f180,%f181}, [%rd42+0];
$Lt_22_74242:
	.loc	3	255	0
	mov.f32 	%f182, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p24, %f178, %f182;
	@!%p24 bra 	$Lt_22_74754;
	.loc	3	234	0
	neg.ftz.f32 	%f183, %f178;
	lg2.approx.ftz.f32 	%f184, %f183;
	mov.f32 	%f185, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f186, %f184, %f185;
	ex2.approx.ftz.f32 	%f187, %f186;
	neg.ftz.f32 	%f11, %f187;
	bra.uni 	$LDWendi___log2f_199_35;
$Lt_22_74754:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f188, %f178;
	mov.f32 	%f189, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f190, %f188, %f189;
	ex2.approx.ftz.f32 	%f11, %f190;
$LDWendi___log2f_199_35:
	.loc	3	256	0
	mov.f32 	%f191, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p25, %f179, %f191;
	@!%p25 bra 	$Lt_22_75266;
	.loc	3	234	0
	neg.ftz.f32 	%f192, %f179;
	lg2.approx.ftz.f32 	%f193, %f192;
	mov.f32 	%f194, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f195, %f193, %f194;
	ex2.approx.ftz.f32 	%f196, %f195;
	neg.ftz.f32 	%f21, %f196;
	bra.uni 	$LDWendi___log2f_199_33;
$Lt_22_75266:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f197, %f179;
	mov.f32 	%f198, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f199, %f197, %f198;
	ex2.approx.ftz.f32 	%f21, %f199;
$LDWendi___log2f_199_33:
	.loc	3	257	0
	mov.f32 	%f200, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p26, %f180, %f200;
	@!%p26 bra 	$Lt_22_75778;
	.loc	3	234	0
	neg.ftz.f32 	%f201, %f180;
	lg2.approx.ftz.f32 	%f202, %f201;
	mov.f32 	%f203, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f204, %f202, %f203;
	ex2.approx.ftz.f32 	%f205, %f204;
	neg.ftz.f32 	%f31, %f205;
	bra.uni 	$LDWendi___log2f_199_31;
$Lt_22_75778:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f206, %f180;
	mov.f32 	%f207, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f208, %f206, %f207;
	ex2.approx.ftz.f32 	%f31, %f208;
$LDWendi___log2f_199_31:
	.loc	20	80	0
	ld.param.f32 	%f209, [__cudaparm_AccumulateKernel_inSrcWeights5];
	mul.ftz.f32 	%f210, %f209, %f181;
	cvt.ftz.sat.f32.f32 	%f211, %f210;
	fma.rn.ftz.f32 	%f38, %f11, %f211, %f38;
	.loc	20	81	0
	fma.rn.ftz.f32 	%f39, %f21, %f211, %f39;
	.loc	20	82	0
	fma.rn.ftz.f32 	%f40, %f31, %f211, %f40;
	.loc	20	83	0
	add.ftz.f32 	%f41, %f211, %f41;
$Lt_22_73730:
	.loc	19	102	0
	ld.param.u64 	%rd43, [__cudaparm_AccumulateKernel_inSrc6];
	mov.u64 	%rd44, 0;
	setp.eq.u64 	%p27, %rd43, %rd44;
	@%p27 bra 	$Lt_22_76290;
	ld.param.s32 	%r43, [__cudaparm_AccumulateKernel_inSrcPitch6];
	mul.lo.s32 	%r44, %r43, %r10;
	add.s32 	%r45, %r8, %r44;
	cvt.s64.s32 	%rd45, %r45;
	@!%p2 bra 	$Lt_22_77058;
	.loc	18	115	0
	mul.lo.u64 	%rd46, %rd45, 8;
	add.u64 	%rd47, %rd43, %rd46;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd47+0];
	.loc	19	42	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f212, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f213, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f214, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f215, %b1; }
	bra.uni 	$Lt_22_76802;
$Lt_22_77058:
	mul.lo.u64 	%rd48, %rd45, 16;
	add.u64 	%rd49, %rd43, %rd48;
	ld.global.v4.f32 	{%f212,%f213,%f214,%f215}, [%rd49+0];
$Lt_22_76802:
	.loc	3	255	0
	mov.f32 	%f216, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p28, %f212, %f216;
	@!%p28 bra 	$Lt_22_77314;
	.loc	3	234	0
	neg.ftz.f32 	%f217, %f212;
	lg2.approx.ftz.f32 	%f218, %f217;
	mov.f32 	%f219, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f220, %f218, %f219;
	ex2.approx.ftz.f32 	%f221, %f220;
	neg.ftz.f32 	%f11, %f221;
	bra.uni 	$LDWendi___log2f_199_29;
$Lt_22_77314:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f222, %f212;
	mov.f32 	%f223, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f224, %f222, %f223;
	ex2.approx.ftz.f32 	%f11, %f224;
$LDWendi___log2f_199_29:
	.loc	3	256	0
	mov.f32 	%f225, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p29, %f213, %f225;
	@!%p29 bra 	$Lt_22_77826;
	.loc	3	234	0
	neg.ftz.f32 	%f226, %f213;
	lg2.approx.ftz.f32 	%f227, %f226;
	mov.f32 	%f228, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f229, %f227, %f228;
	ex2.approx.ftz.f32 	%f230, %f229;
	neg.ftz.f32 	%f21, %f230;
	bra.uni 	$LDWendi___log2f_199_27;
$Lt_22_77826:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f231, %f213;
	mov.f32 	%f232, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f233, %f231, %f232;
	ex2.approx.ftz.f32 	%f21, %f233;
$LDWendi___log2f_199_27:
	.loc	3	257	0
	mov.f32 	%f234, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p30, %f214, %f234;
	@!%p30 bra 	$Lt_22_78338;
	.loc	3	234	0
	neg.ftz.f32 	%f235, %f214;
	lg2.approx.ftz.f32 	%f236, %f235;
	mov.f32 	%f237, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f238, %f236, %f237;
	ex2.approx.ftz.f32 	%f239, %f238;
	neg.ftz.f32 	%f31, %f239;
	bra.uni 	$LDWendi___log2f_199_25;
$Lt_22_78338:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f240, %f214;
	mov.f32 	%f241, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f242, %f240, %f241;
	ex2.approx.ftz.f32 	%f31, %f242;
$LDWendi___log2f_199_25:
	.loc	20	80	0
	ld.param.f32 	%f243, [__cudaparm_AccumulateKernel_inSrcWeights6];
	mul.ftz.f32 	%f244, %f243, %f215;
	cvt.ftz.sat.f32.f32 	%f245, %f244;
	fma.rn.ftz.f32 	%f38, %f11, %f245, %f38;
	.loc	20	81	0
	fma.rn.ftz.f32 	%f39, %f21, %f245, %f39;
	.loc	20	82	0
	fma.rn.ftz.f32 	%f40, %f31, %f245, %f40;
	.loc	20	83	0
	add.ftz.f32 	%f41, %f245, %f41;
$Lt_22_76290:
	.loc	19	103	0
	ld.param.u64 	%rd50, [__cudaparm_AccumulateKernel_inSrc7];
	mov.u64 	%rd51, 0;
	setp.eq.u64 	%p31, %rd50, %rd51;
	@%p31 bra 	$Lt_22_78850;
	ld.param.s32 	%r46, [__cudaparm_AccumulateKernel_inSrcPitch7];
	mul.lo.s32 	%r47, %r46, %r10;
	add.s32 	%r48, %r8, %r47;
	cvt.s64.s32 	%rd52, %r48;
	@!%p2 bra 	$Lt_22_79618;
	.loc	18	115	0
	mul.lo.u64 	%rd53, %rd52, 8;
	add.u64 	%rd54, %rd50, %rd53;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd54+0];
	.loc	19	42	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f246, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f247, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f248, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f249, %b1; }
	bra.uni 	$Lt_22_79362;
$Lt_22_79618:
	mul.lo.u64 	%rd55, %rd52, 16;
	add.u64 	%rd56, %rd50, %rd55;
	ld.global.v4.f32 	{%f246,%f247,%f248,%f249}, [%rd56+0];
$Lt_22_79362:
	.loc	3	255	0
	mov.f32 	%f250, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p32, %f246, %f250;
	@!%p32 bra 	$Lt_22_79874;
	.loc	3	234	0
	neg.ftz.f32 	%f251, %f246;
	lg2.approx.ftz.f32 	%f252, %f251;
	mov.f32 	%f253, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f254, %f252, %f253;
	ex2.approx.ftz.f32 	%f255, %f254;
	neg.ftz.f32 	%f11, %f255;
	bra.uni 	$LDWendi___log2f_199_23;
$Lt_22_79874:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f256, %f246;
	mov.f32 	%f257, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f258, %f256, %f257;
	ex2.approx.ftz.f32 	%f11, %f258;
$LDWendi___log2f_199_23:
	.loc	3	256	0
	mov.f32 	%f259, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p33, %f247, %f259;
	@!%p33 bra 	$Lt_22_80386;
	.loc	3	234	0
	neg.ftz.f32 	%f260, %f247;
	lg2.approx.ftz.f32 	%f261, %f260;
	mov.f32 	%f262, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f263, %f261, %f262;
	ex2.approx.ftz.f32 	%f264, %f263;
	neg.ftz.f32 	%f21, %f264;
	bra.uni 	$LDWendi___log2f_199_21;
$Lt_22_80386:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f265, %f247;
	mov.f32 	%f266, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f267, %f265, %f266;
	ex2.approx.ftz.f32 	%f21, %f267;
$LDWendi___log2f_199_21:
	.loc	3	257	0
	mov.f32 	%f268, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p34, %f248, %f268;
	@!%p34 bra 	$Lt_22_80898;
	.loc	3	234	0
	neg.ftz.f32 	%f269, %f248;
	lg2.approx.ftz.f32 	%f270, %f269;
	mov.f32 	%f271, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f272, %f270, %f271;
	ex2.approx.ftz.f32 	%f273, %f272;
	neg.ftz.f32 	%f31, %f273;
	bra.uni 	$LDWendi___log2f_199_19;
$Lt_22_80898:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f274, %f248;
	mov.f32 	%f275, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f276, %f274, %f275;
	ex2.approx.ftz.f32 	%f31, %f276;
$LDWendi___log2f_199_19:
	.loc	20	80	0
	ld.param.f32 	%f277, [__cudaparm_AccumulateKernel_inSrcWeights7];
	mul.ftz.f32 	%f278, %f277, %f249;
	cvt.ftz.sat.f32.f32 	%f279, %f278;
	fma.rn.ftz.f32 	%f38, %f11, %f279, %f38;
	.loc	20	81	0
	fma.rn.ftz.f32 	%f39, %f21, %f279, %f39;
	.loc	20	82	0
	fma.rn.ftz.f32 	%f40, %f31, %f279, %f40;
	.loc	20	83	0
	add.ftz.f32 	%f41, %f279, %f41;
$Lt_22_78850:
	.loc	19	104	0
	ld.param.u64 	%rd57, [__cudaparm_AccumulateKernel_inSrc8];
	mov.u64 	%rd58, 0;
	setp.eq.u64 	%p35, %rd57, %rd58;
	@%p35 bra 	$Lt_22_81410;
	ld.param.s32 	%r49, [__cudaparm_AccumulateKernel_inSrcPitch8];
	mul.lo.s32 	%r50, %r49, %r10;
	add.s32 	%r51, %r8, %r50;
	cvt.s64.s32 	%rd59, %r51;
	@!%p2 bra 	$Lt_22_82178;
	.loc	18	115	0
	mul.lo.u64 	%rd60, %rd59, 8;
	add.u64 	%rd61, %rd57, %rd60;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd61+0];
	.loc	19	42	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f280, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f281, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f282, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f283, %b1; }
	bra.uni 	$Lt_22_81922;
$Lt_22_82178:
	mul.lo.u64 	%rd62, %rd59, 16;
	add.u64 	%rd63, %rd57, %rd62;
	ld.global.v4.f32 	{%f280,%f281,%f282,%f283}, [%rd63+0];
$Lt_22_81922:
	.loc	3	255	0
	mov.f32 	%f284, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p36, %f280, %f284;
	@!%p36 bra 	$Lt_22_82434;
	.loc	3	234	0
	neg.ftz.f32 	%f285, %f280;
	lg2.approx.ftz.f32 	%f286, %f285;
	mov.f32 	%f287, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f288, %f286, %f287;
	ex2.approx.ftz.f32 	%f289, %f288;
	neg.ftz.f32 	%f11, %f289;
	bra.uni 	$LDWendi___log2f_199_17;
$Lt_22_82434:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f290, %f280;
	mov.f32 	%f291, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f292, %f290, %f291;
	ex2.approx.ftz.f32 	%f11, %f292;
$LDWendi___log2f_199_17:
	.loc	3	256	0
	mov.f32 	%f293, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p37, %f281, %f293;
	@!%p37 bra 	$Lt_22_82946;
	.loc	3	234	0
	neg.ftz.f32 	%f294, %f281;
	lg2.approx.ftz.f32 	%f295, %f294;
	mov.f32 	%f296, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f297, %f295, %f296;
	ex2.approx.ftz.f32 	%f298, %f297;
	neg.ftz.f32 	%f21, %f298;
	bra.uni 	$LDWendi___log2f_199_15;
$Lt_22_82946:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f299, %f281;
	mov.f32 	%f300, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f301, %f299, %f300;
	ex2.approx.ftz.f32 	%f21, %f301;
$LDWendi___log2f_199_15:
	.loc	3	257	0
	mov.f32 	%f302, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p38, %f282, %f302;
	@!%p38 bra 	$Lt_22_83458;
	.loc	3	234	0
	neg.ftz.f32 	%f303, %f282;
	lg2.approx.ftz.f32 	%f304, %f303;
	mov.f32 	%f305, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f306, %f304, %f305;
	ex2.approx.ftz.f32 	%f307, %f306;
	neg.ftz.f32 	%f31, %f307;
	bra.uni 	$LDWendi___log2f_199_13;
$Lt_22_83458:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f308, %f282;
	mov.f32 	%f309, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f310, %f308, %f309;
	ex2.approx.ftz.f32 	%f31, %f310;
$LDWendi___log2f_199_13:
	.loc	20	80	0
	ld.param.f32 	%f311, [__cudaparm_AccumulateKernel_inSrcWeights8];
	mul.ftz.f32 	%f312, %f311, %f283;
	cvt.ftz.sat.f32.f32 	%f313, %f312;
	fma.rn.ftz.f32 	%f38, %f11, %f313, %f38;
	.loc	20	81	0
	fma.rn.ftz.f32 	%f39, %f21, %f313, %f39;
	.loc	20	82	0
	fma.rn.ftz.f32 	%f40, %f31, %f313, %f40;
	.loc	20	83	0
	add.ftz.f32 	%f41, %f313, %f41;
$Lt_22_81410:
	.loc	19	105	0
	ld.param.u64 	%rd64, [__cudaparm_AccumulateKernel_inSrc9];
	mov.u64 	%rd65, 0;
	setp.eq.u64 	%p39, %rd64, %rd65;
	@%p39 bra 	$Lt_22_83970;
	ld.param.s32 	%r52, [__cudaparm_AccumulateKernel_inSrcPitch9];
	mul.lo.s32 	%r53, %r52, %r10;
	add.s32 	%r54, %r8, %r53;
	cvt.s64.s32 	%rd66, %r54;
	@!%p2 bra 	$Lt_22_84738;
	.loc	18	115	0
	mul.lo.u64 	%rd67, %rd66, 8;
	add.u64 	%rd68, %rd64, %rd67;
	ld.global.v4.u16 	{%r24,%r25,%r26,%r27}, [%rd68+0];
	.loc	19	42	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r24;
	cvt.ftz.f32.f16	%f314, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r25;
	cvt.ftz.f32.f16	%f315, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r26;
	cvt.ftz.f32.f16	%f316, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r27;
	cvt.ftz.f32.f16	%f317, %b1; }
	bra.uni 	$Lt_22_84482;
$Lt_22_84738:
	mul.lo.u64 	%rd69, %rd66, 16;
	add.u64 	%rd70, %rd64, %rd69;
	ld.global.v4.f32 	{%f314,%f315,%f316,%f317}, [%rd70+0];
$Lt_22_84482:
	.loc	3	255	0
	mov.f32 	%f318, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p40, %f314, %f318;
	@!%p40 bra 	$Lt_22_84994;
	.loc	3	234	0
	neg.ftz.f32 	%f319, %f314;
	lg2.approx.ftz.f32 	%f320, %f319;
	mov.f32 	%f321, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f322, %f320, %f321;
	ex2.approx.ftz.f32 	%f323, %f322;
	neg.ftz.f32 	%f11, %f323;
	bra.uni 	$LDWendi___log2f_199_11;
$Lt_22_84994:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f324, %f314;
	mov.f32 	%f325, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f326, %f324, %f325;
	ex2.approx.ftz.f32 	%f11, %f326;
$LDWendi___log2f_199_11:
	.loc	3	256	0
	mov.f32 	%f327, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p41, %f315, %f327;
	@!%p41 bra 	$Lt_22_85506;
	.loc	3	234	0
	neg.ftz.f32 	%f328, %f315;
	lg2.approx.ftz.f32 	%f329, %f328;
	mov.f32 	%f330, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f331, %f329, %f330;
	ex2.approx.ftz.f32 	%f332, %f331;
	neg.ftz.f32 	%f21, %f332;
	bra.uni 	$LDWendi___log2f_199_9;
$Lt_22_85506:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f333, %f315;
	mov.f32 	%f334, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f335, %f333, %f334;
	ex2.approx.ftz.f32 	%f21, %f335;
$LDWendi___log2f_199_9:
	.loc	3	257	0
	mov.f32 	%f336, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p42, %f316, %f336;
	@!%p42 bra 	$Lt_22_86018;
	.loc	3	234	0
	neg.ftz.f32 	%f337, %f316;
	lg2.approx.ftz.f32 	%f338, %f337;
	mov.f32 	%f339, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f340, %f338, %f339;
	ex2.approx.ftz.f32 	%f341, %f340;
	neg.ftz.f32 	%f31, %f341;
	bra.uni 	$LDWendi___log2f_199_7;
$Lt_22_86018:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f342, %f316;
	mov.f32 	%f343, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f344, %f342, %f343;
	ex2.approx.ftz.f32 	%f31, %f344;
$LDWendi___log2f_199_7:
	.loc	20	80	0
	ld.param.f32 	%f345, [__cudaparm_AccumulateKernel_inSrcWeights9];
	mul.ftz.f32 	%f346, %f345, %f317;
	cvt.ftz.sat.f32.f32 	%f347, %f346;
	fma.rn.ftz.f32 	%f38, %f11, %f347, %f38;
	.loc	20	81	0
	fma.rn.ftz.f32 	%f39, %f21, %f347, %f39;
	.loc	20	82	0
	fma.rn.ftz.f32 	%f40, %f31, %f347, %f40;
	.loc	20	83	0
	add.ftz.f32 	%f41, %f347, %f41;
$Lt_22_83970:
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f348, %f41;
	mov.f32 	%f349, %f348;
	mov.f32 	%f350, 0fb70637bd;   	// -8e-006
	add.ftz.f32 	%f351, %f348, %f350;
	mov.f32 	%f352, 0f00000000;   	// 0
	setp.le.ftz.f32 	%p43, %f351, %f352;
	@%p43 bra 	$Lt_22_86786;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f353, %f348;
	mul.ftz.f32 	%f354, %f353, %f40;
	.loc	3	214	0
	mul.ftz.f32 	%f355, %f353, %f39;
	.loc	3	215	0
	mul.ftz.f32 	%f356, %f353, %f38;
	bra.uni 	$Lt_22_86530;
$Lt_22_86786:
	.loc	3	219	0
	mov.f32 	%f354, 0f00000000;   	// 0
	mov.f32 	%f355, 0f00000000;   	// 0
	mov.f32 	%f356, 0f00000000;   	// 0
	mov.f32 	%f349, 0f00000000;   	// 0
$Lt_22_86530:
	.loc	3	266	0
	mov.f32 	%f357, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p44, %f356, %f357;
	@!%p44 bra 	$Lt_22_87042;
	.loc	3	242	0
	neg.ftz.f32 	%f358, %f356;
	lg2.approx.ftz.f32 	%f359, %f358;
	mov.f32 	%f360, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f361, %f359, %f360;
	ex2.approx.ftz.f32 	%f362, %f361;
	neg.ftz.f32 	%f363, %f362;
	bra.uni 	$LDWendi___log2f_199_5;
$Lt_22_87042:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f364, %f356;
	mov.f32 	%f365, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f366, %f364, %f365;
	ex2.approx.ftz.f32 	%f363, %f366;
$LDWendi___log2f_199_5:
	.loc	3	267	0
	mov.f32 	%f367, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p45, %f355, %f367;
	@!%p45 bra 	$Lt_22_87554;
	.loc	3	242	0
	neg.ftz.f32 	%f368, %f355;
	lg2.approx.ftz.f32 	%f369, %f368;
	mov.f32 	%f370, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f371, %f369, %f370;
	ex2.approx.ftz.f32 	%f372, %f371;
	neg.ftz.f32 	%f373, %f372;
	bra.uni 	$LDWendi___log2f_199_3;
$Lt_22_87554:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f374, %f355;
	mov.f32 	%f375, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f376, %f374, %f375;
	ex2.approx.ftz.f32 	%f373, %f376;
$LDWendi___log2f_199_3:
	.loc	3	268	0
	mov.f32 	%f377, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p46, %f354, %f377;
	@!%p46 bra 	$Lt_22_88066;
	.loc	3	242	0
	neg.ftz.f32 	%f378, %f354;
	lg2.approx.ftz.f32 	%f379, %f378;
	mov.f32 	%f380, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f381, %f379, %f380;
	ex2.approx.ftz.f32 	%f382, %f381;
	neg.ftz.f32 	%f383, %f382;
	bra.uni 	$LDWendi___log2f_199_1;
$Lt_22_88066:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f384, %f354;
	mov.f32 	%f385, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f386, %f384, %f385;
	ex2.approx.ftz.f32 	%f383, %f386;
$LDWendi___log2f_199_1:
	.loc	19	107	0
	ld.param.s32 	%r55, [__cudaparm_AccumulateKernel_inDestPitch];
	mul.lo.s32 	%r56, %r55, %r10;
	add.s32 	%r57, %r8, %r56;
	cvt.s64.s32 	%rd71, %r57;
	ld.param.u64 	%rd72, [__cudaparm_AccumulateKernel_inDest];
	@!%p2 bra 	$Lt_22_88834;
	.loc	18	126	0
	mul.lo.u64 	%rd73, %rd71, 8;
	add.u64 	%rd74, %rd72, %rd73;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f363;
	mov.b32		%r58, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f373;
	mov.b32		%r59, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f383;
	mov.b32		%r60, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f349;
	mov.b32		%r61, %b1; }
	st.global.v4.u16 	[%rd74+0], {%r58,%r59,%r60,%r61};
	.loc	19	107	0
	bra.uni 	$Lt_22_88578;
$Lt_22_88834:
	.loc	18	126	0
	mul.lo.u64 	%rd75, %rd71, 16;
	add.u64 	%rd76, %rd72, %rd75;
	st.global.v4.f32 	[%rd76+0], {%f363,%f373,%f383,%f349};
$Lt_22_88578:
$Lt_22_60418:
	.loc	19	115	0
	exit;
$LDWend_AccumulateKernel:
	} // AccumulateKernel

