	.version 2.2
	.target sm_20
	// compiled with ../../../External/3rdParty/NVIDIA/CUDA/win/bin/../open64/lib//be.exe
	// nvopencc 3.2 built on 2010-11-04

	.visible .func (.param .s32 __cudaretf__Z15IntegerMultiplyii) _Z15IntegerMultiplyii (.param .s32 __cudaparmf1__Z15IntegerMultiplyii, .param .s32 __cudaparmf2__Z15IntegerMultiplyii)

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelXv) _Z17Standard2DKernelXv ()

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelYv) _Z17Standard2DKernelYv ()

	.visible .func (.param .align 16 .b8 __cudaretf__Z13Half4ToFloat47ushort4[16]) _Z13Half4ToFloat47ushort4 (.param .align 8 .b8 __cudaparmf1__Z13Half4ToFloat47ushort4[8])

	.visible .func (.param .align 8 .b8 __cudaretf__Z13Float4ToHalf46float4[8]) _Z13Float4ToHalf46float4 (.param .align 16 .b8 __cudaparmf1__Z13Float4ToHalf46float4[16])

	.visible .func (.param .u32 __cudaretf__Z4Mix3RjS_S_) _Z4Mix3RjS_S_ (.param .u64 __cudaparmf1__Z4Mix3RjS_S_, .param .u64 __cudaparmf2__Z4Mix3RjS_S_, .param .u64 __cudaparmf3__Z4Mix3RjS_S_)

	.visible .func (.param .s32 __cudaretf__Z4Randj) _Z4Randj (.param .u32 __cudaparmf1__Z4Randj)

	.visible .func (.param .s32 __cudaretf__Z6Rand2Djjj) _Z6Rand2Djjj (.param .u32 __cudaparmf1__Z6Rand2Djjj, .param .u32 __cudaparmf2__Z6Rand2Djjj, .param .u32 __cudaparmf3__Z6Rand2Djjj)

	.visible .func (.param .s32 __cudaretf__Z6Rand2Dj) _Z6Rand2Dj (.param .u32 __cudaparmf1__Z6Rand2Dj)

	.visible .func (.param .align 8 .b8 __cudaretf__Z6Read2DI7ushort4ET_PKS1_iii[8]) _Z6Read2DI7ushort4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI7ushort4ET_PKS1_iii)

	.visible .func (.param .align 16 .b8 __cudaretf__Z6Read2DI6float4ET_PKS1_iii[16]) _Z6Read2DI6float4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI6float4ET_PKS1_iii)

	.visible .func _Z7Write2DI7ushort4EvT_PS1_iii (.param .align 8 .b8 __cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii[8], .param .u64 __cudaparmf2__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI7ushort4EvT_PS1_iii)

	.visible .func _Z7Write2DI6float4EvT_PS1_iii (.param .align 16 .b8 __cudaparmf1__Z7Write2DI6float4EvT_PS1_iii[16], .param .u64 __cudaparmf2__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI6float4EvT_PS1_iii)

	.visible .func (.param .align 16 .b8 __cudaretf__Z18UnpremultiplyPixel8PixelRGB[16]) _Z18UnpremultiplyPixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z18UnpremultiplyPixel8PixelRGB[16])

	.visible .func (.param .f32 __cudaretf__Z13ToLinearColorf) _Z13ToLinearColorf (.param .f32 __cudaparmf1__Z13ToLinearColorf)

	.visible .func (.param .f32 __cudaretf__Z15FromLinearColorf) _Z15FromLinearColorf (.param .f32 __cudaparmf1__Z15FromLinearColorf)

	.visible .func (.param .align 16 .b8 __cudaretf__Z25PremultiplyLinearizePixel8PixelRGB[16]) _Z25PremultiplyLinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16]) _Z29UnpremultiplyUnlinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z20PremultiplyLinearize6float4[16]) _Z20PremultiplyLinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z20PremultiplyLinearize6float4[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z24UnpremultiplyUnlinearize6float4[16]) _Z24UnpremultiplyUnlinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z24UnpremultiplyUnlinearize6float4[16])

	.visible .func (.param .align 16 .b8 __cudaretf__ZmlI8PixelRGBET_RKS1_f[16]) _ZmlI8PixelRGBET_RKS1_f (.param .u64 __cudaparmf1__ZmlI8PixelRGBET_RKS1_f, .param .f32 __cudaparmf2__ZmlI8PixelRGBET_RKS1_f)

	.visible .func (.param .align 16 .b8 __cudaretf__ZmlI8PixelRGBET_fRKS1_[16]) _ZmlI8PixelRGBET_fRKS1_ (.param .f32 __cudaparmf1__ZmlI8PixelRGBET_fRKS1_, .param .u64 __cudaparmf2__ZmlI8PixelRGBET_fRKS1_)

	//-----------------------------------------------------------
	// Compiling C:/Users/dvaeng/AppData/Local/Temp/tmpxft_000039f0_00000000-11_BoxBlur.cpp3.i (C:/Users/dvaeng/AppData/Local/Temp/ccBI#.a15480)
	//-----------------------------------------------------------

	//-----------------------------------------------------------
	// Options:
	//-----------------------------------------------------------
	//  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
	//  -O3	(Optimization level)
	//  -g0	(Debug level)
	//  -m2	(Report advisories)
	//-----------------------------------------------------------

	.file	1	"C:/Users/dvaeng/AppData/Local/Temp/tmpxft_000039f0_00000000-10_BoxBlur.cudafe2.gpu"
	.file	2	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/PixelFormat.h"
	.file	3	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/PixelRGB.h"
	.file	4	"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\crtdefs.h"
	.file	5	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\crt/device_runtime.h"
	.file	6	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\host_defines.h"
	.file	7	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\builtin_types.h"
	.file	8	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\device_types.h"
	.file	9	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\driver_types.h"
	.file	10	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\surface_types.h"
	.file	11	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\texture_types.h"
	.file	12	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\vector_types.h"
	.file	13	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\builtin_types.h"
	.file	14	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\host_defines.h"
	.file	15	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\device_launch_parameters.h"
	.file	16	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\crt\storage_class.h"
	.file	17	"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\time.h"
	.file	18	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/Utils.h"
	.file	19	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/VectorUtils.h"
	.file	20	"c:/Mulder64/shared/adobe/MediaCore/GPUFoundation/Src/ImageProcessing/BoxBlur.cu"
	.file	21	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\common_functions.h"
	.file	22	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_functions.h"
	.file	23	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_constants.h"
	.file	24	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\device_functions.h"
	.file	25	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_11_atomic_functions.h"
	.file	26	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_12_atomic_functions.h"
	.file	27	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_13_double_functions.h"
	.file	28	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_20_atomic_functions.h"
	.file	29	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_20_intrinsics.h"
	.file	30	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\surface_functions.h"
	.file	31	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\texture_fetch_functions.h"
	.file	32	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_functions_dbl_ptx3.h"


	.visible .func (.param .s32 __cudaretf__Z15IntegerMultiplyii) _Z15IntegerMultiplyii (.param .s32 __cudaparmf1__Z15IntegerMultiplyii, .param .s32 __cudaparmf2__Z15IntegerMultiplyii)
	{
	.reg .u32 %r<7>;
	.loc	18	60	0
$LDWbegin__Z15IntegerMultiplyii:
	ld.param.u32 	%r1, [__cudaparmf1__Z15IntegerMultiplyii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf2__Z15IntegerMultiplyii];
	mov.s32 	%r4, %r3;
	.loc	18	64	0
	mul.lo.s32 	%r5, %r2, %r4;
	st.param.s32 	[__cudaretf__Z15IntegerMultiplyii], %r5;
	ret;
$LDWend__Z15IntegerMultiplyii:
	} // _Z15IntegerMultiplyii

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelXv) _Z17Standard2DKernelXv ()
	{
	.reg .u32 %r<7>;
	.loc	18	73	0
$LDWbegin__Z17Standard2DKernelXv:
	.loc	18	74	0
	mov.u32 	%r1, %tid.x;
	cvt.s32.u32 	%r2, %ctaid.x;
	cvt.s32.u32 	%r3, %ntid.x;
	mul.lo.s32 	%r4, %r2, %r3;
	add.u32 	%r5, %r1, %r4;
	st.param.s32 	[__cudaretf__Z17Standard2DKernelXv], %r5;
	ret;
$LDWend__Z17Standard2DKernelXv:
	} // _Z17Standard2DKernelXv

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelYv) _Z17Standard2DKernelYv ()
	{
	.reg .u32 %r<7>;
	.loc	18	77	0
$LDWbegin__Z17Standard2DKernelYv:
	.loc	18	78	0
	mov.u32 	%r1, %tid.y;
	cvt.s32.u32 	%r2, %ctaid.y;
	cvt.s32.u32 	%r3, %ntid.y;
	mul.lo.s32 	%r4, %r2, %r3;
	add.u32 	%r5, %r1, %r4;
	st.param.s32 	[__cudaretf__Z17Standard2DKernelYv], %r5;
	ret;
$LDWend__Z17Standard2DKernelYv:
	} // _Z17Standard2DKernelYv

	.visible .func (.param .align 16 .b8 __cudaretf__Z13Half4ToFloat47ushort4[16]) _Z13Half4ToFloat47ushort4 (.param .align 8 .b8 __cudaparmf1__Z13Half4ToFloat47ushort4[8])
	{
	.reg .u32 %r<14>;
	.reg .f32 %f<9>;
	.loc	18	86	0
$LDWbegin__Z13Half4ToFloat47ushort4:
	ld.param.u16 	%r1, [__cudaparmf1__Z13Half4ToFloat47ushort4+0];
	mov.s32 	%r2, %r1;
	ld.param.u16 	%r3, [__cudaparmf1__Z13Half4ToFloat47ushort4+2];
	mov.s32 	%r4, %r3;
	ld.param.u16 	%r5, [__cudaparmf1__Z13Half4ToFloat47ushort4+4];
	mov.s32 	%r6, %r5;
	ld.param.u16 	%r7, [__cudaparmf1__Z13Half4ToFloat47ushort4+6];
	mov.s32 	%r8, %r7;
	.loc	18	87	0
	cvt.u16.u32 	%r9, %r4;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r9;
	cvt.ftz.f32.f16	%f1, %b1; }
	cvt.u16.u32 	%r10, %r6;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r10;
	cvt.ftz.f32.f16	%f2, %b1; }
	cvt.u16.u32 	%r11, %r8;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r11;
	cvt.ftz.f32.f16	%f3, %b1; }
	cvt.u16.u32 	%r12, %r2;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r12;
	cvt.ftz.f32.f16	%f4, %b1; }
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+0], %f4;
	mov.f32 	%f5, %f1;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+4], %f5;
	mov.f32 	%f6, %f2;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+8], %f6;
	mov.f32 	%f7, %f3;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+12], %f7;
	ret;
$LDWend__Z13Half4ToFloat47ushort4:
	} // _Z13Half4ToFloat47ushort4

	.visible .func (.param .align 8 .b8 __cudaretf__Z13Float4ToHalf46float4[8]) _Z13Float4ToHalf46float4 (.param .align 16 .b8 __cudaparmf1__Z13Float4ToHalf46float4[16])
	{
	.reg .u32 %r<13>;
	.reg .f32 %f<10>;
	.loc	18	95	0
$LDWbegin__Z13Float4ToHalf46float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z13Float4ToHalf46float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z13Float4ToHalf46float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z13Float4ToHalf46float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z13Float4ToHalf46float4+12];
	mov.f32 	%f8, %f7;
	.loc	18	96	0
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f4;
	mov.b32		%r1, %b1; }
	cvt.u16.u32 	%r2, %r1;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f6;
	mov.b32		%r3, %b1; }
	cvt.u16.u32 	%r4, %r3;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f8;
	mov.b32		%r5, %b1; }
	cvt.u16.u32 	%r6, %r5;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f2;
	mov.b32		%r7, %b1; }
	cvt.u16.u32 	%r8, %r7;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+0], %r8;
	mov.s32 	%r9, %r2;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+2], %r9;
	mov.s32 	%r10, %r4;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+4], %r10;
	mov.s32 	%r11, %r6;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+6], %r11;
	ret;
$LDWend__Z13Float4ToHalf46float4:
	} // _Z13Float4ToHalf46float4

	.visible .func (.param .u32 __cudaretf__Z4Mix3RjS_S_) _Z4Mix3RjS_S_ (.param .u64 __cudaparmf1__Z4Mix3RjS_S_, .param .u64 __cudaparmf2__Z4Mix3RjS_S_, .param .u64 __cudaparmf3__Z4Mix3RjS_S_)
	{
	.reg .u32 %r<75>;
	.reg .u64 %rd<8>;
	.loc	18	138	0
$LDWbegin__Z4Mix3RjS_S_:
	ld.param.u64 	%rd1, [__cudaparmf1__Z4Mix3RjS_S_];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z4Mix3RjS_S_];
	mov.s64 	%rd4, %rd3;
	ld.param.u64 	%rd5, [__cudaparmf3__Z4Mix3RjS_S_];
	mov.s64 	%rd6, %rd5;
	.loc	18	139	0
	ld.u32 	%r1, [%rd2+0];
	ld.u32 	%r2, [%rd4+0];
	sub.u32 	%r3, %r1, %r2;
	st.u32 	[%rd2+0], %r3;
	ld.u32 	%r4, [%rd6+0];
	sub.u32 	%r5, %r3, %r4;
	st.u32 	[%rd2+0], %r5;
	ld.u32 	%r6, [%rd6+0];
	shr.u32 	%r7, %r6, 13;
	xor.b32 	%r8, %r5, %r7;
	st.u32 	[%rd2+0], %r8;
	.loc	18	140	0
	ld.u32 	%r9, [%rd4+0];
	ld.u32 	%r10, [%rd6+0];
	sub.u32 	%r11, %r9, %r10;
	st.u32 	[%rd4+0], %r11;
	ld.u32 	%r12, [%rd2+0];
	sub.u32 	%r13, %r11, %r12;
	st.u32 	[%rd4+0], %r13;
	ld.u32 	%r14, [%rd2+0];
	shl.b32 	%r15, %r14, 8;
	xor.b32 	%r16, %r13, %r15;
	st.u32 	[%rd4+0], %r16;
	.loc	18	141	0
	ld.u32 	%r17, [%rd6+0];
	ld.u32 	%r18, [%rd2+0];
	sub.u32 	%r19, %r17, %r18;
	st.u32 	[%rd6+0], %r19;
	ld.u32 	%r20, [%rd4+0];
	sub.u32 	%r21, %r19, %r20;
	st.u32 	[%rd6+0], %r21;
	ld.u32 	%r22, [%rd4+0];
	shr.u32 	%r23, %r22, 13;
	xor.b32 	%r24, %r21, %r23;
	st.u32 	[%rd6+0], %r24;
	.loc	18	142	0
	ld.u32 	%r25, [%rd2+0];
	ld.u32 	%r26, [%rd4+0];
	sub.u32 	%r27, %r25, %r26;
	st.u32 	[%rd2+0], %r27;
	ld.u32 	%r28, [%rd6+0];
	sub.u32 	%r29, %r27, %r28;
	st.u32 	[%rd2+0], %r29;
	ld.u32 	%r30, [%rd6+0];
	shr.u32 	%r31, %r30, 12;
	xor.b32 	%r32, %r29, %r31;
	st.u32 	[%rd2+0], %r32;
	.loc	18	143	0
	ld.u32 	%r33, [%rd4+0];
	ld.u32 	%r34, [%rd6+0];
	sub.u32 	%r35, %r33, %r34;
	st.u32 	[%rd4+0], %r35;
	ld.u32 	%r36, [%rd2+0];
	sub.u32 	%r37, %r35, %r36;
	st.u32 	[%rd4+0], %r37;
	ld.u32 	%r38, [%rd2+0];
	shl.b32 	%r39, %r38, 16;
	xor.b32 	%r40, %r37, %r39;
	st.u32 	[%rd4+0], %r40;
	.loc	18	144	0
	ld.u32 	%r41, [%rd6+0];
	ld.u32 	%r42, [%rd2+0];
	sub.u32 	%r43, %r41, %r42;
	st.u32 	[%rd6+0], %r43;
	ld.u32 	%r44, [%rd4+0];
	sub.u32 	%r45, %r43, %r44;
	st.u32 	[%rd6+0], %r45;
	ld.u32 	%r46, [%rd4+0];
	shr.u32 	%r47, %r46, 5;
	xor.b32 	%r48, %r45, %r47;
	st.u32 	[%rd6+0], %r48;
	.loc	18	145	0
	ld.u32 	%r49, [%rd2+0];
	ld.u32 	%r50, [%rd4+0];
	sub.u32 	%r51, %r49, %r50;
	st.u32 	[%rd2+0], %r51;
	ld.u32 	%r52, [%rd6+0];
	sub.u32 	%r53, %r51, %r52;
	st.u32 	[%rd2+0], %r53;
	ld.u32 	%r54, [%rd6+0];
	shr.u32 	%r55, %r54, 3;
	xor.b32 	%r56, %r53, %r55;
	st.u32 	[%rd2+0], %r56;
	.loc	18	146	0
	ld.u32 	%r57, [%rd4+0];
	ld.u32 	%r58, [%rd6+0];
	sub.u32 	%r59, %r57, %r58;
	st.u32 	[%rd4+0], %r59;
	ld.u32 	%r60, [%rd2+0];
	sub.u32 	%r61, %r59, %r60;
	st.u32 	[%rd4+0], %r61;
	ld.u32 	%r62, [%rd2+0];
	shl.b32 	%r63, %r62, 10;
	xor.b32 	%r64, %r61, %r63;
	st.u32 	[%rd4+0], %r64;
	.loc	18	147	0
	ld.u32 	%r65, [%rd6+0];
	ld.u32 	%r66, [%rd2+0];
	sub.u32 	%r67, %r65, %r66;
	st.u32 	[%rd6+0], %r67;
	ld.u32 	%r68, [%rd4+0];
	sub.u32 	%r69, %r67, %r68;
	st.u32 	[%rd6+0], %r69;
	ld.u32 	%r70, [%rd4+0];
	shr.u32 	%r71, %r70, 15;
	xor.b32 	%r72, %r69, %r71;
	st.u32 	[%rd6+0], %r72;
	.loc	18	148	0
	mov.s32 	%r73, %r72;
	st.param.u32 	[__cudaretf__Z4Mix3RjS_S_], %r73;
	ret;
$LDWend__Z4Mix3RjS_S_:
	} // _Z4Mix3RjS_S_

	.visible .func (.param .s32 __cudaretf__Z4Randj) _Z4Randj (.param .u32 __cudaparmf1__Z4Randj)
	{
	.reg .u32 %r<14>;
	.loc	18	152	0
$LDWbegin__Z4Randj:
	ld.param.u32 	%r1, [__cudaparmf1__Z4Randj];
	mov.s32 	%r2, %r1;
	.loc	18	163	0
	mul.lo.u32 	%r3, %r2, 1103515245;
	add.u32 	%r4, %r3, 12345;
	shr.u32 	%r5, %r4, 16;
	and.b32 	%r6, %r5, 255;
	shl.b32 	%r7, %r6, 7;
	mul.lo.u32 	%r8, %r2, -1029531031;
	sub.u32 	%r9, %r8, 740551042;
	shr.u32 	%r10, %r9, 16;
	and.b32 	%r11, %r10, 255;
	xor.b32 	%r12, %r7, %r11;
	st.param.s32 	[__cudaretf__Z4Randj], %r12;
	ret;
$LDWend__Z4Randj:
	} // _Z4Randj

	.visible .func (.param .s32 __cudaretf__Z6Rand2Djjj) _Z6Rand2Djjj (.param .u32 __cudaparmf1__Z6Rand2Djjj, .param .u32 __cudaparmf2__Z6Rand2Djjj, .param .u32 __cudaparmf3__Z6Rand2Djjj)
	{
	.reg .u32 %r<54>;
	.loc	18	169	0
$LDWbegin__Z6Rand2Djjj:
	ld.param.u32 	%r1, [__cudaparmf1__Z6Rand2Djjj];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf2__Z6Rand2Djjj];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf3__Z6Rand2Djjj];
	mov.s32 	%r6, %r5;
	.loc	18	139	0
	sub.u32 	%r7, %r2, %r4;
	sub.u32 	%r8, %r7, %r6;
	shr.u32 	%r9, %r6, 13;
	xor.b32 	%r10, %r8, %r9;
	.loc	18	140	0
	sub.u32 	%r11, %r4, %r6;
	sub.u32 	%r12, %r11, %r10;
	shl.b32 	%r13, %r10, 8;
	xor.b32 	%r14, %r12, %r13;
	.loc	18	141	0
	sub.u32 	%r15, %r6, %r10;
	sub.u32 	%r16, %r15, %r14;
	shr.u32 	%r17, %r14, 13;
	xor.b32 	%r18, %r16, %r17;
	.loc	18	142	0
	sub.u32 	%r19, %r10, %r14;
	sub.u32 	%r20, %r19, %r18;
	shr.u32 	%r21, %r18, 12;
	xor.b32 	%r22, %r20, %r21;
	.loc	18	143	0
	sub.u32 	%r23, %r14, %r18;
	sub.u32 	%r24, %r23, %r22;
	shl.b32 	%r25, %r22, 16;
	xor.b32 	%r26, %r24, %r25;
	.loc	18	144	0
	sub.u32 	%r27, %r18, %r22;
	sub.u32 	%r28, %r27, %r26;
	shr.u32 	%r29, %r26, 5;
	xor.b32 	%r30, %r28, %r29;
	.loc	18	145	0
	sub.u32 	%r31, %r22, %r26;
	sub.u32 	%r32, %r31, %r30;
	shr.u32 	%r33, %r30, 3;
	xor.b32 	%r34, %r32, %r33;
	.loc	18	146	0
	sub.u32 	%r35, %r26, %r30;
	sub.u32 	%r36, %r35, %r34;
	shl.b32 	%r37, %r34, 10;
	xor.b32 	%r38, %r36, %r37;
	.loc	18	147	0
	sub.u32 	%r39, %r30, %r34;
	sub.u32 	%r40, %r39, %r38;
	shr.u32 	%r41, %r38, 15;
	xor.b32 	%r42, %r40, %r41;
	.loc	18	170	0
	mul.lo.u32 	%r43, %r42, 1103515245;
	add.u32 	%r44, %r43, 12345;
	shr.u32 	%r45, %r44, 16;
	and.b32 	%r46, %r45, 255;
	shl.b32 	%r47, %r46, 7;
	mul.lo.u32 	%r48, %r42, -1029531031;
	sub.u32 	%r49, %r48, 740551042;
	shr.u32 	%r50, %r49, 16;
	and.b32 	%r51, %r50, 255;
	xor.b32 	%r52, %r47, %r51;
	st.param.s32 	[__cudaretf__Z6Rand2Djjj], %r52;
	ret;
$LDWend__Z6Rand2Djjj:
	} // _Z6Rand2Djjj

	.visible .func (.param .s32 __cudaretf__Z6Rand2Dj) _Z6Rand2Dj (.param .u32 __cudaparmf1__Z6Rand2Dj)
	{
	.reg .u32 %r<60>;
	.loc	18	175	0
$LDWbegin__Z6Rand2Dj:
	ld.param.u32 	%r1, [__cudaparmf1__Z6Rand2Dj];
	mov.s32 	%r2, %r1;
	.loc	18	143	0
	cvt.s32.u32 	%r3, %ctaid.y;
	cvt.s32.u32 	%r4, %ntid.y;
	mul.lo.s32 	%r5, %r3, %r4;
	cvt.s32.u32 	%r6, %ctaid.x;
	cvt.s32.u32 	%r7, %ntid.x;
	mul.lo.s32 	%r8, %r6, %r7;
	mov.u32 	%r9, %tid.y;
	add.u32 	%r10, %r5, %r9;
	mov.u32 	%r11, %tid.x;
	add.u32 	%r12, %r8, %r11;
	shr.u32 	%r13, %r10, 13;
	sub.u32 	%r14, %r2, %r12;
	sub.u32 	%r15, %r12, %r10;
	sub.u32 	%r16, %r14, %r10;
	xor.b32 	%r17, %r13, %r16;
	shl.b32 	%r18, %r17, 8;
	sub.u32 	%r19, %r15, %r17;
	sub.u32 	%r20, %r10, %r17;
	xor.b32 	%r21, %r18, %r19;
	shr.u32 	%r22, %r21, 13;
	sub.u32 	%r23, %r20, %r21;
	sub.u32 	%r24, %r17, %r21;
	xor.b32 	%r25, %r22, %r23;
	shr.u32 	%r26, %r25, 12;
	sub.u32 	%r27, %r24, %r25;
	xor.b32 	%r28, %r26, %r27;
	sub.u32 	%r29, %r21, %r25;
	sub.u32 	%r30, %r29, %r28;
	shl.b32 	%r31, %r28, 16;
	xor.b32 	%r32, %r30, %r31;
	.loc	18	144	0
	sub.u32 	%r33, %r25, %r28;
	sub.u32 	%r34, %r33, %r32;
	shr.u32 	%r35, %r32, 5;
	xor.b32 	%r36, %r34, %r35;
	.loc	18	145	0
	sub.u32 	%r37, %r28, %r32;
	sub.u32 	%r38, %r37, %r36;
	shr.u32 	%r39, %r36, 3;
	xor.b32 	%r40, %r38, %r39;
	.loc	18	146	0
	sub.u32 	%r41, %r32, %r36;
	sub.u32 	%r42, %r41, %r40;
	shl.b32 	%r43, %r40, 10;
	xor.b32 	%r44, %r42, %r43;
	.loc	18	147	0
	sub.u32 	%r45, %r36, %r40;
	sub.u32 	%r46, %r45, %r44;
	shr.u32 	%r47, %r44, 15;
	xor.b32 	%r48, %r46, %r47;
	.loc	18	176	0
	mul.lo.u32 	%r49, %r48, 1103515245;
	add.u32 	%r50, %r49, 12345;
	shr.u32 	%r51, %r50, 16;
	and.b32 	%r52, %r51, 255;
	shl.b32 	%r53, %r52, 7;
	mul.lo.u32 	%r54, %r48, -1029531031;
	sub.u32 	%r55, %r54, 740551042;
	shr.u32 	%r56, %r55, 16;
	and.b32 	%r57, %r56, 255;
	xor.b32 	%r58, %r53, %r57;
	st.param.s32 	[__cudaretf__Z6Rand2Dj], %r58;
	ret;
$LDWend__Z6Rand2Dj:
	} // _Z6Rand2Dj

	.visible .func (.param .align 8 .b8 __cudaretf__Z6Read2DI7ushort4ET_PKS1_iii[8]) _Z6Read2DI7ushort4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI7ushort4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI7ushort4ET_PKS1_iii)
	{
	.reg .u32 %r<14>;
	.reg .u64 %rd<7>;
	.loc	18	114	0
$LDWbegin__Z6Read2DI7ushort4ET_PKS1_iii:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf2__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf3__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf4__Z6Read2DI7ushort4ET_PKS1_iii];
	mov.s32 	%r6, %r5;
	.loc	18	115	0
	mul.lo.s32 	%r7, %r2, %r6;
	add.s32 	%r8, %r4, %r7;
	cvt.s64.s32 	%rd3, %r8;
	mul.wide.s32 	%rd4, %r8, 8;
	add.u64 	%rd5, %rd2, %rd4;
	ld.v4.u16 	{%r9,%r10,%r11,%r12}, [%rd5+0];
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+0], %r9;
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+2], %r10;
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+4], %r11;
	st.param.u16 	[__cudaretf__Z6Read2DI7ushort4ET_PKS1_iii+6], %r12;
	ret;
$LDWend__Z6Read2DI7ushort4ET_PKS1_iii:
	} // _Z6Read2DI7ushort4ET_PKS1_iii

	.visible .func (.param .align 16 .b8 __cudaretf__Z6Read2DI6float4ET_PKS1_iii[16]) _Z6Read2DI6float4ET_PKS1_iii (.param .u64 __cudaparmf1__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf2__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf3__Z6Read2DI6float4ET_PKS1_iii, .param .s32 __cudaparmf4__Z6Read2DI6float4ET_PKS1_iii)
	{
	.reg .u32 %r<10>;
	.reg .u64 %rd<7>;
	.reg .f32 %f<6>;
	.loc	18	114	0
$LDWbegin__Z6Read2DI6float4ET_PKS1_iii:
	ld.param.u64 	%rd1, [__cudaparmf1__Z6Read2DI6float4ET_PKS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf2__Z6Read2DI6float4ET_PKS1_iii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf3__Z6Read2DI6float4ET_PKS1_iii];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf4__Z6Read2DI6float4ET_PKS1_iii];
	mov.s32 	%r6, %r5;
	.loc	18	115	0
	mul.lo.s32 	%r7, %r2, %r6;
	add.s32 	%r8, %r4, %r7;
	cvt.s64.s32 	%rd3, %r8;
	mul.wide.s32 	%rd4, %r8, 16;
	add.u64 	%rd5, %rd2, %rd4;
	ld.v4.f32 	{%f1,%f2,%f3,%f4}, [%rd5+0];
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+0], %f1;
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+4], %f2;
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+8], %f3;
	st.param.f32 	[__cudaretf__Z6Read2DI6float4ET_PKS1_iii+12], %f4;
	ret;
$LDWend__Z6Read2DI6float4ET_PKS1_iii:
	} // _Z6Read2DI6float4ET_PKS1_iii

	.visible .func _Z7Write2DI7ushort4EvT_PS1_iii (.param .align 8 .b8 __cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii[8], .param .u64 __cudaparmf2__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI7ushort4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI7ushort4EvT_PS1_iii)
	{
	.reg .u32 %r<18>;
	.reg .u64 %rd<7>;
	.loc	18	125	0
$LDWbegin__Z7Write2DI7ushort4EvT_PS1_iii:
	ld.param.u16 	%r1, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+0];
	mov.s32 	%r2, %r1;
	ld.param.u16 	%r3, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+2];
	mov.s32 	%r4, %r3;
	ld.param.u16 	%r5, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+4];
	mov.s32 	%r6, %r5;
	ld.param.u16 	%r7, [__cudaparmf1__Z7Write2DI7ushort4EvT_PS1_iii+6];
	mov.s32 	%r8, %r7;
	ld.param.u64 	%rd1, [__cudaparmf2__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r9, [__cudaparmf3__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s32 	%r10, %r9;
	ld.param.u32 	%r11, [__cudaparmf4__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s32 	%r12, %r11;
	ld.param.u32 	%r13, [__cudaparmf5__Z7Write2DI7ushort4EvT_PS1_iii];
	mov.s32 	%r14, %r13;
	.loc	18	126	0
	mul.lo.s32 	%r15, %r10, %r14;
	add.s32 	%r16, %r12, %r15;
	cvt.s64.s32 	%rd3, %r16;
	mul.wide.s32 	%rd4, %r16, 8;
	add.u64 	%rd5, %rd2, %rd4;
	st.v4.u16 	[%rd5+0], {%r2,%r4,%r6,%r8};
	.loc	18	127	0
	ret;
$LDWend__Z7Write2DI7ushort4EvT_PS1_iii:
	} // _Z7Write2DI7ushort4EvT_PS1_iii

	.visible .func _Z7Write2DI6float4EvT_PS1_iii (.param .align 16 .b8 __cudaparmf1__Z7Write2DI6float4EvT_PS1_iii[16], .param .u64 __cudaparmf2__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf3__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf4__Z7Write2DI6float4EvT_PS1_iii, .param .s32 __cudaparmf5__Z7Write2DI6float4EvT_PS1_iii)
	{
	.reg .u32 %r<10>;
	.reg .u64 %rd<7>;
	.reg .f32 %f<10>;
	.loc	18	125	0
$LDWbegin__Z7Write2DI6float4EvT_PS1_iii:
	ld.param.f32 	%f1, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z7Write2DI6float4EvT_PS1_iii+12];
	mov.f32 	%f8, %f7;
	ld.param.u64 	%rd1, [__cudaparmf2__Z7Write2DI6float4EvT_PS1_iii];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf3__Z7Write2DI6float4EvT_PS1_iii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf4__Z7Write2DI6float4EvT_PS1_iii];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf5__Z7Write2DI6float4EvT_PS1_iii];
	mov.s32 	%r6, %r5;
	.loc	18	126	0
	mul.lo.s32 	%r7, %r2, %r6;
	add.s32 	%r8, %r4, %r7;
	cvt.s64.s32 	%rd3, %r8;
	mul.wide.s32 	%rd4, %r8, 16;
	add.u64 	%rd5, %rd2, %rd4;
	st.v4.f32 	[%rd5+0], {%f2,%f4,%f6,%f8};
	.loc	18	127	0
	ret;
$LDWend__Z7Write2DI6float4EvT_PS1_iii:
	} // _Z7Write2DI6float4EvT_PS1_iii

	.visible .func (.param .align 16 .b8 __cudaretf__Z18UnpremultiplyPixel8PixelRGB[16]) _Z18UnpremultiplyPixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z18UnpremultiplyPixel8PixelRGB[16])
	{
	.reg .f32 %f<23>;
	.reg .pred %p<3>;
	.loc	3	206	0
$LDWbegin__Z18UnpremultiplyPixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_13_1282;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_13_1026;
$Lt_13_1282:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_13_1026:
	.loc	3	224	0
	mov.f32 	%f18, %f17;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+0], %f18;
	mov.f32 	%f19, %f16;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+4], %f19;
	mov.f32 	%f20, %f15;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+8], %f20;
	mov.f32 	%f21, %f10;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+12], %f21;
	ret;
$LDWend__Z18UnpremultiplyPixel8PixelRGB:
	} // _Z18UnpremultiplyPixel8PixelRGB

	.visible .func (.param .f32 __cudaretf__Z13ToLinearColorf) _Z13ToLinearColorf (.param .f32 __cudaparmf1__Z13ToLinearColorf)
	{
	.reg .f32 %f<15>;
	.reg .pred %p<3>;
	.loc	3	231	0
$LDWbegin__Z13ToLinearColorf:
	ld.param.f32 	%f1, [__cudaparmf1__Z13ToLinearColorf];
	mov.f32 	%f2, %f1;
	mov.f32 	%f3, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f3;
	@!%p1 bra 	$Lt_14_1026;
	.loc	3	234	0
	neg.ftz.f32 	%f4, %f2;
	lg2.approx.ftz.f32 	%f5, %f4;
	mov.f32 	%f6, 0f400ccccd;     	// 2.2
	mul.ftz.f32 	%f7, %f5, %f6;
	ex2.approx.ftz.f32 	%f8, %f7;
	neg.ftz.f32 	%f9, %f8;
	bra.uni 	$LBB4__Z13ToLinearColorf;
$Lt_14_1026:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f10, %f2;
	mov.f32 	%f11, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f12, %f10, %f11;
	ex2.approx.ftz.f32 	%f9, %f12;
$LBB4__Z13ToLinearColorf:
	mov.f32 	%f13, %f9;
	st.param.f32 	[__cudaretf__Z13ToLinearColorf], %f13;
	ret;
$LDWend__Z13ToLinearColorf:
	} // _Z13ToLinearColorf

	.visible .func (.param .f32 __cudaretf__Z15FromLinearColorf) _Z15FromLinearColorf (.param .f32 __cudaparmf1__Z15FromLinearColorf)
	{
	.reg .f32 %f<15>;
	.reg .pred %p<3>;
	.loc	3	239	0
$LDWbegin__Z15FromLinearColorf:
	ld.param.f32 	%f1, [__cudaparmf1__Z15FromLinearColorf];
	mov.f32 	%f2, %f1;
	mov.f32 	%f3, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f3;
	@!%p1 bra 	$Lt_15_1026;
	.loc	3	242	0
	neg.ftz.f32 	%f4, %f2;
	lg2.approx.ftz.f32 	%f5, %f4;
	mov.f32 	%f6, 0f3ee8ba2e;     	// 0.454545
	mul.ftz.f32 	%f7, %f5, %f6;
	ex2.approx.ftz.f32 	%f8, %f7;
	neg.ftz.f32 	%f9, %f8;
	bra.uni 	$LBB4__Z15FromLinearColorf;
$Lt_15_1026:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f10, %f2;
	mov.f32 	%f11, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f12, %f10, %f11;
	ex2.approx.ftz.f32 	%f9, %f12;
$LBB4__Z15FromLinearColorf:
	mov.f32 	%f13, %f9;
	st.param.f32 	[__cudaretf__Z15FromLinearColorf], %f13;
	ret;
$LDWend__Z15FromLinearColorf:
	} // _Z15FromLinearColorf

	.visible .func (.param .align 16 .b8 __cudaretf__Z25PremultiplyLinearizePixel8PixelRGB[16]) _Z25PremultiplyLinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB[16])
	{
	.reg .f32 %f<47>;
	.reg .pred %p<5>;
	.loc	3	252	0
$LDWbegin__Z25PremultiplyLinearizePixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	254	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	.loc	3	255	0
	mov.f32 	%f10, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f10;
	@!%p1 bra 	$Lt_16_4098;
	.loc	3	234	0
	neg.ftz.f32 	%f11, %f2;
	lg2.approx.ftz.f32 	%f12, %f11;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f15, %f14;
	neg.ftz.f32 	%f16, %f15;
	bra.uni 	$LDWendi___log2f_193_5;
$Lt_16_4098:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f17, %f2;
	mov.f32 	%f18, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f19, %f17, %f18;
	ex2.approx.ftz.f32 	%f16, %f19;
$LDWendi___log2f_193_5:
	.loc	3	256	0
	mov.f32 	%f20, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f4, %f20;
	@!%p2 bra 	$Lt_16_4610;
	.loc	3	234	0
	neg.ftz.f32 	%f21, %f4;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	neg.ftz.f32 	%f26, %f25;
	bra.uni 	$LDWendi___log2f_193_3;
$Lt_16_4610:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f27, %f4;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f26, %f29;
$LDWendi___log2f_193_3:
	.loc	3	257	0
	mov.f32 	%f30, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f6, %f30;
	@!%p3 bra 	$Lt_16_5122;
	.loc	3	234	0
	neg.ftz.f32 	%f31, %f6;
	lg2.approx.ftz.f32 	%f32, %f31;
	mov.f32 	%f33, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f34, %f32, %f33;
	ex2.approx.ftz.f32 	%f35, %f34;
	neg.ftz.f32 	%f36, %f35;
	bra.uni 	$LDWendi___log2f_193_1;
$Lt_16_5122:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f37, %f6;
	mov.f32 	%f38, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f39, %f37, %f38;
	ex2.approx.ftz.f32 	%f36, %f39;
$LDWendi___log2f_193_1:
	.loc	3	259	0
	mul.ftz.f32 	%f40, %f36, %f9;
	mul.ftz.f32 	%f41, %f26, %f9;
	mul.ftz.f32 	%f42, %f16, %f9;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+0], %f42;
	mov.f32 	%f43, %f41;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+4], %f43;
	mov.f32 	%f44, %f40;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+8], %f44;
	mov.f32 	%f45, %f9;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+12], %f45;
	ret;
$LDWend__Z25PremultiplyLinearizePixel8PixelRGB:
	} // _Z25PremultiplyLinearizePixel8PixelRGB

	.visible .func (.param .align 16 .b8 __cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16]) _Z29UnpremultiplyUnlinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16])
	{
	.reg .f32 %f<53>;
	.reg .pred %p<6>;
	.loc	3	263	0
$LDWbegin__Z29UnpremultiplyUnlinearizePixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_17_5122;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_17_4866;
$Lt_17_5122:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_17_4866:
	.loc	3	266	0
	mov.f32 	%f18, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f17, %f18;
	@!%p2 bra 	$Lt_17_5378;
	.loc	3	242	0
	neg.ftz.f32 	%f19, %f17;
	lg2.approx.ftz.f32 	%f20, %f19;
	mov.f32 	%f21, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f22, %f20, %f21;
	ex2.approx.ftz.f32 	%f23, %f22;
	neg.ftz.f32 	%f24, %f23;
	bra.uni 	$LDWendi___log2f_194_5;
$Lt_17_5378:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f25, %f17;
	mov.f32 	%f26, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f27, %f25, %f26;
	ex2.approx.ftz.f32 	%f24, %f27;
$LDWendi___log2f_194_5:
	.loc	3	267	0
	mov.f32 	%f28, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f16, %f28;
	@!%p3 bra 	$Lt_17_5890;
	.loc	3	242	0
	neg.ftz.f32 	%f29, %f16;
	lg2.approx.ftz.f32 	%f30, %f29;
	mov.f32 	%f31, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f32, %f30, %f31;
	ex2.approx.ftz.f32 	%f33, %f32;
	neg.ftz.f32 	%f34, %f33;
	bra.uni 	$LDWendi___log2f_194_3;
$Lt_17_5890:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f35, %f16;
	mov.f32 	%f36, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f37, %f35, %f36;
	ex2.approx.ftz.f32 	%f34, %f37;
$LDWendi___log2f_194_3:
	.loc	3	268	0
	mov.f32 	%f38, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p4, %f15, %f38;
	@!%p4 bra 	$Lt_17_6402;
	.loc	3	242	0
	neg.ftz.f32 	%f39, %f15;
	lg2.approx.ftz.f32 	%f40, %f39;
	mov.f32 	%f41, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f42, %f40, %f41;
	ex2.approx.ftz.f32 	%f43, %f42;
	neg.ftz.f32 	%f44, %f43;
	bra.uni 	$LDWendi___log2f_194_1;
$Lt_17_6402:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f45, %f15;
	mov.f32 	%f46, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f47, %f45, %f46;
	ex2.approx.ftz.f32 	%f44, %f47;
$LDWendi___log2f_194_1:
	.loc	3	269	0
	mov.f32 	%f48, %f24;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+0], %f48;
	mov.f32 	%f49, %f34;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+4], %f49;
	mov.f32 	%f50, %f44;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+8], %f50;
	mov.f32 	%f51, %f10;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+12], %f51;
	ret;
$LDWend__Z29UnpremultiplyUnlinearizePixel8PixelRGB:
	} // _Z29UnpremultiplyUnlinearizePixel8PixelRGB

	.visible .func (.param .align 16 .b8 __cudaretf__Z20PremultiplyLinearize6float4[16]) _Z20PremultiplyLinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z20PremultiplyLinearize6float4[16])
	{
	.reg .f32 %f<47>;
	.reg .pred %p<5>;
	.loc	3	277	0
$LDWbegin__Z20PremultiplyLinearize6float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z20PremultiplyLinearize6float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z20PremultiplyLinearize6float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z20PremultiplyLinearize6float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z20PremultiplyLinearize6float4+12];
	mov.f32 	%f8, %f7;
	.loc	3	254	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	.loc	3	255	0
	mov.f32 	%f10, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f10;
	@!%p1 bra 	$Lt_18_4098;
	.loc	3	234	0
	neg.ftz.f32 	%f11, %f2;
	lg2.approx.ftz.f32 	%f12, %f11;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f15, %f14;
	neg.ftz.f32 	%f16, %f15;
	bra.uni 	$LDWendi___log2f_195_5;
$Lt_18_4098:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f17, %f2;
	mov.f32 	%f18, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f19, %f17, %f18;
	ex2.approx.ftz.f32 	%f16, %f19;
$LDWendi___log2f_195_5:
	.loc	3	256	0
	mov.f32 	%f20, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f4, %f20;
	@!%p2 bra 	$Lt_18_4610;
	.loc	3	234	0
	neg.ftz.f32 	%f21, %f4;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	neg.ftz.f32 	%f26, %f25;
	bra.uni 	$LDWendi___log2f_195_3;
$Lt_18_4610:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f27, %f4;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f26, %f29;
$LDWendi___log2f_195_3:
	.loc	3	257	0
	mov.f32 	%f30, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f6, %f30;
	@!%p3 bra 	$Lt_18_5122;
	.loc	3	234	0
	neg.ftz.f32 	%f31, %f6;
	lg2.approx.ftz.f32 	%f32, %f31;
	mov.f32 	%f33, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f34, %f32, %f33;
	ex2.approx.ftz.f32 	%f35, %f34;
	neg.ftz.f32 	%f36, %f35;
	bra.uni 	$LDWendi___log2f_195_1;
$Lt_18_5122:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f37, %f6;
	mov.f32 	%f38, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f39, %f37, %f38;
	ex2.approx.ftz.f32 	%f36, %f39;
$LDWendi___log2f_195_1:
	.loc	3	259	0
	mul.ftz.f32 	%f40, %f36, %f9;
	mul.ftz.f32 	%f41, %f26, %f9;
	.loc	3	278	0
	mul.ftz.f32 	%f42, %f16, %f9;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+0], %f42;
	mov.f32 	%f43, %f41;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+4], %f43;
	mov.f32 	%f44, %f40;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+8], %f44;
	mov.f32 	%f45, %f9;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+12], %f45;
	ret;
$LDWend__Z20PremultiplyLinearize6float4:
	} // _Z20PremultiplyLinearize6float4

	.visible .func (.param .align 16 .b8 __cudaretf__Z24UnpremultiplyUnlinearize6float4[16]) _Z24UnpremultiplyUnlinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z24UnpremultiplyUnlinearize6float4[16])
	{
	.reg .f32 %f<53>;
	.reg .pred %p<6>;
	.loc	3	284	0
$LDWbegin__Z24UnpremultiplyUnlinearize6float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_19_5122;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_19_4866;
$Lt_19_5122:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_19_4866:
	.loc	3	266	0
	mov.f32 	%f18, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f17, %f18;
	@!%p2 bra 	$Lt_19_5378;
	.loc	3	242	0
	neg.ftz.f32 	%f19, %f17;
	lg2.approx.ftz.f32 	%f20, %f19;
	mov.f32 	%f21, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f22, %f20, %f21;
	ex2.approx.ftz.f32 	%f23, %f22;
	neg.ftz.f32 	%f24, %f23;
	bra.uni 	$LDWendi___log2f_196_5;
$Lt_19_5378:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f25, %f17;
	mov.f32 	%f26, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f27, %f25, %f26;
	ex2.approx.ftz.f32 	%f24, %f27;
$LDWendi___log2f_196_5:
	.loc	3	267	0
	mov.f32 	%f28, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f16, %f28;
	@!%p3 bra 	$Lt_19_5890;
	.loc	3	242	0
	neg.ftz.f32 	%f29, %f16;
	lg2.approx.ftz.f32 	%f30, %f29;
	mov.f32 	%f31, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f32, %f30, %f31;
	ex2.approx.ftz.f32 	%f33, %f32;
	neg.ftz.f32 	%f34, %f33;
	bra.uni 	$LDWendi___log2f_196_3;
$Lt_19_5890:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f35, %f16;
	mov.f32 	%f36, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f37, %f35, %f36;
	ex2.approx.ftz.f32 	%f34, %f37;
$LDWendi___log2f_196_3:
	.loc	3	268	0
	mov.f32 	%f38, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p4, %f15, %f38;
	@!%p4 bra 	$Lt_19_6402;
	.loc	3	242	0
	neg.ftz.f32 	%f39, %f15;
	lg2.approx.ftz.f32 	%f40, %f39;
	mov.f32 	%f41, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f42, %f40, %f41;
	ex2.approx.ftz.f32 	%f43, %f42;
	neg.ftz.f32 	%f44, %f43;
	bra.uni 	$LDWendi___log2f_196_1;
$Lt_19_6402:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f45, %f15;
	mov.f32 	%f46, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f47, %f45, %f46;
	ex2.approx.ftz.f32 	%f44, %f47;
$LDWendi___log2f_196_1:
	.loc	3	285	0
	mov.f32 	%f48, %f24;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+0], %f48;
	mov.f32 	%f49, %f34;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+4], %f49;
	mov.f32 	%f50, %f44;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+8], %f50;
	mov.f32 	%f51, %f10;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+12], %f51;
	ret;
$LDWend__Z24UnpremultiplyUnlinearize6float4:
	} // _Z24UnpremultiplyUnlinearize6float4

	.visible .func (.param .align 16 .b8 __cudaretf__ZmlI8PixelRGBET_RKS1_f[16]) _ZmlI8PixelRGBET_RKS1_f (.param .u64 __cudaparmf1__ZmlI8PixelRGBET_RKS1_f, .param .f32 __cudaparmf2__ZmlI8PixelRGBET_RKS1_f)
	{
	.reg .u64 %rd<4>;
	.reg .f32 %f<12>;
	.loc	19	150	0
$LDWbegin__ZmlI8PixelRGBET_RKS1_f:
	ld.param.u64 	%rd1, [__cudaparmf1__ZmlI8PixelRGBET_RKS1_f];
	mov.s64 	%rd2, %rd1;
	ld.param.f32 	%f1, [__cudaparmf2__ZmlI8PixelRGBET_RKS1_f];
	mov.f32 	%f2, %f1;
	ld.v4.f32 	{%f3,%f4,%f5,%f6}, [%rd2+0];
	.loc	19	153	0
	mul.ftz.f32 	%f7, %f3, %f2;
	st.param.f32 	[__cudaretf__ZmlI8PixelRGBET_RKS1_f+0], %f7;
	mul.ftz.f32 	%f8, %f4, %f2;
	st.param.f32 	[__cudaretf__ZmlI8PixelRGBET_RKS1_f+4], %f8;
	mul.ftz.f32 	%f9, %f5, %f2;
	st.param.f32 	[__cudaretf__ZmlI8PixelRGBET_RKS1_f+8], %f9;
	mul.ftz.f32 	%f10, %f6, %f2;
	st.param.f32 	[__cudaretf__ZmlI8PixelRGBET_RKS1_f+12], %f10;
	ret;
$LDWend__ZmlI8PixelRGBET_RKS1_f:
	} // _ZmlI8PixelRGBET_RKS1_f

	.visible .func (.param .align 16 .b8 __cudaretf__ZmlI8PixelRGBET_fRKS1_[16]) _ZmlI8PixelRGBET_fRKS1_ (.param .f32 __cudaparmf1__ZmlI8PixelRGBET_fRKS1_, .param .u64 __cudaparmf2__ZmlI8PixelRGBET_fRKS1_)
	{
	.reg .u64 %rd<4>;
	.reg .f32 %f<12>;
	.loc	19	160	0
$LDWbegin__ZmlI8PixelRGBET_fRKS1_:
	ld.param.f32 	%f1, [__cudaparmf1__ZmlI8PixelRGBET_fRKS1_];
	mov.f32 	%f2, %f1;
	ld.param.u64 	%rd1, [__cudaparmf2__ZmlI8PixelRGBET_fRKS1_];
	mov.s64 	%rd2, %rd1;
	ld.v4.f32 	{%f3,%f4,%f5,%f6}, [%rd2+0];
	.loc	19	163	0
	mul.ftz.f32 	%f7, %f3, %f2;
	st.param.f32 	[__cudaretf__ZmlI8PixelRGBET_fRKS1_+0], %f7;
	mul.ftz.f32 	%f8, %f4, %f2;
	st.param.f32 	[__cudaretf__ZmlI8PixelRGBET_fRKS1_+4], %f8;
	mul.ftz.f32 	%f9, %f5, %f2;
	st.param.f32 	[__cudaretf__ZmlI8PixelRGBET_fRKS1_+8], %f9;
	mul.ftz.f32 	%f10, %f6, %f2;
	st.param.f32 	[__cudaretf__ZmlI8PixelRGBET_fRKS1_+12], %f10;
	ret;
$LDWend__ZmlI8PixelRGBET_fRKS1_:
	} // _ZmlI8PixelRGBET_fRKS1_
	.global .texref tex;

	.entry HorizontalBoxBlurKernel (
		.param .u32 __cudaparm_HorizontalBoxBlurKernel_inDeviceFormat,
		.param .u64 __cudaparm_HorizontalBoxBlurKernel_inOutput,
		.param .s32 __cudaparm_HorizontalBoxBlurKernel_inOutputWidth,
		.param .s32 __cudaparm_HorizontalBoxBlurKernel_inOutputHeight,
		.param .s32 __cudaparm_HorizontalBoxBlurKernel_inOutputPitch,
		.param .s32 __cudaparm_HorizontalBoxBlurKernel_inInputWidth,
		.param .s32 __cudaparm_HorizontalBoxBlurKernel_inInputHeight,
		.param .s32 __cudaparm_HorizontalBoxBlurKernel_inRadius,
		.param .s8 __cudaparm_HorizontalBoxBlurKernel_inRepeatEdgePixels)
	{
	.reg .u32 %r<76>;
	.reg .u64 %rd<17>;
	.reg .f32 %f<92>;
	.reg .pred %p<15>;
	.loc	20	42	0
$LDWbegin_HorizontalBoxBlurKernel:
	.loc	20	45	0
	cvt.s32.u32 	%r1, %ctaid.x;
	cvt.s32.u32 	%r2, %ntid.x;
	mul.lo.s32 	%r3, %r1, %r2;
	mov.u32 	%r4, %tid.x;
	add.u32 	%r5, %r3, %r4;
	ld.param.s32 	%r6, [__cudaparm_HorizontalBoxBlurKernel_inOutputHeight];
	setp.gt.s32 	%p1, %r6, %r5;
	@%p1 bra 	$Lt_22_9730;
	bra.uni 	$LBB45_HorizontalBoxBlurKernel;
$Lt_22_9730:
	.loc	20	49	0
	ld.param.s32 	%r7, [__cudaparm_HorizontalBoxBlurKernel_inRadius];
	mul.lo.s32 	%r8, %r7, 2;
	add.s32 	%r9, %r8, 1;
	ld.param.s32 	%r10, [__cudaparm_HorizontalBoxBlurKernel_inOutputWidth];
	ld.param.s8 	%r11, [__cudaparm_HorizontalBoxBlurKernel_inRepeatEdgePixels];
	mov.u32 	%r12, 0;
	setp.eq.s32 	%p2, %r11, %r12;
	@%p2 bra 	$Lt_22_10498;
	.loc	20	69	0
	sub.s32 	%r13, %r10, 1;
	.loc	20	71	0
	neg.s32 	%r14, %r7;
	mov.s32 	%r15, %r14;
	setp.gt.s32 	%p3, %r14, %r7;
	@%p3 bra 	$Lt_22_17410;
	add.s32 	%r16, %r7, %r7;
	add.s32 	%r17, %r16, 1;
	add.s32 	%r18, %r7, 1;
	cvt.rn.f32.s32 	%f1, %r5;
	mov.f32 	%f2, 0f00000000;     	// 0
	mov.f32 	%f3, 0f00000000;     	// 0
	mov.f32 	%f4, 0f00000000;     	// 0
	mov.f32 	%f5, 0f00000000;     	// 0
	mov.s32 	%r19, %r17;
$Lt_22_11266:
 //<loop> Loop body line 71, nesting depth: 1, estimated iterations: unknown
	cvt.rn.f32.s32 	%f6, %r15;
	mov.f32 	%f7, %f6;
	mov.f32 	%f8, %f1;
	mov.f32 	%f9, 0f00000000;     	// 0
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0f00000000;    	// 0
	mov.f32 	%f12, %f11;
	tex.2d.v4.f32.f32 {%f13,%f14,%f15,%f16},[tex,{%f7,%f8,%f10,%f12}];
 //<loop> Part of loop body line 71, head labeled $Lt_22_11266
	.loc	20	73	0
	mov.f32 	%f17, %f13;
	mov.f32 	%f18, %f14;
	mov.f32 	%f19, %f15;
	mov.f32 	%f20, %f16;
	add.ftz.f32 	%f4, %f17, %f4;
	add.ftz.f32 	%f3, %f18, %f3;
	add.ftz.f32 	%f2, %f19, %f2;
	add.ftz.f32 	%f5, %f20, %f5;
	add.s32 	%r15, %r15, 1;
	setp.ne.s32 	%p4, %r18, %r15;
	@%p4 bra 	$Lt_22_11266;
	bra.uni 	$Lt_22_10754;
$Lt_22_17410:
	mov.f32 	%f2, 0f00000000;     	// 0
	mov.f32 	%f3, 0f00000000;     	// 0
	mov.f32 	%f4, 0f00000000;     	// 0
	mov.f32 	%f5, 0f00000000;     	// 0
$Lt_22_10754:
	.loc	20	75	0
	ld.param.s32 	%r20, [__cudaparm_HorizontalBoxBlurKernel_inOutputPitch];
	mul.lo.s32 	%r21, %r20, %r5;
	cvt.rn.f32.s32 	%f21, %r9;
	cvt.s64.s32 	%rd1, %r21;
	rcp.approx.ftz.f32 	%f22, %f21;
	mul.ftz.f32 	%f23, %f22, %f4;
	mul.ftz.f32 	%f24, %f22, %f3;
	mul.ftz.f32 	%f25, %f22, %f2;
	mul.ftz.f32 	%f26, %f22, %f5;
	ld.param.u64 	%rd2, [__cudaparm_HorizontalBoxBlurKernel_inOutput];
	ld.param.s32 	%r22, [__cudaparm_HorizontalBoxBlurKernel_inDeviceFormat];
	mov.u32 	%r23, 0;
	setp.ne.s32 	%p5, %r22, %r23;
	@%p5 bra 	$Lt_22_12034;
	.loc	18	126	0
	mul.lo.u64 	%rd3, %rd1, 8;
	add.u64 	%rd4, %rd2, %rd3;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f23;
	mov.b32		%r24, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f24;
	mov.b32		%r25, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f25;
	mov.b32		%r26, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f26;
	mov.b32		%r27, %b1; }
	st.global.v4.u16 	[%rd4+0], {%r24,%r25,%r26,%r27};
	.loc	20	76	0
	bra.uni 	$Lt_22_11778;
$Lt_22_12034:
	.loc	18	126	0
	mul.lo.u64 	%rd5, %rd1, 16;
	add.u64 	%rd6, %rd2, %rd5;
	st.global.v4.f32 	[%rd6+0], {%f23,%f24,%f25,%f26};
$Lt_22_11778:
	.loc	20	84	0
	mov.s32 	%r28, %r7;
	mov.s32 	%r29, 1;
	bra.uni 	$Lt_22_10242;
$Lt_22_10498:
	.loc	20	88	0
	mov.s32 	%r29, %r9;
	.loc	20	89	0
	sub.s32 	%r30, %r10, %r8;
	sub.s32 	%r13, %r30, 1;
	mov.u32 	%r31, 0;
	setp.lt.s32 	%p6, %r8, %r31;
	@%p6 bra 	$Lt_22_17666;
	mov.s32 	%r32, %r9;
	ld.param.s32 	%r33, [__cudaparm_HorizontalBoxBlurKernel_inDeviceFormat];
	mov.s32 	%r34, 0;
	setp.eq.s32 	%p7, %r33, %r34;
	cvt.rn.f32.s32 	%f1, %r5;
	ld.param.s32 	%r35, [__cudaparm_HorizontalBoxBlurKernel_inOutputPitch];
	mul.lo.s32 	%r21, %r35, %r5;
	cvt.rn.f32.s32 	%f21, %r9;
	rcp.approx.ftz.f32 	%f22, %f21;
	ld.param.u64 	%rd2, [__cudaparm_HorizontalBoxBlurKernel_inOutput];
	mov.s32 	%r36, 0;
	mov.f32 	%f2, 0f00000000;     	// 0
	mov.f32 	%f3, 0f00000000;     	// 0
	mov.f32 	%f4, 0f00000000;     	// 0
	mov.f32 	%f5, 0f00000000;     	// 0
	mov.s32 	%r37, %r32;
$Lt_22_12802:
 //<loop> Loop body line 89, nesting depth: 1, estimated iterations: unknown
	cvt.rn.f32.s32 	%f27, %r36;
	mov.f32 	%f28, %f27;
	mov.f32 	%f29, %f1;
	mov.f32 	%f30, 0f00000000;    	// 0
	mov.f32 	%f31, %f30;
	mov.f32 	%f32, 0f00000000;    	// 0
	mov.f32 	%f33, %f32;
	tex.2d.v4.f32.f32 {%f34,%f35,%f36,%f37},[tex,{%f28,%f29,%f31,%f33}];
 //<loop> Part of loop body line 89, head labeled $Lt_22_12802
	.loc	20	94	0
	mov.f32 	%f38, %f34;
	mov.f32 	%f39, %f35;
	mov.f32 	%f40, %f36;
	mov.f32 	%f41, %f37;
	add.ftz.f32 	%f4, %f38, %f4;
	add.ftz.f32 	%f3, %f39, %f3;
	add.ftz.f32 	%f2, %f40, %f2;
	add.ftz.f32 	%f5, %f41, %f5;
	.loc	20	95	0
	add.s32 	%r38, %r21, %r36;
	cvt.s64.s32 	%rd7, %r38;
	mul.ftz.f32 	%f23, %f22, %f4;
	mul.ftz.f32 	%f24, %f22, %f3;
	mul.ftz.f32 	%f25, %f22, %f2;
	mul.ftz.f32 	%f26, %f22, %f5;
	@!%p7 bra 	$Lt_22_13314;
 //<loop> Part of loop body line 89, head labeled $Lt_22_12802
	.loc	18	126	0
	mul.lo.u64 	%rd8, %rd7, 8;
	add.u64 	%rd9, %rd2, %rd8;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f23;
	mov.b32		%r39, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f24;
	mov.b32		%r40, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f25;
	mov.b32		%r41, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f26;
	mov.b32		%r42, %b1; }
	st.global.v4.u16 	[%rd9+0], {%r39,%r40,%r41,%r42};
	.loc	20	96	0
	bra.uni 	$Lt_22_13058;
$Lt_22_13314:
 //<loop> Part of loop body line 89, head labeled $Lt_22_12802
	.loc	18	126	0
	mul.lo.u64 	%rd10, %rd7, 16;
	add.u64 	%rd11, %rd2, %rd10;
	st.global.v4.f32 	[%rd11+0], {%f23,%f24,%f25,%f26};
$Lt_22_13058:
 //<loop> Part of loop body line 89, head labeled $Lt_22_12802
	.loc	20	96	0
	add.s32 	%r36, %r36, 1;
	setp.ne.s32 	%p8, %r9, %r36;
	@%p8 bra 	$Lt_22_12802;
	bra.uni 	$Lt_22_12290;
$Lt_22_17666:
	mov.f32 	%f2, 0f00000000;     	// 0
	mov.f32 	%f3, 0f00000000;     	// 0
	mov.f32 	%f4, 0f00000000;     	// 0
	mov.f32 	%f5, 0f00000000;     	// 0
$Lt_22_12290:
	mov.s32 	%r28, 0;
$Lt_22_10242:
	.loc	20	107	0
	mov.s32 	%r36, %r29;
	setp.lt.s32 	%p9, %r13, %r29;
	@%p9 bra 	$Lt_22_13826;
	sub.s32 	%r43, %r13, %r29;
	add.s32 	%r44, %r43, 1;
	ld.param.s32 	%r45, [__cudaparm_HorizontalBoxBlurKernel_inDeviceFormat];
	mov.s32 	%r46, 0;
	setp.eq.s32 	%p7, %r45, %r46;
	add.s32 	%r47, %r13, 1;
	mov.s32 	%r48, -1;
	sub.s32 	%r49, %r48, %r8;
	cvt.rn.f32.s32 	%f1, %r5;
	ld.param.s32 	%r50, [__cudaparm_HorizontalBoxBlurKernel_inOutputPitch];
	mul.lo.s32 	%r21, %r50, %r5;
	cvt.rn.f32.s32 	%f21, %r9;
	add.s32 	%r51, %r49, %r28;
	rcp.approx.ftz.f32 	%f22, %f21;
	ld.param.u64 	%rd2, [__cudaparm_HorizontalBoxBlurKernel_inOutput];
	mov.s32 	%r52, %r44;
$Lt_22_14338:
 //<loop> Loop body line 107, nesting depth: 1, estimated iterations: unknown
	add.s32 	%r53, %r36, %r28;
	cvt.rn.f32.s32 	%f42, %r53;
	mov.f32 	%f43, %f42;
	mov.f32 	%f44, %f1;
	mov.f32 	%f45, 0f00000000;    	// 0
	mov.f32 	%f46, %f45;
	mov.f32 	%f47, 0f00000000;    	// 0
	mov.f32 	%f48, %f47;
	tex.2d.v4.f32.f32 {%f49,%f50,%f51,%f52},[tex,{%f43,%f44,%f46,%f48}];
 //<loop> Part of loop body line 107, head labeled $Lt_22_14338
	.loc	20	109	0
	mov.f32 	%f53, %f49;
	mov.f32 	%f54, %f50;
	mov.f32 	%f55, %f51;
	mov.f32 	%f56, %f52;
	add.s32 	%r54, %r51, %r36;
	cvt.rn.f32.s32 	%f57, %r54;
	mov.f32 	%f58, %f57;
	mov.f32 	%f59, %f1;
	mov.f32 	%f60, 0f00000000;    	// 0
	mov.f32 	%f61, %f60;
	mov.f32 	%f62, 0f00000000;    	// 0
	mov.f32 	%f63, %f62;
	tex.2d.v4.f32.f32 {%f64,%f65,%f66,%f67},[tex,{%f58,%f59,%f61,%f63}];
 //<loop> Part of loop body line 107, head labeled $Lt_22_14338
	.loc	20	110	0
	mov.f32 	%f68, %f64;
	mov.f32 	%f69, %f65;
	mov.f32 	%f70, %f66;
	mov.f32 	%f71, %f67;
	add.ftz.f32 	%f72, %f53, %f4;
	sub.ftz.f32 	%f4, %f72, %f68;
	add.ftz.f32 	%f73, %f54, %f3;
	sub.ftz.f32 	%f3, %f73, %f69;
	add.ftz.f32 	%f74, %f55, %f2;
	sub.ftz.f32 	%f2, %f74, %f70;
	add.ftz.f32 	%f75, %f56, %f5;
	sub.ftz.f32 	%f5, %f75, %f71;
	.loc	20	111	0
	add.s32 	%r55, %r21, %r36;
	cvt.s64.s32 	%rd7, %r55;
	mul.ftz.f32 	%f23, %f22, %f4;
	mul.ftz.f32 	%f24, %f22, %f3;
	mul.ftz.f32 	%f25, %f22, %f2;
	mul.ftz.f32 	%f26, %f22, %f5;
	@!%p7 bra 	$Lt_22_14850;
 //<loop> Part of loop body line 107, head labeled $Lt_22_14338
	.loc	18	126	0
	mul.lo.u64 	%rd12, %rd7, 8;
	add.u64 	%rd9, %rd2, %rd12;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f23;
	mov.b32		%r56, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f24;
	mov.b32		%r57, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f25;
	mov.b32		%r58, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f26;
	mov.b32		%r59, %b1; }
	st.global.v4.u16 	[%rd9+0], {%r56,%r57,%r58,%r59};
	.loc	20	112	0
	bra.uni 	$Lt_22_14594;
$Lt_22_14850:
 //<loop> Part of loop body line 107, head labeled $Lt_22_14338
	.loc	18	126	0
	mul.lo.u64 	%rd13, %rd7, 16;
	add.u64 	%rd11, %rd2, %rd13;
	st.global.v4.f32 	[%rd11+0], {%f23,%f24,%f25,%f26};
$Lt_22_14594:
 //<loop> Part of loop body line 107, head labeled $Lt_22_14338
	.loc	20	112	0
	add.s32 	%r36, %r36, 1;
	setp.ne.s32 	%p10, %r47, %r36;
	@%p10 bra 	$Lt_22_14338;
$Lt_22_13826:
	mov.u32 	%r60, 0;
	setp.ne.s32 	%p11, %r11, %r60;
	@%p11 bra 	$LBB45_HorizontalBoxBlurKernel;
	.loc	20	124	0
	sub.s32 	%r61, %r10, %r8;
	mov.s32 	%r36, %r61;
	sub.s32 	%r62, %r10, 1;
	setp.gt.s32 	%p12, %r61, %r62;
	@%p12 bra 	$LBB45_HorizontalBoxBlurKernel;
	mov.s32 	%r63, %r8;
	ld.param.s32 	%r64, [__cudaparm_HorizontalBoxBlurKernel_inDeviceFormat];
	mov.s32 	%r65, 0;
	setp.eq.s32 	%p7, %r64, %r65;
	mov.s32 	%r66, -1;
	sub.s32 	%r49, %r66, %r8;
	cvt.rn.f32.s32 	%f1, %r5;
	ld.param.s32 	%r67, [__cudaparm_HorizontalBoxBlurKernel_inOutputPitch];
	mul.lo.s32 	%r21, %r67, %r5;
	cvt.rn.f32.s32 	%f21, %r9;
	rcp.approx.ftz.f32 	%f22, %f21;
	ld.param.u64 	%rd2, [__cudaparm_HorizontalBoxBlurKernel_inOutput];
	mov.s32 	%r68, %r63;
$Lt_22_16386:
 //<loop> Loop body line 124, nesting depth: 1, estimated iterations: unknown
	add.s32 	%r69, %r49, %r36;
	cvt.rn.f32.s32 	%f76, %r69;
	mov.f32 	%f77, %f76;
	mov.f32 	%f78, %f1;
	mov.f32 	%f79, 0f00000000;    	// 0
	mov.f32 	%f80, %f79;
	mov.f32 	%f81, 0f00000000;    	// 0
	mov.f32 	%f82, %f81;
	tex.2d.v4.f32.f32 {%f83,%f84,%f85,%f86},[tex,{%f77,%f78,%f80,%f82}];
 //<loop> Part of loop body line 124, head labeled $Lt_22_16386
	.loc	20	126	0
	mov.f32 	%f87, %f83;
	mov.f32 	%f88, %f84;
	mov.f32 	%f89, %f85;
	mov.f32 	%f90, %f86;
	sub.ftz.f32 	%f4, %f4, %f87;
	sub.ftz.f32 	%f3, %f3, %f88;
	sub.ftz.f32 	%f2, %f2, %f89;
	sub.ftz.f32 	%f5, %f5, %f90;
	.loc	20	127	0
	add.s32 	%r70, %r21, %r36;
	cvt.s64.s32 	%rd7, %r70;
	mul.ftz.f32 	%f23, %f22, %f4;
	mul.ftz.f32 	%f24, %f22, %f3;
	mul.ftz.f32 	%f25, %f22, %f2;
	mul.ftz.f32 	%f26, %f22, %f5;
	@!%p7 bra 	$Lt_22_16898;
 //<loop> Part of loop body line 124, head labeled $Lt_22_16386
	.loc	18	126	0
	mul.lo.u64 	%rd14, %rd7, 8;
	add.u64 	%rd9, %rd2, %rd14;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f23;
	mov.b32		%r71, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f24;
	mov.b32		%r72, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f25;
	mov.b32		%r73, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f26;
	mov.b32		%r74, %b1; }
	st.global.v4.u16 	[%rd9+0], {%r71,%r72,%r73,%r74};
	.loc	20	128	0
	bra.uni 	$Lt_22_16642;
$Lt_22_16898:
 //<loop> Part of loop body line 124, head labeled $Lt_22_16386
	.loc	18	126	0
	mul.lo.u64 	%rd15, %rd7, 16;
	add.u64 	%rd11, %rd2, %rd15;
	st.global.v4.f32 	[%rd11+0], {%f23,%f24,%f25,%f26};
$Lt_22_16642:
 //<loop> Part of loop body line 124, head labeled $Lt_22_16386
	.loc	20	128	0
	add.s32 	%r36, %r36, 1;
	setp.ne.s32 	%p13, %r36, %r10;
	@%p13 bra 	$Lt_22_16386;
$LBB45_HorizontalBoxBlurKernel:
	.loc	20	137	0
	exit;
$LDWend_HorizontalBoxBlurKernel:
	} // HorizontalBoxBlurKernel

	.entry VerticalBoxBlurKernel (
		.param .u32 __cudaparm_VerticalBoxBlurKernel_inDeviceFormat,
		.param .u64 __cudaparm_VerticalBoxBlurKernel_inOutput,
		.param .s32 __cudaparm_VerticalBoxBlurKernel_inOutputWidth,
		.param .s32 __cudaparm_VerticalBoxBlurKernel_inOutputHeight,
		.param .s32 __cudaparm_VerticalBoxBlurKernel_inOutputPitch,
		.param .u64 __cudaparm_VerticalBoxBlurKernel_inInput,
		.param .s32 __cudaparm_VerticalBoxBlurKernel_inInputWidth,
		.param .s32 __cudaparm_VerticalBoxBlurKernel_inInputHeight,
		.param .s32 __cudaparm_VerticalBoxBlurKernel_inInputPitch,
		.param .s32 __cudaparm_VerticalBoxBlurKernel_inRadius,
		.param .s8 __cudaparm_VerticalBoxBlurKernel_inRepeatEdgePixels)
	{
	.reg .u32 %r<156>;
	.reg .u64 %rd<72>;
	.reg .f32 %f<65>;
	.reg .pred %p<15>;
	.loc	20	152	0
$LDWbegin_VerticalBoxBlurKernel:
	.loc	20	155	0
	cvt.s32.u32 	%r1, %ctaid.x;
	cvt.s32.u32 	%r2, %ntid.x;
	mul.lo.s32 	%r3, %r1, %r2;
	mov.u32 	%r4, %tid.x;
	add.u32 	%r5, %r3, %r4;
	ld.param.s32 	%r6, [__cudaparm_VerticalBoxBlurKernel_inOutputWidth];
	setp.gt.s32 	%p1, %r6, %r5;
	@%p1 bra 	$Lt_23_20994;
	bra.uni 	$LBB85_VerticalBoxBlurKernel;
$Lt_23_20994:
	.loc	20	159	0
	ld.param.s8 	%r7, [__cudaparm_VerticalBoxBlurKernel_inRepeatEdgePixels];
	mov.s32 	%r8, 0;
	setp.ne.s32 	%p2, %r7, %r8;
	ld.param.s32 	%r9, [__cudaparm_VerticalBoxBlurKernel_inRadius];
	mul.lo.s32 	%r10, %r9, 2;
	add.s32 	%r11, %r10, 1;
	ld.param.s32 	%r12, [__cudaparm_VerticalBoxBlurKernel_inOutputHeight];
	@!%p2 bra 	$Lt_23_21762;
	.loc	20	178	0
	add.s32 	%r13, %r9, 1;
	mov.s32 	%r14, %r13;
	.loc	20	179	0
	sub.s32 	%r15, %r12, %r9;
	sub.s32 	%r16, %r15, 1;
	ld.param.s32 	%r17, [__cudaparm_VerticalBoxBlurKernel_inDeviceFormat];
	mov.s32 	%r18, 0;
	setp.eq.s32 	%p3, %r17, %r18;
	cvt.s64.s32 	%rd1, %r5;
	ld.param.u64 	%rd2, [__cudaparm_VerticalBoxBlurKernel_inInput];
	@!%p3 bra 	$Lt_23_22274;
	.loc	18	115	0
	mul.lo.u64 	%rd3, %rd1, 8;
	add.u64 	%rd4, %rd2, %rd3;
	ld.global.v4.u16 	{%r19,%r20,%r21,%r22}, [%rd4+0];
	.loc	20	181	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r19;
	cvt.ftz.f32.f16	%f1, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r20;
	cvt.ftz.f32.f16	%f2, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r21;
	cvt.ftz.f32.f16	%f3, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r22;
	cvt.ftz.f32.f16	%f4, %b1; }
	bra.uni 	$Lt_23_22018;
$Lt_23_22274:
	mul.lo.u64 	%rd5, %rd1, 16;
	add.u64 	%rd6, %rd2, %rd5;
	ld.global.v4.f32 	{%f1,%f2,%f3,%f4}, [%rd6+0];
$Lt_23_22018:
	cvt.rn.f32.s32 	%f5, %r13;
	mul.ftz.f32 	%f6, %f5, %f1;
	mul.ftz.f32 	%f7, %f5, %f2;
	mul.ftz.f32 	%f8, %f5, %f3;
	mul.ftz.f32 	%f9, %f5, %f4;
	mov.s32 	%r23, 1;
	setp.ge.s32 	%p4, %r9, %r23;
	@!%p4 bra 	$Lt_23_22530;
	mov.s32 	%r24, %r9;
	ld.param.s32 	%r25, [__cudaparm_VerticalBoxBlurKernel_inInputPitch];
	add.s32 	%r26, %r25, %r5;
	cvt.s64.s32 	%rd7, %r26;
	cvt.s64.s32 	%rd8, %r25;
	mov.s32 	%r27, 1;
	mov.s32 	%r28, %r24;
$Lt_23_23042:
 //<loop> Loop body line 181, nesting depth: 1, estimated iterations: unknown
	@!%p3 bra 	$Lt_23_23554;
 //<loop> Part of loop body line 181, head labeled $Lt_23_23042
	.loc	18	115	0
	mul.lo.u64 	%rd9, %rd7, 8;
	add.u64 	%rd10, %rd2, %rd9;
	ld.global.v4.u16 	{%r29,%r30,%r31,%r32}, [%rd10+0];
	.loc	20	190	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r29;
	cvt.ftz.f32.f16	%f10, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r30;
	cvt.ftz.f32.f16	%f11, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r31;
	cvt.ftz.f32.f16	%f12, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r32;
	cvt.ftz.f32.f16	%f13, %b1; }
	bra.uni 	$Lt_23_23298;
$Lt_23_23554:
 //<loop> Part of loop body line 181, head labeled $Lt_23_23042
	mul.lo.u64 	%rd11, %rd7, 16;
	add.u64 	%rd12, %rd2, %rd11;
	ld.global.v4.f32 	{%f10,%f11,%f12,%f13}, [%rd12+0];
$Lt_23_23298:
 //<loop> Part of loop body line 181, head labeled $Lt_23_23042
	add.ftz.f32 	%f6, %f10, %f6;
	add.ftz.f32 	%f7, %f11, %f7;
	add.ftz.f32 	%f8, %f12, %f8;
	add.ftz.f32 	%f9, %f13, %f9;
	add.s32 	%r27, %r27, 1;
	add.s64 	%rd7, %rd7, %rd8;
	setp.ne.s32 	%p5, %r13, %r27;
	@%p5 bra 	$Lt_23_23042;
$Lt_23_22530:
	.loc	20	197	0
	cvt.rn.f32.s32 	%f14, %r11;
	rcp.approx.ftz.f32 	%f15, %f14;
	mul.ftz.f32 	%f16, %f15, %f6;
	mul.ftz.f32 	%f17, %f15, %f7;
	mul.ftz.f32 	%f18, %f15, %f8;
	mul.ftz.f32 	%f19, %f15, %f9;
	ld.param.u64 	%rd13, [__cudaparm_VerticalBoxBlurKernel_inOutput];
	@!%p3 bra 	$Lt_23_24322;
	.loc	18	126	0
	mul.lo.u64 	%rd14, %rd1, 8;
	add.u64 	%rd15, %rd13, %rd14;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f16;
	mov.b32		%r33, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f17;
	mov.b32		%r34, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f18;
	mov.b32		%r35, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f19;
	mov.b32		%r36, %b1; }
	st.global.v4.u16 	[%rd15+0], {%r33,%r34,%r35,%r36};
	.loc	20	198	0
	bra.uni 	$Lt_23_24066;
$Lt_23_24322:
	.loc	18	126	0
	mul.lo.u64 	%rd16, %rd1, 16;
	add.u64 	%rd17, %rd13, %rd16;
	st.global.v4.f32 	[%rd17+0], {%f16,%f17,%f18,%f19};
$Lt_23_24066:
	.loc	20	198	0
	@!%p4 bra 	$Lt_23_24578;
	mov.s32 	%r37, %r9;
	ld.param.s32 	%r38, [__cudaparm_VerticalBoxBlurKernel_inOutputPitch];
	mov.s32 	%r39, %r13;
	add.s32 	%r40, %r13, %r9;
	ld.param.s32 	%r25, [__cudaparm_VerticalBoxBlurKernel_inInputPitch];
	add.s32 	%r41, %r38, %r5;
	cvt.s64.s32 	%rd18, %r41;
	cvt.s64.s32 	%rd19, %r38;
	mul.lo.s32 	%r42, %r13, %r25;
	add.s32 	%r43, %r5, %r42;
	cvt.s64.s32 	%rd20, %r43;
	cvt.s64.s32 	%rd8, %r25;
	mov.s32 	%r44, %r37;
$Lt_23_25090:
 //<loop> Loop body line 198, nesting depth: 1, estimated iterations: unknown
	@!%p3 bra 	$Lt_23_25602;
 //<loop> Part of loop body line 198, head labeled $Lt_23_25090
	.loc	18	115	0
	mul.lo.u64 	%rd21, %rd20, 8;
	add.u64 	%rd22, %rd2, %rd21;
	ld.global.v4.u16 	{%r45,%r46,%r47,%r48}, [%rd22+0];
	.loc	20	208	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r45;
	cvt.ftz.f32.f16	%f20, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r46;
	cvt.ftz.f32.f16	%f21, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r47;
	cvt.ftz.f32.f16	%f22, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r48;
	cvt.ftz.f32.f16	%f23, %b1; }
	bra.uni 	$Lt_23_25346;
$Lt_23_25602:
 //<loop> Part of loop body line 198, head labeled $Lt_23_25090
	mul.lo.u64 	%rd23, %rd20, 16;
	add.u64 	%rd24, %rd2, %rd23;
	ld.global.v4.f32 	{%f20,%f21,%f22,%f23}, [%rd24+0];
$Lt_23_25346:
 //<loop> Part of loop body line 198, head labeled $Lt_23_25090
	@!%p3 bra 	$Lt_23_26114;
 //<loop> Part of loop body line 198, head labeled $Lt_23_25090
	.loc	18	115	0
	mul.lo.u64 	%rd25, %rd1, 8;
	add.u64 	%rd26, %rd2, %rd25;
	ld.global.v4.u16 	{%r49,%r50,%r51,%r52}, [%rd26+0];
	.loc	20	214	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r49;
	cvt.ftz.f32.f16	%f24, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r50;
	cvt.ftz.f32.f16	%f25, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r51;
	cvt.ftz.f32.f16	%f26, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r52;
	cvt.ftz.f32.f16	%f27, %b1; }
	bra.uni 	$Lt_23_25858;
$Lt_23_26114:
 //<loop> Part of loop body line 198, head labeled $Lt_23_25090
	mul.lo.u64 	%rd27, %rd1, 16;
	add.u64 	%rd28, %rd2, %rd27;
	ld.global.v4.f32 	{%f24,%f25,%f26,%f27}, [%rd28+0];
$Lt_23_25858:
 //<loop> Part of loop body line 198, head labeled $Lt_23_25090
	add.ftz.f32 	%f28, %f20, %f6;
	sub.ftz.f32 	%f6, %f28, %f24;
	add.ftz.f32 	%f29, %f21, %f7;
	sub.ftz.f32 	%f7, %f29, %f25;
	add.ftz.f32 	%f30, %f22, %f8;
	sub.ftz.f32 	%f8, %f30, %f26;
	add.ftz.f32 	%f31, %f23, %f9;
	sub.ftz.f32 	%f9, %f31, %f27;
	.loc	20	220	0
	mul.ftz.f32 	%f16, %f15, %f6;
	mul.ftz.f32 	%f17, %f15, %f7;
	mul.ftz.f32 	%f18, %f15, %f8;
	mul.ftz.f32 	%f19, %f15, %f9;
	@!%p3 bra 	$Lt_23_26626;
 //<loop> Part of loop body line 198, head labeled $Lt_23_25090
	.loc	18	126	0
	mul.lo.u64 	%rd29, %rd18, 8;
	add.u64 	%rd30, %rd13, %rd29;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f16;
	mov.b32		%r53, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f17;
	mov.b32		%r54, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f18;
	mov.b32		%r55, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f19;
	mov.b32		%r56, %b1; }
	st.global.v4.u16 	[%rd30+0], {%r53,%r54,%r55,%r56};
	.loc	20	221	0
	bra.uni 	$Lt_23_26370;
$Lt_23_26626:
 //<loop> Part of loop body line 198, head labeled $Lt_23_25090
	.loc	18	126	0
	mul.lo.u64 	%rd31, %rd18, 16;
	add.u64 	%rd32, %rd13, %rd31;
	st.global.v4.f32 	[%rd32+0], {%f16,%f17,%f18,%f19};
$Lt_23_26370:
 //<loop> Part of loop body line 198, head labeled $Lt_23_25090
	.loc	20	221	0
	add.s32 	%r39, %r39, 1;
	add.s64 	%rd20, %rd20, %rd8;
	add.s64 	%rd18, %rd18, %rd19;
	setp.ne.s32 	%p6, %r39, %r40;
	@%p6 bra 	$Lt_23_25090;
$Lt_23_24578:
	.loc	20	229	0
	mov.s32 	%r57, %r9;
	bra.uni 	$Lt_23_21506;
$Lt_23_21762:
	.loc	20	233	0
	mov.s32 	%r14, %r11;
	.loc	20	234	0
	sub.s32 	%r58, %r12, %r10;
	sub.s32 	%r16, %r58, 1;
	mov.u32 	%r59, 0;
	setp.lt.s32 	%p7, %r10, %r59;
	@%p7 bra 	$Lt_23_37378;
	mov.s32 	%r60, %r11;
	ld.param.s32 	%r61, [__cudaparm_VerticalBoxBlurKernel_inDeviceFormat];
	mov.s32 	%r62, 0;
	setp.eq.s32 	%p3, %r61, %r62;
	cvt.rn.f32.s32 	%f14, %r11;
	cvt.s64.s32 	%rd33, %r5;
	mov.s64 	%rd34, %rd33;
	ld.param.s32 	%r63, [__cudaparm_VerticalBoxBlurKernel_inInputPitch];
	cvt.s64.s32 	%rd8, %r63;
	mov.s64 	%rd18, %rd33;
	ld.param.s32 	%r64, [__cudaparm_VerticalBoxBlurKernel_inOutputPitch];
	cvt.s64.s32 	%rd19, %r64;
	rcp.approx.ftz.f32 	%f15, %f14;
	ld.param.u64 	%rd13, [__cudaparm_VerticalBoxBlurKernel_inOutput];
	ld.param.u64 	%rd2, [__cudaparm_VerticalBoxBlurKernel_inInput];
	mov.s32 	%r65, 0;
	mov.f32 	%f8, 0f00000000;     	// 0
	mov.f32 	%f7, 0f00000000;     	// 0
	mov.f32 	%f6, 0f00000000;     	// 0
	mov.f32 	%f9, 0f00000000;     	// 0
	mov.s32 	%r66, %r60;
$Lt_23_27650:
 //<loop> Loop body line 234, nesting depth: 1, estimated iterations: unknown
	@!%p3 bra 	$Lt_23_28162;
 //<loop> Part of loop body line 234, head labeled $Lt_23_27650
	.loc	18	115	0
	mul.lo.u64 	%rd35, %rd34, 8;
	add.u64 	%rd36, %rd2, %rd35;
	ld.global.v4.u16 	{%r67,%r68,%r69,%r70}, [%rd36+0];
	.loc	20	239	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r67;
	cvt.ftz.f32.f16	%f32, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r68;
	cvt.ftz.f32.f16	%f33, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r69;
	cvt.ftz.f32.f16	%f34, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r70;
	cvt.ftz.f32.f16	%f35, %b1; }
	bra.uni 	$Lt_23_27906;
$Lt_23_28162:
 //<loop> Part of loop body line 234, head labeled $Lt_23_27650
	mul.lo.u64 	%rd37, %rd34, 16;
	add.u64 	%rd38, %rd2, %rd37;
	ld.global.v4.f32 	{%f32,%f33,%f34,%f35}, [%rd38+0];
$Lt_23_27906:
 //<loop> Part of loop body line 234, head labeled $Lt_23_27650
	add.ftz.f32 	%f6, %f32, %f6;
	add.ftz.f32 	%f7, %f33, %f7;
	add.ftz.f32 	%f8, %f34, %f8;
	add.ftz.f32 	%f9, %f35, %f9;
	.loc	20	245	0
	mul.ftz.f32 	%f16, %f15, %f6;
	mul.ftz.f32 	%f17, %f15, %f7;
	mul.ftz.f32 	%f18, %f15, %f8;
	mul.ftz.f32 	%f19, %f15, %f9;
	@!%p3 bra 	$Lt_23_28674;
 //<loop> Part of loop body line 234, head labeled $Lt_23_27650
	.loc	18	126	0
	mul.lo.u64 	%rd39, %rd18, 8;
	add.u64 	%rd30, %rd13, %rd39;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f16;
	mov.b32		%r71, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f17;
	mov.b32		%r72, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f18;
	mov.b32		%r73, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f19;
	mov.b32		%r74, %b1; }
	st.global.v4.u16 	[%rd30+0], {%r71,%r72,%r73,%r74};
	.loc	20	246	0
	bra.uni 	$Lt_23_28418;
$Lt_23_28674:
 //<loop> Part of loop body line 234, head labeled $Lt_23_27650
	.loc	18	126	0
	mul.lo.u64 	%rd40, %rd18, 16;
	add.u64 	%rd32, %rd13, %rd40;
	st.global.v4.f32 	[%rd32+0], {%f16,%f17,%f18,%f19};
$Lt_23_28418:
 //<loop> Part of loop body line 234, head labeled $Lt_23_27650
	.loc	20	246	0
	add.s32 	%r65, %r65, 1;
	add.s64 	%rd18, %rd18, %rd19;
	add.s64 	%rd34, %rd34, %rd8;
	setp.ne.s32 	%p8, %r11, %r65;
	@%p8 bra 	$Lt_23_27650;
	bra.uni 	$Lt_23_27138;
$Lt_23_37378:
	mov.f32 	%f8, 0f00000000;     	// 0
	mov.f32 	%f7, 0f00000000;     	// 0
	mov.f32 	%f6, 0f00000000;     	// 0
	mov.f32 	%f9, 0f00000000;     	// 0
$Lt_23_27138:
	mov.s32 	%r57, 0;
$Lt_23_21506:
	.loc	20	257	0
	mov.s32 	%r75, %r14;
	setp.gt.s32 	%p9, %r75, %r16;
	@%p9 bra 	$Lt_23_37634;
	sub.s32 	%r76, %r16, %r75;
	add.s32 	%r77, %r76, 1;
	ld.param.s32 	%r78, [__cudaparm_VerticalBoxBlurKernel_inDeviceFormat];
	mov.s32 	%r79, 0;
	setp.eq.s32 	%p3, %r78, %r79;
	ld.param.s32 	%r38, [__cudaparm_VerticalBoxBlurKernel_inOutputPitch];
	add.s32 	%r80, %r16, 1;
	add.s32 	%r81, %r75, %r57;
	add.s32 	%r82, %r80, %r57;
	sub.s32 	%r83, %r75, %r10;
	ld.param.s32 	%r25, [__cudaparm_VerticalBoxBlurKernel_inInputPitch];
	cvt.rn.f32.s32 	%f14, %r11;
	add.s32 	%r84, %r83, %r57;
	mul.lo.s32 	%r85, %r75, %r38;
	add.s32 	%r86, %r5, %r85;
	cvt.s64.s32 	%rd18, %r86;
	cvt.s64.s32 	%rd19, %r38;
	rcp.approx.ftz.f32 	%f15, %f14;
	sub.s32 	%r87, %r84, 1;
	mul.lo.s32 	%r88, %r81, %r25;
	add.s32 	%r89, %r5, %r88;
	cvt.s64.s32 	%rd41, %r89;
	cvt.s64.s32 	%rd8, %r25;
	mul.lo.s32 	%r90, %r87, %r25;
	add.s32 	%r91, %r5, %r90;
	cvt.s64.s32 	%rd42, %r91;
	ld.param.u64 	%rd13, [__cudaparm_VerticalBoxBlurKernel_inOutput];
	ld.param.u64 	%rd2, [__cudaparm_VerticalBoxBlurKernel_inInput];
	mov.s32 	%r92, %r77;
$Lt_23_29698:
 //<loop> Loop body line 257, nesting depth: 1, estimated iterations: unknown
	@!%p3 bra 	$Lt_23_30210;
 //<loop> Part of loop body line 257, head labeled $Lt_23_29698
	.loc	18	115	0
	mul.lo.u64 	%rd43, %rd41, 8;
	add.u64 	%rd44, %rd2, %rd43;
	ld.global.v4.u16 	{%r93,%r94,%r95,%r96}, [%rd44+0];
	.loc	20	259	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r93;
	cvt.ftz.f32.f16	%f36, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r94;
	cvt.ftz.f32.f16	%f37, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r95;
	cvt.ftz.f32.f16	%f38, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r96;
	cvt.ftz.f32.f16	%f39, %b1; }
	bra.uni 	$Lt_23_29954;
$Lt_23_30210:
 //<loop> Part of loop body line 257, head labeled $Lt_23_29698
	mul.lo.u64 	%rd45, %rd41, 16;
	add.u64 	%rd46, %rd2, %rd45;
	ld.global.v4.f32 	{%f36,%f37,%f38,%f39}, [%rd46+0];
$Lt_23_29954:
 //<loop> Part of loop body line 257, head labeled $Lt_23_29698
	@!%p3 bra 	$Lt_23_30722;
 //<loop> Part of loop body line 257, head labeled $Lt_23_29698
	.loc	18	115	0
	mul.lo.u64 	%rd47, %rd42, 8;
	add.u64 	%rd48, %rd2, %rd47;
	ld.global.v4.u16 	{%r97,%r98,%r99,%r100}, [%rd48+0];
	.loc	20	265	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r97;
	cvt.ftz.f32.f16	%f40, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r98;
	cvt.ftz.f32.f16	%f41, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r99;
	cvt.ftz.f32.f16	%f42, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r100;
	cvt.ftz.f32.f16	%f43, %b1; }
	bra.uni 	$Lt_23_30466;
$Lt_23_30722:
 //<loop> Part of loop body line 257, head labeled $Lt_23_29698
	mul.lo.u64 	%rd49, %rd42, 16;
	add.u64 	%rd50, %rd2, %rd49;
	ld.global.v4.f32 	{%f40,%f41,%f42,%f43}, [%rd50+0];
$Lt_23_30466:
 //<loop> Part of loop body line 257, head labeled $Lt_23_29698
	add.ftz.f32 	%f44, %f36, %f6;
	sub.ftz.f32 	%f6, %f44, %f40;
	add.ftz.f32 	%f45, %f37, %f7;
	sub.ftz.f32 	%f7, %f45, %f41;
	add.ftz.f32 	%f46, %f38, %f8;
	sub.ftz.f32 	%f8, %f46, %f42;
	add.ftz.f32 	%f47, %f39, %f9;
	sub.ftz.f32 	%f9, %f47, %f43;
	.loc	20	272	0
	mul.ftz.f32 	%f16, %f15, %f6;
	mul.ftz.f32 	%f17, %f15, %f7;
	mul.ftz.f32 	%f18, %f15, %f8;
	mul.ftz.f32 	%f19, %f15, %f9;
	@!%p3 bra 	$Lt_23_31234;
 //<loop> Part of loop body line 257, head labeled $Lt_23_29698
	.loc	18	126	0
	mul.lo.u64 	%rd51, %rd18, 8;
	add.u64 	%rd30, %rd13, %rd51;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f16;
	mov.b32		%r101, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f17;
	mov.b32		%r102, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f18;
	mov.b32		%r103, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f19;
	mov.b32		%r104, %b1; }
	st.global.v4.u16 	[%rd30+0], {%r101,%r102,%r103,%r104};
	.loc	20	273	0
	bra.uni 	$Lt_23_30978;
$Lt_23_31234:
 //<loop> Part of loop body line 257, head labeled $Lt_23_29698
	.loc	18	126	0
	mul.lo.u64 	%rd52, %rd18, 16;
	add.u64 	%rd32, %rd13, %rd52;
	st.global.v4.f32 	[%rd32+0], {%f16,%f17,%f18,%f19};
$Lt_23_30978:
 //<loop> Part of loop body line 257, head labeled $Lt_23_29698
	.loc	20	273	0
	add.s64 	%rd42, %rd42, %rd8;
	add.s32 	%r81, %r81, 1;
	add.s64 	%rd41, %rd41, %rd8;
	add.s64 	%rd18, %rd18, %rd19;
	setp.ne.s32 	%p10, %r81, %r82;
	@%p10 bra 	$Lt_23_29698;
	bra.uni 	$Lt_23_29186;
$Lt_23_37634:
	add.s32 	%r80, %r16, 1;
$Lt_23_29186:
	sub.s32 	%r105, %r12, 1;
	setp.ge.s32 	%p11, %r105, %r80;
	@!%p2 bra 	$Lt_23_32002;
	@!%p11 bra 	$LBB85_VerticalBoxBlurKernel;
	sub.s32 	%r106, %r12, %r16;
	sub.s32 	%r107, %r106, 1;
	ld.param.s32 	%r108, [__cudaparm_VerticalBoxBlurKernel_inDeviceFormat];
	mov.s32 	%r109, 0;
	setp.eq.s32 	%p3, %r108, %r109;
	ld.param.s32 	%r38, [__cudaparm_VerticalBoxBlurKernel_inOutputPitch];
	mul.lo.s32 	%r110, %r80, %r38;
	sub.s32 	%r111, %r80, %r9;
	sub.s32 	%r112, %r12, %r9;
	ld.param.s32 	%r25, [__cudaparm_VerticalBoxBlurKernel_inInputPitch];
	sub.s32 	%r113, %r111, 1;
	sub.s32 	%r114, %r112, 1;
	add.s32 	%r115, %r110, %r5;
	cvt.rn.f32.s32 	%f14, %r11;
	cvt.s64.s32 	%rd18, %r115;
	cvt.s64.s32 	%rd19, %r38;
	rcp.approx.ftz.f32 	%f15, %f14;
	mul.lo.s32 	%r116, %r105, %r25;
	add.s32 	%r117, %r5, %r116;
	cvt.s64.s32 	%rd53, %r117;
	mul.lo.s32 	%r118, %r113, %r25;
	add.s32 	%r119, %r5, %r118;
	cvt.s64.s32 	%rd54, %r119;
	cvt.s64.s32 	%rd8, %r25;
	ld.param.u64 	%rd13, [__cudaparm_VerticalBoxBlurKernel_inOutput];
	ld.param.u64 	%rd2, [__cudaparm_VerticalBoxBlurKernel_inInput];
	mov.s32 	%r120, %r107;
$Lt_23_32770:
 //<loop> Loop body line 273, nesting depth: 1, estimated iterations: unknown
	@!%p3 bra 	$Lt_23_33282;
 //<loop> Part of loop body line 273, head labeled $Lt_23_32770
	.loc	18	115	0
	mul.lo.u64 	%rd55, %rd53, 8;
	add.u64 	%rd56, %rd2, %rd55;
	ld.global.v4.u16 	{%r121,%r122,%r123,%r124}, [%rd56+0];
	.loc	20	286	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r121;
	cvt.ftz.f32.f16	%f48, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r122;
	cvt.ftz.f32.f16	%f49, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r123;
	cvt.ftz.f32.f16	%f50, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r124;
	cvt.ftz.f32.f16	%f51, %b1; }
	bra.uni 	$Lt_23_33026;
$Lt_23_33282:
 //<loop> Part of loop body line 273, head labeled $Lt_23_32770
	mul.lo.u64 	%rd57, %rd53, 16;
	add.u64 	%rd58, %rd2, %rd57;
	ld.global.v4.f32 	{%f48,%f49,%f50,%f51}, [%rd58+0];
$Lt_23_33026:
 //<loop> Part of loop body line 273, head labeled $Lt_23_32770
	@!%p3 bra 	$Lt_23_33794;
 //<loop> Part of loop body line 273, head labeled $Lt_23_32770
	.loc	18	115	0
	mul.lo.u64 	%rd59, %rd54, 8;
	add.u64 	%rd60, %rd2, %rd59;
	ld.global.v4.u16 	{%r125,%r126,%r127,%r128}, [%rd60+0];
	.loc	20	292	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r125;
	cvt.ftz.f32.f16	%f52, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r126;
	cvt.ftz.f32.f16	%f53, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r127;
	cvt.ftz.f32.f16	%f54, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r128;
	cvt.ftz.f32.f16	%f55, %b1; }
	bra.uni 	$Lt_23_33538;
$Lt_23_33794:
 //<loop> Part of loop body line 273, head labeled $Lt_23_32770
	mul.lo.u64 	%rd61, %rd54, 16;
	add.u64 	%rd62, %rd2, %rd61;
	ld.global.v4.f32 	{%f52,%f53,%f54,%f55}, [%rd62+0];
$Lt_23_33538:
 //<loop> Part of loop body line 273, head labeled $Lt_23_32770
	add.ftz.f32 	%f56, %f48, %f6;
	sub.ftz.f32 	%f6, %f56, %f52;
	add.ftz.f32 	%f57, %f49, %f7;
	sub.ftz.f32 	%f7, %f57, %f53;
	add.ftz.f32 	%f58, %f50, %f8;
	sub.ftz.f32 	%f8, %f58, %f54;
	add.ftz.f32 	%f59, %f51, %f9;
	sub.ftz.f32 	%f9, %f59, %f55;
	.loc	20	298	0
	mul.ftz.f32 	%f16, %f15, %f6;
	mul.ftz.f32 	%f17, %f15, %f7;
	mul.ftz.f32 	%f18, %f15, %f8;
	mul.ftz.f32 	%f19, %f15, %f9;
	@!%p3 bra 	$Lt_23_34306;
 //<loop> Part of loop body line 273, head labeled $Lt_23_32770
	.loc	18	126	0
	mul.lo.u64 	%rd63, %rd18, 8;
	add.u64 	%rd30, %rd13, %rd63;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f16;
	mov.b32		%r129, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f17;
	mov.b32		%r130, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f18;
	mov.b32		%r131, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f19;
	mov.b32		%r132, %b1; }
	st.global.v4.u16 	[%rd30+0], {%r129,%r130,%r131,%r132};
	.loc	20	299	0
	bra.uni 	$Lt_23_34050;
$Lt_23_34306:
 //<loop> Part of loop body line 273, head labeled $Lt_23_32770
	.loc	18	126	0
	mul.lo.u64 	%rd64, %rd18, 16;
	add.u64 	%rd32, %rd13, %rd64;
	st.global.v4.f32 	[%rd32+0], {%f16,%f17,%f18,%f19};
$Lt_23_34050:
 //<loop> Part of loop body line 273, head labeled $Lt_23_32770
	.loc	20	299	0
	add.s32 	%r113, %r113, 1;
	add.s64 	%rd54, %rd54, %rd8;
	add.s64 	%rd18, %rd18, %rd19;
	setp.ne.s32 	%p12, %r113, %r114;
	@%p12 bra 	$Lt_23_32770;
	bra.uni 	$LBB85_VerticalBoxBlurKernel;
$Lt_23_32002:
	@!%p11 bra 	$LBB85_VerticalBoxBlurKernel;
	sub.s32 	%r133, %r12, %r16;
	sub.s32 	%r134, %r133, 1;
	ld.param.s32 	%r135, [__cudaparm_VerticalBoxBlurKernel_inDeviceFormat];
	mov.s32 	%r136, 0;
	setp.eq.s32 	%p3, %r135, %r136;
	ld.param.s32 	%r38, [__cudaparm_VerticalBoxBlurKernel_inOutputPitch];
	mul.lo.s32 	%r137, %r80, %r38;
	sub.s32 	%r138, %r80, %r10;
	sub.s32 	%r139, %r12, %r10;
	add.s32 	%r140, %r137, %r5;
	cvt.rn.f32.s32 	%f14, %r11;
	add.s32 	%r141, %r138, %r57;
	add.s32 	%r142, %r139, %r57;
	cvt.s64.s32 	%rd18, %r140;
	cvt.s64.s32 	%rd19, %r38;
	rcp.approx.ftz.f32 	%f15, %f14;
	sub.s32 	%r87, %r141, 1;
	sub.s32 	%r143, %r142, 1;
	ld.param.s32 	%r25, [__cudaparm_VerticalBoxBlurKernel_inInputPitch];
	mul.lo.s32 	%r144, %r87, %r25;
	add.s32 	%r145, %r5, %r144;
	cvt.s64.s32 	%rd42, %r145;
	cvt.s64.s32 	%rd8, %r25;
	ld.param.u64 	%rd13, [__cudaparm_VerticalBoxBlurKernel_inOutput];
	ld.param.u64 	%rd2, [__cudaparm_VerticalBoxBlurKernel_inInput];
	mov.s32 	%r146, %r134;
$Lt_23_35330:
 //<loop> Loop body line 299, nesting depth: 1, estimated iterations: unknown
	@!%p3 bra 	$Lt_23_35842;
 //<loop> Part of loop body line 299, head labeled $Lt_23_35330
	.loc	18	115	0
	mul.lo.u64 	%rd65, %rd42, 8;
	add.u64 	%rd66, %rd2, %rd65;
	ld.global.v4.u16 	{%r147,%r148,%r149,%r150}, [%rd66+0];
	.loc	20	312	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r147;
	cvt.ftz.f32.f16	%f60, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r148;
	cvt.ftz.f32.f16	%f61, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r149;
	cvt.ftz.f32.f16	%f62, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r150;
	cvt.ftz.f32.f16	%f63, %b1; }
	bra.uni 	$Lt_23_35586;
$Lt_23_35842:
 //<loop> Part of loop body line 299, head labeled $Lt_23_35330
	mul.lo.u64 	%rd67, %rd42, 16;
	add.u64 	%rd68, %rd2, %rd67;
	ld.global.v4.f32 	{%f60,%f61,%f62,%f63}, [%rd68+0];
$Lt_23_35586:
 //<loop> Part of loop body line 299, head labeled $Lt_23_35330
	sub.ftz.f32 	%f6, %f6, %f60;
	sub.ftz.f32 	%f7, %f7, %f61;
	sub.ftz.f32 	%f8, %f8, %f62;
	sub.ftz.f32 	%f9, %f9, %f63;
	.loc	20	318	0
	mul.ftz.f32 	%f16, %f15, %f6;
	mul.ftz.f32 	%f17, %f15, %f7;
	mul.ftz.f32 	%f18, %f15, %f8;
	mul.ftz.f32 	%f19, %f15, %f9;
	@!%p3 bra 	$Lt_23_36354;
 //<loop> Part of loop body line 299, head labeled $Lt_23_35330
	.loc	18	126	0
	mul.lo.u64 	%rd69, %rd18, 8;
	add.u64 	%rd30, %rd13, %rd69;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f16;
	mov.b32		%r151, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f17;
	mov.b32		%r152, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f18;
	mov.b32		%r153, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f19;
	mov.b32		%r154, %b1; }
	st.global.v4.u16 	[%rd30+0], {%r151,%r152,%r153,%r154};
	.loc	20	319	0
	bra.uni 	$Lt_23_36098;
$Lt_23_36354:
 //<loop> Part of loop body line 299, head labeled $Lt_23_35330
	.loc	18	126	0
	mul.lo.u64 	%rd70, %rd18, 16;
	add.u64 	%rd32, %rd13, %rd70;
	st.global.v4.f32 	[%rd32+0], {%f16,%f17,%f18,%f19};
$Lt_23_36098:
 //<loop> Part of loop body line 299, head labeled $Lt_23_35330
	.loc	20	319	0
	add.s32 	%r87, %r87, 1;
	add.s64 	%rd42, %rd42, %rd8;
	add.s64 	%rd18, %rd18, %rd19;
	setp.ne.s32 	%p13, %r87, %r143;
	@%p13 bra 	$Lt_23_35330;
$LBB85_VerticalBoxBlurKernel:
	.loc	20	328	0
	exit;
$LDWend_VerticalBoxBlurKernel:
	} // VerticalBoxBlurKernel

