	.version 2.2
	.target sm_20
	// compiled with ../../../External/3rdParty/NVIDIA/CUDA/win/bin/../open64/lib//be.exe
	// nvopencc 3.2 built on 2010-11-04

	.visible .func (.param .s32 __cudaretf__Z15IntegerMultiplyii) _Z15IntegerMultiplyii (.param .s32 __cudaparmf1__Z15IntegerMultiplyii, .param .s32 __cudaparmf2__Z15IntegerMultiplyii)

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelXv) _Z17Standard2DKernelXv ()

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelYv) _Z17Standard2DKernelYv ()

	.visible .func (.param .align 16 .b8 __cudaretf__Z13Half4ToFloat47ushort4[16]) _Z13Half4ToFloat47ushort4 (.param .align 8 .b8 __cudaparmf1__Z13Half4ToFloat47ushort4[8])

	.visible .func (.param .align 8 .b8 __cudaretf__Z13Float4ToHalf46float4[8]) _Z13Float4ToHalf46float4 (.param .align 16 .b8 __cudaparmf1__Z13Float4ToHalf46float4[16])

	.visible .func (.param .u32 __cudaretf__Z4Mix3RjS_S_) _Z4Mix3RjS_S_ (.param .u64 __cudaparmf1__Z4Mix3RjS_S_, .param .u64 __cudaparmf2__Z4Mix3RjS_S_, .param .u64 __cudaparmf3__Z4Mix3RjS_S_)

	.visible .func (.param .s32 __cudaretf__Z4Randj) _Z4Randj (.param .u32 __cudaparmf1__Z4Randj)

	.visible .func (.param .s32 __cudaretf__Z6Rand2Djjj) _Z6Rand2Djjj (.param .u32 __cudaparmf1__Z6Rand2Djjj, .param .u32 __cudaparmf2__Z6Rand2Djjj, .param .u32 __cudaparmf3__Z6Rand2Djjj)

	.visible .func (.param .s32 __cudaretf__Z6Rand2Dj) _Z6Rand2Dj (.param .u32 __cudaparmf1__Z6Rand2Dj)

	.visible .func (.param .align 16 .b8 __cudaretf__Z18UnpremultiplyPixel8PixelRGB[16]) _Z18UnpremultiplyPixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z18UnpremultiplyPixel8PixelRGB[16])

	.visible .func (.param .f32 __cudaretf__Z13ToLinearColorf) _Z13ToLinearColorf (.param .f32 __cudaparmf1__Z13ToLinearColorf)

	.visible .func (.param .f32 __cudaretf__Z15FromLinearColorf) _Z15FromLinearColorf (.param .f32 __cudaparmf1__Z15FromLinearColorf)

	.visible .func (.param .align 16 .b8 __cudaretf__Z25PremultiplyLinearizePixel8PixelRGB[16]) _Z25PremultiplyLinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16]) _Z29UnpremultiplyUnlinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z20PremultiplyLinearize6float4[16]) _Z20PremultiplyLinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z20PremultiplyLinearize6float4[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z24UnpremultiplyUnlinearize6float4[16]) _Z24UnpremultiplyUnlinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z24UnpremultiplyUnlinearize6float4[16])

	//-----------------------------------------------------------
	// Compiling C:/Users/dvaeng/AppData/Local/Temp/tmpxft_00003fac_00000000-11_SingleChannelBlur.cpp3.i (C:/Users/dvaeng/AppData/Local/Temp/ccBI#.a15136)
	//-----------------------------------------------------------

	//-----------------------------------------------------------
	// Options:
	//-----------------------------------------------------------
	//  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
	//  -O3	(Optimization level)
	//  -g0	(Debug level)
	//  -m2	(Report advisories)
	//-----------------------------------------------------------

	.file	1	"C:/Users/dvaeng/AppData/Local/Temp/tmpxft_00003fac_00000000-10_SingleChannelBlur.cudafe2.gpu"
	.file	2	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/PixelRGB.h"
	.file	3	"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\crtdefs.h"
	.file	4	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\crt/device_runtime.h"
	.file	5	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\host_defines.h"
	.file	6	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\builtin_types.h"
	.file	7	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\device_types.h"
	.file	8	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\driver_types.h"
	.file	9	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\surface_types.h"
	.file	10	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\texture_types.h"
	.file	11	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\vector_types.h"
	.file	12	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\builtin_types.h"
	.file	13	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\host_defines.h"
	.file	14	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\device_launch_parameters.h"
	.file	15	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\crt\storage_class.h"
	.file	16	"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\time.h"
	.file	17	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/Utils.h"
	.file	18	"c:/Mulder64/shared/adobe/MediaCore/Display/Src/CUDA/Effects/SingleChannelBlur.cu"
	.file	19	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\common_functions.h"
	.file	20	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_functions.h"
	.file	21	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_constants.h"
	.file	22	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\device_functions.h"
	.file	23	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_11_atomic_functions.h"
	.file	24	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_12_atomic_functions.h"
	.file	25	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_13_double_functions.h"
	.file	26	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_20_atomic_functions.h"
	.file	27	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_20_intrinsics.h"
	.file	28	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\surface_functions.h"
	.file	29	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\texture_fetch_functions.h"
	.file	30	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_functions_dbl_ptx3.h"
	.file	31	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/ColorSpaceConvert.h"


	.visible .func (.param .s32 __cudaretf__Z15IntegerMultiplyii) _Z15IntegerMultiplyii (.param .s32 __cudaparmf1__Z15IntegerMultiplyii, .param .s32 __cudaparmf2__Z15IntegerMultiplyii)
	{
	.reg .u32 %r<7>;
	.loc	17	60	0
$LDWbegin__Z15IntegerMultiplyii:
	ld.param.u32 	%r1, [__cudaparmf1__Z15IntegerMultiplyii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf2__Z15IntegerMultiplyii];
	mov.s32 	%r4, %r3;
	.loc	17	64	0
	mul.lo.s32 	%r5, %r2, %r4;
	st.param.s32 	[__cudaretf__Z15IntegerMultiplyii], %r5;
	ret;
$LDWend__Z15IntegerMultiplyii:
	} // _Z15IntegerMultiplyii

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelXv) _Z17Standard2DKernelXv ()
	{
	.reg .u32 %r<7>;
	.loc	17	73	0
$LDWbegin__Z17Standard2DKernelXv:
	.loc	17	74	0
	mov.u32 	%r1, %tid.x;
	cvt.s32.u32 	%r2, %ctaid.x;
	cvt.s32.u32 	%r3, %ntid.x;
	mul.lo.s32 	%r4, %r2, %r3;
	add.u32 	%r5, %r1, %r4;
	st.param.s32 	[__cudaretf__Z17Standard2DKernelXv], %r5;
	ret;
$LDWend__Z17Standard2DKernelXv:
	} // _Z17Standard2DKernelXv

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelYv) _Z17Standard2DKernelYv ()
	{
	.reg .u32 %r<7>;
	.loc	17	77	0
$LDWbegin__Z17Standard2DKernelYv:
	.loc	17	78	0
	mov.u32 	%r1, %tid.y;
	cvt.s32.u32 	%r2, %ctaid.y;
	cvt.s32.u32 	%r3, %ntid.y;
	mul.lo.s32 	%r4, %r2, %r3;
	add.u32 	%r5, %r1, %r4;
	st.param.s32 	[__cudaretf__Z17Standard2DKernelYv], %r5;
	ret;
$LDWend__Z17Standard2DKernelYv:
	} // _Z17Standard2DKernelYv

	.visible .func (.param .align 16 .b8 __cudaretf__Z13Half4ToFloat47ushort4[16]) _Z13Half4ToFloat47ushort4 (.param .align 8 .b8 __cudaparmf1__Z13Half4ToFloat47ushort4[8])
	{
	.reg .u32 %r<14>;
	.reg .f32 %f<9>;
	.loc	17	86	0
$LDWbegin__Z13Half4ToFloat47ushort4:
	ld.param.u16 	%r1, [__cudaparmf1__Z13Half4ToFloat47ushort4+0];
	mov.s32 	%r2, %r1;
	ld.param.u16 	%r3, [__cudaparmf1__Z13Half4ToFloat47ushort4+2];
	mov.s32 	%r4, %r3;
	ld.param.u16 	%r5, [__cudaparmf1__Z13Half4ToFloat47ushort4+4];
	mov.s32 	%r6, %r5;
	ld.param.u16 	%r7, [__cudaparmf1__Z13Half4ToFloat47ushort4+6];
	mov.s32 	%r8, %r7;
	.loc	17	87	0
	cvt.u16.u32 	%r9, %r4;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r9;
	cvt.ftz.f32.f16	%f1, %b1; }
	cvt.u16.u32 	%r10, %r6;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r10;
	cvt.ftz.f32.f16	%f2, %b1; }
	cvt.u16.u32 	%r11, %r8;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r11;
	cvt.ftz.f32.f16	%f3, %b1; }
	cvt.u16.u32 	%r12, %r2;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r12;
	cvt.ftz.f32.f16	%f4, %b1; }
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+0], %f4;
	mov.f32 	%f5, %f1;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+4], %f5;
	mov.f32 	%f6, %f2;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+8], %f6;
	mov.f32 	%f7, %f3;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+12], %f7;
	ret;
$LDWend__Z13Half4ToFloat47ushort4:
	} // _Z13Half4ToFloat47ushort4

	.visible .func (.param .align 8 .b8 __cudaretf__Z13Float4ToHalf46float4[8]) _Z13Float4ToHalf46float4 (.param .align 16 .b8 __cudaparmf1__Z13Float4ToHalf46float4[16])
	{
	.reg .u32 %r<13>;
	.reg .f32 %f<10>;
	.loc	17	95	0
$LDWbegin__Z13Float4ToHalf46float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z13Float4ToHalf46float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z13Float4ToHalf46float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z13Float4ToHalf46float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z13Float4ToHalf46float4+12];
	mov.f32 	%f8, %f7;
	.loc	17	96	0
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f4;
	mov.b32		%r1, %b1; }
	cvt.u16.u32 	%r2, %r1;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f6;
	mov.b32		%r3, %b1; }
	cvt.u16.u32 	%r4, %r3;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f8;
	mov.b32		%r5, %b1; }
	cvt.u16.u32 	%r6, %r5;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f2;
	mov.b32		%r7, %b1; }
	cvt.u16.u32 	%r8, %r7;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+0], %r8;
	mov.s32 	%r9, %r2;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+2], %r9;
	mov.s32 	%r10, %r4;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+4], %r10;
	mov.s32 	%r11, %r6;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+6], %r11;
	ret;
$LDWend__Z13Float4ToHalf46float4:
	} // _Z13Float4ToHalf46float4

	.visible .func (.param .u32 __cudaretf__Z4Mix3RjS_S_) _Z4Mix3RjS_S_ (.param .u64 __cudaparmf1__Z4Mix3RjS_S_, .param .u64 __cudaparmf2__Z4Mix3RjS_S_, .param .u64 __cudaparmf3__Z4Mix3RjS_S_)
	{
	.reg .u32 %r<75>;
	.reg .u64 %rd<8>;
	.loc	17	138	0
$LDWbegin__Z4Mix3RjS_S_:
	ld.param.u64 	%rd1, [__cudaparmf1__Z4Mix3RjS_S_];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z4Mix3RjS_S_];
	mov.s64 	%rd4, %rd3;
	ld.param.u64 	%rd5, [__cudaparmf3__Z4Mix3RjS_S_];
	mov.s64 	%rd6, %rd5;
	.loc	17	139	0
	ld.u32 	%r1, [%rd2+0];
	ld.u32 	%r2, [%rd4+0];
	sub.u32 	%r3, %r1, %r2;
	st.u32 	[%rd2+0], %r3;
	ld.u32 	%r4, [%rd6+0];
	sub.u32 	%r5, %r3, %r4;
	st.u32 	[%rd2+0], %r5;
	ld.u32 	%r6, [%rd6+0];
	shr.u32 	%r7, %r6, 13;
	xor.b32 	%r8, %r5, %r7;
	st.u32 	[%rd2+0], %r8;
	.loc	17	140	0
	ld.u32 	%r9, [%rd4+0];
	ld.u32 	%r10, [%rd6+0];
	sub.u32 	%r11, %r9, %r10;
	st.u32 	[%rd4+0], %r11;
	ld.u32 	%r12, [%rd2+0];
	sub.u32 	%r13, %r11, %r12;
	st.u32 	[%rd4+0], %r13;
	ld.u32 	%r14, [%rd2+0];
	shl.b32 	%r15, %r14, 8;
	xor.b32 	%r16, %r13, %r15;
	st.u32 	[%rd4+0], %r16;
	.loc	17	141	0
	ld.u32 	%r17, [%rd6+0];
	ld.u32 	%r18, [%rd2+0];
	sub.u32 	%r19, %r17, %r18;
	st.u32 	[%rd6+0], %r19;
	ld.u32 	%r20, [%rd4+0];
	sub.u32 	%r21, %r19, %r20;
	st.u32 	[%rd6+0], %r21;
	ld.u32 	%r22, [%rd4+0];
	shr.u32 	%r23, %r22, 13;
	xor.b32 	%r24, %r21, %r23;
	st.u32 	[%rd6+0], %r24;
	.loc	17	142	0
	ld.u32 	%r25, [%rd2+0];
	ld.u32 	%r26, [%rd4+0];
	sub.u32 	%r27, %r25, %r26;
	st.u32 	[%rd2+0], %r27;
	ld.u32 	%r28, [%rd6+0];
	sub.u32 	%r29, %r27, %r28;
	st.u32 	[%rd2+0], %r29;
	ld.u32 	%r30, [%rd6+0];
	shr.u32 	%r31, %r30, 12;
	xor.b32 	%r32, %r29, %r31;
	st.u32 	[%rd2+0], %r32;
	.loc	17	143	0
	ld.u32 	%r33, [%rd4+0];
	ld.u32 	%r34, [%rd6+0];
	sub.u32 	%r35, %r33, %r34;
	st.u32 	[%rd4+0], %r35;
	ld.u32 	%r36, [%rd2+0];
	sub.u32 	%r37, %r35, %r36;
	st.u32 	[%rd4+0], %r37;
	ld.u32 	%r38, [%rd2+0];
	shl.b32 	%r39, %r38, 16;
	xor.b32 	%r40, %r37, %r39;
	st.u32 	[%rd4+0], %r40;
	.loc	17	144	0
	ld.u32 	%r41, [%rd6+0];
	ld.u32 	%r42, [%rd2+0];
	sub.u32 	%r43, %r41, %r42;
	st.u32 	[%rd6+0], %r43;
	ld.u32 	%r44, [%rd4+0];
	sub.u32 	%r45, %r43, %r44;
	st.u32 	[%rd6+0], %r45;
	ld.u32 	%r46, [%rd4+0];
	shr.u32 	%r47, %r46, 5;
	xor.b32 	%r48, %r45, %r47;
	st.u32 	[%rd6+0], %r48;
	.loc	17	145	0
	ld.u32 	%r49, [%rd2+0];
	ld.u32 	%r50, [%rd4+0];
	sub.u32 	%r51, %r49, %r50;
	st.u32 	[%rd2+0], %r51;
	ld.u32 	%r52, [%rd6+0];
	sub.u32 	%r53, %r51, %r52;
	st.u32 	[%rd2+0], %r53;
	ld.u32 	%r54, [%rd6+0];
	shr.u32 	%r55, %r54, 3;
	xor.b32 	%r56, %r53, %r55;
	st.u32 	[%rd2+0], %r56;
	.loc	17	146	0
	ld.u32 	%r57, [%rd4+0];
	ld.u32 	%r58, [%rd6+0];
	sub.u32 	%r59, %r57, %r58;
	st.u32 	[%rd4+0], %r59;
	ld.u32 	%r60, [%rd2+0];
	sub.u32 	%r61, %r59, %r60;
	st.u32 	[%rd4+0], %r61;
	ld.u32 	%r62, [%rd2+0];
	shl.b32 	%r63, %r62, 10;
	xor.b32 	%r64, %r61, %r63;
	st.u32 	[%rd4+0], %r64;
	.loc	17	147	0
	ld.u32 	%r65, [%rd6+0];
	ld.u32 	%r66, [%rd2+0];
	sub.u32 	%r67, %r65, %r66;
	st.u32 	[%rd6+0], %r67;
	ld.u32 	%r68, [%rd4+0];
	sub.u32 	%r69, %r67, %r68;
	st.u32 	[%rd6+0], %r69;
	ld.u32 	%r70, [%rd4+0];
	shr.u32 	%r71, %r70, 15;
	xor.b32 	%r72, %r69, %r71;
	st.u32 	[%rd6+0], %r72;
	.loc	17	148	0
	mov.s32 	%r73, %r72;
	st.param.u32 	[__cudaretf__Z4Mix3RjS_S_], %r73;
	ret;
$LDWend__Z4Mix3RjS_S_:
	} // _Z4Mix3RjS_S_

	.visible .func (.param .s32 __cudaretf__Z4Randj) _Z4Randj (.param .u32 __cudaparmf1__Z4Randj)
	{
	.reg .u32 %r<14>;
	.loc	17	152	0
$LDWbegin__Z4Randj:
	ld.param.u32 	%r1, [__cudaparmf1__Z4Randj];
	mov.s32 	%r2, %r1;
	.loc	17	163	0
	mul.lo.u32 	%r3, %r2, 1103515245;
	add.u32 	%r4, %r3, 12345;
	shr.u32 	%r5, %r4, 16;
	and.b32 	%r6, %r5, 255;
	shl.b32 	%r7, %r6, 7;
	mul.lo.u32 	%r8, %r2, -1029531031;
	sub.u32 	%r9, %r8, 740551042;
	shr.u32 	%r10, %r9, 16;
	and.b32 	%r11, %r10, 255;
	xor.b32 	%r12, %r7, %r11;
	st.param.s32 	[__cudaretf__Z4Randj], %r12;
	ret;
$LDWend__Z4Randj:
	} // _Z4Randj

	.visible .func (.param .s32 __cudaretf__Z6Rand2Djjj) _Z6Rand2Djjj (.param .u32 __cudaparmf1__Z6Rand2Djjj, .param .u32 __cudaparmf2__Z6Rand2Djjj, .param .u32 __cudaparmf3__Z6Rand2Djjj)
	{
	.reg .u32 %r<54>;
	.loc	17	169	0
$LDWbegin__Z6Rand2Djjj:
	ld.param.u32 	%r1, [__cudaparmf1__Z6Rand2Djjj];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf2__Z6Rand2Djjj];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf3__Z6Rand2Djjj];
	mov.s32 	%r6, %r5;
	.loc	17	139	0
	sub.u32 	%r7, %r2, %r4;
	sub.u32 	%r8, %r7, %r6;
	shr.u32 	%r9, %r6, 13;
	xor.b32 	%r10, %r8, %r9;
	.loc	17	140	0
	sub.u32 	%r11, %r4, %r6;
	sub.u32 	%r12, %r11, %r10;
	shl.b32 	%r13, %r10, 8;
	xor.b32 	%r14, %r12, %r13;
	.loc	17	141	0
	sub.u32 	%r15, %r6, %r10;
	sub.u32 	%r16, %r15, %r14;
	shr.u32 	%r17, %r14, 13;
	xor.b32 	%r18, %r16, %r17;
	.loc	17	142	0
	sub.u32 	%r19, %r10, %r14;
	sub.u32 	%r20, %r19, %r18;
	shr.u32 	%r21, %r18, 12;
	xor.b32 	%r22, %r20, %r21;
	.loc	17	143	0
	sub.u32 	%r23, %r14, %r18;
	sub.u32 	%r24, %r23, %r22;
	shl.b32 	%r25, %r22, 16;
	xor.b32 	%r26, %r24, %r25;
	.loc	17	144	0
	sub.u32 	%r27, %r18, %r22;
	sub.u32 	%r28, %r27, %r26;
	shr.u32 	%r29, %r26, 5;
	xor.b32 	%r30, %r28, %r29;
	.loc	17	145	0
	sub.u32 	%r31, %r22, %r26;
	sub.u32 	%r32, %r31, %r30;
	shr.u32 	%r33, %r30, 3;
	xor.b32 	%r34, %r32, %r33;
	.loc	17	146	0
	sub.u32 	%r35, %r26, %r30;
	sub.u32 	%r36, %r35, %r34;
	shl.b32 	%r37, %r34, 10;
	xor.b32 	%r38, %r36, %r37;
	.loc	17	147	0
	sub.u32 	%r39, %r30, %r34;
	sub.u32 	%r40, %r39, %r38;
	shr.u32 	%r41, %r38, 15;
	xor.b32 	%r42, %r40, %r41;
	.loc	17	170	0
	mul.lo.u32 	%r43, %r42, 1103515245;
	add.u32 	%r44, %r43, 12345;
	shr.u32 	%r45, %r44, 16;
	and.b32 	%r46, %r45, 255;
	shl.b32 	%r47, %r46, 7;
	mul.lo.u32 	%r48, %r42, -1029531031;
	sub.u32 	%r49, %r48, 740551042;
	shr.u32 	%r50, %r49, 16;
	and.b32 	%r51, %r50, 255;
	xor.b32 	%r52, %r47, %r51;
	st.param.s32 	[__cudaretf__Z6Rand2Djjj], %r52;
	ret;
$LDWend__Z6Rand2Djjj:
	} // _Z6Rand2Djjj

	.visible .func (.param .s32 __cudaretf__Z6Rand2Dj) _Z6Rand2Dj (.param .u32 __cudaparmf1__Z6Rand2Dj)
	{
	.reg .u32 %r<60>;
	.loc	17	175	0
$LDWbegin__Z6Rand2Dj:
	ld.param.u32 	%r1, [__cudaparmf1__Z6Rand2Dj];
	mov.s32 	%r2, %r1;
	.loc	17	143	0
	cvt.s32.u32 	%r3, %ctaid.y;
	cvt.s32.u32 	%r4, %ntid.y;
	mul.lo.s32 	%r5, %r3, %r4;
	cvt.s32.u32 	%r6, %ctaid.x;
	cvt.s32.u32 	%r7, %ntid.x;
	mul.lo.s32 	%r8, %r6, %r7;
	mov.u32 	%r9, %tid.y;
	add.u32 	%r10, %r5, %r9;
	mov.u32 	%r11, %tid.x;
	add.u32 	%r12, %r8, %r11;
	shr.u32 	%r13, %r10, 13;
	sub.u32 	%r14, %r2, %r12;
	sub.u32 	%r15, %r12, %r10;
	sub.u32 	%r16, %r14, %r10;
	xor.b32 	%r17, %r13, %r16;
	shl.b32 	%r18, %r17, 8;
	sub.u32 	%r19, %r15, %r17;
	sub.u32 	%r20, %r10, %r17;
	xor.b32 	%r21, %r18, %r19;
	shr.u32 	%r22, %r21, 13;
	sub.u32 	%r23, %r20, %r21;
	sub.u32 	%r24, %r17, %r21;
	xor.b32 	%r25, %r22, %r23;
	shr.u32 	%r26, %r25, 12;
	sub.u32 	%r27, %r24, %r25;
	xor.b32 	%r28, %r26, %r27;
	sub.u32 	%r29, %r21, %r25;
	sub.u32 	%r30, %r29, %r28;
	shl.b32 	%r31, %r28, 16;
	xor.b32 	%r32, %r30, %r31;
	.loc	17	144	0
	sub.u32 	%r33, %r25, %r28;
	sub.u32 	%r34, %r33, %r32;
	shr.u32 	%r35, %r32, 5;
	xor.b32 	%r36, %r34, %r35;
	.loc	17	145	0
	sub.u32 	%r37, %r28, %r32;
	sub.u32 	%r38, %r37, %r36;
	shr.u32 	%r39, %r36, 3;
	xor.b32 	%r40, %r38, %r39;
	.loc	17	146	0
	sub.u32 	%r41, %r32, %r36;
	sub.u32 	%r42, %r41, %r40;
	shl.b32 	%r43, %r40, 10;
	xor.b32 	%r44, %r42, %r43;
	.loc	17	147	0
	sub.u32 	%r45, %r36, %r40;
	sub.u32 	%r46, %r45, %r44;
	shr.u32 	%r47, %r44, 15;
	xor.b32 	%r48, %r46, %r47;
	.loc	17	176	0
	mul.lo.u32 	%r49, %r48, 1103515245;
	add.u32 	%r50, %r49, 12345;
	shr.u32 	%r51, %r50, 16;
	and.b32 	%r52, %r51, 255;
	shl.b32 	%r53, %r52, 7;
	mul.lo.u32 	%r54, %r48, -1029531031;
	sub.u32 	%r55, %r54, 740551042;
	shr.u32 	%r56, %r55, 16;
	and.b32 	%r57, %r56, 255;
	xor.b32 	%r58, %r53, %r57;
	st.param.s32 	[__cudaretf__Z6Rand2Dj], %r58;
	ret;
$LDWend__Z6Rand2Dj:
	} // _Z6Rand2Dj

	.visible .func (.param .align 16 .b8 __cudaretf__Z18UnpremultiplyPixel8PixelRGB[16]) _Z18UnpremultiplyPixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z18UnpremultiplyPixel8PixelRGB[16])
	{
	.reg .f32 %f<23>;
	.reg .pred %p<3>;
	.loc	2	206	0
$LDWbegin__Z18UnpremultiplyPixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	2	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_9_1282;
	.loc	2	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	2	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	2	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_9_1026;
$Lt_9_1282:
	.loc	2	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_9_1026:
	.loc	2	224	0
	mov.f32 	%f18, %f17;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+0], %f18;
	mov.f32 	%f19, %f16;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+4], %f19;
	mov.f32 	%f20, %f15;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+8], %f20;
	mov.f32 	%f21, %f10;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+12], %f21;
	ret;
$LDWend__Z18UnpremultiplyPixel8PixelRGB:
	} // _Z18UnpremultiplyPixel8PixelRGB

	.visible .func (.param .f32 __cudaretf__Z13ToLinearColorf) _Z13ToLinearColorf (.param .f32 __cudaparmf1__Z13ToLinearColorf)
	{
	.reg .f32 %f<15>;
	.reg .pred %p<3>;
	.loc	2	231	0
$LDWbegin__Z13ToLinearColorf:
	ld.param.f32 	%f1, [__cudaparmf1__Z13ToLinearColorf];
	mov.f32 	%f2, %f1;
	mov.f32 	%f3, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f3;
	@!%p1 bra 	$Lt_10_1026;
	.loc	2	234	0
	neg.ftz.f32 	%f4, %f2;
	lg2.approx.ftz.f32 	%f5, %f4;
	mov.f32 	%f6, 0f400ccccd;     	// 2.2
	mul.ftz.f32 	%f7, %f5, %f6;
	ex2.approx.ftz.f32 	%f8, %f7;
	neg.ftz.f32 	%f9, %f8;
	bra.uni 	$LBB4__Z13ToLinearColorf;
$Lt_10_1026:
	.loc	2	236	0
	lg2.approx.ftz.f32 	%f10, %f2;
	mov.f32 	%f11, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f12, %f10, %f11;
	ex2.approx.ftz.f32 	%f9, %f12;
$LBB4__Z13ToLinearColorf:
	mov.f32 	%f13, %f9;
	st.param.f32 	[__cudaretf__Z13ToLinearColorf], %f13;
	ret;
$LDWend__Z13ToLinearColorf:
	} // _Z13ToLinearColorf

	.visible .func (.param .f32 __cudaretf__Z15FromLinearColorf) _Z15FromLinearColorf (.param .f32 __cudaparmf1__Z15FromLinearColorf)
	{
	.reg .f32 %f<15>;
	.reg .pred %p<3>;
	.loc	2	239	0
$LDWbegin__Z15FromLinearColorf:
	ld.param.f32 	%f1, [__cudaparmf1__Z15FromLinearColorf];
	mov.f32 	%f2, %f1;
	mov.f32 	%f3, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f3;
	@!%p1 bra 	$Lt_11_1026;
	.loc	2	242	0
	neg.ftz.f32 	%f4, %f2;
	lg2.approx.ftz.f32 	%f5, %f4;
	mov.f32 	%f6, 0f3ee8ba2e;     	// 0.454545
	mul.ftz.f32 	%f7, %f5, %f6;
	ex2.approx.ftz.f32 	%f8, %f7;
	neg.ftz.f32 	%f9, %f8;
	bra.uni 	$LBB4__Z15FromLinearColorf;
$Lt_11_1026:
	.loc	2	244	0
	lg2.approx.ftz.f32 	%f10, %f2;
	mov.f32 	%f11, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f12, %f10, %f11;
	ex2.approx.ftz.f32 	%f9, %f12;
$LBB4__Z15FromLinearColorf:
	mov.f32 	%f13, %f9;
	st.param.f32 	[__cudaretf__Z15FromLinearColorf], %f13;
	ret;
$LDWend__Z15FromLinearColorf:
	} // _Z15FromLinearColorf

	.visible .func (.param .align 16 .b8 __cudaretf__Z25PremultiplyLinearizePixel8PixelRGB[16]) _Z25PremultiplyLinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB[16])
	{
	.reg .f32 %f<47>;
	.reg .pred %p<5>;
	.loc	2	252	0
$LDWbegin__Z25PremultiplyLinearizePixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	2	254	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	.loc	2	255	0
	mov.f32 	%f10, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f10;
	@!%p1 bra 	$Lt_12_4098;
	.loc	2	234	0
	neg.ftz.f32 	%f11, %f2;
	lg2.approx.ftz.f32 	%f12, %f11;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f15, %f14;
	neg.ftz.f32 	%f16, %f15;
	bra.uni 	$LDWendi___log2f_189_5;
$Lt_12_4098:
	.loc	2	236	0
	lg2.approx.ftz.f32 	%f17, %f2;
	mov.f32 	%f18, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f19, %f17, %f18;
	ex2.approx.ftz.f32 	%f16, %f19;
$LDWendi___log2f_189_5:
	.loc	2	256	0
	mov.f32 	%f20, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f4, %f20;
	@!%p2 bra 	$Lt_12_4610;
	.loc	2	234	0
	neg.ftz.f32 	%f21, %f4;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	neg.ftz.f32 	%f26, %f25;
	bra.uni 	$LDWendi___log2f_189_3;
$Lt_12_4610:
	.loc	2	236	0
	lg2.approx.ftz.f32 	%f27, %f4;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f26, %f29;
$LDWendi___log2f_189_3:
	.loc	2	257	0
	mov.f32 	%f30, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f6, %f30;
	@!%p3 bra 	$Lt_12_5122;
	.loc	2	234	0
	neg.ftz.f32 	%f31, %f6;
	lg2.approx.ftz.f32 	%f32, %f31;
	mov.f32 	%f33, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f34, %f32, %f33;
	ex2.approx.ftz.f32 	%f35, %f34;
	neg.ftz.f32 	%f36, %f35;
	bra.uni 	$LDWendi___log2f_189_1;
$Lt_12_5122:
	.loc	2	236	0
	lg2.approx.ftz.f32 	%f37, %f6;
	mov.f32 	%f38, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f39, %f37, %f38;
	ex2.approx.ftz.f32 	%f36, %f39;
$LDWendi___log2f_189_1:
	.loc	2	259	0
	mul.ftz.f32 	%f40, %f36, %f9;
	mul.ftz.f32 	%f41, %f26, %f9;
	mul.ftz.f32 	%f42, %f16, %f9;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+0], %f42;
	mov.f32 	%f43, %f41;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+4], %f43;
	mov.f32 	%f44, %f40;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+8], %f44;
	mov.f32 	%f45, %f9;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+12], %f45;
	ret;
$LDWend__Z25PremultiplyLinearizePixel8PixelRGB:
	} // _Z25PremultiplyLinearizePixel8PixelRGB

	.visible .func (.param .align 16 .b8 __cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16]) _Z29UnpremultiplyUnlinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16])
	{
	.reg .f32 %f<53>;
	.reg .pred %p<6>;
	.loc	2	263	0
$LDWbegin__Z29UnpremultiplyUnlinearizePixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	2	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_13_5122;
	.loc	2	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	2	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	2	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_13_4866;
$Lt_13_5122:
	.loc	2	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_13_4866:
	.loc	2	266	0
	mov.f32 	%f18, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f17, %f18;
	@!%p2 bra 	$Lt_13_5378;
	.loc	2	242	0
	neg.ftz.f32 	%f19, %f17;
	lg2.approx.ftz.f32 	%f20, %f19;
	mov.f32 	%f21, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f22, %f20, %f21;
	ex2.approx.ftz.f32 	%f23, %f22;
	neg.ftz.f32 	%f24, %f23;
	bra.uni 	$LDWendi___log2f_190_5;
$Lt_13_5378:
	.loc	2	244	0
	lg2.approx.ftz.f32 	%f25, %f17;
	mov.f32 	%f26, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f27, %f25, %f26;
	ex2.approx.ftz.f32 	%f24, %f27;
$LDWendi___log2f_190_5:
	.loc	2	267	0
	mov.f32 	%f28, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f16, %f28;
	@!%p3 bra 	$Lt_13_5890;
	.loc	2	242	0
	neg.ftz.f32 	%f29, %f16;
	lg2.approx.ftz.f32 	%f30, %f29;
	mov.f32 	%f31, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f32, %f30, %f31;
	ex2.approx.ftz.f32 	%f33, %f32;
	neg.ftz.f32 	%f34, %f33;
	bra.uni 	$LDWendi___log2f_190_3;
$Lt_13_5890:
	.loc	2	244	0
	lg2.approx.ftz.f32 	%f35, %f16;
	mov.f32 	%f36, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f37, %f35, %f36;
	ex2.approx.ftz.f32 	%f34, %f37;
$LDWendi___log2f_190_3:
	.loc	2	268	0
	mov.f32 	%f38, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p4, %f15, %f38;
	@!%p4 bra 	$Lt_13_6402;
	.loc	2	242	0
	neg.ftz.f32 	%f39, %f15;
	lg2.approx.ftz.f32 	%f40, %f39;
	mov.f32 	%f41, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f42, %f40, %f41;
	ex2.approx.ftz.f32 	%f43, %f42;
	neg.ftz.f32 	%f44, %f43;
	bra.uni 	$LDWendi___log2f_190_1;
$Lt_13_6402:
	.loc	2	244	0
	lg2.approx.ftz.f32 	%f45, %f15;
	mov.f32 	%f46, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f47, %f45, %f46;
	ex2.approx.ftz.f32 	%f44, %f47;
$LDWendi___log2f_190_1:
	.loc	2	269	0
	mov.f32 	%f48, %f24;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+0], %f48;
	mov.f32 	%f49, %f34;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+4], %f49;
	mov.f32 	%f50, %f44;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+8], %f50;
	mov.f32 	%f51, %f10;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+12], %f51;
	ret;
$LDWend__Z29UnpremultiplyUnlinearizePixel8PixelRGB:
	} // _Z29UnpremultiplyUnlinearizePixel8PixelRGB

	.visible .func (.param .align 16 .b8 __cudaretf__Z20PremultiplyLinearize6float4[16]) _Z20PremultiplyLinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z20PremultiplyLinearize6float4[16])
	{
	.reg .f32 %f<47>;
	.reg .pred %p<5>;
	.loc	2	277	0
$LDWbegin__Z20PremultiplyLinearize6float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z20PremultiplyLinearize6float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z20PremultiplyLinearize6float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z20PremultiplyLinearize6float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z20PremultiplyLinearize6float4+12];
	mov.f32 	%f8, %f7;
	.loc	2	254	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	.loc	2	255	0
	mov.f32 	%f10, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f10;
	@!%p1 bra 	$Lt_14_4098;
	.loc	2	234	0
	neg.ftz.f32 	%f11, %f2;
	lg2.approx.ftz.f32 	%f12, %f11;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f15, %f14;
	neg.ftz.f32 	%f16, %f15;
	bra.uni 	$LDWendi___log2f_191_5;
$Lt_14_4098:
	.loc	2	236	0
	lg2.approx.ftz.f32 	%f17, %f2;
	mov.f32 	%f18, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f19, %f17, %f18;
	ex2.approx.ftz.f32 	%f16, %f19;
$LDWendi___log2f_191_5:
	.loc	2	256	0
	mov.f32 	%f20, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f4, %f20;
	@!%p2 bra 	$Lt_14_4610;
	.loc	2	234	0
	neg.ftz.f32 	%f21, %f4;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	neg.ftz.f32 	%f26, %f25;
	bra.uni 	$LDWendi___log2f_191_3;
$Lt_14_4610:
	.loc	2	236	0
	lg2.approx.ftz.f32 	%f27, %f4;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f26, %f29;
$LDWendi___log2f_191_3:
	.loc	2	257	0
	mov.f32 	%f30, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f6, %f30;
	@!%p3 bra 	$Lt_14_5122;
	.loc	2	234	0
	neg.ftz.f32 	%f31, %f6;
	lg2.approx.ftz.f32 	%f32, %f31;
	mov.f32 	%f33, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f34, %f32, %f33;
	ex2.approx.ftz.f32 	%f35, %f34;
	neg.ftz.f32 	%f36, %f35;
	bra.uni 	$LDWendi___log2f_191_1;
$Lt_14_5122:
	.loc	2	236	0
	lg2.approx.ftz.f32 	%f37, %f6;
	mov.f32 	%f38, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f39, %f37, %f38;
	ex2.approx.ftz.f32 	%f36, %f39;
$LDWendi___log2f_191_1:
	.loc	2	259	0
	mul.ftz.f32 	%f40, %f36, %f9;
	mul.ftz.f32 	%f41, %f26, %f9;
	.loc	2	278	0
	mul.ftz.f32 	%f42, %f16, %f9;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+0], %f42;
	mov.f32 	%f43, %f41;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+4], %f43;
	mov.f32 	%f44, %f40;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+8], %f44;
	mov.f32 	%f45, %f9;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+12], %f45;
	ret;
$LDWend__Z20PremultiplyLinearize6float4:
	} // _Z20PremultiplyLinearize6float4

	.visible .func (.param .align 16 .b8 __cudaretf__Z24UnpremultiplyUnlinearize6float4[16]) _Z24UnpremultiplyUnlinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z24UnpremultiplyUnlinearize6float4[16])
	{
	.reg .f32 %f<53>;
	.reg .pred %p<6>;
	.loc	2	284	0
$LDWbegin__Z24UnpremultiplyUnlinearize6float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+12];
	mov.f32 	%f8, %f7;
	.loc	2	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_15_5122;
	.loc	2	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	2	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	2	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_15_4866;
$Lt_15_5122:
	.loc	2	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_15_4866:
	.loc	2	266	0
	mov.f32 	%f18, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f17, %f18;
	@!%p2 bra 	$Lt_15_5378;
	.loc	2	242	0
	neg.ftz.f32 	%f19, %f17;
	lg2.approx.ftz.f32 	%f20, %f19;
	mov.f32 	%f21, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f22, %f20, %f21;
	ex2.approx.ftz.f32 	%f23, %f22;
	neg.ftz.f32 	%f24, %f23;
	bra.uni 	$LDWendi___log2f_192_5;
$Lt_15_5378:
	.loc	2	244	0
	lg2.approx.ftz.f32 	%f25, %f17;
	mov.f32 	%f26, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f27, %f25, %f26;
	ex2.approx.ftz.f32 	%f24, %f27;
$LDWendi___log2f_192_5:
	.loc	2	267	0
	mov.f32 	%f28, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f16, %f28;
	@!%p3 bra 	$Lt_15_5890;
	.loc	2	242	0
	neg.ftz.f32 	%f29, %f16;
	lg2.approx.ftz.f32 	%f30, %f29;
	mov.f32 	%f31, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f32, %f30, %f31;
	ex2.approx.ftz.f32 	%f33, %f32;
	neg.ftz.f32 	%f34, %f33;
	bra.uni 	$LDWendi___log2f_192_3;
$Lt_15_5890:
	.loc	2	244	0
	lg2.approx.ftz.f32 	%f35, %f16;
	mov.f32 	%f36, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f37, %f35, %f36;
	ex2.approx.ftz.f32 	%f34, %f37;
$LDWendi___log2f_192_3:
	.loc	2	268	0
	mov.f32 	%f38, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p4, %f15, %f38;
	@!%p4 bra 	$Lt_15_6402;
	.loc	2	242	0
	neg.ftz.f32 	%f39, %f15;
	lg2.approx.ftz.f32 	%f40, %f39;
	mov.f32 	%f41, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f42, %f40, %f41;
	ex2.approx.ftz.f32 	%f43, %f42;
	neg.ftz.f32 	%f44, %f43;
	bra.uni 	$LDWendi___log2f_192_1;
$Lt_15_6402:
	.loc	2	244	0
	lg2.approx.ftz.f32 	%f45, %f15;
	mov.f32 	%f46, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f47, %f45, %f46;
	ex2.approx.ftz.f32 	%f44, %f47;
$LDWendi___log2f_192_1:
	.loc	2	285	0
	mov.f32 	%f48, %f24;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+0], %f48;
	mov.f32 	%f49, %f34;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+4], %f49;
	mov.f32 	%f50, %f44;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+8], %f50;
	mov.f32 	%f51, %f10;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+12], %f51;
	ret;
$LDWend__Z24UnpremultiplyUnlinearize6float4:
	} // _Z24UnpremultiplyUnlinearize6float4

	.entry VerticalRecursiveGaussianGRAYF32_kernel (
		.param .u64 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_pOut,
		.param .u64 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_pIn,
		.param .s32 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_width,
		.param .s32 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_height,
		.param .s32 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_pitch_f,
		.param .f32 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_n_plus0,
		.param .f32 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_n_plus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_neg_d_plus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_neg_d_plus2,
		.param .f32 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_n_minus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_n_minus2,
		.param .f32 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_neg_d_minus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_neg_d_minus2)
	{
	.reg .u32 %r<21>;
	.reg .u64 %rd<17>;
	.reg .f32 %f<25>;
	.reg .pred %p<7>;
	.loc	18	30	0
$LDWbegin_VerticalRecursiveGaussianGRAYF32_kernel:
	.loc	18	33	0
	cvt.s32.u32 	%r1, %ctaid.x;
	cvt.s32.u32 	%r2, %ntid.x;
	mul.lo.s32 	%r3, %r1, %r2;
	mov.u32 	%r4, %tid.x;
	add.u32 	%r5, %r3, %r4;
	ld.param.s32 	%r6, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_width];
	setp.ge.s32 	%p1, %r6, %r5;
	@%p1 bra 	$Lt_16_3074;
	bra.uni 	$LBB14_VerticalRecursiveGaussianGRAYF32_kernel;
$Lt_16_3074:
	.loc	18	48	0
	mov.s32 	%r7, %r5;
	ld.param.s32 	%r8, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_height];
	mov.u32 	%r9, 0;
	setp.le.s32 	%p2, %r8, %r9;
	@%p2 bra 	$Lt_16_5634;
	mov.s32 	%r10, %r8;
	ld.param.s32 	%r11, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_pitch_f];
	cvt.s64.s32 	%rd1, %r11;
	cvt.s64.s32 	%rd2, %r5;
	mul.wide.s32 	%rd3, %r5, 4;
	mul.wide.s32 	%rd4, %r11, 4;
	ld.param.u64 	%rd5, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_pIn];
	add.u64 	%rd6, %rd5, %rd3;
	ld.param.u64 	%rd7, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_pOut];
	add.u64 	%rd8, %rd7, %rd3;
	ld.param.f32 	%f1, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_neg_d_plus2];
	ld.param.f32 	%f2, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_neg_d_plus1];
	ld.param.f32 	%f3, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_n_plus0];
	ld.param.f32 	%f4, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_n_plus1];
	mov.s32 	%r12, 0;
	mov.f32 	%f5, 0f00000000;     	// 0
	mov.f32 	%f6, 0f00000000;     	// 0
	mov.f32 	%f7, 0f00000000;     	// 0
	mov.s32 	%r13, %r10;
$Lt_16_4098:
 //<loop> Loop body line 48, nesting depth: 1, estimated iterations: unknown
	.loc	18	52	0
	ld.global.f32 	%f8, [%rd6+0];
	.loc	18	53	0
	mul.ftz.f32 	%f9, %f4, %f7;
	fma.rn.ftz.f32 	%f10, %f3, %f8, %f9;
	fma.rn.ftz.f32 	%f11, %f2, %f6, %f10;
	fma.rn.ftz.f32 	%f12, %f1, %f5, %f11;
	.loc	18	55	0
	mov.f32 	%f7, %f8;
	.loc	18	56	0
	mov.f32 	%f5, %f6;
	.loc	18	57	0
	mov.f32 	%f6, %f12;
	.loc	18	59	0
	st.global.f32 	[%rd8+0], %f12;
	add.s32 	%r12, %r12, 1;
	add.u64 	%rd8, %rd8, %rd4;
	add.u64 	%rd6, %rd6, %rd4;
	setp.ne.s32 	%p3, %r8, %r12;
	@%p3 bra 	$Lt_16_4098;
	mov.s32 	%r12, %r8;
	mul.lo.s32 	%r14, %r11, %r8;
	add.s32 	%r7, %r5, %r14;
	bra.uni 	$Lt_16_3586;
$Lt_16_5634:
	mov.s32 	%r12, 0;
$Lt_16_3586:
	mov.u32 	%r15, 0;
	setp.le.s32 	%p4, %r12, %r15;
	@%p4 bra 	$LBB14_VerticalRecursiveGaussianGRAYF32_kernel;
	mov.s32 	%r16, %r12;
	ld.param.s32 	%r17, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_pitch_f];
	cvt.s64.s32 	%rd9, %r17;
	cvt.s64.s32 	%rd10, %r7;
	mul.wide.s32 	%rd11, %r7, 4;
	mul.wide.s32 	%rd4, %r17, 4;
	ld.param.u64 	%rd12, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_pIn];
	add.u64 	%rd13, %rd12, %rd11;
	ld.param.u64 	%rd14, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_pOut];
	add.u64 	%rd15, %rd14, %rd11;
	ld.param.f32 	%f13, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_neg_d_minus2];
	ld.param.f32 	%f14, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_neg_d_minus1];
	ld.param.f32 	%f15, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_n_minus1];
	ld.param.f32 	%f16, [__cudaparm_VerticalRecursiveGaussianGRAYF32_kernel_n_minus2];
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f5, 0f00000000;     	// 0
	mov.f32 	%f6, 0f00000000;     	// 0
	mov.f32 	%f7, 0f00000000;     	// 0
	mov.s32 	%r18, %r16;
$Lt_16_5122:
 //<loop> Loop body line 59, nesting depth: 1, estimated iterations: unknown
	.loc	18	73	0
	sub.s32 	%r12, %r12, 1;
	.loc	18	74	0
	sub.u64 	%rd15, %rd15, %rd4;
	sub.u64 	%rd13, %rd13, %rd4;
	.loc	18	76	0
	mul.ftz.f32 	%f18, %f16, %f17;
	fma.rn.ftz.f32 	%f19, %f15, %f7, %f18;
	fma.rn.ftz.f32 	%f20, %f14, %f6, %f19;
	fma.rn.ftz.f32 	%f21, %f13, %f5, %f20;
	.loc	18	78	0
	mov.f32 	%f17, %f7;
	.loc	18	79	0
	ld.global.f32 	%f7, [%rd13+0];
	.loc	18	80	0
	mov.f32 	%f5, %f6;
	.loc	18	81	0
	mov.f32 	%f6, %f21;
	.loc	18	83	0
	ld.global.f32 	%f22, [%rd15+0];
	add.ftz.f32 	%f23, %f22, %f21;
	st.global.f32 	[%rd15+0], %f23;
	mov.u32 	%r19, 0;
	setp.ne.s32 	%p5, %r12, %r19;
	@%p5 bra 	$Lt_16_5122;
$LBB14_VerticalRecursiveGaussianGRAYF32_kernel:
	.loc	18	85	0
	exit;
$LDWend_VerticalRecursiveGaussianGRAYF32_kernel:
	} // VerticalRecursiveGaussianGRAYF32_kernel

	.entry HorizontalRecursiveGaussianGrayF32_kernel (
		.param .u64 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pOut,
		.param .u64 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pIn,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_width,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_height,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_n_plus0,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_n_plus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_neg_d_plus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_neg_d_plus2,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_n_minus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_n_minus2,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_neg_d_minus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_neg_d_minus2)
	{
	.reg .u32 %r<164>;
	.reg .u64 %rd<114>;
	.reg .f32 %f<55>;
	.reg .pred %p<29>;
	.shared .align 4 .b8 __cuda___cuda_local_var_93192_34_non_const_smem124[4224];
	.loc	18	94	0
$LDWbegin_HorizontalRecursiveGaussianGrayF32_kernel:
	.loc	18	113	0
	ld.param.s32 	%r1, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_width];
	mov.u32 	%r2, 0;
	setp.le.s32 	%p1, %r1, %r2;
	@%p1 bra 	$Lt_17_37634;
	add.s32 	%r3, %r1, 31;
	shr.s32 	%r4, %r3, 31;
	mov.s32 	%r5, 31;
	and.b32 	%r6, %r4, %r5;
	add.s32 	%r7, %r6, %r3;
	shr.s32 	%r8, %r7, 5;
	cvt.s32.u32 	%r9, %ctaid.y;
	mul.lo.s32 	%r10, %r9, 32;
	mov.u32 	%r11, %tid.y;
	mov.u32 	%r12, 0;
	setp.eq.u32 	%p2, %r11, %r12;
	add.u32 	%r13, %r10, %r11;
	ld.param.s32 	%r14, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_height];
	setp.gt.s32 	%p3, %r14, %r13;
	add.s32 	%r15, %r13, 8;
	add.s32 	%r16, %r13, 16;
	add.s32 	%r17, %r13, 24;
	setp.lt.s32 	%p4, %r15, %r14;
	setp.lt.s32 	%p5, %r16, %r14;
	setp.lt.s32 	%p6, %r17, %r14;
	mov.s32 	%r18, 0;
	mov.f32 	%f1, 0f00000000;     	// 0
	mov.f32 	%f2, 0f00000000;     	// 0
	mov.f32 	%f3, 0f00000000;     	// 0
	mov.u64 	%rd1, __cuda___cuda_local_var_93192_34_non_const_smem124;
	mov.s32 	%r19, %r8;
$Lt_17_16642:
 //<loop> Loop body line 113, nesting depth: 1, estimated iterations: unknown
	.loc	18	123	0
	@!%p3 bra 	$Lt_17_17410;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	mul.lo.s32 	%r20, %r11, 33;
	mov.u32 	%r21, %tid.x;
	add.u32 	%r22, %r20, %r21;
	cvt.s64.s32 	%rd2, %r22;
	mul.wide.s32 	%rd3, %r22, 4;
	add.u64 	%rd4, %rd1, %rd3;
	add.u32 	%r23, %r18, %r21;
	setp.le.u32 	%p7, %r1, %r23;
	@%p7 bra 	$Lt_17_17666;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	131	0
	ld.param.u64 	%rd5, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pIn];
	ld.param.s32 	%r24, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	mul.lo.s32 	%r25, %r24, %r13;
	add.s32 	%r26, %r18, %r25;
	add.u32 	%r27, %r21, %r26;
	cvt.s64.s32 	%rd6, %r27;
	mul.wide.s32 	%rd7, %r27, 4;
	add.u64 	%rd8, %rd5, %rd7;
	ld.global.f32 	%f4, [%rd8+0];
	st.shared.f32 	[%rd4+0], %f4;
	bra.uni 	$Lt_17_17410;
$Lt_17_17666:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	133	0
	mov.f32 	%f5, 0f00000000;     	// 0
	st.shared.f32 	[%rd4+0], %f5;
$Lt_17_17410:
$Lt_17_16898:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	@!%p4 bra 	$Lt_17_18434;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	mul.lo.s32 	%r20, %r11, 33;
	mov.u32 	%r21, %tid.x;
	add.u32 	%r28, %r20, %r21;
	cvt.s64.s32 	%rd9, %r28;
	mul.wide.s32 	%rd10, %r28, 4;
	add.u64 	%rd4, %rd1, %rd10;
	add.u32 	%r29, %r18, %r21;
	setp.le.u32 	%p8, %r1, %r29;
	@%p8 bra 	$Lt_17_18690;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	131	0
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	ld.param.u64 	%rd11, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pIn];
	shl.b32 	%r31, %r30, 3;
	mul.lo.s32 	%r32, %r30, %r13;
	add.s32 	%r33, %r18, %r32;
	add.u32 	%r34, %r21, %r33;
	add.s32 	%r35, %r31, %r34;
	cvt.s64.s32 	%rd12, %r35;
	mul.wide.s32 	%rd13, %r35, 4;
	add.u64 	%rd14, %rd11, %rd13;
	ld.global.f32 	%f6, [%rd14+0];
	st.shared.f32 	[%rd4+1056], %f6;
	bra.uni 	$Lt_17_18434;
$Lt_17_18690:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	133	0
	mov.f32 	%f7, 0f00000000;     	// 0
	st.shared.f32 	[%rd4+1056], %f7;
$Lt_17_18434:
$Lt_17_17922:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	@!%p5 bra 	$Lt_17_19458;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	mul.lo.s32 	%r20, %r11, 33;
	mov.u32 	%r21, %tid.x;
	add.u32 	%r36, %r20, %r21;
	cvt.s64.s32 	%rd15, %r36;
	mul.wide.s32 	%rd16, %r36, 4;
	add.u64 	%rd4, %rd1, %rd16;
	add.u32 	%r37, %r18, %r21;
	setp.le.u32 	%p9, %r1, %r37;
	@%p9 bra 	$Lt_17_19714;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	131	0
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	shl.b32 	%r38, %r30, 3;
	ld.param.u64 	%rd17, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pIn];
	mul.lo.s32 	%r39, %r30, %r13;
	add.s32 	%r40, %r18, %r39;
	add.u32 	%r41, %r21, %r40;
	add.s32 	%r42, %r38, %r41;
	add.s32 	%r43, %r38, %r42;
	cvt.s64.s32 	%rd18, %r43;
	mul.wide.s32 	%rd19, %r43, 4;
	add.u64 	%rd20, %rd17, %rd19;
	ld.global.f32 	%f8, [%rd20+0];
	st.shared.f32 	[%rd4+2112], %f8;
	bra.uni 	$Lt_17_19458;
$Lt_17_19714:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	133	0
	mov.f32 	%f9, 0f00000000;     	// 0
	st.shared.f32 	[%rd4+2112], %f9;
$Lt_17_19458:
$Lt_17_18946:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	@!%p6 bra 	$Lt_17_20482;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	mul.lo.s32 	%r20, %r11, 33;
	mov.u32 	%r21, %tid.x;
	add.u32 	%r44, %r20, %r21;
	cvt.s64.s32 	%rd21, %r44;
	mul.wide.s32 	%rd22, %r44, 4;
	add.u64 	%rd4, %rd1, %rd22;
	add.u32 	%r45, %r18, %r21;
	setp.le.u32 	%p10, %r1, %r45;
	@%p10 bra 	$Lt_17_20738;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	131	0
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	shl.b32 	%r38, %r30, 3;
	ld.param.u64 	%rd23, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pIn];
	add.s32 	%r46, %r38, %r38;
	mul.lo.s32 	%r47, %r30, %r13;
	add.s32 	%r48, %r18, %r47;
	add.u32 	%r49, %r21, %r48;
	add.s32 	%r50, %r38, %r49;
	add.s32 	%r51, %r46, %r50;
	cvt.s64.s32 	%rd24, %r51;
	mul.wide.s32 	%rd25, %r51, 4;
	add.u64 	%rd26, %rd23, %rd25;
	ld.global.f32 	%f10, [%rd26+0];
	st.shared.f32 	[%rd4+3168], %f10;
	bra.uni 	$Lt_17_20482;
$Lt_17_20738:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	133	0
	mov.f32 	%f11, 0f00000000;    	// 0
	st.shared.f32 	[%rd4+3168], %f11;
$Lt_17_20482:
$Lt_17_19970:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	138	0
	bar.sync 	0;
	@!%p2 bra 	$Lt_17_20994;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	145	0
	mov.u32 	%r52, %tid.x;
	mul.lo.s32 	%r53, %r52, 33;
	mov.s32 	%r54, %r53;
	add.s32 	%r55, %r53, 32;
	cvt.s64.s32 	%rd27, %r53;
	mul.wide.s32 	%rd28, %r53, 4;
	add.u64 	%rd29, %rd1, %rd28;
	ld.param.f32 	%f12, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_neg_d_plus2];
	ld.param.f32 	%f13, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_neg_d_plus1];
	ld.param.f32 	%f14, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_n_plus0];
	ld.param.f32 	%f15, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_n_plus1];
$Lt_17_22018:
 //<loop> Loop body line 145, nesting depth: 2, iterations: 32
	.loc	18	148	0
	ld.shared.f32 	%f16, [%rd29+0];
	.loc	18	149	0
	mul.ftz.f32 	%f17, %f15, %f3;
	fma.rn.ftz.f32 	%f18, %f14, %f16, %f17;
	fma.rn.ftz.f32 	%f19, %f13, %f2, %f18;
	fma.rn.ftz.f32 	%f20, %f12, %f1, %f19;
	.loc	18	151	0
	mov.f32 	%f3, %f16;
	.loc	18	152	0
	mov.f32 	%f1, %f2;
	.loc	18	153	0
	mov.f32 	%f2, %f20;
	.loc	18	155	0
	st.shared.f32 	[%rd29+0], %f20;
	add.s32 	%r54, %r54, 1;
	add.u64 	%rd29, %rd29, 4;
	setp.ne.s32 	%p11, %r54, %r55;
	@%p11 bra 	$Lt_17_22018;
$Lt_17_20994:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	160	0
	bar.sync 	0;
	.loc	18	164	0
	@!%p3 bra 	$Lt_17_23042;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	mov.u32 	%r21, %tid.x;
	add.u32 	%r56, %r18, %r21;
	setp.le.u32 	%p12, %r1, %r56;
	@%p12 bra 	$Lt_17_23042;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	172	0
	mul.lo.s32 	%r57, %r11, 33;
	add.u32 	%r58, %r21, %r57;
	cvt.s64.s32 	%rd30, %r58;
	mul.wide.s32 	%rd31, %r58, 4;
	add.u64 	%rd32, %rd1, %rd31;
	ld.shared.f32 	%f21, [%rd32+0];
	ld.param.u64 	%rd33, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pOut];
	ld.param.s32 	%r59, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	mul.lo.s32 	%r60, %r59, %r13;
	add.s32 	%r61, %r18, %r60;
	add.u32 	%r62, %r21, %r61;
	cvt.s64.s32 	%rd34, %r62;
	mul.wide.s32 	%rd35, %r62, 4;
	add.u64 	%rd36, %rd33, %rd35;
	st.global.f32 	[%rd36+0], %f21;
$Lt_17_23042:
$Lt_17_22530:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	@!%p4 bra 	$Lt_17_24066;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	mov.u32 	%r21, %tid.x;
	add.u32 	%r63, %r18, %r21;
	setp.le.u32 	%p13, %r1, %r63;
	@%p13 bra 	$Lt_17_24066;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	mul.lo.s32 	%r64, %r11, 33;
	add.u32 	%r65, %r21, %r64;
	cvt.s64.s32 	%rd37, %r65;
	mul.wide.s32 	%rd38, %r65, 4;
	add.u64 	%rd39, %rd1, %rd38;
	ld.shared.f32 	%f22, [%rd39+1056];
	ld.param.u64 	%rd40, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pOut];
	shl.b32 	%r66, %r30, 3;
	mul.lo.s32 	%r67, %r30, %r13;
	add.s32 	%r68, %r18, %r67;
	add.u32 	%r69, %r21, %r68;
	add.s32 	%r70, %r66, %r69;
	cvt.s64.s32 	%rd41, %r70;
	mul.wide.s32 	%rd42, %r70, 4;
	add.u64 	%rd43, %rd40, %rd42;
	st.global.f32 	[%rd43+0], %f22;
$Lt_17_24066:
$Lt_17_23554:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	@!%p5 bra 	$Lt_17_25090;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	mov.u32 	%r21, %tid.x;
	add.u32 	%r71, %r18, %r21;
	setp.le.u32 	%p14, %r1, %r71;
	@%p14 bra 	$Lt_17_25090;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	shl.b32 	%r38, %r30, 3;
	mul.lo.s32 	%r72, %r11, 33;
	add.u32 	%r73, %r21, %r72;
	cvt.s64.s32 	%rd44, %r73;
	mul.wide.s32 	%rd45, %r73, 4;
	add.u64 	%rd46, %rd1, %rd45;
	ld.shared.f32 	%f23, [%rd46+2112];
	ld.param.u64 	%rd47, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pOut];
	mul.lo.s32 	%r74, %r30, %r13;
	add.s32 	%r75, %r18, %r74;
	add.u32 	%r76, %r21, %r75;
	add.s32 	%r77, %r38, %r76;
	add.s32 	%r78, %r38, %r77;
	cvt.s64.s32 	%rd48, %r78;
	mul.wide.s32 	%rd49, %r78, 4;
	add.u64 	%rd50, %rd47, %rd49;
	st.global.f32 	[%rd50+0], %f23;
$Lt_17_25090:
$Lt_17_24578:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	@!%p6 bra 	$Lt_17_26114;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	mov.u32 	%r21, %tid.x;
	add.u32 	%r79, %r18, %r21;
	setp.le.u32 	%p15, %r1, %r79;
	@%p15 bra 	$Lt_17_26114;
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	shl.b32 	%r38, %r30, 3;
	mul.lo.s32 	%r80, %r11, 33;
	add.u32 	%r81, %r21, %r80;
	cvt.s64.s32 	%rd51, %r81;
	mul.wide.s32 	%rd52, %r81, 4;
	add.u64 	%rd53, %rd1, %rd52;
	ld.shared.f32 	%f24, [%rd53+3168];
	ld.param.u64 	%rd54, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pOut];
	add.s32 	%r82, %r38, %r38;
	mul.lo.s32 	%r83, %r30, %r13;
	add.s32 	%r84, %r18, %r83;
	add.u32 	%r85, %r21, %r84;
	add.s32 	%r86, %r38, %r85;
	add.s32 	%r87, %r82, %r86;
	cvt.s64.s32 	%rd55, %r87;
	mul.wide.s32 	%rd56, %r87, 4;
	add.u64 	%rd57, %rd54, %rd56;
	st.global.f32 	[%rd57+0], %f24;
$Lt_17_26114:
$Lt_17_25602:
 //<loop> Part of loop body line 113, head labeled $Lt_17_16642
	.loc	18	178	0
	bar.sync 	0;
	add.s32 	%r18, %r18, 32;
	setp.gt.s32 	%p16, %r1, %r18;
	@%p16 bra 	$Lt_17_16642;
	bra.uni 	$Lt_17_16130;
$Lt_17_37634:
	mov.s32 	%r18, 0;
	mov.u64 	%rd1, __cuda___cuda_local_var_93192_34_non_const_smem124;
$Lt_17_16130:
	mov.u32 	%r88, 0;
	setp.le.s32 	%p17, %r18, %r88;
	@%p17 bra 	$Lt_17_26882;
	add.s32 	%r89, %r18, 31;
	shr.s32 	%r90, %r89, 31;
	mov.s32 	%r91, 31;
	and.b32 	%r92, %r90, %r91;
	add.s32 	%r93, %r92, %r89;
	shr.s32 	%r94, %r93, 5;
	cvt.s32.u32 	%r95, %ctaid.y;
	mul.lo.s32 	%r96, %r95, 32;
	mov.u32 	%r11, %tid.y;
	mov.u32 	%r97, 0;
	setp.eq.u32 	%p2, %r11, %r97;
	add.u32 	%r13, %r96, %r11;
	ld.param.s32 	%r98, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_height];
	setp.gt.s32 	%p3, %r98, %r13;
	add.s32 	%r99, %r13, 8;
	add.s32 	%r100, %r13, 16;
	add.s32 	%r101, %r13, 24;
	setp.lt.s32 	%p4, %r99, %r98;
	setp.lt.s32 	%p5, %r100, %r98;
	setp.lt.s32 	%p6, %r101, %r98;
	mov.f32 	%f25, 0f00000000;    	// 0
	mov.f32 	%f1, 0f00000000;     	// 0
	mov.f32 	%f2, 0f00000000;     	// 0
	mov.f32 	%f3, 0f00000000;     	// 0
	mov.s32 	%r102, %r94;
$Lt_17_27394:
 //<loop> Loop body line 178, nesting depth: 1, estimated iterations: unknown
	.loc	18	191	0
	sub.s32 	%r18, %r18, 32;
	.loc	18	194	0
	@!%p3 bra 	$Lt_17_28162;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	mul.lo.s32 	%r20, %r11, 33;
	mov.u32 	%r21, %tid.x;
	add.u32 	%r103, %r20, %r21;
	cvt.s64.s32 	%rd58, %r103;
	mul.wide.s32 	%rd59, %r103, 4;
	add.u64 	%rd4, %rd1, %rd59;
	add.u32 	%r104, %r18, %r21;
	setp.le.u32 	%p18, %r1, %r104;
	@%p18 bra 	$Lt_17_28418;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	202	0
	ld.param.u64 	%rd60, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pIn];
	ld.param.s32 	%r105, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	mul.lo.s32 	%r106, %r105, %r13;
	add.s32 	%r107, %r18, %r106;
	add.u32 	%r108, %r21, %r107;
	cvt.s64.s32 	%rd61, %r108;
	mul.wide.s32 	%rd62, %r108, 4;
	add.u64 	%rd63, %rd60, %rd62;
	ld.global.f32 	%f26, [%rd63+0];
	st.shared.f32 	[%rd4+0], %f26;
	bra.uni 	$Lt_17_28162;
$Lt_17_28418:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	204	0
	mov.f32 	%f27, 0f00000000;    	// 0
	st.shared.f32 	[%rd4+0], %f27;
$Lt_17_28162:
$Lt_17_27650:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	@!%p4 bra 	$Lt_17_29186;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	mul.lo.s32 	%r20, %r11, 33;
	mov.u32 	%r21, %tid.x;
	add.u32 	%r109, %r20, %r21;
	cvt.s64.s32 	%rd64, %r109;
	mul.wide.s32 	%rd65, %r109, 4;
	add.u64 	%rd4, %rd1, %rd65;
	add.u32 	%r110, %r18, %r21;
	setp.le.u32 	%p19, %r1, %r110;
	@%p19 bra 	$Lt_17_29442;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	202	0
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	ld.param.u64 	%rd66, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pIn];
	shl.b32 	%r111, %r30, 3;
	mul.lo.s32 	%r112, %r30, %r13;
	add.s32 	%r113, %r18, %r112;
	add.u32 	%r114, %r21, %r113;
	add.s32 	%r115, %r111, %r114;
	cvt.s64.s32 	%rd67, %r115;
	mul.wide.s32 	%rd68, %r115, 4;
	add.u64 	%rd69, %rd66, %rd68;
	ld.global.f32 	%f28, [%rd69+0];
	st.shared.f32 	[%rd4+1056], %f28;
	bra.uni 	$Lt_17_29186;
$Lt_17_29442:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	204	0
	mov.f32 	%f29, 0f00000000;    	// 0
	st.shared.f32 	[%rd4+1056], %f29;
$Lt_17_29186:
$Lt_17_28674:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	@!%p5 bra 	$Lt_17_30210;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	mul.lo.s32 	%r20, %r11, 33;
	mov.u32 	%r21, %tid.x;
	add.u32 	%r116, %r20, %r21;
	cvt.s64.s32 	%rd70, %r116;
	mul.wide.s32 	%rd71, %r116, 4;
	add.u64 	%rd4, %rd1, %rd71;
	add.u32 	%r117, %r18, %r21;
	setp.le.u32 	%p20, %r1, %r117;
	@%p20 bra 	$Lt_17_30466;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	202	0
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	shl.b32 	%r38, %r30, 3;
	ld.param.u64 	%rd72, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pIn];
	mul.lo.s32 	%r118, %r30, %r13;
	add.s32 	%r119, %r18, %r118;
	add.u32 	%r120, %r21, %r119;
	add.s32 	%r121, %r38, %r120;
	add.s32 	%r122, %r38, %r121;
	cvt.s64.s32 	%rd73, %r122;
	mul.wide.s32 	%rd74, %r122, 4;
	add.u64 	%rd75, %rd72, %rd74;
	ld.global.f32 	%f30, [%rd75+0];
	st.shared.f32 	[%rd4+2112], %f30;
	bra.uni 	$Lt_17_30210;
$Lt_17_30466:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	204	0
	mov.f32 	%f31, 0f00000000;    	// 0
	st.shared.f32 	[%rd4+2112], %f31;
$Lt_17_30210:
$Lt_17_29698:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	@!%p6 bra 	$Lt_17_31234;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	mul.lo.s32 	%r20, %r11, 33;
	mov.u32 	%r21, %tid.x;
	add.u32 	%r123, %r20, %r21;
	cvt.s64.s32 	%rd76, %r123;
	mul.wide.s32 	%rd77, %r123, 4;
	add.u64 	%rd4, %rd1, %rd77;
	add.u32 	%r124, %r18, %r21;
	setp.le.u32 	%p21, %r1, %r124;
	@%p21 bra 	$Lt_17_31490;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	202	0
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	shl.b32 	%r38, %r30, 3;
	ld.param.u64 	%rd78, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pIn];
	add.s32 	%r125, %r38, %r38;
	mul.lo.s32 	%r126, %r30, %r13;
	add.s32 	%r127, %r18, %r126;
	add.u32 	%r128, %r21, %r127;
	add.s32 	%r129, %r38, %r128;
	add.s32 	%r130, %r125, %r129;
	cvt.s64.s32 	%rd79, %r130;
	mul.wide.s32 	%rd80, %r130, 4;
	add.u64 	%rd81, %rd78, %rd80;
	ld.global.f32 	%f32, [%rd81+0];
	st.shared.f32 	[%rd4+3168], %f32;
	bra.uni 	$Lt_17_31234;
$Lt_17_31490:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	204	0
	mov.f32 	%f33, 0f00000000;    	// 0
	st.shared.f32 	[%rd4+3168], %f33;
$Lt_17_31234:
$Lt_17_30722:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	209	0
	bar.sync 	0;
	@!%p2 bra 	$Lt_17_31746;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	214	0
	mov.u32 	%r131, %tid.x;
	mul.lo.s32 	%r132, %r131, 33;
	add.s32 	%r133, %r132, 31;
	cvt.s64.s32 	%rd82, %r133;
	mul.wide.s32 	%rd83, %r133, 4;
	add.u64 	%rd84, %rd1, %rd83;
	ld.param.f32 	%f34, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_neg_d_minus2];
	ld.param.f32 	%f35, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_neg_d_minus1];
	ld.param.f32 	%f36, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_n_minus1];
	ld.param.f32 	%f37, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_n_minus2];
	mov.s32 	%r134, 31;
$Lt_17_32770:
 //<loop> Loop body line 214, nesting depth: 1, iterations: 32
	.loc	18	217	0
	mul.ftz.f32 	%f38, %f37, %f25;
	fma.rn.ftz.f32 	%f39, %f36, %f3, %f38;
	fma.rn.ftz.f32 	%f40, %f35, %f2, %f39;
	fma.rn.ftz.f32 	%f41, %f34, %f1, %f40;
	.loc	18	219	0
	mov.f32 	%f25, %f3;
	.loc	18	220	0
	ld.shared.f32 	%f3, [%rd84+0];
	.loc	18	221	0
	mov.f32 	%f1, %f2;
	.loc	18	222	0
	mov.f32 	%f2, %f41;
	.loc	18	224	0
	st.shared.f32 	[%rd84+0], %f41;
	.loc	18	225	0
	sub.u64 	%rd84, %rd84, 4;
	sub.s32 	%r134, %r134, 1;
	mov.u32 	%r135, -1;
	setp.ne.s32 	%p22, %r134, %r135;
	@%p22 bra 	$Lt_17_32770;
$Lt_17_31746:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	228	0
	bar.sync 	0;
	.loc	18	231	0
	@!%p3 bra 	$Lt_17_33794;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	mov.u32 	%r21, %tid.x;
	add.u32 	%r136, %r18, %r21;
	setp.le.u32 	%p23, %r1, %r136;
	@%p23 bra 	$Lt_17_33794;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	239	0
	ld.param.s32 	%r137, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	mul.lo.s32 	%r138, %r137, %r13;
	ld.param.u64 	%rd85, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pOut];
	add.s32 	%r139, %r138, %r18;
	add.u32 	%r140, %r21, %r139;
	cvt.s64.s32 	%rd86, %r140;
	mul.wide.s32 	%rd87, %r140, 4;
	add.u64 	%rd88, %rd85, %rd87;
	ld.global.f32 	%f42, [%rd88+0];
	mul.lo.s32 	%r141, %r11, 33;
	add.u32 	%r142, %r21, %r141;
	cvt.s64.s32 	%rd89, %r142;
	mul.wide.s32 	%rd90, %r142, 4;
	add.u64 	%rd91, %rd1, %rd90;
	ld.shared.f32 	%f43, [%rd91+0];
	add.ftz.f32 	%f44, %f42, %f43;
	st.global.f32 	[%rd88+0], %f44;
$Lt_17_33794:
$Lt_17_33282:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	@!%p4 bra 	$Lt_17_34818;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	mov.u32 	%r21, %tid.x;
	add.u32 	%r143, %r18, %r21;
	setp.le.u32 	%p24, %r1, %r143;
	@%p24 bra 	$Lt_17_34818;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	shl.b32 	%r38, %r30, 3;
	mul.lo.s32 	%r138, %r30, %r13;
	add.s32 	%r144, %r138, %r18;
	ld.param.u64 	%rd92, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pOut];
	add.u32 	%r145, %r144, %r21;
	add.s32 	%r146, %r38, %r145;
	cvt.s64.s32 	%rd93, %r146;
	mul.wide.s32 	%rd94, %r146, 4;
	add.u64 	%rd95, %rd92, %rd94;
	ld.global.f32 	%f45, [%rd95+0];
	mul.lo.s32 	%r147, %r11, 33;
	add.u32 	%r148, %r21, %r147;
	cvt.s64.s32 	%rd96, %r148;
	mul.wide.s32 	%rd97, %r148, 4;
	add.u64 	%rd98, %rd1, %rd97;
	ld.shared.f32 	%f46, [%rd98+1056];
	add.ftz.f32 	%f47, %f45, %f46;
	st.global.f32 	[%rd95+0], %f47;
$Lt_17_34818:
$Lt_17_34306:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	@!%p5 bra 	$Lt_17_35842;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	mov.u32 	%r21, %tid.x;
	add.u32 	%r149, %r18, %r21;
	setp.le.u32 	%p25, %r1, %r149;
	@%p25 bra 	$Lt_17_35842;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	shl.b32 	%r38, %r30, 3;
	mul.lo.s32 	%r138, %r30, %r13;
	add.s32 	%r144, %r138, %r18;
	add.u32 	%r150, %r144, %r21;
	ld.param.u64 	%rd99, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pOut];
	add.s32 	%r151, %r38, %r150;
	add.s32 	%r152, %r38, %r151;
	cvt.s64.s32 	%rd100, %r152;
	mul.wide.s32 	%rd101, %r152, 4;
	add.u64 	%rd102, %rd99, %rd101;
	ld.global.f32 	%f48, [%rd102+0];
	mul.lo.s32 	%r153, %r11, 33;
	add.u32 	%r154, %r21, %r153;
	cvt.s64.s32 	%rd103, %r154;
	mul.wide.s32 	%rd104, %r154, 4;
	add.u64 	%rd105, %rd1, %rd104;
	ld.shared.f32 	%f49, [%rd105+2112];
	add.ftz.f32 	%f50, %f48, %f49;
	st.global.f32 	[%rd102+0], %f50;
$Lt_17_35842:
$Lt_17_35330:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	@!%p6 bra 	$Lt_17_36866;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	mov.u32 	%r21, %tid.x;
	add.u32 	%r155, %r18, %r21;
	setp.le.u32 	%p26, %r1, %r155;
	@%p26 bra 	$Lt_17_36866;
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	ld.param.s32 	%r30, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pitch_f];
	shl.b32 	%r38, %r30, 3;
	mul.lo.s32 	%r138, %r30, %r13;
	add.s32 	%r144, %r138, %r18;
	add.u32 	%r156, %r144, %r21;
	add.s32 	%r157, %r38, %r156;
	ld.param.u64 	%rd106, [__cudaparm_HorizontalRecursiveGaussianGrayF32_kernel_pOut];
	add.s32 	%r158, %r157, %r38;
	add.s32 	%r159, %r38, %r158;
	cvt.s64.s32 	%rd107, %r159;
	mul.wide.s32 	%rd108, %r159, 4;
	add.u64 	%rd109, %rd106, %rd108;
	ld.global.f32 	%f51, [%rd109+0];
	mul.lo.s32 	%r160, %r11, 33;
	add.u32 	%r161, %r21, %r160;
	cvt.s64.s32 	%rd110, %r161;
	mul.wide.s32 	%rd111, %r161, 4;
	add.u64 	%rd112, %rd1, %rd111;
	ld.shared.f32 	%f52, [%rd112+3168];
	add.ftz.f32 	%f53, %f51, %f52;
	st.global.f32 	[%rd109+0], %f53;
$Lt_17_36866:
$Lt_17_36354:
 //<loop> Part of loop body line 178, head labeled $Lt_17_27394
	.loc	18	245	0
	bar.sync 	0;
	mov.u32 	%r162, 0;
	setp.gt.s32 	%p27, %r18, %r162;
	@%p27 bra 	$Lt_17_27394;
$Lt_17_26882:
	.loc	18	247	0
	exit;
$LDWend_HorizontalRecursiveGaussianGrayF32_kernel:
	} // HorizontalRecursiveGaussianGrayF32_kernel
	.const .align 4 .b8 kRGB32f_To_601YPbPr[36] = {135,22,153,62,162,69,22,63,213,120,233,61,33,201,44,190,111,155,169,190,0,0,0,63,0,0,0,63,70,94,214,190,232,134,166,189};
	.const .align 4 .b8 k601YPbPr_To_RGB32f[36] = {0,0,128,63,0,0,0,0,188,116,179,63,0,0,128,63,152,50,176,190,158,209,54,191,0,0,128,63,229,208,226,63,0,0,0,0};
	.const .align 4 .b8 kRGB32f_To_601YCbCr[36] = {70,246,130,66,145,141,0,67,94,186,199,65,33,48,23,194,240,103,148,194,0,0,224,66,0,0,224,66,111,146,187,194,70,182,145,193};
	.const .align 4 .b8 k601YCbCr_To_RGB32f[36] = {37,160,149,59,0,0,0,0,182,23,205,59,37,160,149,59,40,15,201,186,156,239,80,187,37,160,149,59,236,155,1,60,0,0,0,0};
	.const .align 4 .b8 kRGB8u_To_601YCbCr[36] = {219,121,131,62,152,14,1,63,18,131,200,61,174,199,23,190,238,252,148,190,197,224,224,62,197,224,224,62,217,78,188,190,174,71,146,189};
	.const .align 4 .b8 k601YCbCr_To_RGB8u[36] = {127,10,149,63,0,0,0,0,160,74,204,63,127,10,149,63,254,148,200,190,184,30,80,191,127,10,149,63,78,26,1,64,0,0,0,0};
	.const .align 4 .b8 kRGB8u_To_601YCbCrFullRange[36] = {135,22,153,62,162,69,22,63,213,120,233,61,166,27,44,190,39,241,168,190,250,254,254,62,250,254,254,62,43,135,213,190,59,223,165,189};
	.const .align 4 .b8 k601YCbCrFullRange_To_RGB8u[36] = {0,0,128,63,0,0,0,0,72,193,178,63,0,0,128,63,143,130,175,190,225,26,54,191,0,0,128,63,20,238,225,63,0,0,0,0};
	.const .align 4 .b8 kRGB32f_To_601YCbCrFullRange[36] = {113,125,152,66,92,175,21,67,92,143,232,65,158,111,43,194,49,72,168,194,0,0,254,66,0,0,254,66,170,177,212,194,88,57,165,193};
	.const .align 4 .b8 k601YCbCrFullRange_To_RGB32f[36] = {129,128,128,59,0,0,0,0,188,116,179,59,129,128,128,59,194,50,176,186,179,209,54,187,129,128,128,59,229,208,226,59,0,0,0,0};
	.const .align 4 .b8 kRGB32f_To_709YPbPr[36] = {208,179,89,62,89,23,55,63,152,221,147,61,186,164,234,189,210,86,197,190,0,0,0,63,0,0,0,63,190,134,232,190,16,202,59,189};
	.const .align 4 .b8 k709YPbPr_To_RGB32f[36] = {0,0,128,63,0,0,0,0,12,147,201,63,0,0,128,63,221,209,63,190,243,173,239,190,0,0,128,63,77,132,237,63,0,0,0,0};
	.const .align 4 .b8 kRGB32f_To_709YCbCr[36] = {106,60,58,66,6,161,28,67,244,253,124,65,223,79,205,193,8,172,172,194,0,0,224,66,0,0,224,66,195,117,203,194,236,81,36,193};
	.const .align 4 .b8 k709YCbCr_To_RGB32f[36] = {37,160,149,59,0,0,0,0,239,94,230,59,37,160,149,59,33,57,91,186,178,245,8,187,37,160,149,59,82,185,7,60,0,0,0,0};
	.const .align 4 .b8 kRGB8u_To_709YCbCr[36] = {207,247,58,62,53,62,29,63,231,251,125,61,147,24,206,61,23,89,173,190,197,224,224,62,197,224,224,62,12,66,204,190,195,245,36,189};
	.const .align 4 .b8 k709YCbCr_To_RGB8u[36] = {127,10,149,63,0,0,0,0,147,120,229,63,127,10,149,63,53,94,90,190,205,108,8,191,127,10,149,63,154,49,7,64,0,0,0,0};
	.const .align 4 .b8 k709YCbCr_To_601YCbCr[36] = {0,0,128,63,23,100,203,61,1,77,68,62,0,0,0,0,18,103,125,63,10,158,226,189,0,0,0,0,61,98,148,189,249,191,123,63};
	.const .align 4 .b8 k601YCbCr_To_709YCbCr[36] = {0,0,128,63,122,165,236,189,179,237,84,190,0,0,0,0,204,98,130,63,216,188,234,61,0,0,0,0,74,179,153,61,234,61,131,63};
	.const .align 4 .b8 kYCbCrOffset[12] = {0,0,128,65,0,0,0,67,0,0,0,67};
	.const .align 4 .b8 kYCbCrFullRangeOffset[12] = {0,0,0,0,0,0,0,67,0,0,0,67};
	.const .align 4 .b8 kRGB32f_To_YIQ[36] = {135,22,153,62,162,69,22,63,213,120,233,61,216,128,24,63,27,133,140,190,149,124,164,190,236,135,88,62,134,200,5,191,22,77,159,62};
	.const .align 4 .b8 kYIQ_To_RGB32f[36] = {0,0,128,63,20,208,116,63,219,249,30,63,0,0,128,63,177,80,139,190,2,188,37,191,0,0,128,63,45,178,141,191,85,48,218,63};

