	.version 2.2
	.target sm_20
	// compiled with ../../../External/3rdParty/NVIDIA/CUDA/win/bin/../open64/lib//be.exe
	// nvopencc 3.2 built on 2010-11-04

	.visible .func (.param .s32 __cudaretf__Z15IntegerMultiplyii) _Z15IntegerMultiplyii (.param .s32 __cudaparmf1__Z15IntegerMultiplyii, .param .s32 __cudaparmf2__Z15IntegerMultiplyii)

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelXv) _Z17Standard2DKernelXv ()

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelYv) _Z17Standard2DKernelYv ()

	.visible .func (.param .align 16 .b8 __cudaretf__Z13Half4ToFloat47ushort4[16]) _Z13Half4ToFloat47ushort4 (.param .align 8 .b8 __cudaparmf1__Z13Half4ToFloat47ushort4[8])

	.visible .func (.param .align 8 .b8 __cudaretf__Z13Float4ToHalf46float4[8]) _Z13Float4ToHalf46float4 (.param .align 16 .b8 __cudaparmf1__Z13Float4ToHalf46float4[16])

	.visible .func (.param .u32 __cudaretf__Z4Mix3RjS_S_) _Z4Mix3RjS_S_ (.param .u64 __cudaparmf1__Z4Mix3RjS_S_, .param .u64 __cudaparmf2__Z4Mix3RjS_S_, .param .u64 __cudaparmf3__Z4Mix3RjS_S_)

	.visible .func (.param .s32 __cudaretf__Z4Randj) _Z4Randj (.param .u32 __cudaparmf1__Z4Randj)

	.visible .func (.param .s32 __cudaretf__Z6Rand2Djjj) _Z6Rand2Djjj (.param .u32 __cudaparmf1__Z6Rand2Djjj, .param .u32 __cudaparmf2__Z6Rand2Djjj, .param .u32 __cudaparmf3__Z6Rand2Djjj)

	.visible .func (.param .s32 __cudaretf__Z6Rand2Dj) _Z6Rand2Dj (.param .u32 __cudaparmf1__Z6Rand2Dj)

	.visible .func (.param .align 16 .b8 __cudaretf__Z18UnpremultiplyPixel8PixelRGB[16]) _Z18UnpremultiplyPixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z18UnpremultiplyPixel8PixelRGB[16])

	.visible .func (.param .f32 __cudaretf__Z13ToLinearColorf) _Z13ToLinearColorf (.param .f32 __cudaparmf1__Z13ToLinearColorf)

	.visible .func (.param .f32 __cudaretf__Z15FromLinearColorf) _Z15FromLinearColorf (.param .f32 __cudaparmf1__Z15FromLinearColorf)

	.visible .func (.param .align 16 .b8 __cudaretf__Z25PremultiplyLinearizePixel8PixelRGB[16]) _Z25PremultiplyLinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16]) _Z29UnpremultiplyUnlinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z20PremultiplyLinearize6float4[16]) _Z20PremultiplyLinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z20PremultiplyLinearize6float4[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z24UnpremultiplyUnlinearize6float4[16]) _Z24UnpremultiplyUnlinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z24UnpremultiplyUnlinearize6float4[16])

	.visible .func (.param .align 16 .b8 __cudaretf__Z9ReadPixelPK6float417DevicePixelFormati[16]) _Z9ReadPixelPK6float417DevicePixelFormati (.param .u64 __cudaparmf1__Z9ReadPixelPK6float417DevicePixelFormati, .param .u32 __cudaparmf2__Z9ReadPixelPK6float417DevicePixelFormati, .param .s32 __cudaparmf3__Z9ReadPixelPK6float417DevicePixelFormati)

	.visible .func _Z10WritePixel6float417DevicePixelFormatPS_i (.param .align 16 .b8 __cudaparmf1__Z10WritePixel6float417DevicePixelFormatPS_i[16], .param .u32 __cudaparmf2__Z10WritePixel6float417DevicePixelFormatPS_i, .param .u64 __cudaparmf3__Z10WritePixel6float417DevicePixelFormatPS_i, .param .s32 __cudaparmf4__Z10WritePixel6float417DevicePixelFormatPS_i)

	.visible .func _Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff (.param .u64 __cudaparmf1__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf2__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf3__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf4__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .u64 __cudaparmf5__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf6__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf7__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf8__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .u32 __cudaparmf9__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf10__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf11__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf12__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf13__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf14__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf15__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf16__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf17__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf18__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff)

	.visible .func _Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff (.param .u64 __cudaparmf1__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf2__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf3__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf4__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .u64 __cudaparmf5__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf6__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf7__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf8__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .u32 __cudaparmf9__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf10__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf11__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf12__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf13__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf14__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf15__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf16__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf17__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf18__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff)

	//-----------------------------------------------------------
	// Compiling C:/Users/dvaeng/AppData/Local/Temp/tmpxft_00002374_00000000-11_GaussianBlur.cpp3.i (C:/Users/dvaeng/AppData/Local/Temp/ccBI#.a15676)
	//-----------------------------------------------------------

	//-----------------------------------------------------------
	// Options:
	//-----------------------------------------------------------
	//  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
	//  -O3	(Optimization level)
	//  -g0	(Debug level)
	//  -m2	(Report advisories)
	//-----------------------------------------------------------

	.file	1	"C:/Users/dvaeng/AppData/Local/Temp/tmpxft_00002374_00000000-10_GaussianBlur.cudafe2.gpu"
	.file	2	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/PixelFormat.h"
	.file	3	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/PixelRGB.h"
	.file	4	"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\crtdefs.h"
	.file	5	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\crt/device_runtime.h"
	.file	6	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\host_defines.h"
	.file	7	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\builtin_types.h"
	.file	8	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\device_types.h"
	.file	9	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\driver_types.h"
	.file	10	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\surface_types.h"
	.file	11	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\texture_types.h"
	.file	12	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\vector_types.h"
	.file	13	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\builtin_types.h"
	.file	14	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\host_defines.h"
	.file	15	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\device_launch_parameters.h"
	.file	16	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\crt\storage_class.h"
	.file	17	"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\time.h"
	.file	18	"c:\Mulder64\shared\adobe\MediaCore\GPUFoundation\API\Inc\GPUFoundation/KernelSupport/Utils.h"
	.file	19	"c:/Mulder64/shared/adobe/MediaCore/GPUFoundation/Src/ImageProcessing/GaussianBlur.cu"
	.file	20	"c:\Mulder64\shared\adobe\MediaCore\External\3rdParty\NVIDIA\CUDA\win\include\common_functions.h"
	.file	21	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_functions.h"
	.file	22	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_constants.h"
	.file	23	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\device_functions.h"
	.file	24	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_11_atomic_functions.h"
	.file	25	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_12_atomic_functions.h"
	.file	26	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_13_double_functions.h"
	.file	27	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_20_atomic_functions.h"
	.file	28	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\sm_20_intrinsics.h"
	.file	29	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\surface_functions.h"
	.file	30	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\texture_fetch_functions.h"
	.file	31	"c:\mulder64\shared\adobe\mediacore\external\3rdparty\nvidia\cuda\win\include\math_functions_dbl_ptx3.h"


	.visible .func (.param .s32 __cudaretf__Z15IntegerMultiplyii) _Z15IntegerMultiplyii (.param .s32 __cudaparmf1__Z15IntegerMultiplyii, .param .s32 __cudaparmf2__Z15IntegerMultiplyii)
	{
	.reg .u32 %r<7>;
	.loc	18	60	0
$LDWbegin__Z15IntegerMultiplyii:
	ld.param.u32 	%r1, [__cudaparmf1__Z15IntegerMultiplyii];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf2__Z15IntegerMultiplyii];
	mov.s32 	%r4, %r3;
	.loc	18	64	0
	mul.lo.s32 	%r5, %r2, %r4;
	st.param.s32 	[__cudaretf__Z15IntegerMultiplyii], %r5;
	ret;
$LDWend__Z15IntegerMultiplyii:
	} // _Z15IntegerMultiplyii

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelXv) _Z17Standard2DKernelXv ()
	{
	.reg .u32 %r<7>;
	.loc	18	73	0
$LDWbegin__Z17Standard2DKernelXv:
	.loc	18	74	0
	mov.u32 	%r1, %tid.x;
	cvt.s32.u32 	%r2, %ctaid.x;
	cvt.s32.u32 	%r3, %ntid.x;
	mul.lo.s32 	%r4, %r2, %r3;
	add.u32 	%r5, %r1, %r4;
	st.param.s32 	[__cudaretf__Z17Standard2DKernelXv], %r5;
	ret;
$LDWend__Z17Standard2DKernelXv:
	} // _Z17Standard2DKernelXv

	.visible .func (.param .s32 __cudaretf__Z17Standard2DKernelYv) _Z17Standard2DKernelYv ()
	{
	.reg .u32 %r<7>;
	.loc	18	77	0
$LDWbegin__Z17Standard2DKernelYv:
	.loc	18	78	0
	mov.u32 	%r1, %tid.y;
	cvt.s32.u32 	%r2, %ctaid.y;
	cvt.s32.u32 	%r3, %ntid.y;
	mul.lo.s32 	%r4, %r2, %r3;
	add.u32 	%r5, %r1, %r4;
	st.param.s32 	[__cudaretf__Z17Standard2DKernelYv], %r5;
	ret;
$LDWend__Z17Standard2DKernelYv:
	} // _Z17Standard2DKernelYv

	.visible .func (.param .align 16 .b8 __cudaretf__Z13Half4ToFloat47ushort4[16]) _Z13Half4ToFloat47ushort4 (.param .align 8 .b8 __cudaparmf1__Z13Half4ToFloat47ushort4[8])
	{
	.reg .u32 %r<14>;
	.reg .f32 %f<9>;
	.loc	18	86	0
$LDWbegin__Z13Half4ToFloat47ushort4:
	ld.param.u16 	%r1, [__cudaparmf1__Z13Half4ToFloat47ushort4+0];
	mov.s32 	%r2, %r1;
	ld.param.u16 	%r3, [__cudaparmf1__Z13Half4ToFloat47ushort4+2];
	mov.s32 	%r4, %r3;
	ld.param.u16 	%r5, [__cudaparmf1__Z13Half4ToFloat47ushort4+4];
	mov.s32 	%r6, %r5;
	ld.param.u16 	%r7, [__cudaparmf1__Z13Half4ToFloat47ushort4+6];
	mov.s32 	%r8, %r7;
	.loc	18	87	0
	cvt.u16.u32 	%r9, %r4;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r9;
	cvt.ftz.f32.f16	%f1, %b1; }
	cvt.u16.u32 	%r10, %r6;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r10;
	cvt.ftz.f32.f16	%f2, %b1; }
	cvt.u16.u32 	%r11, %r8;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r11;
	cvt.ftz.f32.f16	%f3, %b1; }
	cvt.u16.u32 	%r12, %r2;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r12;
	cvt.ftz.f32.f16	%f4, %b1; }
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+0], %f4;
	mov.f32 	%f5, %f1;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+4], %f5;
	mov.f32 	%f6, %f2;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+8], %f6;
	mov.f32 	%f7, %f3;
	st.param.f32 	[__cudaretf__Z13Half4ToFloat47ushort4+12], %f7;
	ret;
$LDWend__Z13Half4ToFloat47ushort4:
	} // _Z13Half4ToFloat47ushort4

	.visible .func (.param .align 8 .b8 __cudaretf__Z13Float4ToHalf46float4[8]) _Z13Float4ToHalf46float4 (.param .align 16 .b8 __cudaparmf1__Z13Float4ToHalf46float4[16])
	{
	.reg .u32 %r<13>;
	.reg .f32 %f<10>;
	.loc	18	95	0
$LDWbegin__Z13Float4ToHalf46float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z13Float4ToHalf46float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z13Float4ToHalf46float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z13Float4ToHalf46float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z13Float4ToHalf46float4+12];
	mov.f32 	%f8, %f7;
	.loc	18	96	0
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f4;
	mov.b32		%r1, %b1; }
	cvt.u16.u32 	%r2, %r1;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f6;
	mov.b32		%r3, %b1; }
	cvt.u16.u32 	%r4, %r3;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f8;
	mov.b32		%r5, %b1; }
	cvt.u16.u32 	%r6, %r5;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f2;
	mov.b32		%r7, %b1; }
	cvt.u16.u32 	%r8, %r7;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+0], %r8;
	mov.s32 	%r9, %r2;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+2], %r9;
	mov.s32 	%r10, %r4;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+4], %r10;
	mov.s32 	%r11, %r6;
	st.param.u16 	[__cudaretf__Z13Float4ToHalf46float4+6], %r11;
	ret;
$LDWend__Z13Float4ToHalf46float4:
	} // _Z13Float4ToHalf46float4

	.visible .func (.param .u32 __cudaretf__Z4Mix3RjS_S_) _Z4Mix3RjS_S_ (.param .u64 __cudaparmf1__Z4Mix3RjS_S_, .param .u64 __cudaparmf2__Z4Mix3RjS_S_, .param .u64 __cudaparmf3__Z4Mix3RjS_S_)
	{
	.reg .u32 %r<75>;
	.reg .u64 %rd<8>;
	.loc	18	138	0
$LDWbegin__Z4Mix3RjS_S_:
	ld.param.u64 	%rd1, [__cudaparmf1__Z4Mix3RjS_S_];
	mov.s64 	%rd2, %rd1;
	ld.param.u64 	%rd3, [__cudaparmf2__Z4Mix3RjS_S_];
	mov.s64 	%rd4, %rd3;
	ld.param.u64 	%rd5, [__cudaparmf3__Z4Mix3RjS_S_];
	mov.s64 	%rd6, %rd5;
	.loc	18	139	0
	ld.u32 	%r1, [%rd2+0];
	ld.u32 	%r2, [%rd4+0];
	sub.u32 	%r3, %r1, %r2;
	st.u32 	[%rd2+0], %r3;
	ld.u32 	%r4, [%rd6+0];
	sub.u32 	%r5, %r3, %r4;
	st.u32 	[%rd2+0], %r5;
	ld.u32 	%r6, [%rd6+0];
	shr.u32 	%r7, %r6, 13;
	xor.b32 	%r8, %r5, %r7;
	st.u32 	[%rd2+0], %r8;
	.loc	18	140	0
	ld.u32 	%r9, [%rd4+0];
	ld.u32 	%r10, [%rd6+0];
	sub.u32 	%r11, %r9, %r10;
	st.u32 	[%rd4+0], %r11;
	ld.u32 	%r12, [%rd2+0];
	sub.u32 	%r13, %r11, %r12;
	st.u32 	[%rd4+0], %r13;
	ld.u32 	%r14, [%rd2+0];
	shl.b32 	%r15, %r14, 8;
	xor.b32 	%r16, %r13, %r15;
	st.u32 	[%rd4+0], %r16;
	.loc	18	141	0
	ld.u32 	%r17, [%rd6+0];
	ld.u32 	%r18, [%rd2+0];
	sub.u32 	%r19, %r17, %r18;
	st.u32 	[%rd6+0], %r19;
	ld.u32 	%r20, [%rd4+0];
	sub.u32 	%r21, %r19, %r20;
	st.u32 	[%rd6+0], %r21;
	ld.u32 	%r22, [%rd4+0];
	shr.u32 	%r23, %r22, 13;
	xor.b32 	%r24, %r21, %r23;
	st.u32 	[%rd6+0], %r24;
	.loc	18	142	0
	ld.u32 	%r25, [%rd2+0];
	ld.u32 	%r26, [%rd4+0];
	sub.u32 	%r27, %r25, %r26;
	st.u32 	[%rd2+0], %r27;
	ld.u32 	%r28, [%rd6+0];
	sub.u32 	%r29, %r27, %r28;
	st.u32 	[%rd2+0], %r29;
	ld.u32 	%r30, [%rd6+0];
	shr.u32 	%r31, %r30, 12;
	xor.b32 	%r32, %r29, %r31;
	st.u32 	[%rd2+0], %r32;
	.loc	18	143	0
	ld.u32 	%r33, [%rd4+0];
	ld.u32 	%r34, [%rd6+0];
	sub.u32 	%r35, %r33, %r34;
	st.u32 	[%rd4+0], %r35;
	ld.u32 	%r36, [%rd2+0];
	sub.u32 	%r37, %r35, %r36;
	st.u32 	[%rd4+0], %r37;
	ld.u32 	%r38, [%rd2+0];
	shl.b32 	%r39, %r38, 16;
	xor.b32 	%r40, %r37, %r39;
	st.u32 	[%rd4+0], %r40;
	.loc	18	144	0
	ld.u32 	%r41, [%rd6+0];
	ld.u32 	%r42, [%rd2+0];
	sub.u32 	%r43, %r41, %r42;
	st.u32 	[%rd6+0], %r43;
	ld.u32 	%r44, [%rd4+0];
	sub.u32 	%r45, %r43, %r44;
	st.u32 	[%rd6+0], %r45;
	ld.u32 	%r46, [%rd4+0];
	shr.u32 	%r47, %r46, 5;
	xor.b32 	%r48, %r45, %r47;
	st.u32 	[%rd6+0], %r48;
	.loc	18	145	0
	ld.u32 	%r49, [%rd2+0];
	ld.u32 	%r50, [%rd4+0];
	sub.u32 	%r51, %r49, %r50;
	st.u32 	[%rd2+0], %r51;
	ld.u32 	%r52, [%rd6+0];
	sub.u32 	%r53, %r51, %r52;
	st.u32 	[%rd2+0], %r53;
	ld.u32 	%r54, [%rd6+0];
	shr.u32 	%r55, %r54, 3;
	xor.b32 	%r56, %r53, %r55;
	st.u32 	[%rd2+0], %r56;
	.loc	18	146	0
	ld.u32 	%r57, [%rd4+0];
	ld.u32 	%r58, [%rd6+0];
	sub.u32 	%r59, %r57, %r58;
	st.u32 	[%rd4+0], %r59;
	ld.u32 	%r60, [%rd2+0];
	sub.u32 	%r61, %r59, %r60;
	st.u32 	[%rd4+0], %r61;
	ld.u32 	%r62, [%rd2+0];
	shl.b32 	%r63, %r62, 10;
	xor.b32 	%r64, %r61, %r63;
	st.u32 	[%rd4+0], %r64;
	.loc	18	147	0
	ld.u32 	%r65, [%rd6+0];
	ld.u32 	%r66, [%rd2+0];
	sub.u32 	%r67, %r65, %r66;
	st.u32 	[%rd6+0], %r67;
	ld.u32 	%r68, [%rd4+0];
	sub.u32 	%r69, %r67, %r68;
	st.u32 	[%rd6+0], %r69;
	ld.u32 	%r70, [%rd4+0];
	shr.u32 	%r71, %r70, 15;
	xor.b32 	%r72, %r69, %r71;
	st.u32 	[%rd6+0], %r72;
	.loc	18	148	0
	mov.s32 	%r73, %r72;
	st.param.u32 	[__cudaretf__Z4Mix3RjS_S_], %r73;
	ret;
$LDWend__Z4Mix3RjS_S_:
	} // _Z4Mix3RjS_S_

	.visible .func (.param .s32 __cudaretf__Z4Randj) _Z4Randj (.param .u32 __cudaparmf1__Z4Randj)
	{
	.reg .u32 %r<14>;
	.loc	18	152	0
$LDWbegin__Z4Randj:
	ld.param.u32 	%r1, [__cudaparmf1__Z4Randj];
	mov.s32 	%r2, %r1;
	.loc	18	163	0
	mul.lo.u32 	%r3, %r2, 1103515245;
	add.u32 	%r4, %r3, 12345;
	shr.u32 	%r5, %r4, 16;
	and.b32 	%r6, %r5, 255;
	shl.b32 	%r7, %r6, 7;
	mul.lo.u32 	%r8, %r2, -1029531031;
	sub.u32 	%r9, %r8, 740551042;
	shr.u32 	%r10, %r9, 16;
	and.b32 	%r11, %r10, 255;
	xor.b32 	%r12, %r7, %r11;
	st.param.s32 	[__cudaretf__Z4Randj], %r12;
	ret;
$LDWend__Z4Randj:
	} // _Z4Randj

	.visible .func (.param .s32 __cudaretf__Z6Rand2Djjj) _Z6Rand2Djjj (.param .u32 __cudaparmf1__Z6Rand2Djjj, .param .u32 __cudaparmf2__Z6Rand2Djjj, .param .u32 __cudaparmf3__Z6Rand2Djjj)
	{
	.reg .u32 %r<54>;
	.loc	18	169	0
$LDWbegin__Z6Rand2Djjj:
	ld.param.u32 	%r1, [__cudaparmf1__Z6Rand2Djjj];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf2__Z6Rand2Djjj];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf3__Z6Rand2Djjj];
	mov.s32 	%r6, %r5;
	.loc	18	139	0
	sub.u32 	%r7, %r2, %r4;
	sub.u32 	%r8, %r7, %r6;
	shr.u32 	%r9, %r6, 13;
	xor.b32 	%r10, %r8, %r9;
	.loc	18	140	0
	sub.u32 	%r11, %r4, %r6;
	sub.u32 	%r12, %r11, %r10;
	shl.b32 	%r13, %r10, 8;
	xor.b32 	%r14, %r12, %r13;
	.loc	18	141	0
	sub.u32 	%r15, %r6, %r10;
	sub.u32 	%r16, %r15, %r14;
	shr.u32 	%r17, %r14, 13;
	xor.b32 	%r18, %r16, %r17;
	.loc	18	142	0
	sub.u32 	%r19, %r10, %r14;
	sub.u32 	%r20, %r19, %r18;
	shr.u32 	%r21, %r18, 12;
	xor.b32 	%r22, %r20, %r21;
	.loc	18	143	0
	sub.u32 	%r23, %r14, %r18;
	sub.u32 	%r24, %r23, %r22;
	shl.b32 	%r25, %r22, 16;
	xor.b32 	%r26, %r24, %r25;
	.loc	18	144	0
	sub.u32 	%r27, %r18, %r22;
	sub.u32 	%r28, %r27, %r26;
	shr.u32 	%r29, %r26, 5;
	xor.b32 	%r30, %r28, %r29;
	.loc	18	145	0
	sub.u32 	%r31, %r22, %r26;
	sub.u32 	%r32, %r31, %r30;
	shr.u32 	%r33, %r30, 3;
	xor.b32 	%r34, %r32, %r33;
	.loc	18	146	0
	sub.u32 	%r35, %r26, %r30;
	sub.u32 	%r36, %r35, %r34;
	shl.b32 	%r37, %r34, 10;
	xor.b32 	%r38, %r36, %r37;
	.loc	18	147	0
	sub.u32 	%r39, %r30, %r34;
	sub.u32 	%r40, %r39, %r38;
	shr.u32 	%r41, %r38, 15;
	xor.b32 	%r42, %r40, %r41;
	.loc	18	170	0
	mul.lo.u32 	%r43, %r42, 1103515245;
	add.u32 	%r44, %r43, 12345;
	shr.u32 	%r45, %r44, 16;
	and.b32 	%r46, %r45, 255;
	shl.b32 	%r47, %r46, 7;
	mul.lo.u32 	%r48, %r42, -1029531031;
	sub.u32 	%r49, %r48, 740551042;
	shr.u32 	%r50, %r49, 16;
	and.b32 	%r51, %r50, 255;
	xor.b32 	%r52, %r47, %r51;
	st.param.s32 	[__cudaretf__Z6Rand2Djjj], %r52;
	ret;
$LDWend__Z6Rand2Djjj:
	} // _Z6Rand2Djjj

	.visible .func (.param .s32 __cudaretf__Z6Rand2Dj) _Z6Rand2Dj (.param .u32 __cudaparmf1__Z6Rand2Dj)
	{
	.reg .u32 %r<60>;
	.loc	18	175	0
$LDWbegin__Z6Rand2Dj:
	ld.param.u32 	%r1, [__cudaparmf1__Z6Rand2Dj];
	mov.s32 	%r2, %r1;
	.loc	18	143	0
	cvt.s32.u32 	%r3, %ctaid.y;
	cvt.s32.u32 	%r4, %ntid.y;
	mul.lo.s32 	%r5, %r3, %r4;
	cvt.s32.u32 	%r6, %ctaid.x;
	cvt.s32.u32 	%r7, %ntid.x;
	mul.lo.s32 	%r8, %r6, %r7;
	mov.u32 	%r9, %tid.y;
	add.u32 	%r10, %r5, %r9;
	mov.u32 	%r11, %tid.x;
	add.u32 	%r12, %r8, %r11;
	shr.u32 	%r13, %r10, 13;
	sub.u32 	%r14, %r2, %r12;
	sub.u32 	%r15, %r12, %r10;
	sub.u32 	%r16, %r14, %r10;
	xor.b32 	%r17, %r13, %r16;
	shl.b32 	%r18, %r17, 8;
	sub.u32 	%r19, %r15, %r17;
	sub.u32 	%r20, %r10, %r17;
	xor.b32 	%r21, %r18, %r19;
	shr.u32 	%r22, %r21, 13;
	sub.u32 	%r23, %r20, %r21;
	sub.u32 	%r24, %r17, %r21;
	xor.b32 	%r25, %r22, %r23;
	shr.u32 	%r26, %r25, 12;
	sub.u32 	%r27, %r24, %r25;
	xor.b32 	%r28, %r26, %r27;
	sub.u32 	%r29, %r21, %r25;
	sub.u32 	%r30, %r29, %r28;
	shl.b32 	%r31, %r28, 16;
	xor.b32 	%r32, %r30, %r31;
	.loc	18	144	0
	sub.u32 	%r33, %r25, %r28;
	sub.u32 	%r34, %r33, %r32;
	shr.u32 	%r35, %r32, 5;
	xor.b32 	%r36, %r34, %r35;
	.loc	18	145	0
	sub.u32 	%r37, %r28, %r32;
	sub.u32 	%r38, %r37, %r36;
	shr.u32 	%r39, %r36, 3;
	xor.b32 	%r40, %r38, %r39;
	.loc	18	146	0
	sub.u32 	%r41, %r32, %r36;
	sub.u32 	%r42, %r41, %r40;
	shl.b32 	%r43, %r40, 10;
	xor.b32 	%r44, %r42, %r43;
	.loc	18	147	0
	sub.u32 	%r45, %r36, %r40;
	sub.u32 	%r46, %r45, %r44;
	shr.u32 	%r47, %r44, 15;
	xor.b32 	%r48, %r46, %r47;
	.loc	18	176	0
	mul.lo.u32 	%r49, %r48, 1103515245;
	add.u32 	%r50, %r49, 12345;
	shr.u32 	%r51, %r50, 16;
	and.b32 	%r52, %r51, 255;
	shl.b32 	%r53, %r52, 7;
	mul.lo.u32 	%r54, %r48, -1029531031;
	sub.u32 	%r55, %r54, 740551042;
	shr.u32 	%r56, %r55, 16;
	and.b32 	%r57, %r56, 255;
	xor.b32 	%r58, %r53, %r57;
	st.param.s32 	[__cudaretf__Z6Rand2Dj], %r58;
	ret;
$LDWend__Z6Rand2Dj:
	} // _Z6Rand2Dj

	.visible .func (.param .align 16 .b8 __cudaretf__Z18UnpremultiplyPixel8PixelRGB[16]) _Z18UnpremultiplyPixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z18UnpremultiplyPixel8PixelRGB[16])
	{
	.reg .f32 %f<23>;
	.reg .pred %p<3>;
	.loc	3	206	0
$LDWbegin__Z18UnpremultiplyPixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z18UnpremultiplyPixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_9_1282;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_9_1026;
$Lt_9_1282:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_9_1026:
	.loc	3	224	0
	mov.f32 	%f18, %f17;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+0], %f18;
	mov.f32 	%f19, %f16;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+4], %f19;
	mov.f32 	%f20, %f15;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+8], %f20;
	mov.f32 	%f21, %f10;
	st.param.f32 	[__cudaretf__Z18UnpremultiplyPixel8PixelRGB+12], %f21;
	ret;
$LDWend__Z18UnpremultiplyPixel8PixelRGB:
	} // _Z18UnpremultiplyPixel8PixelRGB

	.visible .func (.param .f32 __cudaretf__Z13ToLinearColorf) _Z13ToLinearColorf (.param .f32 __cudaparmf1__Z13ToLinearColorf)
	{
	.reg .f32 %f<15>;
	.reg .pred %p<3>;
	.loc	3	231	0
$LDWbegin__Z13ToLinearColorf:
	ld.param.f32 	%f1, [__cudaparmf1__Z13ToLinearColorf];
	mov.f32 	%f2, %f1;
	mov.f32 	%f3, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f3;
	@!%p1 bra 	$Lt_10_1026;
	.loc	3	234	0
	neg.ftz.f32 	%f4, %f2;
	lg2.approx.ftz.f32 	%f5, %f4;
	mov.f32 	%f6, 0f400ccccd;     	// 2.2
	mul.ftz.f32 	%f7, %f5, %f6;
	ex2.approx.ftz.f32 	%f8, %f7;
	neg.ftz.f32 	%f9, %f8;
	bra.uni 	$LBB4__Z13ToLinearColorf;
$Lt_10_1026:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f10, %f2;
	mov.f32 	%f11, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f12, %f10, %f11;
	ex2.approx.ftz.f32 	%f9, %f12;
$LBB4__Z13ToLinearColorf:
	mov.f32 	%f13, %f9;
	st.param.f32 	[__cudaretf__Z13ToLinearColorf], %f13;
	ret;
$LDWend__Z13ToLinearColorf:
	} // _Z13ToLinearColorf

	.visible .func (.param .f32 __cudaretf__Z15FromLinearColorf) _Z15FromLinearColorf (.param .f32 __cudaparmf1__Z15FromLinearColorf)
	{
	.reg .f32 %f<15>;
	.reg .pred %p<3>;
	.loc	3	239	0
$LDWbegin__Z15FromLinearColorf:
	ld.param.f32 	%f1, [__cudaparmf1__Z15FromLinearColorf];
	mov.f32 	%f2, %f1;
	mov.f32 	%f3, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f3;
	@!%p1 bra 	$Lt_11_1026;
	.loc	3	242	0
	neg.ftz.f32 	%f4, %f2;
	lg2.approx.ftz.f32 	%f5, %f4;
	mov.f32 	%f6, 0f3ee8ba2e;     	// 0.454545
	mul.ftz.f32 	%f7, %f5, %f6;
	ex2.approx.ftz.f32 	%f8, %f7;
	neg.ftz.f32 	%f9, %f8;
	bra.uni 	$LBB4__Z15FromLinearColorf;
$Lt_11_1026:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f10, %f2;
	mov.f32 	%f11, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f12, %f10, %f11;
	ex2.approx.ftz.f32 	%f9, %f12;
$LBB4__Z15FromLinearColorf:
	mov.f32 	%f13, %f9;
	st.param.f32 	[__cudaretf__Z15FromLinearColorf], %f13;
	ret;
$LDWend__Z15FromLinearColorf:
	} // _Z15FromLinearColorf

	.visible .func (.param .align 16 .b8 __cudaretf__Z25PremultiplyLinearizePixel8PixelRGB[16]) _Z25PremultiplyLinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB[16])
	{
	.reg .f32 %f<47>;
	.reg .pred %p<5>;
	.loc	3	252	0
$LDWbegin__Z25PremultiplyLinearizePixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z25PremultiplyLinearizePixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	254	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	.loc	3	255	0
	mov.f32 	%f10, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f10;
	@!%p1 bra 	$Lt_12_4098;
	.loc	3	234	0
	neg.ftz.f32 	%f11, %f2;
	lg2.approx.ftz.f32 	%f12, %f11;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f15, %f14;
	neg.ftz.f32 	%f16, %f15;
	bra.uni 	$LDWendi___log2f_189_5;
$Lt_12_4098:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f17, %f2;
	mov.f32 	%f18, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f19, %f17, %f18;
	ex2.approx.ftz.f32 	%f16, %f19;
$LDWendi___log2f_189_5:
	.loc	3	256	0
	mov.f32 	%f20, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f4, %f20;
	@!%p2 bra 	$Lt_12_4610;
	.loc	3	234	0
	neg.ftz.f32 	%f21, %f4;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	neg.ftz.f32 	%f26, %f25;
	bra.uni 	$LDWendi___log2f_189_3;
$Lt_12_4610:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f27, %f4;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f26, %f29;
$LDWendi___log2f_189_3:
	.loc	3	257	0
	mov.f32 	%f30, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f6, %f30;
	@!%p3 bra 	$Lt_12_5122;
	.loc	3	234	0
	neg.ftz.f32 	%f31, %f6;
	lg2.approx.ftz.f32 	%f32, %f31;
	mov.f32 	%f33, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f34, %f32, %f33;
	ex2.approx.ftz.f32 	%f35, %f34;
	neg.ftz.f32 	%f36, %f35;
	bra.uni 	$LDWendi___log2f_189_1;
$Lt_12_5122:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f37, %f6;
	mov.f32 	%f38, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f39, %f37, %f38;
	ex2.approx.ftz.f32 	%f36, %f39;
$LDWendi___log2f_189_1:
	.loc	3	259	0
	mul.ftz.f32 	%f40, %f36, %f9;
	mul.ftz.f32 	%f41, %f26, %f9;
	mul.ftz.f32 	%f42, %f16, %f9;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+0], %f42;
	mov.f32 	%f43, %f41;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+4], %f43;
	mov.f32 	%f44, %f40;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+8], %f44;
	mov.f32 	%f45, %f9;
	st.param.f32 	[__cudaretf__Z25PremultiplyLinearizePixel8PixelRGB+12], %f45;
	ret;
$LDWend__Z25PremultiplyLinearizePixel8PixelRGB:
	} // _Z25PremultiplyLinearizePixel8PixelRGB

	.visible .func (.param .align 16 .b8 __cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16]) _Z29UnpremultiplyUnlinearizePixel8PixelRGB (.param .align 16 .b8 __cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB[16])
	{
	.reg .f32 %f<53>;
	.reg .pred %p<6>;
	.loc	3	263	0
$LDWbegin__Z29UnpremultiplyUnlinearizePixel8PixelRGB:
	ld.param.f32 	%f1, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z29UnpremultiplyUnlinearizePixel8PixelRGB+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_13_5122;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_13_4866;
$Lt_13_5122:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_13_4866:
	.loc	3	266	0
	mov.f32 	%f18, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f17, %f18;
	@!%p2 bra 	$Lt_13_5378;
	.loc	3	242	0
	neg.ftz.f32 	%f19, %f17;
	lg2.approx.ftz.f32 	%f20, %f19;
	mov.f32 	%f21, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f22, %f20, %f21;
	ex2.approx.ftz.f32 	%f23, %f22;
	neg.ftz.f32 	%f24, %f23;
	bra.uni 	$LDWendi___log2f_190_5;
$Lt_13_5378:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f25, %f17;
	mov.f32 	%f26, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f27, %f25, %f26;
	ex2.approx.ftz.f32 	%f24, %f27;
$LDWendi___log2f_190_5:
	.loc	3	267	0
	mov.f32 	%f28, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f16, %f28;
	@!%p3 bra 	$Lt_13_5890;
	.loc	3	242	0
	neg.ftz.f32 	%f29, %f16;
	lg2.approx.ftz.f32 	%f30, %f29;
	mov.f32 	%f31, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f32, %f30, %f31;
	ex2.approx.ftz.f32 	%f33, %f32;
	neg.ftz.f32 	%f34, %f33;
	bra.uni 	$LDWendi___log2f_190_3;
$Lt_13_5890:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f35, %f16;
	mov.f32 	%f36, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f37, %f35, %f36;
	ex2.approx.ftz.f32 	%f34, %f37;
$LDWendi___log2f_190_3:
	.loc	3	268	0
	mov.f32 	%f38, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p4, %f15, %f38;
	@!%p4 bra 	$Lt_13_6402;
	.loc	3	242	0
	neg.ftz.f32 	%f39, %f15;
	lg2.approx.ftz.f32 	%f40, %f39;
	mov.f32 	%f41, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f42, %f40, %f41;
	ex2.approx.ftz.f32 	%f43, %f42;
	neg.ftz.f32 	%f44, %f43;
	bra.uni 	$LDWendi___log2f_190_1;
$Lt_13_6402:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f45, %f15;
	mov.f32 	%f46, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f47, %f45, %f46;
	ex2.approx.ftz.f32 	%f44, %f47;
$LDWendi___log2f_190_1:
	.loc	3	269	0
	mov.f32 	%f48, %f24;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+0], %f48;
	mov.f32 	%f49, %f34;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+4], %f49;
	mov.f32 	%f50, %f44;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+8], %f50;
	mov.f32 	%f51, %f10;
	st.param.f32 	[__cudaretf__Z29UnpremultiplyUnlinearizePixel8PixelRGB+12], %f51;
	ret;
$LDWend__Z29UnpremultiplyUnlinearizePixel8PixelRGB:
	} // _Z29UnpremultiplyUnlinearizePixel8PixelRGB

	.visible .func (.param .align 16 .b8 __cudaretf__Z20PremultiplyLinearize6float4[16]) _Z20PremultiplyLinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z20PremultiplyLinearize6float4[16])
	{
	.reg .f32 %f<47>;
	.reg .pred %p<5>;
	.loc	3	277	0
$LDWbegin__Z20PremultiplyLinearize6float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z20PremultiplyLinearize6float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z20PremultiplyLinearize6float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z20PremultiplyLinearize6float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z20PremultiplyLinearize6float4+12];
	mov.f32 	%f8, %f7;
	.loc	3	254	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	.loc	3	255	0
	mov.f32 	%f10, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p1, %f2, %f10;
	@!%p1 bra 	$Lt_14_4098;
	.loc	3	234	0
	neg.ftz.f32 	%f11, %f2;
	lg2.approx.ftz.f32 	%f12, %f11;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f15, %f14;
	neg.ftz.f32 	%f16, %f15;
	bra.uni 	$LDWendi___log2f_191_5;
$Lt_14_4098:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f17, %f2;
	mov.f32 	%f18, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f19, %f17, %f18;
	ex2.approx.ftz.f32 	%f16, %f19;
$LDWendi___log2f_191_5:
	.loc	3	256	0
	mov.f32 	%f20, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f4, %f20;
	@!%p2 bra 	$Lt_14_4610;
	.loc	3	234	0
	neg.ftz.f32 	%f21, %f4;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	neg.ftz.f32 	%f26, %f25;
	bra.uni 	$LDWendi___log2f_191_3;
$Lt_14_4610:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f27, %f4;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f26, %f29;
$LDWendi___log2f_191_3:
	.loc	3	257	0
	mov.f32 	%f30, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f6, %f30;
	@!%p3 bra 	$Lt_14_5122;
	.loc	3	234	0
	neg.ftz.f32 	%f31, %f6;
	lg2.approx.ftz.f32 	%f32, %f31;
	mov.f32 	%f33, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f34, %f32, %f33;
	ex2.approx.ftz.f32 	%f35, %f34;
	neg.ftz.f32 	%f36, %f35;
	bra.uni 	$LDWendi___log2f_191_1;
$Lt_14_5122:
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f37, %f6;
	mov.f32 	%f38, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f39, %f37, %f38;
	ex2.approx.ftz.f32 	%f36, %f39;
$LDWendi___log2f_191_1:
	.loc	3	259	0
	mul.ftz.f32 	%f40, %f36, %f9;
	mul.ftz.f32 	%f41, %f26, %f9;
	.loc	3	278	0
	mul.ftz.f32 	%f42, %f16, %f9;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+0], %f42;
	mov.f32 	%f43, %f41;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+4], %f43;
	mov.f32 	%f44, %f40;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+8], %f44;
	mov.f32 	%f45, %f9;
	st.param.f32 	[__cudaretf__Z20PremultiplyLinearize6float4+12], %f45;
	ret;
$LDWend__Z20PremultiplyLinearize6float4:
	} // _Z20PremultiplyLinearize6float4

	.visible .func (.param .align 16 .b8 __cudaretf__Z24UnpremultiplyUnlinearize6float4[16]) _Z24UnpremultiplyUnlinearize6float4 (.param .align 16 .b8 __cudaparmf1__Z24UnpremultiplyUnlinearize6float4[16])
	{
	.reg .f32 %f<53>;
	.reg .pred %p<6>;
	.loc	3	284	0
$LDWbegin__Z24UnpremultiplyUnlinearize6float4:
	ld.param.f32 	%f1, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z24UnpremultiplyUnlinearize6float4+12];
	mov.f32 	%f8, %f7;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f9, %f8;
	mov.f32 	%f10, %f9;
	mov.f32 	%f11, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f12, %f9, %f11;
	mov.f32 	%f13, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p1, %f12, %f13;
	@%p1 bra 	$Lt_15_5122;
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f14, %f9;
	mul.ftz.f32 	%f15, %f14, %f6;
	.loc	3	214	0
	mul.ftz.f32 	%f16, %f14, %f4;
	.loc	3	215	0
	mul.ftz.f32 	%f17, %f14, %f2;
	bra.uni 	$Lt_15_4866;
$Lt_15_5122:
	.loc	3	219	0
	mov.f32 	%f15, 0f00000000;    	// 0
	mov.f32 	%f16, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f10, 0f00000000;    	// 0
$Lt_15_4866:
	.loc	3	266	0
	mov.f32 	%f18, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p2, %f17, %f18;
	@!%p2 bra 	$Lt_15_5378;
	.loc	3	242	0
	neg.ftz.f32 	%f19, %f17;
	lg2.approx.ftz.f32 	%f20, %f19;
	mov.f32 	%f21, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f22, %f20, %f21;
	ex2.approx.ftz.f32 	%f23, %f22;
	neg.ftz.f32 	%f24, %f23;
	bra.uni 	$LDWendi___log2f_192_5;
$Lt_15_5378:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f25, %f17;
	mov.f32 	%f26, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f27, %f25, %f26;
	ex2.approx.ftz.f32 	%f24, %f27;
$LDWendi___log2f_192_5:
	.loc	3	267	0
	mov.f32 	%f28, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p3, %f16, %f28;
	@!%p3 bra 	$Lt_15_5890;
	.loc	3	242	0
	neg.ftz.f32 	%f29, %f16;
	lg2.approx.ftz.f32 	%f30, %f29;
	mov.f32 	%f31, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f32, %f30, %f31;
	ex2.approx.ftz.f32 	%f33, %f32;
	neg.ftz.f32 	%f34, %f33;
	bra.uni 	$LDWendi___log2f_192_3;
$Lt_15_5890:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f35, %f16;
	mov.f32 	%f36, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f37, %f35, %f36;
	ex2.approx.ftz.f32 	%f34, %f37;
$LDWendi___log2f_192_3:
	.loc	3	268	0
	mov.f32 	%f38, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p4, %f15, %f38;
	@!%p4 bra 	$Lt_15_6402;
	.loc	3	242	0
	neg.ftz.f32 	%f39, %f15;
	lg2.approx.ftz.f32 	%f40, %f39;
	mov.f32 	%f41, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f42, %f40, %f41;
	ex2.approx.ftz.f32 	%f43, %f42;
	neg.ftz.f32 	%f44, %f43;
	bra.uni 	$LDWendi___log2f_192_1;
$Lt_15_6402:
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f45, %f15;
	mov.f32 	%f46, 0f3ee8ba2e;    	// 0.454545
	mul.ftz.f32 	%f47, %f45, %f46;
	ex2.approx.ftz.f32 	%f44, %f47;
$LDWendi___log2f_192_1:
	.loc	3	285	0
	mov.f32 	%f48, %f24;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+0], %f48;
	mov.f32 	%f49, %f34;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+4], %f49;
	mov.f32 	%f50, %f44;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+8], %f50;
	mov.f32 	%f51, %f10;
	st.param.f32 	[__cudaretf__Z24UnpremultiplyUnlinearize6float4+12], %f51;
	ret;
$LDWend__Z24UnpremultiplyUnlinearize6float4:
	} // _Z24UnpremultiplyUnlinearize6float4

	.visible .func (.param .align 16 .b8 __cudaretf__Z9ReadPixelPK6float417DevicePixelFormati[16]) _Z9ReadPixelPK6float417DevicePixelFormati (.param .u64 __cudaparmf1__Z9ReadPixelPK6float417DevicePixelFormati, .param .u32 __cudaparmf2__Z9ReadPixelPK6float417DevicePixelFormati, .param .s32 __cudaparmf3__Z9ReadPixelPK6float417DevicePixelFormati)
	{
	.reg .u32 %r<11>;
	.reg .u64 %rd<9>;
	.reg .f32 %f<13>;
	.reg .pred %p<3>;
	.loc	19	33	0
$LDWbegin__Z9ReadPixelPK6float417DevicePixelFormati:
	ld.param.u64 	%rd1, [__cudaparmf1__Z9ReadPixelPK6float417DevicePixelFormati];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf2__Z9ReadPixelPK6float417DevicePixelFormati];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf3__Z9ReadPixelPK6float417DevicePixelFormati];
	mov.s32 	%r4, %r3;
	cvt.s64.s32 	%rd3, %r4;
	mov.u32 	%r5, 0;
	setp.ne.s32 	%p1, %r2, %r5;
	@%p1 bra 	$Lt_16_1026;
	.loc	19	36	0
	mul.lo.u64 	%rd4, %rd3, 8;
	add.u64 	%rd5, %rd2, %rd4;
	ld.v4.u16 	{%r6,%r7,%r8,%r9}, [%rd5+0];
	.loc	18	87	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r7;
	cvt.ftz.f32.f16	%f1, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r8;
	cvt.ftz.f32.f16	%f2, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r9;
	cvt.ftz.f32.f16	%f3, %b1; }
	.loc	19	36	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r6;
	cvt.ftz.f32.f16	%f4, %b1; }
	mov.f32 	%f5, %f1;
	mov.f32 	%f6, %f2;
	mov.f32 	%f7, %f3;
	bra.uni 	$LBB4__Z9ReadPixelPK6float417DevicePixelFormati;
$Lt_16_1026:
	.loc	19	38	0
	mul.lo.u64 	%rd6, %rd3, 16;
	add.u64 	%rd7, %rd2, %rd6;
	ld.v4.f32 	{%f4,%f5,%f6,%f7}, [%rd7+0];
$LBB4__Z9ReadPixelPK6float417DevicePixelFormati:
	mov.f32 	%f8, %f4;
	st.param.f32 	[__cudaretf__Z9ReadPixelPK6float417DevicePixelFormati+0], %f8;
	mov.f32 	%f9, %f5;
	st.param.f32 	[__cudaretf__Z9ReadPixelPK6float417DevicePixelFormati+4], %f9;
	mov.f32 	%f10, %f6;
	st.param.f32 	[__cudaretf__Z9ReadPixelPK6float417DevicePixelFormati+8], %f10;
	mov.f32 	%f11, %f7;
	st.param.f32 	[__cudaretf__Z9ReadPixelPK6float417DevicePixelFormati+12], %f11;
	ret;
$LDWend__Z9ReadPixelPK6float417DevicePixelFormati:
	} // _Z9ReadPixelPK6float417DevicePixelFormati

	.visible .func _Z10WritePixel6float417DevicePixelFormatPS_i (.param .align 16 .b8 __cudaparmf1__Z10WritePixel6float417DevicePixelFormatPS_i[16], .param .u32 __cudaparmf2__Z10WritePixel6float417DevicePixelFormatPS_i, .param .u64 __cudaparmf3__Z10WritePixel6float417DevicePixelFormatPS_i, .param .s32 __cudaparmf4__Z10WritePixel6float417DevicePixelFormatPS_i)
	{
	.reg .u32 %r<11>;
	.reg .u64 %rd<9>;
	.reg .f32 %f<10>;
	.reg .pred %p<3>;
	.loc	19	45	0
$LDWbegin__Z10WritePixel6float417DevicePixelFormatPS_i:
	ld.param.f32 	%f1, [__cudaparmf1__Z10WritePixel6float417DevicePixelFormatPS_i+0];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf1__Z10WritePixel6float417DevicePixelFormatPS_i+4];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf1__Z10WritePixel6float417DevicePixelFormatPS_i+8];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf1__Z10WritePixel6float417DevicePixelFormatPS_i+12];
	mov.f32 	%f8, %f7;
	ld.param.u32 	%r1, [__cudaparmf2__Z10WritePixel6float417DevicePixelFormatPS_i];
	mov.s32 	%r2, %r1;
	ld.param.u64 	%rd1, [__cudaparmf3__Z10WritePixel6float417DevicePixelFormatPS_i];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r3, [__cudaparmf4__Z10WritePixel6float417DevicePixelFormatPS_i];
	mov.s32 	%r4, %r3;
	cvt.s64.s32 	%rd3, %r4;
	mov.u32 	%r5, 0;
	setp.ne.s32 	%p1, %r2, %r5;
	@%p1 bra 	$Lt_17_1282;
	.loc	19	48	0
	mul.lo.u64 	%rd4, %rd3, 8;
	add.u64 	%rd5, %rd2, %rd4;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f2;
	mov.b32		%r6, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f4;
	mov.b32		%r7, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f6;
	mov.b32		%r8, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f8;
	mov.b32		%r9, %b1; }
	st.v4.u16 	[%rd5+0], {%r6,%r7,%r8,%r9};
	bra.uni 	$Lt_17_1026;
$Lt_17_1282:
	.loc	19	52	0
	mul.lo.u64 	%rd6, %rd3, 16;
	add.u64 	%rd7, %rd2, %rd6;
	st.v4.f32 	[%rd7+0], {%f2,%f4,%f6,%f8};
$Lt_17_1026:
	.loc	19	54	0
	ret;
$LDWend__Z10WritePixel6float417DevicePixelFormatPS_i:
	} // _Z10WritePixel6float417DevicePixelFormatPS_i
	.shared .align 4 .b8 __cuda_local_var_302754_34_non_const_smem__1[2048];

	.visible .func _Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff (.param .u64 __cudaparmf1__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf2__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf3__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf4__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .u64 __cudaparmf5__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf6__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf7__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf8__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .u32 __cudaparmf9__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf10__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf11__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf12__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf13__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf14__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf15__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf16__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf17__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf18__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff)
	{
	.reg .u32 %r<132>;
	.reg .u64 %rd<40>;
	.reg .f32 %f<185>;
	.reg .pred %p<46>;
	.loc	19	95	0
$LDWbegin__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff:
	ld.param.u64 	%rd1, [__cudaparmf1__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf2__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf3__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf4__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r6, %r5;
	ld.param.u64 	%rd3, [__cudaparmf5__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s64 	%rd4, %rd3;
	ld.param.u32 	%r7, [__cudaparmf6__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r8, %r7;
	ld.param.u32 	%r9, [__cudaparmf7__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r10, %r9;
	ld.param.u32 	%r11, [__cudaparmf8__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r12, %r11;
	ld.param.u32 	%r13, [__cudaparmf9__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r14, %r13;
	ld.param.u32 	%r15, [__cudaparmf10__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	cvt.s8.s32 	%r16, %r15;
	ld.param.f32 	%f1, [__cudaparmf11__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf12__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf13__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf14__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f8, %f7;
	ld.param.f32 	%f9, [__cudaparmf15__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f10, %f9;
	ld.param.f32 	%f11, [__cudaparmf16__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f12, %f11;
	ld.param.f32 	%f13, [__cudaparmf17__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f14, %f13;
	ld.param.f32 	%f15, [__cudaparmf18__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f16, %f15;
	.loc	19	102	0
	cvt.s32.u32 	%r17, %tid.y;
	mov.s32 	%r18, %r17;
	.loc	19	106	0
	sub.s32 	%r19, %r12, %r6;
	shr.s32 	%r20, %r19, 1;
	sub.s32 	%r21, %r17, %r20;
	mov.s32 	%r22, %r21;
	cvt.s32.u32 	%r23, %ctaid.x;
	cvt.s32.u32 	%r24, %ntid.x;
	mul.lo.s32 	%r25, %r23, %r24;
	mov.u32 	%r26, %tid.x;
	add.u32 	%r27, %r25, %r26;
	setp.gt.s32 	%p1, %r10, %r27;
	@%p1 bra 	$Lt_18_32002;
	bra.uni 	$LBB93__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff;
$Lt_18_32002:
	.loc	19	125	0
	sub.s32 	%r28, %r10, %r4;
	shr.s32 	%r29, %r28, 1;
	sub.s32 	%r30, %r27, %r29;
	mul.lo.s32 	%r31, %r21, %r2;
	add.s32 	%r32, %r30, %r31;
	.loc	19	126	0
	mul.lo.s32 	%r33, %r8, %r17;
	add.s32 	%r34, %r27, %r33;
	mov.u32 	%r35, 0;
	setp.le.s32 	%p2, %r12, %r35;
	@%p2 bra 	$Lt_18_47362;
	cvta.shared.u64 	%rd5, __cuda_local_var_302754_34_non_const_smem__1;
	add.s32 	%r36, %r12, 3;
	shr.s32 	%r37, %r36, 31;
	mov.s32 	%r38, 3;
	and.b32 	%r39, %r37, %r38;
	add.s32 	%r40, %r39, %r36;
	shr.s32 	%r41, %r40, 2;
	shl.b32 	%r42, %r2, 2;
	mul.lo.s32 	%r43, %r17, 32;
	mul.lo.s32 	%r44, %r17, 128;
	mov.s32 	%r45, 0;
	setp.ne.s32 	%p3, %r16, %r45;
	mov.pred 	%p4, %p3;
	mov.pred 	%p5, %p6;
	shl.b32 	%r46, %r8, 2;
	add.u32 	%r47, %r43, %r26;
	add.u32 	%r48, %r44, %r26;
	selp.s32 	%r49, 1, 0, %p4;
	cvt.s64.s32 	%rd6, %r47;
	cvt.s64.s32 	%rd7, %r48;
	mul.wide.s32 	%rd8, %r47, 4;
	mul.wide.s32 	%rd9, %r48, 4;
	add.u64 	%rd10, %rd8, %rd5;
	add.u64 	%rd11, %rd9, %rd5;
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f18, 0f00000000;    	// 0
	mov.f32 	%f19, 0f00000000;    	// 0
	mov.s32 	%r50, 0;
	mov.s32 	%r51, %r41;
$Lt_18_33026:
 //<loop> Loop body line 126, nesting depth: 1, estimated iterations: unknown
	.loc	19	130	0
	mov.u32 	%r52, 0;
	setp.lt.s32 	%p7, %r22, %r52;
	@%p7 bra 	$Lt_18_48130;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	setp.le.s32 	%p8, %r6, %r22;
	@%p8 bra 	$Lt_18_48130;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	mov.u32 	%r53, 0;
	setp.lt.s32 	%p9, %r30, %r53;
	@%p9 bra 	$Lt_18_48130;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	setp.ge.s32 	%p10, %r30, %r4;
	@%p10 bra 	$Lt_18_48130;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	132	0
	cvt.s64.s32 	%rd12, %r32;
	mov.u32 	%r54, 0;
	setp.ne.s32 	%p11, %r14, %r54;
	@%p11 bra 	$Lt_18_33282;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	36	0
	mul.lo.u64 	%rd13, %rd12, 8;
	add.u64 	%rd14, %rd2, %rd13;
	ld.v4.u16 	{%r55,%r56,%r57,%r58}, [%rd14+0];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r55;
	cvt.ftz.f32.f16	%f20, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r56;
	cvt.ftz.f32.f16	%f21, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r57;
	cvt.ftz.f32.f16	%f22, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r58;
	cvt.ftz.f32.f16	%f23, %b1; }
	bra.uni 	$LDWendi__Z13Half4ToFloat47ushort4_195_17;
$Lt_18_33282:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	38	0
	mul.lo.u64 	%rd15, %rd12, 16;
	add.u64 	%rd16, %rd2, %rd15;
	ld.v4.f32 	{%f20,%f21,%f22,%f23}, [%rd16+0];
$LDWendi__Z13Half4ToFloat47ushort4_195_17:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	3	255	0
	mov.f32 	%f24, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p12, %f20, %f24;
	@!%p12 bra 	$Lt_18_33794;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	3	234	0
	neg.ftz.f32 	%f25, %f20;
	lg2.approx.ftz.f32 	%f26, %f25;
	mov.f32 	%f27, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f28, %f26, %f27;
	ex2.approx.ftz.f32 	%f29, %f28;
	neg.ftz.f32 	%f30, %f29;
	bra.uni 	$LDWendi___log2f_195_23;
$Lt_18_33794:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f31, %f20;
	mov.f32 	%f32, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f33, %f31, %f32;
	ex2.approx.ftz.f32 	%f30, %f33;
$LDWendi___log2f_195_23:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	3	256	0
	mov.f32 	%f34, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p13, %f21, %f34;
	@!%p13 bra 	$Lt_18_34306;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	3	234	0
	neg.ftz.f32 	%f35, %f21;
	lg2.approx.ftz.f32 	%f36, %f35;
	mov.f32 	%f37, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f38, %f36, %f37;
	ex2.approx.ftz.f32 	%f39, %f38;
	neg.ftz.f32 	%f40, %f39;
	bra.uni 	$LDWendi___log2f_195_21;
$Lt_18_34306:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f41, %f21;
	mov.f32 	%f42, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f43, %f41, %f42;
	ex2.approx.ftz.f32 	%f40, %f43;
$LDWendi___log2f_195_21:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	3	257	0
	mov.f32 	%f44, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p14, %f22, %f44;
	@!%p14 bra 	$Lt_18_34818;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	3	234	0
	neg.ftz.f32 	%f45, %f22;
	lg2.approx.ftz.f32 	%f46, %f45;
	mov.f32 	%f47, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f48, %f46, %f47;
	ex2.approx.ftz.f32 	%f49, %f48;
	neg.ftz.f32 	%f50, %f49;
	bra.uni 	$LDWendi___log2f_195_19;
$Lt_18_34818:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f51, %f22;
	mov.f32 	%f52, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f53, %f51, %f52;
	ex2.approx.ftz.f32 	%f50, %f53;
$LDWendi___log2f_195_19:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	132	0
	cvt.ftz.sat.f32.f32 	%f54, %f23;
	mul.ftz.f32 	%f55, %f30, %f54;
	mul.ftz.f32 	%f56, %f40, %f54;
	mul.ftz.f32 	%f57, %f50, %f54;
	mov.f32 	%f58, %f54;
	bra.uni 	$L_18_29442;
$Lt_18_48130:
$L_18_29698:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	134	0
	mov.f32 	%f58, 0f00000000;    	// 0
	mov.f32 	%f57, 0f00000000;    	// 0
	mov.f32 	%f56, 0f00000000;    	// 0
	mov.f32 	%f55, 0f00000000;    	// 0
$L_18_29442:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	136	0
	st.f32 	[%rd10+0], %f55;
	.loc	19	137	0
	st.f32 	[%rd10+512], %f56;
	.loc	19	138	0
	st.f32 	[%rd10+1024], %f57;
	.loc	19	139	0
	st.f32 	[%rd10+1536], %f58;
	.loc	19	141	0
	bar.sync 	0;
	.loc	19	149	0
	ld.f32 	%f59, [%rd11+0];
	mov.s32 	%r59, 0;
	set.eq.u32.s32 	%r60, %r50, %r59;
	neg.s32 	%r61, %r60;
	and.b32 	%r62, %r61, %r49;
	mov.u32 	%r63, 0;
	setp.eq.s32 	%p15, %r62, %r63;
	@%p15 bra 	$Lt_18_35330;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	152	0
	mov.f32 	%f60, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f61, %f59, %f60;
	mov.f32 	%f17, %f61;
	mov.f32 	%f18, %f61;
$Lt_18_35330:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	155	0
	mul.ftz.f32 	%f62, %f4, %f19;
	fma.rn.ftz.f32 	%f63, %f2, %f59, %f62;
	fma.rn.ftz.f32 	%f64, %f6, %f18, %f63;
	fma.rn.ftz.f32 	%f65, %f8, %f17, %f64;
	.loc	19	158	0
	mov.f32 	%f17, %f18;
	.loc	19	159	0
	mov.f32 	%f18, %f65;
	.loc	19	161	0
	st.f32 	[%rd11+0], %f65;
	.loc	19	149	0
	ld.f32 	%f66, [%rd11+128];
	mov.s32 	%r64, -1;
	set.eq.u32.s32 	%r65, %r50, %r64;
	neg.s32 	%r66, %r65;
	and.b32 	%r67, %r66, %r49;
	mov.u32 	%r68, 0;
	setp.eq.s32 	%p16, %r67, %r68;
	@%p16 bra 	$Lt_18_35842;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	152	0
	mov.f32 	%f67, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f61, %f66, %f67;
	mov.f32 	%f17, %f61;
	mov.f32 	%f18, %f61;
$Lt_18_35842:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	155	0
	mul.ftz.f32 	%f68, %f59, %f4;
	fma.rn.ftz.f32 	%f69, %f2, %f66, %f68;
	fma.rn.ftz.f32 	%f70, %f6, %f18, %f69;
	fma.rn.ftz.f32 	%f65, %f8, %f17, %f70;
	.loc	19	158	0
	mov.f32 	%f17, %f18;
	.loc	19	159	0
	mov.f32 	%f18, %f65;
	.loc	19	161	0
	st.f32 	[%rd11+128], %f65;
	.loc	19	149	0
	ld.f32 	%f71, [%rd11+256];
	mov.s32 	%r69, -2;
	set.eq.u32.s32 	%r70, %r50, %r69;
	neg.s32 	%r71, %r70;
	and.b32 	%r72, %r71, %r49;
	mov.u32 	%r73, 0;
	setp.eq.s32 	%p17, %r72, %r73;
	@%p17 bra 	$Lt_18_36354;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	152	0
	mov.f32 	%f72, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f61, %f71, %f72;
	mov.f32 	%f17, %f61;
	mov.f32 	%f18, %f61;
$Lt_18_36354:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	155	0
	mul.ftz.f32 	%f73, %f66, %f4;
	fma.rn.ftz.f32 	%f74, %f2, %f71, %f73;
	fma.rn.ftz.f32 	%f75, %f6, %f18, %f74;
	fma.rn.ftz.f32 	%f65, %f8, %f17, %f75;
	.loc	19	158	0
	mov.f32 	%f17, %f18;
	.loc	19	159	0
	mov.f32 	%f18, %f65;
	.loc	19	161	0
	st.f32 	[%rd11+256], %f65;
	.loc	19	149	0
	ld.f32 	%f76, [%rd11+384];
	mov.s32 	%r74, -3;
	set.eq.u32.s32 	%r75, %r50, %r74;
	neg.s32 	%r76, %r75;
	and.b32 	%r77, %r76, %r49;
	mov.u32 	%r78, 0;
	setp.eq.s32 	%p18, %r77, %r78;
	@%p18 bra 	$Lt_18_36866;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	152	0
	mov.f32 	%f77, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f61, %f76, %f77;
	mov.f32 	%f17, %f61;
	mov.f32 	%f18, %f61;
$Lt_18_36866:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	155	0
	mul.ftz.f32 	%f78, %f71, %f4;
	fma.rn.ftz.f32 	%f79, %f2, %f76, %f78;
	fma.rn.ftz.f32 	%f80, %f6, %f18, %f79;
	fma.rn.ftz.f32 	%f65, %f8, %f17, %f80;
	.loc	19	157	0
	mov.f32 	%f19, %f76;
	.loc	19	158	0
	mov.f32 	%f17, %f18;
	.loc	19	159	0
	mov.f32 	%f18, %f65;
	.loc	19	161	0
	st.f32 	[%rd11+384], %f65;
	.loc	19	164	0
	bar.sync 	0;
	.loc	19	167	0
	setp.le.s32 	%p19, %r12, %r18;
	@%p19 bra 	$Lt_18_37890;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	170	0
	ld.f32 	%f55, [%rd10+0];
	.loc	19	171	0
	ld.f32 	%f56, [%rd10+512];
	.loc	19	172	0
	ld.f32 	%f57, [%rd10+1024];
	.loc	19	173	0
	ld.f32 	%f58, [%rd10+1536];
	.loc	19	174	0
	cvt.s64.s32 	%rd17, %r34;
	mov.u32 	%r79, 0;
	setp.ne.s32 	%p20, %r14, %r79;
	@%p20 bra 	$Lt_18_38146;
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	48	0
	mul.lo.u64 	%rd18, %rd17, 8;
	add.u64 	%rd19, %rd4, %rd18;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f55;
	mov.b32		%r80, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f56;
	mov.b32		%r81, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f57;
	mov.b32		%r82, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f58;
	mov.b32		%r83, %b1; }
	st.v4.u16 	[%rd19+0], {%r80,%r81,%r82,%r83};
	bra.uni 	$Lt_18_37890;
$Lt_18_38146:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	52	0
	mul.lo.u64 	%rd20, %rd17, 16;
	add.u64 	%rd21, %rd4, %rd20;
	st.v4.f32 	[%rd21+0], {%f55,%f56,%f57,%f58};
$Lt_18_37890:
$Lt_18_37378:
 //<loop> Part of loop body line 126, head labeled $Lt_18_33026
	.loc	19	176	0
	bar.sync 	0;
	.loc	19	178	0
	add.s32 	%r50, %r50, 4;
	.loc	19	179	0
	add.s32 	%r22, %r22, 4;
	.loc	19	180	0
	add.s32 	%r18, %r18, 4;
	.loc	19	181	0
	add.s32 	%r32, %r42, %r32;
	.loc	19	182	0
	add.s32 	%r34, %r46, %r34;
	setp.gt.s32 	%p21, %r12, %r50;
	@%p21 bra 	$Lt_18_33026;
	bra.uni 	$Lt_18_32514;
$Lt_18_47362:
	mov.s32 	%r50, 0;
	cvta.shared.u64 	%rd5, __cuda_local_var_302754_34_non_const_smem__1;
$Lt_18_32514:
	mov.u32 	%r84, 0;
	setp.le.s32 	%p22, %r50, %r84;
	@%p22 bra 	$LBB93__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff;
	add.s32 	%r85, %r50, 3;
	shr.s32 	%r86, %r85, 31;
	mov.s32 	%r87, 3;
	and.b32 	%r88, %r86, %r87;
	add.s32 	%r89, %r88, %r85;
	shr.s32 	%r90, %r89, 2;
	shl.b32 	%r42, %r2, 2;
	mul.lo.s32 	%r91, %r17, 32;
	mul.lo.s32 	%r92, %r17, 128;
	sub.s32 	%r93, %r12, 4;
	mov.s32 	%r94, 0;
	setp.ne.s32 	%p23, %r16, %r94;
	mov.pred 	%p24, %p23;
	mov.pred 	%p25, %p6;
	sub.s32 	%r95, %r12, 3;
	sub.s32 	%r96, %r12, 2;
	sub.s32 	%r97, %r12, 1;
	mul.lo.s32 	%r98, %r2, %r22;
	mul.lo.s32 	%r99, %r2, -4;
	add.u32 	%r100, %r91, %r26;
	add.u32 	%r101, %r92, %r26;
	selp.s32 	%r49, 1, 0, %p24;
	cvt.s64.s32 	%rd22, %r100;
	cvt.s64.s32 	%rd23, %r101;
	add.s32 	%r102, %r98, %r30;
	mul.wide.s32 	%rd24, %r100, 4;
	mul.wide.s32 	%rd25, %r101, 4;
	add.u64 	%rd10, %rd24, %rd5;
	add.u64 	%rd11, %rd25, %rd5;
	mov.f32 	%f81, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f18, 0f00000000;    	// 0
	mov.f32 	%f19, 0f00000000;    	// 0
	mov.s32 	%r103, %r90;
$Lt_18_39170:
 //<loop> Loop body line 182, nesting depth: 1, estimated iterations: unknown
	.loc	19	198	0
	sub.s32 	%r50, %r50, 4;
	.loc	19	199	0
	sub.s32 	%r22, %r22, 4;
	add.s32 	%r102, %r99, %r102;
	.loc	19	200	0
	sub.s32 	%r18, %r18, 4;
	.loc	19	201	0
	sub.s32 	%r32, %r32, %r42;
	.loc	19	198	0
	mov.u32 	%r104, 0;
	setp.lt.s32 	%p26, %r22, %r104;
	@%p26 bra 	$Lt_18_49410;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	setp.le.s32 	%p27, %r6, %r22;
	@%p27 bra 	$Lt_18_49410;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	mov.u32 	%r105, 0;
	setp.lt.s32 	%p28, %r30, %r105;
	@%p28 bra 	$Lt_18_49410;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	setp.ge.s32 	%p29, %r30, %r4;
	@%p29 bra 	$Lt_18_49410;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	208	0
	cvt.s64.s32 	%rd12, %r32;
	mov.u32 	%r106, 0;
	setp.ne.s32 	%p30, %r14, %r106;
	@%p30 bra 	$Lt_18_39426;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	36	0
	mul.lo.u64 	%rd26, %rd12, 8;
	add.u64 	%rd27, %rd2, %rd26;
	ld.v4.u16 	{%r55,%r56,%r57,%r58}, [%rd27+0];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r55;
	cvt.ftz.f32.f16	%f82, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r56;
	cvt.ftz.f32.f16	%f83, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r57;
	cvt.ftz.f32.f16	%f84, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r58;
	cvt.ftz.f32.f16	%f85, %b1; }
	bra.uni 	$LDWendi__Z13Half4ToFloat47ushort4_195_9;
$Lt_18_39426:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	38	0
	mul.lo.u64 	%rd28, %rd12, 16;
	add.u64 	%rd29, %rd2, %rd28;
	ld.v4.f32 	{%f82,%f83,%f84,%f85}, [%rd29+0];
$LDWendi__Z13Half4ToFloat47ushort4_195_9:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	255	0
	mov.f32 	%f86, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p31, %f82, %f86;
	@!%p31 bra 	$Lt_18_39938;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	234	0
	neg.ftz.f32 	%f87, %f82;
	lg2.approx.ftz.f32 	%f88, %f87;
	mov.f32 	%f89, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f90, %f88, %f89;
	ex2.approx.ftz.f32 	%f91, %f90;
	neg.ftz.f32 	%f30, %f91;
	bra.uni 	$LDWendi___log2f_195_15;
$Lt_18_39938:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f92, %f82;
	mov.f32 	%f93, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f94, %f92, %f93;
	ex2.approx.ftz.f32 	%f30, %f94;
$LDWendi___log2f_195_15:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	256	0
	mov.f32 	%f95, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p32, %f83, %f95;
	@!%p32 bra 	$Lt_18_40450;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	234	0
	neg.ftz.f32 	%f96, %f83;
	lg2.approx.ftz.f32 	%f97, %f96;
	mov.f32 	%f98, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f99, %f97, %f98;
	ex2.approx.ftz.f32 	%f100, %f99;
	neg.ftz.f32 	%f40, %f100;
	bra.uni 	$LDWendi___log2f_195_13;
$Lt_18_40450:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f101, %f83;
	mov.f32 	%f102, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f103, %f101, %f102;
	ex2.approx.ftz.f32 	%f40, %f103;
$LDWendi___log2f_195_13:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	257	0
	mov.f32 	%f104, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p33, %f84, %f104;
	@!%p33 bra 	$Lt_18_40962;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	234	0
	neg.ftz.f32 	%f105, %f84;
	lg2.approx.ftz.f32 	%f106, %f105;
	mov.f32 	%f107, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f108, %f106, %f107;
	ex2.approx.ftz.f32 	%f109, %f108;
	neg.ftz.f32 	%f50, %f109;
	bra.uni 	$LDWendi___log2f_195_11;
$Lt_18_40962:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f110, %f84;
	mov.f32 	%f111, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f112, %f110, %f111;
	ex2.approx.ftz.f32 	%f50, %f112;
$LDWendi___log2f_195_11:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	208	0
	cvt.ftz.sat.f32.f32 	%f113, %f85;
	mul.ftz.f32 	%f55, %f30, %f113;
	mul.ftz.f32 	%f56, %f40, %f113;
	mul.ftz.f32 	%f57, %f50, %f113;
	mov.f32 	%f58, %f113;
	bra.uni 	$L_18_30722;
$Lt_18_49410:
$L_18_30978:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	210	0
	mov.f32 	%f58, 0f00000000;    	// 0
	mov.f32 	%f57, 0f00000000;    	// 0
	mov.f32 	%f56, 0f00000000;    	// 0
	mov.f32 	%f55, 0f00000000;    	// 0
$L_18_30722:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	212	0
	st.f32 	[%rd10+0], %f55;
	.loc	19	213	0
	st.f32 	[%rd10+512], %f56;
	.loc	19	214	0
	st.f32 	[%rd10+1024], %f57;
	.loc	19	215	0
	st.f32 	[%rd10+1536], %f58;
	.loc	19	217	0
	bar.sync 	0;
	.loc	19	223	0
	ld.f32 	%f114, [%rd11+384];
	set.eq.u32.s32 	%r107, %r93, %r50;
	neg.s32 	%r108, %r107;
	and.b32 	%r109, %r108, %r49;
	mov.u32 	%r110, 0;
	setp.eq.s32 	%p34, %r109, %r110;
	@%p34 bra 	$Lt_18_41474;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	226	0
	mov.f32 	%f115, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f61, %f114, %f115;
	mov.f32 	%f17, %f61;
	mov.f32 	%f18, %f61;
$Lt_18_41474:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	228	0
	mov.f32 	%f116, %f19;
	mul.ftz.f32 	%f117, %f12, %f81;
	fma.rn.ftz.f32 	%f118, %f10, %f116, %f117;
	fma.rn.ftz.f32 	%f119, %f14, %f18, %f118;
	fma.rn.ftz.f32 	%f65, %f16, %f17, %f119;
	.loc	19	232	0
	mov.f32 	%f17, %f18;
	.loc	19	233	0
	mov.f32 	%f18, %f65;
	.loc	19	235	0
	st.f32 	[%rd11+384], %f65;
	.loc	19	223	0
	ld.f32 	%f120, [%rd11+256];
	set.eq.u32.s32 	%r111, %r95, %r50;
	neg.s32 	%r112, %r111;
	and.b32 	%r113, %r112, %r49;
	mov.u32 	%r114, 0;
	setp.eq.s32 	%p35, %r113, %r114;
	@%p35 bra 	$Lt_18_41986;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	226	0
	mov.f32 	%f121, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f61, %f120, %f121;
	mov.f32 	%f17, %f61;
	mov.f32 	%f18, %f61;
$Lt_18_41986:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	228	0
	mul.ftz.f32 	%f122, %f116, %f12;
	fma.rn.ftz.f32 	%f123, %f10, %f114, %f122;
	fma.rn.ftz.f32 	%f124, %f14, %f18, %f123;
	fma.rn.ftz.f32 	%f65, %f16, %f17, %f124;
	.loc	19	232	0
	mov.f32 	%f17, %f18;
	.loc	19	233	0
	mov.f32 	%f18, %f65;
	.loc	19	235	0
	st.f32 	[%rd11+256], %f65;
	.loc	19	223	0
	ld.f32 	%f125, [%rd11+128];
	set.eq.u32.s32 	%r115, %r96, %r50;
	neg.s32 	%r116, %r115;
	and.b32 	%r117, %r116, %r49;
	mov.u32 	%r118, 0;
	setp.eq.s32 	%p36, %r117, %r118;
	@%p36 bra 	$Lt_18_42498;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	226	0
	mov.f32 	%f126, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f61, %f125, %f126;
	mov.f32 	%f17, %f61;
	mov.f32 	%f18, %f61;
$Lt_18_42498:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	228	0
	mul.ftz.f32 	%f127, %f114, %f12;
	fma.rn.ftz.f32 	%f128, %f10, %f120, %f127;
	fma.rn.ftz.f32 	%f129, %f14, %f18, %f128;
	fma.rn.ftz.f32 	%f65, %f16, %f17, %f129;
	.loc	19	232	0
	mov.f32 	%f17, %f18;
	.loc	19	233	0
	mov.f32 	%f18, %f65;
	.loc	19	235	0
	st.f32 	[%rd11+128], %f65;
	.loc	19	223	0
	ld.f32 	%f76, [%rd11+0];
	set.eq.u32.s32 	%r119, %r97, %r50;
	neg.s32 	%r120, %r119;
	and.b32 	%r121, %r120, %r49;
	mov.u32 	%r122, 0;
	setp.eq.s32 	%p37, %r121, %r122;
	@%p37 bra 	$Lt_18_43010;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	226	0
	mov.f32 	%f130, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f61, %f76, %f130;
	mov.f32 	%f17, %f61;
	mov.f32 	%f18, %f61;
$Lt_18_43010:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	228	0
	mul.ftz.f32 	%f131, %f120, %f12;
	fma.rn.ftz.f32 	%f132, %f10, %f125, %f131;
	fma.rn.ftz.f32 	%f133, %f14, %f18, %f132;
	fma.rn.ftz.f32 	%f65, %f16, %f17, %f133;
	.loc	19	230	0
	mov.f32 	%f81, %f125;
	.loc	19	231	0
	mov.f32 	%f19, %f76;
	.loc	19	232	0
	mov.f32 	%f17, %f18;
	.loc	19	233	0
	mov.f32 	%f18, %f65;
	.loc	19	235	0
	st.f32 	[%rd11+0], %f65;
	.loc	19	238	0
	bar.sync 	0;
	.loc	19	242	0
	mov.s32 	%r32, %r102;
	.loc	19	244	0
	setp.le.s32 	%p38, %r12, %r18;
	@%p38 bra 	$Lt_18_46594;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	247	0
	mov.s32 	%r123, 0;
	setp.eq.s32 	%p39, %r14, %r123;
	mul.lo.s32 	%r124, %r8, %r18;
	add.s32 	%r125, %r27, %r124;
	cvt.s64.s32 	%rd30, %r125;
	@!%p39 bra 	$Lt_18_44034;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	36	0
	mul.lo.u64 	%rd31, %rd30, 8;
	add.u64 	%rd32, %rd4, %rd31;
	ld.v4.u16 	{%r55,%r56,%r57,%r58}, [%rd32+0];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r55;
	cvt.ftz.f32.f16	%f134, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r56;
	cvt.ftz.f32.f16	%f135, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r57;
	cvt.ftz.f32.f16	%f136, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r58;
	cvt.ftz.f32.f16	%f137, %b1; }
	bra.uni 	$LDWendi__Z13Half4ToFloat47ushort4_195_7;
$Lt_18_44034:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	38	0
	mul.lo.u64 	%rd33, %rd30, 16;
	add.u64 	%rd34, %rd4, %rd33;
	ld.v4.f32 	{%f134,%f135,%f136,%f137}, [%rd34+0];
$LDWendi__Z13Half4ToFloat47ushort4_195_7:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	252	0
	ld.f32 	%f138, [%rd10+1536];
	add.ftz.f32 	%f58, %f138, %f137;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f139, %f58;
	mov.f32 	%f140, %f139;
	mov.f32 	%f141, 0fb70637bd;   	// -8e-006
	add.ftz.f32 	%f142, %f139, %f141;
	mov.f32 	%f143, 0f00000000;   	// 0
	setp.le.ftz.f32 	%p40, %f142, %f143;
	@%p40 bra 	$Lt_18_44802;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f144, %f139;
	ld.f32 	%f145, [%rd10+1024];
	add.ftz.f32 	%f146, %f145, %f136;
	mul.ftz.f32 	%f147, %f144, %f146;
	.loc	3	214	0
	ld.f32 	%f148, [%rd10+512];
	add.ftz.f32 	%f149, %f148, %f135;
	mul.ftz.f32 	%f150, %f144, %f149;
	.loc	3	215	0
	ld.f32 	%f151, [%rd10+0];
	add.ftz.f32 	%f152, %f151, %f134;
	mul.ftz.f32 	%f153, %f144, %f152;
	bra.uni 	$Lt_18_44546;
$Lt_18_44802:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	219	0
	mov.f32 	%f147, 0f00000000;   	// 0
	mov.f32 	%f150, 0f00000000;   	// 0
	mov.f32 	%f153, 0f00000000;   	// 0
	mov.f32 	%f140, 0f00000000;   	// 0
$Lt_18_44546:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	266	0
	mov.f32 	%f154, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p41, %f153, %f154;
	@!%p41 bra 	$Lt_18_45058;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	242	0
	neg.ftz.f32 	%f155, %f153;
	lg2.approx.ftz.f32 	%f156, %f155;
	mov.f32 	%f157, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f158, %f156, %f157;
	ex2.approx.ftz.f32 	%f159, %f158;
	neg.ftz.f32 	%f160, %f159;
	bra.uni 	$LDWendi___log2f_195_5;
$Lt_18_45058:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f161, %f153;
	mov.f32 	%f162, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f163, %f161, %f162;
	ex2.approx.ftz.f32 	%f160, %f163;
$LDWendi___log2f_195_5:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	267	0
	mov.f32 	%f164, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p42, %f150, %f164;
	@!%p42 bra 	$Lt_18_45570;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	242	0
	neg.ftz.f32 	%f165, %f150;
	lg2.approx.ftz.f32 	%f166, %f165;
	mov.f32 	%f167, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f168, %f166, %f167;
	ex2.approx.ftz.f32 	%f169, %f168;
	neg.ftz.f32 	%f170, %f169;
	bra.uni 	$LDWendi___log2f_195_3;
$Lt_18_45570:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f171, %f150;
	mov.f32 	%f172, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f173, %f171, %f172;
	ex2.approx.ftz.f32 	%f170, %f173;
$LDWendi___log2f_195_3:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	268	0
	mov.f32 	%f174, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p43, %f147, %f174;
	@!%p43 bra 	$Lt_18_46082;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	242	0
	neg.ftz.f32 	%f175, %f147;
	lg2.approx.ftz.f32 	%f176, %f175;
	mov.f32 	%f177, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f178, %f176, %f177;
	ex2.approx.ftz.f32 	%f179, %f178;
	neg.ftz.f32 	%f180, %f179;
	bra.uni 	$LDWendi___log2f_195_1;
$Lt_18_46082:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f181, %f147;
	mov.f32 	%f182, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f183, %f181, %f182;
	ex2.approx.ftz.f32 	%f180, %f183;
$LDWendi___log2f_195_1:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	254	0
	@!%p39 bra 	$Lt_18_46850;
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	48	0
	mul.lo.u64 	%rd35, %rd30, 8;
	add.u64 	%rd36, %rd4, %rd35;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f160;
	mov.b32		%r126, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f170;
	mov.b32		%r127, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f180;
	mov.b32		%r128, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f140;
	mov.b32		%r129, %b1; }
	st.v4.u16 	[%rd36+0], {%r126,%r127,%r128,%r129};
	bra.uni 	$Lt_18_46594;
$Lt_18_46850:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	52	0
	mul.lo.u64 	%rd37, %rd30, 16;
	add.u64 	%rd38, %rd4, %rd37;
	st.v4.f32 	[%rd38+0], {%f160,%f170,%f180,%f140};
$Lt_18_46594:
$Lt_18_43522:
 //<loop> Part of loop body line 182, head labeled $Lt_18_39170
	.loc	19	257	0
	bar.sync 	0;
	mov.u32 	%r130, 0;
	setp.gt.s32 	%p44, %r50, %r130;
	@%p44 bra 	$Lt_18_39170;
$LBB93__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff:
	.loc	19	261	0
	ret;
$LDWend__Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff:
	} // _Z25VerticalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff
	.shared .align 4 .b8 __cuda_local_var_302956_34_non_const_smem__0[4224];

	.visible .func _Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff (.param .u64 __cudaparmf1__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf2__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf3__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf4__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .u64 __cudaparmf5__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf6__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf7__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf8__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .u32 __cudaparmf9__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .s32 __cudaparmf10__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf11__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf12__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf13__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf14__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf15__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf16__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf17__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff, .param .f32 __cudaparmf18__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff)
	{
	.reg .u32 %r<153>;
	.reg .u64 %rd<57>;
	.reg .f32 %f<156>;
	.reg .pred %p<51>;
	.loc	19	297	0
$LDWbegin__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff:
	ld.param.u64 	%rd1, [__cudaparmf1__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s64 	%rd2, %rd1;
	ld.param.u32 	%r1, [__cudaparmf2__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r2, %r1;
	ld.param.u32 	%r3, [__cudaparmf3__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r4, %r3;
	ld.param.u32 	%r5, [__cudaparmf4__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r6, %r5;
	ld.param.u64 	%rd3, [__cudaparmf5__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s64 	%rd4, %rd3;
	ld.param.u32 	%r7, [__cudaparmf6__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r8, %r7;
	ld.param.u32 	%r9, [__cudaparmf7__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r10, %r9;
	ld.param.u32 	%r11, [__cudaparmf8__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r12, %r11;
	ld.param.u32 	%r13, [__cudaparmf9__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.s32 	%r14, %r13;
	ld.param.u32 	%r15, [__cudaparmf10__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	cvt.s8.s32 	%r16, %r15;
	ld.param.f32 	%f1, [__cudaparmf11__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f2, %f1;
	ld.param.f32 	%f3, [__cudaparmf12__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f4, %f3;
	ld.param.f32 	%f5, [__cudaparmf13__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f6, %f5;
	ld.param.f32 	%f7, [__cudaparmf14__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f8, %f7;
	ld.param.f32 	%f9, [__cudaparmf15__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f10, %f9;
	ld.param.f32 	%f11, [__cudaparmf16__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f12, %f11;
	ld.param.f32 	%f13, [__cudaparmf17__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f14, %f13;
	ld.param.f32 	%f15, [__cudaparmf18__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff];
	mov.f32 	%f16, %f15;
	.loc	19	318	0
	mov.u32 	%r17, 0;
	setp.le.s32 	%p1, %r10, %r17;
	@%p1 bra 	$Lt_19_61442;
	mov.u32 	%r18, %tid.y;
	shl.b32 	%r19, %r18, 4;
	mov.u32 	%r20, %tid.x;
	add.u32 	%r21, %r19, %r20;
	cvt.s32.u32 	%r22, %ctaid.y;
	mul.lo.s32 	%r23, %r22, 8;
	sub.s32 	%r24, %r12, %r6;
	add.u32 	%r25, %r23, %r18;
	shr.s32 	%r26, %r24, 1;
	sub.s32 	%r27, %r25, %r26;
	add.s32 	%r28, %r10, 31;
	shr.s32 	%r29, %r28, 31;
	mov.s32 	%r30, 31;
	and.b32 	%r31, %r29, %r30;
	add.s32 	%r32, %r31, %r28;
	shr.s32 	%r33, %r32, 5;
	mov.s32 	%r34, %r20;
	add.u32 	%r35, %r10, %r20;
	sub.s32 	%r36, %r10, %r4;
	mul.lo.u32 	%r37, %r18, 32;
	shr.s32 	%r38, %r36, 1;
	add.u32 	%r39, %r37, %r20;
	mov.s32 	%r40, 31;
	setp.le.s32 	%p2, %r39, %r40;
	setp.lt.s32 	%p3, %r27, %r6;
	add.s32 	%r41, %r27, 8;
	add.s32 	%r42, %r27, 4;
	setp.lt.s32 	%p4, %r42, %r6;
	mov.s32 	%r43, 0;
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f18, 0f00000000;    	// 0
	mov.f32 	%f19, 0f00000000;    	// 0
	cvta.shared.u64 	%rd5, __cuda_local_var_302956_34_non_const_smem__0;
	mov.s32 	%r44, %r33;
$Lt_19_40450:
 //<loop> Loop body line 318, nesting depth: 1, estimated iterations: unknown
	.loc	19	335	0
	sub.s32 	%r45, %r34, %r38;
	mov.s32 	%r46, %r27;
	mov.s32 	%r47, 0;
$Lt_19_41218:
 //<loop> Loop body line 335, nesting depth: 2, iterations: 2
	setp.ge.s32 	%p5, %r46, %r6;
	@%p5 bra 	$Lt_19_41474;
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	19	342	0
	mov.u32 	%r48, 0;
	setp.lt.s32 	%p6, %r27, %r48;
	@%p6 bra 	$Lt_19_62210;
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	@!%p3 bra 	$Lt_19_62210;
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	mov.u32 	%r49, 0;
	setp.lt.s32 	%p7, %r45, %r49;
	@%p7 bra 	$Lt_19_62210;
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	setp.ge.s32 	%p8, %r45, %r4;
	@%p8 bra 	$Lt_19_62210;
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	19	343	0
	mul.lo.s32 	%r50, %r2, %r47;
	mul.lo.s32 	%r51, %r27, %r2;
	add.s32 	%r52, %r45, %r51;
	mul.lo.s32 	%r53, %r50, 4;
	add.s32 	%r54, %r52, %r53;
	cvt.s64.s32 	%rd6, %r54;
	mov.u32 	%r55, 0;
	setp.ne.s32 	%p9, %r14, %r55;
	@%p9 bra 	$Lt_19_41986;
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	19	36	0
	mul.lo.u64 	%rd7, %rd6, 8;
	add.u64 	%rd8, %rd2, %rd7;
	ld.v4.u16 	{%r56,%r57,%r58,%r59}, [%rd8+0];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r56;
	cvt.ftz.f32.f16	%f20, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r57;
	cvt.ftz.f32.f16	%f21, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r58;
	cvt.ftz.f32.f16	%f22, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r59;
	cvt.ftz.f32.f16	%f23, %b1; }
	bra.uni 	$LDWendi__Z13Half4ToFloat47ushort4_196_17;
$Lt_19_41986:
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	19	38	0
	mul.lo.u64 	%rd9, %rd6, 16;
	add.u64 	%rd10, %rd2, %rd9;
	ld.v4.f32 	{%f20,%f21,%f22,%f23}, [%rd10+0];
$LDWendi__Z13Half4ToFloat47ushort4_196_17:
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	3	255	0
	mov.f32 	%f24, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p10, %f20, %f24;
	@!%p10 bra 	$Lt_19_42498;
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	3	234	0
	neg.ftz.f32 	%f25, %f20;
	lg2.approx.ftz.f32 	%f26, %f25;
	mov.f32 	%f27, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f28, %f26, %f27;
	ex2.approx.ftz.f32 	%f29, %f28;
	neg.ftz.f32 	%f30, %f29;
	bra.uni 	$LDWendi___log2f_196_23;
$Lt_19_42498:
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f31, %f20;
	mov.f32 	%f32, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f33, %f31, %f32;
	ex2.approx.ftz.f32 	%f30, %f33;
$LDWendi___log2f_196_23:
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	3	256	0
	mov.f32 	%f34, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p11, %f21, %f34;
	@!%p11 bra 	$Lt_19_43010;
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	3	234	0
	neg.ftz.f32 	%f35, %f21;
	lg2.approx.ftz.f32 	%f36, %f35;
	mov.f32 	%f37, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f38, %f36, %f37;
	ex2.approx.ftz.f32 	%f39, %f38;
	neg.ftz.f32 	%f40, %f39;
	bra.uni 	$LDWendi___log2f_196_21;
$Lt_19_43010:
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f41, %f21;
	mov.f32 	%f42, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f43, %f41, %f42;
	ex2.approx.ftz.f32 	%f40, %f43;
$LDWendi___log2f_196_21:
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	3	257	0
	mov.f32 	%f44, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p12, %f22, %f44;
	@!%p12 bra 	$Lt_19_43522;
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	3	234	0
	neg.ftz.f32 	%f45, %f22;
	lg2.approx.ftz.f32 	%f46, %f45;
	mov.f32 	%f47, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f48, %f46, %f47;
	ex2.approx.ftz.f32 	%f49, %f48;
	neg.ftz.f32 	%f50, %f49;
	bra.uni 	$LDWendi___log2f_196_19;
$Lt_19_43522:
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f51, %f22;
	mov.f32 	%f52, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f53, %f51, %f52;
	ex2.approx.ftz.f32 	%f50, %f53;
$LDWendi___log2f_196_19:
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	19	343	0
	cvt.ftz.sat.f32.f32 	%f54, %f23;
	mul.ftz.f32 	%f55, %f30, %f54;
	mul.ftz.f32 	%f56, %f40, %f54;
	mul.ftz.f32 	%f57, %f50, %f54;
	mov.f32 	%f58, %f54;
	bra.uni 	$L_19_37378;
$Lt_19_62210:
$L_19_37634:
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	19	345	0
	mov.f32 	%f57, 0f00000000;    	// 0
	mov.f32 	%f56, 0f00000000;    	// 0
	mov.f32 	%f55, 0f00000000;    	// 0
	mov.f32 	%f58, 0f00000000;    	// 0
$L_19_37378:
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	.loc	19	348	0
	mul.lo.s32 	%r60, %r18, 33;
	mul.lo.s32 	%r61, %r47, 132;
	add.u32 	%r62, %r60, %r20;
	add.s32 	%r63, %r61, %r62;
	cvt.s64.s32 	%rd11, %r63;
	mul.wide.s32 	%rd12, %r63, 4;
	add.u64 	%rd13, %rd5, %rd12;
	st.f32 	[%rd13+0], %f55;
	.loc	19	349	0
	st.f32 	[%rd13+1056], %f56;
	.loc	19	350	0
	st.f32 	[%rd13+2112], %f57;
	.loc	19	351	0
	st.f32 	[%rd13+3168], %f58;
$Lt_19_41474:
 //<loop> Part of loop body line 335, head labeled $Lt_19_41218
	add.s32 	%r47, %r47, 1;
	add.s32 	%r46, %r46, 4;
	setp.ne.s32 	%p13, %r46, %r41;
	@%p13 bra 	$Lt_19_41218;
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	358	0
	bar.sync 	0;
	@!%p2 bra 	$Lt_19_44290;
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	364	0
	mov.s32 	%r64, 0;
	setp.ne.s32 	%p14, %r16, %r64;
	mov.pred 	%p15, %p14;
	mov.pred 	%p16, %p17;
	neg.s32 	%r65, %r43;
	selp.s32 	%r66, 1, 0, %p15;
	mul.lo.s32 	%r67, %r21, 33;
	mov.s32 	%r68, %r67;
	add.s32 	%r69, %r67, 32;
	add.s32 	%r70, %r65, %r67;
	cvt.s64.s32 	%rd14, %r67;
	mul.wide.s32 	%rd15, %r67, 4;
	add.u64 	%rd16, %rd5, %rd15;
$Lt_19_45314:
 //<loop> Loop body line 364, nesting depth: 2, iterations: 32
	.loc	19	367	0
	ld.f32 	%f59, [%rd16+0];
	set.eq.u32.s32 	%r71, %r68, %r70;
	neg.s32 	%r72, %r71;
	and.b32 	%r73, %r72, %r66;
	mov.u32 	%r74, 0;
	setp.eq.s32 	%p18, %r73, %r74;
	@%p18 bra 	$Lt_19_45570;
 //<loop> Part of loop body line 364, head labeled $Lt_19_45314
	.loc	19	370	0
	mov.f32 	%f60, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f61, %f59, %f60;
	mov.f32 	%f17, %f61;
	mov.f32 	%f18, %f61;
$Lt_19_45570:
 //<loop> Part of loop body line 364, head labeled $Lt_19_45314
	.loc	19	372	0
	mul.ftz.f32 	%f62, %f4, %f19;
	fma.rn.ftz.f32 	%f63, %f2, %f59, %f62;
	fma.rn.ftz.f32 	%f64, %f6, %f18, %f63;
	fma.rn.ftz.f32 	%f65, %f8, %f17, %f64;
	.loc	19	374	0
	mov.f32 	%f19, %f59;
	.loc	19	375	0
	mov.f32 	%f17, %f18;
	.loc	19	376	0
	mov.f32 	%f18, %f65;
	.loc	19	378	0
	st.f32 	[%rd16+0], %f65;
	add.s32 	%r68, %r68, 1;
	add.u64 	%rd16, %rd16, 4;
	setp.ne.s32 	%p19, %r68, %r69;
	@%p19 bra 	$Lt_19_45314;
$Lt_19_44290:
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	383	0
	bar.sync 	0;
	.loc	19	388	0
	@!%p3 bra 	$Lt_19_47362;
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	395	0
	mul.lo.s32 	%r75, %r18, 33;
	add.u32 	%r76, %r20, %r75;
	cvt.s64.s32 	%rd17, %r76;
	mul.wide.s32 	%rd18, %r76, 4;
	add.u64 	%rd19, %rd5, %rd18;
	ld.f32 	%f55, [%rd19+0];
	.loc	19	396	0
	ld.f32 	%f56, [%rd19+1056];
	.loc	19	397	0
	ld.f32 	%f57, [%rd19+2112];
	.loc	19	398	0
	ld.f32 	%f58, [%rd19+3168];
	setp.le.s32 	%p20, %r10, %r34;
	@%p20 bra 	$Lt_19_47362;
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	402	0
	mul.lo.s32 	%r77, %r8, %r25;
	add.s32 	%r78, %r34, %r77;
	cvt.s64.s32 	%rd20, %r78;
	mov.u32 	%r79, 0;
	setp.ne.s32 	%p21, %r14, %r79;
	@%p21 bra 	$Lt_19_47618;
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	48	0
	mul.lo.u64 	%rd21, %rd20, 8;
	add.u64 	%rd22, %rd4, %rd21;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f55;
	mov.b32		%r80, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f56;
	mov.b32		%r81, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f57;
	mov.b32		%r82, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f58;
	mov.b32		%r83, %b1; }
	st.v4.u16 	[%rd22+0], {%r80,%r81,%r82,%r83};
	bra.uni 	$Lt_19_47362;
$Lt_19_47618:
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	52	0
	mul.lo.u64 	%rd23, %rd20, 16;
	add.u64 	%rd24, %rd4, %rd23;
	st.v4.f32 	[%rd24+0], {%f55,%f56,%f57,%f58};
$Lt_19_47362:
$Lt_19_46850:
$Lt_19_46338:
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	402	0
	@!%p4 bra 	$Lt_19_48898;
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	395	0
	mul.lo.s32 	%r84, %r18, 33;
	add.u32 	%r85, %r20, %r84;
	cvt.s64.s32 	%rd25, %r85;
	mul.wide.s32 	%rd26, %r85, 4;
	add.u64 	%rd27, %rd5, %rd26;
	ld.f32 	%f55, [%rd27+528];
	.loc	19	396	0
	ld.f32 	%f56, [%rd27+1584];
	.loc	19	397	0
	ld.f32 	%f57, [%rd27+2640];
	.loc	19	398	0
	ld.f32 	%f58, [%rd27+3696];
	setp.le.s32 	%p22, %r10, %r34;
	@%p22 bra 	$Lt_19_48898;
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	402	0
	mul.lo.s32 	%r86, %r8, %r25;
	add.s32 	%r87, %r86, %r34;
	mul.lo.s32 	%r88, %r8, 4;
	add.s32 	%r89, %r87, %r88;
	cvt.s64.s32 	%rd28, %r89;
	mov.u32 	%r90, 0;
	setp.ne.s32 	%p23, %r14, %r90;
	@%p23 bra 	$Lt_19_49154;
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	48	0
	mul.lo.u64 	%rd29, %rd28, 8;
	add.u64 	%rd30, %rd4, %rd29;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f55;
	mov.b32		%r91, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f56;
	mov.b32		%r92, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f57;
	mov.b32		%r93, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f58;
	mov.b32		%r94, %b1; }
	st.v4.u16 	[%rd30+0], {%r91,%r92,%r93,%r94};
	bra.uni 	$Lt_19_48898;
$Lt_19_49154:
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	52	0
	mul.lo.u64 	%rd31, %rd28, 16;
	add.u64 	%rd32, %rd4, %rd31;
	st.v4.f32 	[%rd32+0], {%f55,%f56,%f57,%f58};
$Lt_19_48898:
$Lt_19_48386:
$Lt_19_47874:
 //<loop> Part of loop body line 318, head labeled $Lt_19_40450
	.loc	19	409	0
	bar.sync 	0;
	add.s32 	%r43, %r43, 32;
	add.u32 	%r34, %r34, 32;
	setp.lt.s32 	%p24, %r34, %r35;
	@%p24 bra 	$Lt_19_40450;
	bra.uni 	$Lt_19_39938;
$Lt_19_61442:
	mov.s32 	%r43, 0;
	cvta.shared.u64 	%rd5, __cuda_local_var_302956_34_non_const_smem__0;
$Lt_19_39938:
	mov.u32 	%r95, 0;
	setp.le.s32 	%p25, %r43, %r95;
	@%p25 bra 	$Lt_19_49666;
	cvt.s32.u32 	%r96, %ctaid.y;
	mul.lo.s32 	%r97, %r96, 8;
	mov.u32 	%r18, %tid.y;
	add.u32 	%r25, %r97, %r18;
	shl.b32 	%r98, %r18, 4;
	mov.u32 	%r20, %tid.x;
	add.u32 	%r21, %r98, %r20;
	sub.s32 	%r99, %r12, %r6;
	shr.s32 	%r100, %r99, 1;
	sub.s32 	%r27, %r25, %r100;
	add.s32 	%r101, %r43, 31;
	shr.s32 	%r102, %r101, 31;
	mov.s32 	%r103, 31;
	and.b32 	%r104, %r102, %r103;
	add.s32 	%r105, %r104, %r101;
	shr.s32 	%r106, %r105, 5;
	add.u32 	%r34, %r43, %r20;
	sub.s32 	%r107, %r10, %r4;
	mul.lo.u32 	%r108, %r18, 32;
	shr.s32 	%r38, %r107, 1;
	add.u32 	%r109, %r108, %r20;
	add.s32 	%r110, %r25, 8;
	mov.s32 	%r111, 31;
	setp.le.s32 	%p2, %r109, %r111;
	add.s32 	%r41, %r27, 8;
	mov.f32 	%f66, 0f00000000;    	// 0
	mov.f32 	%f17, 0f00000000;    	// 0
	mov.f32 	%f18, 0f00000000;    	// 0
	mov.f32 	%f19, 0f00000000;    	// 0
	mov.s32 	%r112, %r106;
$Lt_19_50178:
 //<loop> Loop body line 409, nesting depth: 1, estimated iterations: unknown
	.loc	19	424	0
	sub.s32 	%r43, %r43, 32;
	sub.u32 	%r34, %r34, 32;
	.loc	19	432	0
	sub.s32 	%r45, %r34, %r38;
	mov.s32 	%r113, %r27;
	mov.s32 	%r114, 0;
$Lt_19_50946:
 //<loop> Loop body line 432, nesting depth: 1, iterations: 2
	setp.ge.s32 	%p26, %r113, %r6;
	@%p26 bra 	$Lt_19_51202;
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	19	439	0
	mov.u32 	%r115, 0;
	setp.lt.s32 	%p27, %r27, %r115;
	@%p27 bra 	$Lt_19_63490;
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	setp.ge.s32 	%p28, %r27, %r6;
	@%p28 bra 	$Lt_19_63490;
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	mov.u32 	%r116, 0;
	setp.lt.s32 	%p29, %r45, %r116;
	@%p29 bra 	$Lt_19_63490;
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	setp.ge.s32 	%p30, %r45, %r4;
	@%p30 bra 	$Lt_19_63490;
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	19	440	0
	mul.lo.s32 	%r117, %r2, %r114;
	mul.lo.s32 	%r118, %r27, %r2;
	add.s32 	%r119, %r45, %r118;
	mul.lo.s32 	%r120, %r117, 4;
	add.s32 	%r121, %r119, %r120;
	cvt.s64.s32 	%rd33, %r121;
	mov.u32 	%r122, 0;
	setp.ne.s32 	%p31, %r14, %r122;
	@%p31 bra 	$Lt_19_51714;
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	19	36	0
	mul.lo.u64 	%rd34, %rd33, 8;
	add.u64 	%rd35, %rd2, %rd34;
	ld.v4.u16 	{%r56,%r57,%r58,%r59}, [%rd35+0];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r56;
	cvt.ftz.f32.f16	%f67, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r57;
	cvt.ftz.f32.f16	%f68, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r58;
	cvt.ftz.f32.f16	%f69, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r59;
	cvt.ftz.f32.f16	%f70, %b1; }
	bra.uni 	$LDWendi__Z13Half4ToFloat47ushort4_196_9;
$Lt_19_51714:
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	19	38	0
	mul.lo.u64 	%rd36, %rd33, 16;
	add.u64 	%rd37, %rd2, %rd36;
	ld.v4.f32 	{%f67,%f68,%f69,%f70}, [%rd37+0];
$LDWendi__Z13Half4ToFloat47ushort4_196_9:
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	3	255	0
	mov.f32 	%f71, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p32, %f67, %f71;
	@!%p32 bra 	$Lt_19_52226;
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	3	234	0
	neg.ftz.f32 	%f72, %f67;
	lg2.approx.ftz.f32 	%f73, %f72;
	mov.f32 	%f74, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f75, %f73, %f74;
	ex2.approx.ftz.f32 	%f76, %f75;
	neg.ftz.f32 	%f30, %f76;
	bra.uni 	$LDWendi___log2f_196_15;
$Lt_19_52226:
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f77, %f67;
	mov.f32 	%f78, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f79, %f77, %f78;
	ex2.approx.ftz.f32 	%f30, %f79;
$LDWendi___log2f_196_15:
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	3	256	0
	mov.f32 	%f80, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p33, %f68, %f80;
	@!%p33 bra 	$Lt_19_52738;
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	3	234	0
	neg.ftz.f32 	%f81, %f68;
	lg2.approx.ftz.f32 	%f82, %f81;
	mov.f32 	%f83, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f84, %f82, %f83;
	ex2.approx.ftz.f32 	%f85, %f84;
	neg.ftz.f32 	%f40, %f85;
	bra.uni 	$LDWendi___log2f_196_13;
$Lt_19_52738:
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f86, %f68;
	mov.f32 	%f87, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f88, %f86, %f87;
	ex2.approx.ftz.f32 	%f40, %f88;
$LDWendi___log2f_196_13:
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	3	257	0
	mov.f32 	%f89, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p34, %f69, %f89;
	@!%p34 bra 	$Lt_19_53250;
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	3	234	0
	neg.ftz.f32 	%f90, %f69;
	lg2.approx.ftz.f32 	%f91, %f90;
	mov.f32 	%f92, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f93, %f91, %f92;
	ex2.approx.ftz.f32 	%f94, %f93;
	neg.ftz.f32 	%f50, %f94;
	bra.uni 	$LDWendi___log2f_196_11;
$Lt_19_53250:
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f95, %f69;
	mov.f32 	%f96, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f97, %f95, %f96;
	ex2.approx.ftz.f32 	%f50, %f97;
$LDWendi___log2f_196_11:
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	19	440	0
	cvt.ftz.sat.f32.f32 	%f98, %f70;
	mul.ftz.f32 	%f55, %f30, %f98;
	mul.ftz.f32 	%f56, %f40, %f98;
	mul.ftz.f32 	%f57, %f50, %f98;
	mov.f32 	%f58, %f98;
	bra.uni 	$L_19_38658;
$Lt_19_63490:
$L_19_38914:
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	19	442	0
	mov.f32 	%f57, 0f00000000;    	// 0
	mov.f32 	%f56, 0f00000000;    	// 0
	mov.f32 	%f55, 0f00000000;    	// 0
	mov.f32 	%f58, 0f00000000;    	// 0
$L_19_38658:
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	.loc	19	445	0
	mul.lo.s32 	%r60, %r18, 33;
	mul.lo.s32 	%r123, %r114, 132;
	add.u32 	%r124, %r60, %r20;
	add.s32 	%r125, %r123, %r124;
	cvt.s64.s32 	%rd38, %r125;
	mul.wide.s32 	%rd39, %r125, 4;
	add.u64 	%rd40, %rd5, %rd39;
	st.f32 	[%rd40+0], %f55;
	.loc	19	446	0
	st.f32 	[%rd40+1056], %f56;
	.loc	19	447	0
	st.f32 	[%rd40+2112], %f57;
	.loc	19	448	0
	st.f32 	[%rd40+3168], %f58;
$Lt_19_51202:
 //<loop> Part of loop body line 432, head labeled $Lt_19_50946
	add.s32 	%r114, %r114, 1;
	add.s32 	%r113, %r113, 4;
	setp.ne.s32 	%p35, %r113, %r41;
	@%p35 bra 	$Lt_19_50946;
 //<loop> Part of loop body line 409, head labeled $Lt_19_50178
	.loc	19	455	0
	bar.sync 	0;
	@!%p2 bra 	$Lt_19_54018;
 //<loop> Part of loop body line 409, head labeled $Lt_19_50178
	.loc	19	461	0
	mul.lo.s32 	%r126, %r21, 33;
	add.s32 	%r127, %r126, 31;
	sub.s32 	%r128, %r10, %r43;
	mov.s32 	%r129, 0;
	setp.ne.s32 	%p36, %r16, %r129;
	mov.pred 	%p37, %p36;
	mov.pred 	%p38, %p17;
	sub.s32 	%r130, %r128, 1;
	selp.s32 	%r66, 1, 0, %p37;
	cvt.s64.s32 	%rd41, %r127;
	mul.wide.s32 	%rd42, %r127, 4;
	add.u64 	%rd43, %rd5, %rd42;
	mov.s32 	%r131, 31;
$Lt_19_55042:
 //<loop> Loop body line 461, nesting depth: 1, iterations: 32
	.loc	19	464	0
	ld.f32 	%f59, [%rd43+0];
	set.eq.u32.s32 	%r132, %r130, %r131;
	neg.s32 	%r133, %r132;
	and.b32 	%r134, %r133, %r66;
	mov.u32 	%r135, 0;
	setp.eq.s32 	%p39, %r134, %r135;
	@%p39 bra 	$Lt_19_55298;
 //<loop> Part of loop body line 461, head labeled $Lt_19_55042
	.loc	19	467	0
	mov.f32 	%f99, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f100, %f59, %f99;
	mov.f32 	%f17, %f100;
	mov.f32 	%f18, %f100;
$Lt_19_55298:
 //<loop> Part of loop body line 461, head labeled $Lt_19_55042
	.loc	19	469	0
	mul.ftz.f32 	%f101, %f12, %f66;
	fma.rn.ftz.f32 	%f102, %f10, %f19, %f101;
	fma.rn.ftz.f32 	%f103, %f14, %f18, %f102;
	fma.rn.ftz.f32 	%f104, %f16, %f17, %f103;
	.loc	19	471	0
	mov.f32 	%f66, %f19;
	.loc	19	472	0
	mov.f32 	%f19, %f59;
	.loc	19	473	0
	mov.f32 	%f17, %f18;
	.loc	19	474	0
	mov.f32 	%f18, %f104;
	.loc	19	476	0
	st.f32 	[%rd43+0], %f104;
	.loc	19	477	0
	sub.u64 	%rd43, %rd43, 4;
	sub.s32 	%r131, %r131, 1;
	mov.u32 	%r136, -1;
	setp.ne.s32 	%p40, %r131, %r136;
	@%p40 bra 	$Lt_19_55042;
$Lt_19_54018:
 //<loop> Part of loop body line 409, head labeled $Lt_19_50178
	.loc	19	481	0
	bar.sync 	0;
	.loc	19	486	0
	mov.s32 	%r137, %r25;
	mov.s32 	%r138, 0;
$Lt_19_56578:
 //<loop> Loop body line 486, nesting depth: 1, iterations: 2
	setp.ge.s32 	%p41, %r137, %r12;
	@%p41 bra 	$Lt_19_60418;
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	setp.le.s32 	%p42, %r10, %r34;
	@%p42 bra 	$Lt_19_60418;
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	19	496	0
	mov.s32 	%r139, 0;
	setp.eq.s32 	%p43, %r14, %r139;
	mul.lo.s32 	%r140, %r8, %r138;
	mul.lo.s32 	%r141, %r8, %r25;
	add.s32 	%r142, %r141, %r34;
	mul.lo.s32 	%r143, %r140, 4;
	add.s32 	%r144, %r142, %r143;
	cvt.s64.s32 	%rd44, %r144;
	@!%p43 bra 	$Lt_19_57858;
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	19	36	0
	mul.lo.u64 	%rd45, %rd44, 8;
	add.u64 	%rd46, %rd4, %rd45;
	ld.v4.u16 	{%r56,%r57,%r58,%r59}, [%rd46+0];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r56;
	cvt.ftz.f32.f16	%f105, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r57;
	cvt.ftz.f32.f16	%f106, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r58;
	cvt.ftz.f32.f16	%f107, %b1; }
	{ .reg .b32 %b1;
	mov.b32		%b1, %r59;
	cvt.ftz.f32.f16	%f108, %b1; }
	bra.uni 	$LDWendi__Z13Half4ToFloat47ushort4_196_7;
$Lt_19_57858:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	19	38	0
	mul.lo.u64 	%rd47, %rd44, 16;
	add.u64 	%rd48, %rd4, %rd47;
	ld.v4.f32 	{%f105,%f106,%f107,%f108}, [%rd48+0];
$LDWendi__Z13Half4ToFloat47ushort4_196_7:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	19	502	0
	mul.lo.s32 	%r60, %r18, 33;
	mul.lo.s32 	%r145, %r138, 132;
	add.u32 	%r146, %r60, %r20;
	add.s32 	%r147, %r145, %r146;
	cvt.s64.s32 	%rd49, %r147;
	mul.wide.s32 	%rd50, %r147, 4;
	add.u64 	%rd51, %rd5, %rd50;
	ld.f32 	%f109, [%rd51+3168];
	add.ftz.f32 	%f58, %f109, %f108;
	.loc	3	208	0
	cvt.ftz.sat.f32.f32 	%f110, %f58;
	mov.f32 	%f111, %f110;
	mov.f32 	%f112, 0fb70637bd;   	// -8e-006
	add.ftz.f32 	%f113, %f110, %f112;
	mov.f32 	%f114, 0f00000000;   	// 0
	setp.le.ftz.f32 	%p44, %f113, %f114;
	@%p44 bra 	$Lt_19_58626;
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f115, %f110;
	ld.f32 	%f116, [%rd51+2112];
	add.ftz.f32 	%f117, %f116, %f107;
	mul.ftz.f32 	%f118, %f115, %f117;
	.loc	3	214	0
	ld.f32 	%f119, [%rd51+1056];
	add.ftz.f32 	%f120, %f119, %f106;
	mul.ftz.f32 	%f121, %f115, %f120;
	.loc	3	215	0
	ld.f32 	%f122, [%rd51+0];
	add.ftz.f32 	%f123, %f122, %f105;
	mul.ftz.f32 	%f124, %f115, %f123;
	bra.uni 	$Lt_19_58370;
$Lt_19_58626:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	3	219	0
	mov.f32 	%f118, 0f00000000;   	// 0
	mov.f32 	%f121, 0f00000000;   	// 0
	mov.f32 	%f124, 0f00000000;   	// 0
	mov.f32 	%f111, 0f00000000;   	// 0
$Lt_19_58370:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	3	266	0
	mov.f32 	%f125, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p45, %f124, %f125;
	@!%p45 bra 	$Lt_19_58882;
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	3	242	0
	neg.ftz.f32 	%f126, %f124;
	lg2.approx.ftz.f32 	%f127, %f126;
	mov.f32 	%f128, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f129, %f127, %f128;
	ex2.approx.ftz.f32 	%f130, %f129;
	neg.ftz.f32 	%f131, %f130;
	bra.uni 	$LDWendi___log2f_196_5;
$Lt_19_58882:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f132, %f124;
	mov.f32 	%f133, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f134, %f132, %f133;
	ex2.approx.ftz.f32 	%f131, %f134;
$LDWendi___log2f_196_5:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	3	267	0
	mov.f32 	%f135, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p46, %f121, %f135;
	@!%p46 bra 	$Lt_19_59394;
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	3	242	0
	neg.ftz.f32 	%f136, %f121;
	lg2.approx.ftz.f32 	%f137, %f136;
	mov.f32 	%f138, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f139, %f137, %f138;
	ex2.approx.ftz.f32 	%f140, %f139;
	neg.ftz.f32 	%f141, %f140;
	bra.uni 	$LDWendi___log2f_196_3;
$Lt_19_59394:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f142, %f121;
	mov.f32 	%f143, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f144, %f142, %f143;
	ex2.approx.ftz.f32 	%f141, %f144;
$LDWendi___log2f_196_3:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	3	268	0
	mov.f32 	%f145, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p47, %f118, %f145;
	@!%p47 bra 	$Lt_19_59906;
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	3	242	0
	neg.ftz.f32 	%f146, %f118;
	lg2.approx.ftz.f32 	%f147, %f146;
	mov.f32 	%f148, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f149, %f147, %f148;
	ex2.approx.ftz.f32 	%f150, %f149;
	neg.ftz.f32 	%f151, %f150;
	bra.uni 	$LDWendi___log2f_196_1;
$Lt_19_59906:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f152, %f118;
	mov.f32 	%f153, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f154, %f152, %f153;
	ex2.approx.ftz.f32 	%f151, %f154;
$LDWendi___log2f_196_1:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	19	504	0
	@!%p43 bra 	$Lt_19_60674;
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	19	48	0
	mul.lo.u64 	%rd52, %rd44, 8;
	add.u64 	%rd53, %rd4, %rd52;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f131;
	mov.b32		%r148, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f141;
	mov.b32		%r149, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f151;
	mov.b32		%r150, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f111;
	mov.b32		%r151, %b1; }
	st.v4.u16 	[%rd53+0], {%r148,%r149,%r150,%r151};
	bra.uni 	$Lt_19_60418;
$Lt_19_60674:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	19	52	0
	mul.lo.u64 	%rd54, %rd44, 16;
	add.u64 	%rd55, %rd4, %rd54;
	st.v4.f32 	[%rd55+0], {%f131,%f141,%f151,%f111};
$Lt_19_60418:
$Lt_19_57346:
$Lt_19_56834:
 //<loop> Part of loop body line 486, head labeled $Lt_19_56578
	.loc	19	504	0
	add.s32 	%r138, %r138, 1;
	add.s32 	%r137, %r137, 4;
	setp.ne.s32 	%p48, %r137, %r110;
	@%p48 bra 	$Lt_19_56578;
 //<loop> Part of loop body line 409, head labeled $Lt_19_50178
	.loc	19	512	0
	bar.sync 	0;
	setp.gt.s32 	%p49, %r34, %r20;
	@%p49 bra 	$Lt_19_50178;
$Lt_19_49666:
	.loc	19	514	0
	ret;
$LDWend__Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff:
	} // _Z27HorizontalRecursiveGaussianPK6float4iiiPS_iii17DevicePixelFormatbffffffff

	.entry VerticalRecursiveGaussianRGBAF16_kernel (
		.param .u64 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_pIn,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_src_pitch,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_src_width,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_src_height,
		.param .u64 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_pOut,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_dest_pitch,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_dest_width,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_dest_height,
		.param .s8 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_repeat_edge_pixels,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_n_plus0,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_n_plus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_neg_d_plus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_neg_d_plus2,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_n_minus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_n_minus2,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_neg_d_minus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_neg_d_minus2)
	{
	.reg .u32 %r<120>;
	.reg .u64 %rd<29>;
	.reg .f32 %f<177>;
	.reg .pred %p<42>;
	.loc	19	534	0
$LDWbegin_VerticalRecursiveGaussianRGBAF16_kernel:
	.loc	19	102	0
	cvt.s32.u32 	%r1, %tid.y;
	mov.s32 	%r2, %r1;
	.loc	19	106	0
	ld.param.s32 	%r3, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_src_height];
	ld.param.s32 	%r4, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_dest_height];
	sub.s32 	%r5, %r4, %r3;
	shr.s32 	%r6, %r5, 1;
	sub.s32 	%r7, %r1, %r6;
	mov.s32 	%r8, %r7;
	cvt.s32.u32 	%r9, %ctaid.x;
	cvt.s32.u32 	%r10, %ntid.x;
	mul.lo.s32 	%r11, %r9, %r10;
	mov.u32 	%r12, %tid.x;
	add.u32 	%r13, %r11, %r12;
	ld.param.s32 	%r14, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_dest_width];
	setp.le.s32 	%p1, %r14, %r13;
	@%p1 bra 	$Lt_20_37890;
	.loc	19	125	0
	ld.param.s32 	%r15, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_src_width];
	sub.s32 	%r16, %r14, %r15;
	shr.s32 	%r17, %r16, 1;
	sub.s32 	%r18, %r13, %r17;
	ld.param.s32 	%r19, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_src_pitch];
	mul.lo.s32 	%r20, %r7, %r19;
	add.s32 	%r21, %r18, %r20;
	.loc	19	126	0
	ld.param.s32 	%r22, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_dest_pitch];
	mul.lo.s32 	%r23, %r22, %r1;
	add.s32 	%r24, %r13, %r23;
	mov.u32 	%r25, 0;
	setp.le.s32 	%p2, %r4, %r25;
	@%p2 bra 	$Lt_20_45314;
	mov.u64 	%rd1, __cuda_local_var_302754_34_non_const_smem__1;
	add.s32 	%r26, %r4, 3;
	shr.s32 	%r27, %r26, 31;
	mov.s32 	%r28, 3;
	and.b32 	%r29, %r27, %r28;
	add.s32 	%r30, %r29, %r26;
	shr.s32 	%r31, %r30, 2;
	shl.b32 	%r32, %r19, 2;
	mul.lo.s32 	%r33, %r1, 32;
	mul.lo.s32 	%r34, %r1, 128;
	ld.param.s8 	%r35, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_repeat_edge_pixels];
	mov.s32 	%r36, 0;
	setp.ne.s32 	%p3, %r35, %r36;
	mov.pred 	%p4, %p3;
	mov.pred 	%p5, %p6;
	shl.b32 	%r37, %r22, 2;
	add.u32 	%r38, %r33, %r12;
	add.u32 	%r39, %r34, %r12;
	selp.s32 	%r40, 1, 0, %p4;
	cvt.s64.s32 	%rd2, %r38;
	cvt.s64.s32 	%rd3, %r39;
	mul.wide.s32 	%rd4, %r38, 4;
	mul.wide.s32 	%rd5, %r39, 4;
	add.u64 	%rd6, %rd4, %rd1;
	add.u64 	%rd7, %rd5, %rd1;
	ld.param.f32 	%f1, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_neg_d_plus2];
	ld.param.f32 	%f2, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_neg_d_plus1];
	ld.param.f32 	%f3, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_n_plus0];
	ld.param.f32 	%f4, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_n_plus1];
	mov.f32 	%f5, 0f00000000;     	// 0
	mov.f32 	%f6, 0f00000000;     	// 0
	mov.f32 	%f7, 0f00000000;     	// 0
	mov.s32 	%r41, 0;
	mov.s32 	%r42, %r31;
$Lt_20_33282:
 //<loop> Loop body line 126, nesting depth: 1, estimated iterations: unknown
	.loc	19	130	0
	mov.u32 	%r43, 0;
	setp.lt.s32 	%p7, %r8, %r43;
	@%p7 bra 	$Lt_20_46082;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	setp.ge.s32 	%p8, %r8, %r3;
	@%p8 bra 	$Lt_20_46082;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	mov.u32 	%r44, 0;
	setp.lt.s32 	%p9, %r18, %r44;
	@%p9 bra 	$Lt_20_46082;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	setp.ge.s32 	%p10, %r18, %r15;
	@%p10 bra 	$Lt_20_46082;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	36	0
	ld.param.u64 	%rd8, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_pIn];
	cvt.s64.s32 	%rd9, %r21;
	mul.wide.s32 	%rd10, %r21, 8;
	add.u64 	%rd11, %rd8, %rd10;
	ld.global.v4.u16 	{%r45,%r46,%r47,%r48}, [%rd11+0];
	.loc	3	255	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r45;
	cvt.ftz.f32.f16	%f8, %b1; }
	mov.f32 	%f9, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p11, %f8, %f9;
	@!%p11 bra 	$Lt_20_33538;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	3	234	0
	neg.ftz.f32 	%f10, %f8;
	lg2.approx.ftz.f32 	%f11, %f10;
	mov.f32 	%f12, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f13, %f11, %f12;
	ex2.approx.ftz.f32 	%f14, %f13;
	neg.ftz.f32 	%f15, %f14;
	bra.uni 	$LDWendi___log2f_197_24;
$Lt_20_33538:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f16, %f8;
	mov.f32 	%f17, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f18, %f16, %f17;
	ex2.approx.ftz.f32 	%f15, %f18;
$LDWendi___log2f_197_24:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	3	256	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r46;
	cvt.ftz.f32.f16	%f19, %b1; }
	mov.f32 	%f20, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p12, %f19, %f20;
	@!%p12 bra 	$Lt_20_34050;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	3	234	0
	neg.ftz.f32 	%f21, %f19;
	lg2.approx.ftz.f32 	%f22, %f21;
	mov.f32 	%f23, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f24, %f22, %f23;
	ex2.approx.ftz.f32 	%f25, %f24;
	neg.ftz.f32 	%f26, %f25;
	bra.uni 	$LDWendi___log2f_197_22;
$Lt_20_34050:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f27, %f19;
	mov.f32 	%f28, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f29, %f27, %f28;
	ex2.approx.ftz.f32 	%f26, %f29;
$LDWendi___log2f_197_22:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	3	257	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r47;
	cvt.ftz.f32.f16	%f30, %b1; }
	mov.f32 	%f31, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p13, %f30, %f31;
	@!%p13 bra 	$Lt_20_34562;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	3	234	0
	neg.ftz.f32 	%f32, %f30;
	lg2.approx.ftz.f32 	%f33, %f32;
	mov.f32 	%f34, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f35, %f33, %f34;
	ex2.approx.ftz.f32 	%f36, %f35;
	neg.ftz.f32 	%f37, %f36;
	bra.uni 	$LDWendi___log2f_197_20;
$Lt_20_34562:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f38, %f30;
	mov.f32 	%f39, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f40, %f38, %f39;
	ex2.approx.ftz.f32 	%f37, %f40;
$LDWendi___log2f_197_20:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	132	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r48;
	cvt.ftz.f32.f16	%f41, %b1; }
	cvt.ftz.sat.f32.f32 	%f42, %f41;
	mul.ftz.f32 	%f43, %f15, %f42;
	mul.ftz.f32 	%f44, %f26, %f42;
	mul.ftz.f32 	%f45, %f37, %f42;
	mov.f32 	%f46, %f42;
	bra.uni 	$L_20_30210;
$Lt_20_46082:
$L_20_30466:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	134	0
	mov.f32 	%f46, 0f00000000;    	// 0
	mov.f32 	%f45, 0f00000000;    	// 0
	mov.f32 	%f44, 0f00000000;    	// 0
	mov.f32 	%f43, 0f00000000;    	// 0
$L_20_30210:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	136	0
	st.shared.f32 	[%rd6+0], %f43;
	.loc	19	137	0
	st.shared.f32 	[%rd6+512], %f44;
	.loc	19	138	0
	st.shared.f32 	[%rd6+1024], %f45;
	.loc	19	139	0
	st.shared.f32 	[%rd6+1536], %f46;
	.loc	19	141	0
	bar.sync 	0;
	.loc	19	149	0
	ld.shared.f32 	%f47, [%rd7+0];
	mov.s32 	%r49, 0;
	set.eq.u32.s32 	%r50, %r41, %r49;
	neg.s32 	%r51, %r50;
	and.b32 	%r52, %r51, %r40;
	mov.u32 	%r53, 0;
	setp.eq.s32 	%p14, %r52, %r53;
	@%p14 bra 	$Lt_20_35074;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	152	0
	mov.f32 	%f48, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f49, %f47, %f48;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_20_35074:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	155	0
	mul.ftz.f32 	%f50, %f4, %f7;
	fma.rn.ftz.f32 	%f51, %f47, %f3, %f50;
	fma.rn.ftz.f32 	%f52, %f6, %f2, %f51;
	fma.rn.ftz.f32 	%f53, %f5, %f1, %f52;
	.loc	19	158	0
	mov.f32 	%f5, %f6;
	.loc	19	159	0
	mov.f32 	%f6, %f53;
	.loc	19	161	0
	st.shared.f32 	[%rd7+0], %f53;
	.loc	19	149	0
	ld.shared.f32 	%f54, [%rd7+128];
	mov.s32 	%r54, -1;
	set.eq.u32.s32 	%r55, %r41, %r54;
	neg.s32 	%r56, %r55;
	and.b32 	%r57, %r56, %r40;
	mov.u32 	%r58, 0;
	setp.eq.s32 	%p15, %r57, %r58;
	@%p15 bra 	$Lt_20_35586;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	152	0
	mov.f32 	%f55, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f49, %f54, %f55;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_20_35586:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	155	0
	mul.ftz.f32 	%f56, %f47, %f4;
	fma.rn.ftz.f32 	%f57, %f54, %f3, %f56;
	fma.rn.ftz.f32 	%f58, %f6, %f2, %f57;
	fma.rn.ftz.f32 	%f53, %f5, %f1, %f58;
	.loc	19	158	0
	mov.f32 	%f5, %f6;
	.loc	19	159	0
	mov.f32 	%f6, %f53;
	.loc	19	161	0
	st.shared.f32 	[%rd7+128], %f53;
	.loc	19	149	0
	ld.shared.f32 	%f59, [%rd7+256];
	mov.s32 	%r59, -2;
	set.eq.u32.s32 	%r60, %r41, %r59;
	neg.s32 	%r61, %r60;
	and.b32 	%r62, %r61, %r40;
	mov.u32 	%r63, 0;
	setp.eq.s32 	%p16, %r62, %r63;
	@%p16 bra 	$Lt_20_36098;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	152	0
	mov.f32 	%f60, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f49, %f59, %f60;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_20_36098:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	155	0
	mul.ftz.f32 	%f61, %f54, %f4;
	fma.rn.ftz.f32 	%f62, %f59, %f3, %f61;
	fma.rn.ftz.f32 	%f63, %f6, %f2, %f62;
	fma.rn.ftz.f32 	%f53, %f5, %f1, %f63;
	.loc	19	158	0
	mov.f32 	%f5, %f6;
	.loc	19	159	0
	mov.f32 	%f6, %f53;
	.loc	19	161	0
	st.shared.f32 	[%rd7+256], %f53;
	.loc	19	149	0
	ld.shared.f32 	%f64, [%rd7+384];
	mov.s32 	%r64, -3;
	set.eq.u32.s32 	%r65, %r41, %r64;
	neg.s32 	%r66, %r65;
	and.b32 	%r67, %r66, %r40;
	mov.u32 	%r68, 0;
	setp.eq.s32 	%p17, %r67, %r68;
	@%p17 bra 	$Lt_20_36610;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	152	0
	mov.f32 	%f65, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f49, %f64, %f65;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_20_36610:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	155	0
	mul.ftz.f32 	%f66, %f59, %f4;
	fma.rn.ftz.f32 	%f67, %f64, %f3, %f66;
	fma.rn.ftz.f32 	%f68, %f6, %f2, %f67;
	fma.rn.ftz.f32 	%f53, %f5, %f1, %f68;
	.loc	19	157	0
	mov.f32 	%f7, %f64;
	.loc	19	158	0
	mov.f32 	%f5, %f6;
	.loc	19	159	0
	mov.f32 	%f6, %f53;
	.loc	19	161	0
	st.shared.f32 	[%rd7+384], %f53;
	.loc	19	164	0
	bar.sync 	0;
	.loc	19	167	0
	setp.ge.s32 	%p18, %r2, %r4;
	@%p18 bra 	$Lt_20_37122;
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	48	0
	ld.param.u64 	%rd12, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_pOut];
	cvt.s64.s32 	%rd13, %r24;
	mul.wide.s32 	%rd14, %r24, 8;
	add.u64 	%rd15, %rd12, %rd14;
	ld.shared.f32 	%f69, [%rd6+0];
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f69;
	mov.b32		%r69, %b1; }
	ld.shared.f32 	%f70, [%rd6+512];
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f70;
	mov.b32		%r70, %b1; }
	ld.shared.f32 	%f71, [%rd6+1024];
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f71;
	mov.b32		%r71, %b1; }
	ld.shared.f32 	%f72, [%rd6+1536];
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f72;
	mov.b32		%r72, %b1; }
	st.global.v4.u16 	[%rd15+0], {%r69,%r70,%r71,%r72};
$Lt_20_37122:
 //<loop> Part of loop body line 126, head labeled $Lt_20_33282
	.loc	19	176	0
	bar.sync 	0;
	.loc	19	178	0
	add.s32 	%r41, %r41, 4;
	.loc	19	179	0
	add.s32 	%r8, %r8, 4;
	.loc	19	180	0
	add.s32 	%r2, %r2, 4;
	.loc	19	181	0
	add.s32 	%r21, %r32, %r21;
	.loc	19	182	0
	add.s32 	%r24, %r37, %r24;
	setp.gt.s32 	%p19, %r4, %r41;
	@%p19 bra 	$Lt_20_33282;
	bra.uni 	$Lt_20_32770;
$Lt_20_45314:
	mov.s32 	%r41, 0;
	mov.u64 	%rd1, __cuda_local_var_302754_34_non_const_smem__1;
$Lt_20_32770:
	mov.u32 	%r73, 0;
	setp.le.s32 	%p20, %r41, %r73;
	@%p20 bra 	$Lt_20_37890;
	add.s32 	%r74, %r41, 3;
	shr.s32 	%r75, %r74, 31;
	mov.s32 	%r76, 3;
	and.b32 	%r77, %r75, %r76;
	add.s32 	%r78, %r77, %r74;
	shr.s32 	%r79, %r78, 2;
	shl.b32 	%r32, %r19, 2;
	mul.lo.s32 	%r80, %r1, 32;
	mul.lo.s32 	%r81, %r1, 128;
	sub.s32 	%r82, %r4, 4;
	ld.param.s8 	%r83, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_repeat_edge_pixels];
	mov.s32 	%r84, 0;
	setp.ne.s32 	%p21, %r83, %r84;
	mov.pred 	%p22, %p21;
	mov.pred 	%p23, %p6;
	sub.s32 	%r85, %r4, 3;
	sub.s32 	%r86, %r4, 2;
	sub.s32 	%r87, %r4, 1;
	mul.lo.s32 	%r88, %r8, %r19;
	mul.lo.s32 	%r89, %r19, -4;
	add.u32 	%r90, %r80, %r12;
	add.u32 	%r91, %r81, %r12;
	selp.s32 	%r40, 1, 0, %p22;
	cvt.s64.s32 	%rd16, %r90;
	cvt.s64.s32 	%rd17, %r91;
	add.s32 	%r92, %r88, %r18;
	mul.wide.s32 	%rd18, %r90, 4;
	mul.wide.s32 	%rd19, %r91, 4;
	add.u64 	%rd6, %rd18, %rd1;
	add.u64 	%rd7, %rd19, %rd1;
	ld.param.f32 	%f73, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_neg_d_minus2];
	ld.param.f32 	%f74, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_neg_d_minus1];
	ld.param.f32 	%f75, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_n_minus1];
	ld.param.f32 	%f76, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_n_minus2];
	mov.f32 	%f77, 0f00000000;    	// 0
	mov.f32 	%f5, 0f00000000;     	// 0
	mov.f32 	%f6, 0f00000000;     	// 0
	mov.f32 	%f7, 0f00000000;     	// 0
	mov.s32 	%r93, %r79;
$Lt_20_38402:
 //<loop> Loop body line 182, nesting depth: 1, estimated iterations: unknown
	.loc	19	198	0
	sub.s32 	%r41, %r41, 4;
	.loc	19	199	0
	sub.s32 	%r8, %r8, 4;
	add.s32 	%r92, %r89, %r92;
	.loc	19	200	0
	sub.s32 	%r2, %r2, 4;
	.loc	19	201	0
	sub.s32 	%r21, %r21, %r32;
	.loc	19	198	0
	mov.u32 	%r94, 0;
	setp.lt.s32 	%p24, %r8, %r94;
	@%p24 bra 	$Lt_20_47362;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	setp.ge.s32 	%p25, %r8, %r3;
	@%p25 bra 	$Lt_20_47362;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	mov.u32 	%r95, 0;
	setp.lt.s32 	%p26, %r18, %r95;
	@%p26 bra 	$Lt_20_47362;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	setp.ge.s32 	%p27, %r18, %r15;
	@%p27 bra 	$Lt_20_47362;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	36	0
	ld.param.u64 	%rd20, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_pIn];
	cvt.s64.s32 	%rd21, %r21;
	mul.wide.s32 	%rd22, %r21, 8;
	add.u64 	%rd23, %rd20, %rd22;
	ld.global.v4.u16 	{%r45,%r46,%r47,%r48}, [%rd23+0];
	.loc	3	255	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r45;
	cvt.ftz.f32.f16	%f8, %b1; }
	mov.f32 	%f78, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p28, %f8, %f78;
	@!%p28 bra 	$Lt_20_38658;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	234	0
	neg.ftz.f32 	%f79, %f8;
	lg2.approx.ftz.f32 	%f80, %f79;
	mov.f32 	%f81, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f82, %f80, %f81;
	ex2.approx.ftz.f32 	%f83, %f82;
	neg.ftz.f32 	%f15, %f83;
	bra.uni 	$LDWendi___log2f_197_16;
$Lt_20_38658:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f84, %f8;
	mov.f32 	%f85, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f86, %f84, %f85;
	ex2.approx.ftz.f32 	%f15, %f86;
$LDWendi___log2f_197_16:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	256	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r46;
	cvt.ftz.f32.f16	%f19, %b1; }
	mov.f32 	%f87, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p29, %f19, %f87;
	@!%p29 bra 	$Lt_20_39170;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	234	0
	neg.ftz.f32 	%f88, %f19;
	lg2.approx.ftz.f32 	%f89, %f88;
	mov.f32 	%f90, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f91, %f89, %f90;
	ex2.approx.ftz.f32 	%f92, %f91;
	neg.ftz.f32 	%f26, %f92;
	bra.uni 	$LDWendi___log2f_197_14;
$Lt_20_39170:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f93, %f19;
	mov.f32 	%f94, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f95, %f93, %f94;
	ex2.approx.ftz.f32 	%f26, %f95;
$LDWendi___log2f_197_14:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	257	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r47;
	cvt.ftz.f32.f16	%f30, %b1; }
	mov.f32 	%f96, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p30, %f30, %f96;
	@!%p30 bra 	$Lt_20_39682;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	234	0
	neg.ftz.f32 	%f97, %f30;
	lg2.approx.ftz.f32 	%f98, %f97;
	mov.f32 	%f99, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f100, %f98, %f99;
	ex2.approx.ftz.f32 	%f101, %f100;
	neg.ftz.f32 	%f37, %f101;
	bra.uni 	$LDWendi___log2f_197_12;
$Lt_20_39682:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f102, %f30;
	mov.f32 	%f103, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f104, %f102, %f103;
	ex2.approx.ftz.f32 	%f37, %f104;
$LDWendi___log2f_197_12:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	208	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r48;
	cvt.ftz.f32.f16	%f41, %b1; }
	cvt.ftz.sat.f32.f32 	%f105, %f41;
	mul.ftz.f32 	%f43, %f15, %f105;
	mul.ftz.f32 	%f44, %f26, %f105;
	mul.ftz.f32 	%f45, %f37, %f105;
	mov.f32 	%f46, %f105;
	bra.uni 	$L_20_31490;
$Lt_20_47362:
$L_20_31746:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	210	0
	mov.f32 	%f46, 0f00000000;    	// 0
	mov.f32 	%f45, 0f00000000;    	// 0
	mov.f32 	%f44, 0f00000000;    	// 0
	mov.f32 	%f43, 0f00000000;    	// 0
$L_20_31490:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	212	0
	st.shared.f32 	[%rd6+0], %f43;
	.loc	19	213	0
	st.shared.f32 	[%rd6+512], %f44;
	.loc	19	214	0
	st.shared.f32 	[%rd6+1024], %f45;
	.loc	19	215	0
	st.shared.f32 	[%rd6+1536], %f46;
	.loc	19	217	0
	bar.sync 	0;
	.loc	19	223	0
	ld.shared.f32 	%f106, [%rd7+384];
	set.eq.u32.s32 	%r96, %r82, %r41;
	neg.s32 	%r97, %r96;
	and.b32 	%r98, %r97, %r40;
	mov.u32 	%r99, 0;
	setp.eq.s32 	%p31, %r98, %r99;
	@%p31 bra 	$Lt_20_40194;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	226	0
	mov.f32 	%f107, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f49, %f106, %f107;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_20_40194:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	228	0
	mov.f32 	%f108, %f7;
	mul.ftz.f32 	%f109, %f76, %f77;
	fma.rn.ftz.f32 	%f110, %f108, %f75, %f109;
	fma.rn.ftz.f32 	%f111, %f6, %f74, %f110;
	fma.rn.ftz.f32 	%f53, %f5, %f73, %f111;
	.loc	19	232	0
	mov.f32 	%f5, %f6;
	.loc	19	233	0
	mov.f32 	%f6, %f53;
	.loc	19	235	0
	st.shared.f32 	[%rd7+384], %f53;
	.loc	19	223	0
	ld.shared.f32 	%f112, [%rd7+256];
	set.eq.u32.s32 	%r100, %r85, %r41;
	neg.s32 	%r101, %r100;
	and.b32 	%r102, %r101, %r40;
	mov.u32 	%r103, 0;
	setp.eq.s32 	%p32, %r102, %r103;
	@%p32 bra 	$Lt_20_40706;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	226	0
	mov.f32 	%f113, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f49, %f112, %f113;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_20_40706:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	228	0
	mul.ftz.f32 	%f114, %f108, %f76;
	fma.rn.ftz.f32 	%f115, %f106, %f75, %f114;
	fma.rn.ftz.f32 	%f116, %f6, %f74, %f115;
	fma.rn.ftz.f32 	%f53, %f5, %f73, %f116;
	.loc	19	232	0
	mov.f32 	%f5, %f6;
	.loc	19	233	0
	mov.f32 	%f6, %f53;
	.loc	19	235	0
	st.shared.f32 	[%rd7+256], %f53;
	.loc	19	223	0
	ld.shared.f32 	%f117, [%rd7+128];
	set.eq.u32.s32 	%r104, %r86, %r41;
	neg.s32 	%r105, %r104;
	and.b32 	%r106, %r105, %r40;
	mov.u32 	%r107, 0;
	setp.eq.s32 	%p33, %r106, %r107;
	@%p33 bra 	$Lt_20_41218;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	226	0
	mov.f32 	%f118, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f49, %f117, %f118;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_20_41218:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	228	0
	mul.ftz.f32 	%f119, %f106, %f76;
	fma.rn.ftz.f32 	%f120, %f112, %f75, %f119;
	fma.rn.ftz.f32 	%f121, %f6, %f74, %f120;
	fma.rn.ftz.f32 	%f53, %f5, %f73, %f121;
	.loc	19	232	0
	mov.f32 	%f5, %f6;
	.loc	19	233	0
	mov.f32 	%f6, %f53;
	.loc	19	235	0
	st.shared.f32 	[%rd7+128], %f53;
	.loc	19	223	0
	ld.shared.f32 	%f64, [%rd7+0];
	set.eq.u32.s32 	%r108, %r87, %r41;
	neg.s32 	%r109, %r108;
	and.b32 	%r110, %r109, %r40;
	mov.u32 	%r111, 0;
	setp.eq.s32 	%p34, %r110, %r111;
	@%p34 bra 	$Lt_20_41730;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	226	0
	mov.f32 	%f122, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f49, %f64, %f122;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_20_41730:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	228	0
	mul.ftz.f32 	%f123, %f112, %f76;
	fma.rn.ftz.f32 	%f124, %f117, %f75, %f123;
	fma.rn.ftz.f32 	%f125, %f6, %f74, %f124;
	fma.rn.ftz.f32 	%f53, %f5, %f73, %f125;
	.loc	19	230	0
	mov.f32 	%f77, %f117;
	.loc	19	231	0
	mov.f32 	%f7, %f64;
	.loc	19	232	0
	mov.f32 	%f5, %f6;
	.loc	19	233	0
	mov.f32 	%f6, %f53;
	.loc	19	235	0
	st.shared.f32 	[%rd7+0], %f53;
	.loc	19	238	0
	bar.sync 	0;
	.loc	19	242	0
	mov.s32 	%r21, %r92;
	.loc	19	244	0
	setp.ge.s32 	%p35, %r2, %r4;
	@%p35 bra 	$Lt_20_42242;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	36	0
	ld.param.u64 	%rd24, [__cudaparm_VerticalRecursiveGaussianRGBAF16_kernel_pOut];
	mul.lo.s32 	%r112, %r2, %r22;
	add.s32 	%r113, %r13, %r112;
	cvt.s64.s32 	%rd25, %r113;
	mul.wide.s32 	%rd26, %r113, 8;
	add.u64 	%rd27, %rd24, %rd26;
	ld.global.v4.u16 	{%r45,%r46,%r47,%r48}, [%rd27+0];
	.loc	3	208	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r48;
	cvt.ftz.f32.f16	%f41, %b1; }
	ld.shared.f32 	%f126, [%rd6+1536];
	add.ftz.f32 	%f127, %f41, %f126;
	cvt.ftz.sat.f32.f32 	%f128, %f127;
	mov.f32 	%f129, %f128;
	mov.f32 	%f130, 0fb70637bd;   	// -8e-006
	add.ftz.f32 	%f131, %f128, %f130;
	mov.f32 	%f132, 0f00000000;   	// 0
	setp.le.ftz.f32 	%p36, %f131, %f132;
	@%p36 bra 	$Lt_20_43010;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f133, %f128;
	ld.shared.f32 	%f134, [%rd6+1024];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r47;
	cvt.ftz.f32.f16	%f135, %b1; }
	add.ftz.f32 	%f136, %f134, %f135;
	mul.ftz.f32 	%f137, %f133, %f136;
	.loc	3	214	0
	ld.shared.f32 	%f138, [%rd6+512];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r46;
	cvt.ftz.f32.f16	%f139, %b1; }
	add.ftz.f32 	%f140, %f138, %f139;
	mul.ftz.f32 	%f141, %f133, %f140;
	.loc	3	215	0
	ld.shared.f32 	%f142, [%rd6+0];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r45;
	cvt.ftz.f32.f16	%f143, %b1; }
	add.ftz.f32 	%f144, %f142, %f143;
	mul.ftz.f32 	%f145, %f133, %f144;
	bra.uni 	$Lt_20_42754;
$Lt_20_43010:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	219	0
	mov.f32 	%f137, 0f00000000;   	// 0
	mov.f32 	%f141, 0f00000000;   	// 0
	mov.f32 	%f145, 0f00000000;   	// 0
	mov.f32 	%f129, 0f00000000;   	// 0
$Lt_20_42754:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	266	0
	mov.f32 	%f146, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p37, %f145, %f146;
	@!%p37 bra 	$Lt_20_43266;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	242	0
	neg.ftz.f32 	%f147, %f145;
	lg2.approx.ftz.f32 	%f148, %f147;
	mov.f32 	%f149, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f150, %f148, %f149;
	ex2.approx.ftz.f32 	%f151, %f150;
	neg.ftz.f32 	%f152, %f151;
	bra.uni 	$LDWendi___log2f_197_6;
$Lt_20_43266:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f153, %f145;
	mov.f32 	%f154, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f155, %f153, %f154;
	ex2.approx.ftz.f32 	%f152, %f155;
$LDWendi___log2f_197_6:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	267	0
	mov.f32 	%f156, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p38, %f141, %f156;
	@!%p38 bra 	$Lt_20_43778;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	242	0
	neg.ftz.f32 	%f157, %f141;
	lg2.approx.ftz.f32 	%f158, %f157;
	mov.f32 	%f159, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f160, %f158, %f159;
	ex2.approx.ftz.f32 	%f161, %f160;
	neg.ftz.f32 	%f162, %f161;
	bra.uni 	$LDWendi___log2f_197_4;
$Lt_20_43778:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f163, %f141;
	mov.f32 	%f164, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f165, %f163, %f164;
	ex2.approx.ftz.f32 	%f162, %f165;
$LDWendi___log2f_197_4:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	268	0
	mov.f32 	%f166, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p39, %f137, %f166;
	@!%p39 bra 	$Lt_20_44290;
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	242	0
	neg.ftz.f32 	%f167, %f137;
	lg2.approx.ftz.f32 	%f168, %f167;
	mov.f32 	%f169, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f170, %f168, %f169;
	ex2.approx.ftz.f32 	%f171, %f170;
	neg.ftz.f32 	%f172, %f171;
	bra.uni 	$LDWendi___log2f_197_2;
$Lt_20_44290:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f173, %f137;
	mov.f32 	%f174, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f175, %f173, %f174;
	ex2.approx.ftz.f32 	%f172, %f175;
$LDWendi___log2f_197_2:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	48	0
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f152;
	mov.b32		%r114, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f162;
	mov.b32		%r115, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f172;
	mov.b32		%r116, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f129;
	mov.b32		%r117, %b1; }
	st.global.v4.u16 	[%rd27+0], {%r114,%r115,%r116,%r117};
$Lt_20_42242:
 //<loop> Part of loop body line 182, head labeled $Lt_20_38402
	.loc	19	257	0
	bar.sync 	0;
	mov.u32 	%r118, 0;
	setp.gt.s32 	%p40, %r41, %r118;
	@%p40 bra 	$Lt_20_38402;
$Lt_20_37890:
$LDWendi__Z15IntegerMultiplyii_197_1:
	.loc	19	555	0
	exit;
$LDWend_VerticalRecursiveGaussianRGBAF16_kernel:
	} // VerticalRecursiveGaussianRGBAF16_kernel

	.entry VerticalRecursiveGaussianRGBAF32_kernel (
		.param .u64 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_pIn,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_src_pitch,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_src_width,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_src_height,
		.param .u64 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_pOut,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_dest_pitch,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_dest_width,
		.param .s32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_dest_height,
		.param .s8 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_repeat_edge_pixels,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_n_plus0,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_n_plus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_neg_d_plus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_neg_d_plus2,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_n_minus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_n_minus2,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_neg_d_minus1,
		.param .f32 __cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_neg_d_minus2)
	{
	.reg .u32 %r<108>;
	.reg .u64 %rd<29>;
	.reg .f32 %f<182>;
	.reg .pred %p<42>;
	.loc	19	573	0
$LDWbegin_VerticalRecursiveGaussianRGBAF32_kernel:
	.loc	19	102	0
	cvt.s32.u32 	%r1, %tid.y;
	mov.s32 	%r2, %r1;
	.loc	19	106	0
	ld.param.s32 	%r3, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_src_height];
	ld.param.s32 	%r4, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_dest_height];
	sub.s32 	%r5, %r4, %r3;
	shr.s32 	%r6, %r5, 1;
	sub.s32 	%r7, %r1, %r6;
	mov.s32 	%r8, %r7;
	cvt.s32.u32 	%r9, %ctaid.x;
	cvt.s32.u32 	%r10, %ntid.x;
	mul.lo.s32 	%r11, %r9, %r10;
	mov.u32 	%r12, %tid.x;
	add.u32 	%r13, %r11, %r12;
	ld.param.s32 	%r14, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_dest_width];
	setp.le.s32 	%p1, %r14, %r13;
	@%p1 bra 	$Lt_21_37890;
	.loc	19	125	0
	ld.param.s32 	%r15, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_src_width];
	sub.s32 	%r16, %r14, %r15;
	shr.s32 	%r17, %r16, 1;
	sub.s32 	%r18, %r13, %r17;
	ld.param.s32 	%r19, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_src_pitch];
	mul.lo.s32 	%r20, %r7, %r19;
	add.s32 	%r21, %r18, %r20;
	.loc	19	126	0
	ld.param.s32 	%r22, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_dest_pitch];
	mul.lo.s32 	%r23, %r22, %r1;
	add.s32 	%r24, %r13, %r23;
	mov.u32 	%r25, 0;
	setp.le.s32 	%p2, %r4, %r25;
	@%p2 bra 	$Lt_21_45314;
	mov.u64 	%rd1, __cuda_local_var_302754_34_non_const_smem__1;
	add.s32 	%r26, %r4, 3;
	shr.s32 	%r27, %r26, 31;
	mov.s32 	%r28, 3;
	and.b32 	%r29, %r27, %r28;
	add.s32 	%r30, %r29, %r26;
	shr.s32 	%r31, %r30, 2;
	shl.b32 	%r32, %r19, 2;
	mul.lo.s32 	%r33, %r1, 32;
	mul.lo.s32 	%r34, %r1, 128;
	ld.param.s8 	%r35, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_repeat_edge_pixels];
	mov.s32 	%r36, 0;
	setp.ne.s32 	%p3, %r35, %r36;
	mov.pred 	%p4, %p3;
	mov.pred 	%p5, %p6;
	shl.b32 	%r37, %r22, 2;
	add.u32 	%r38, %r33, %r12;
	add.u32 	%r39, %r34, %r12;
	selp.s32 	%r40, 1, 0, %p4;
	cvt.s64.s32 	%rd2, %r38;
	cvt.s64.s32 	%rd3, %r39;
	mul.wide.s32 	%rd4, %r38, 4;
	mul.wide.s32 	%rd5, %r39, 4;
	add.u64 	%rd6, %rd4, %rd1;
	add.u64 	%rd7, %rd5, %rd1;
	ld.param.f32 	%f1, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_neg_d_plus2];
	ld.param.f32 	%f2, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_neg_d_plus1];
	ld.param.f32 	%f3, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_n_plus0];
	ld.param.f32 	%f4, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_n_plus1];
	mov.f32 	%f5, 0f00000000;     	// 0
	mov.f32 	%f6, 0f00000000;     	// 0
	mov.f32 	%f7, 0f00000000;     	// 0
	mov.s32 	%r41, 0;
	mov.s32 	%r42, %r31;
$Lt_21_33282:
 //<loop> Loop body line 126, nesting depth: 1, estimated iterations: unknown
	.loc	19	130	0
	mov.u32 	%r43, 0;
	setp.lt.s32 	%p7, %r8, %r43;
	@%p7 bra 	$Lt_21_46082;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	setp.ge.s32 	%p8, %r8, %r3;
	@%p8 bra 	$Lt_21_46082;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	mov.u32 	%r44, 0;
	setp.lt.s32 	%p9, %r18, %r44;
	@%p9 bra 	$Lt_21_46082;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	setp.ge.s32 	%p10, %r18, %r15;
	@%p10 bra 	$Lt_21_46082;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	38	0
	ld.param.u64 	%rd8, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_pIn];
	cvt.s64.s32 	%rd9, %r21;
	mul.wide.s32 	%rd10, %r21, 16;
	add.u64 	%rd11, %rd8, %rd10;
	ld.global.v4.f32 	{%f8,%f9,%f10,%f11}, [%rd11+0];
	.loc	3	255	0
	mov.f32 	%f12, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p11, %f8, %f12;
	@!%p11 bra 	$Lt_21_33538;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	3	234	0
	neg.ftz.f32 	%f13, %f8;
	lg2.approx.ftz.f32 	%f14, %f13;
	mov.f32 	%f15, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f16, %f14, %f15;
	ex2.approx.ftz.f32 	%f17, %f16;
	neg.ftz.f32 	%f18, %f17;
	bra.uni 	$LDWendi___log2f_198_24;
$Lt_21_33538:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f19, %f8;
	mov.f32 	%f20, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f21, %f19, %f20;
	ex2.approx.ftz.f32 	%f18, %f21;
$LDWendi___log2f_198_24:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	3	256	0
	mov.f32 	%f22, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p12, %f9, %f22;
	@!%p12 bra 	$Lt_21_34050;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	3	234	0
	neg.ftz.f32 	%f23, %f9;
	lg2.approx.ftz.f32 	%f24, %f23;
	mov.f32 	%f25, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f26, %f24, %f25;
	ex2.approx.ftz.f32 	%f27, %f26;
	neg.ftz.f32 	%f28, %f27;
	bra.uni 	$LDWendi___log2f_198_22;
$Lt_21_34050:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f29, %f9;
	mov.f32 	%f30, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f31, %f29, %f30;
	ex2.approx.ftz.f32 	%f28, %f31;
$LDWendi___log2f_198_22:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	3	257	0
	mov.f32 	%f32, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p13, %f10, %f32;
	@!%p13 bra 	$Lt_21_34562;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	3	234	0
	neg.ftz.f32 	%f33, %f10;
	lg2.approx.ftz.f32 	%f34, %f33;
	mov.f32 	%f35, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f36, %f34, %f35;
	ex2.approx.ftz.f32 	%f37, %f36;
	neg.ftz.f32 	%f38, %f37;
	bra.uni 	$LDWendi___log2f_198_20;
$Lt_21_34562:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f39, %f10;
	mov.f32 	%f40, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f41, %f39, %f40;
	ex2.approx.ftz.f32 	%f38, %f41;
$LDWendi___log2f_198_20:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	132	0
	cvt.ftz.sat.f32.f32 	%f42, %f11;
	mul.ftz.f32 	%f43, %f18, %f42;
	mul.ftz.f32 	%f44, %f28, %f42;
	mul.ftz.f32 	%f45, %f38, %f42;
	mov.f32 	%f46, %f42;
	bra.uni 	$L_21_30210;
$Lt_21_46082:
$L_21_30466:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	134	0
	mov.f32 	%f46, 0f00000000;    	// 0
	mov.f32 	%f45, 0f00000000;    	// 0
	mov.f32 	%f44, 0f00000000;    	// 0
	mov.f32 	%f43, 0f00000000;    	// 0
$L_21_30210:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	136	0
	st.shared.f32 	[%rd6+0], %f43;
	.loc	19	137	0
	st.shared.f32 	[%rd6+512], %f44;
	.loc	19	138	0
	st.shared.f32 	[%rd6+1024], %f45;
	.loc	19	139	0
	st.shared.f32 	[%rd6+1536], %f46;
	.loc	19	141	0
	bar.sync 	0;
	.loc	19	149	0
	ld.shared.f32 	%f47, [%rd7+0];
	mov.s32 	%r45, 0;
	set.eq.u32.s32 	%r46, %r41, %r45;
	neg.s32 	%r47, %r46;
	and.b32 	%r48, %r47, %r40;
	mov.u32 	%r49, 0;
	setp.eq.s32 	%p14, %r48, %r49;
	@%p14 bra 	$Lt_21_35074;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	152	0
	mov.f32 	%f48, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f49, %f47, %f48;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_21_35074:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	155	0
	mul.ftz.f32 	%f50, %f4, %f7;
	fma.rn.ftz.f32 	%f51, %f47, %f3, %f50;
	fma.rn.ftz.f32 	%f52, %f6, %f2, %f51;
	fma.rn.ftz.f32 	%f53, %f5, %f1, %f52;
	.loc	19	158	0
	mov.f32 	%f5, %f6;
	.loc	19	159	0
	mov.f32 	%f6, %f53;
	.loc	19	161	0
	st.shared.f32 	[%rd7+0], %f53;
	.loc	19	149	0
	ld.shared.f32 	%f54, [%rd7+128];
	mov.s32 	%r50, -1;
	set.eq.u32.s32 	%r51, %r41, %r50;
	neg.s32 	%r52, %r51;
	and.b32 	%r53, %r52, %r40;
	mov.u32 	%r54, 0;
	setp.eq.s32 	%p15, %r53, %r54;
	@%p15 bra 	$Lt_21_35586;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	152	0
	mov.f32 	%f55, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f49, %f54, %f55;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_21_35586:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	155	0
	mul.ftz.f32 	%f56, %f47, %f4;
	fma.rn.ftz.f32 	%f57, %f54, %f3, %f56;
	fma.rn.ftz.f32 	%f58, %f6, %f2, %f57;
	fma.rn.ftz.f32 	%f53, %f5, %f1, %f58;
	.loc	19	158	0
	mov.f32 	%f5, %f6;
	.loc	19	159	0
	mov.f32 	%f6, %f53;
	.loc	19	161	0
	st.shared.f32 	[%rd7+128], %f53;
	.loc	19	149	0
	ld.shared.f32 	%f59, [%rd7+256];
	mov.s32 	%r55, -2;
	set.eq.u32.s32 	%r56, %r41, %r55;
	neg.s32 	%r57, %r56;
	and.b32 	%r58, %r57, %r40;
	mov.u32 	%r59, 0;
	setp.eq.s32 	%p16, %r58, %r59;
	@%p16 bra 	$Lt_21_36098;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	152	0
	mov.f32 	%f60, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f49, %f59, %f60;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_21_36098:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	155	0
	mul.ftz.f32 	%f61, %f54, %f4;
	fma.rn.ftz.f32 	%f62, %f59, %f3, %f61;
	fma.rn.ftz.f32 	%f63, %f6, %f2, %f62;
	fma.rn.ftz.f32 	%f53, %f5, %f1, %f63;
	.loc	19	158	0
	mov.f32 	%f5, %f6;
	.loc	19	159	0
	mov.f32 	%f6, %f53;
	.loc	19	161	0
	st.shared.f32 	[%rd7+256], %f53;
	.loc	19	149	0
	ld.shared.f32 	%f64, [%rd7+384];
	mov.s32 	%r60, -3;
	set.eq.u32.s32 	%r61, %r41, %r60;
	neg.s32 	%r62, %r61;
	and.b32 	%r63, %r62, %r40;
	mov.u32 	%r64, 0;
	setp.eq.s32 	%p17, %r63, %r64;
	@%p17 bra 	$Lt_21_36610;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	152	0
	mov.f32 	%f65, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f49, %f64, %f65;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_21_36610:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	155	0
	mul.ftz.f32 	%f66, %f59, %f4;
	fma.rn.ftz.f32 	%f67, %f64, %f3, %f66;
	fma.rn.ftz.f32 	%f68, %f6, %f2, %f67;
	fma.rn.ftz.f32 	%f53, %f5, %f1, %f68;
	.loc	19	157	0
	mov.f32 	%f7, %f64;
	.loc	19	158	0
	mov.f32 	%f5, %f6;
	.loc	19	159	0
	mov.f32 	%f6, %f53;
	.loc	19	161	0
	st.shared.f32 	[%rd7+384], %f53;
	.loc	19	164	0
	bar.sync 	0;
	.loc	19	167	0
	setp.ge.s32 	%p18, %r2, %r4;
	@%p18 bra 	$Lt_21_37122;
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	52	0
	ld.param.u64 	%rd12, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_pOut];
	cvt.s64.s32 	%rd13, %r24;
	mul.wide.s32 	%rd14, %r24, 16;
	add.u64 	%rd15, %rd12, %rd14;
	ld.shared.f32 	%f69, [%rd6+0];
	ld.shared.f32 	%f70, [%rd6+512];
	ld.shared.f32 	%f71, [%rd6+1024];
	ld.shared.f32 	%f72, [%rd6+1536];
	st.global.v4.f32 	[%rd15+0], {%f69,%f70,%f71,%f72};
$Lt_21_37122:
 //<loop> Part of loop body line 126, head labeled $Lt_21_33282
	.loc	19	176	0
	bar.sync 	0;
	.loc	19	178	0
	add.s32 	%r41, %r41, 4;
	.loc	19	179	0
	add.s32 	%r8, %r8, 4;
	.loc	19	180	0
	add.s32 	%r2, %r2, 4;
	.loc	19	181	0
	add.s32 	%r21, %r32, %r21;
	.loc	19	182	0
	add.s32 	%r24, %r37, %r24;
	setp.gt.s32 	%p19, %r4, %r41;
	@%p19 bra 	$Lt_21_33282;
	bra.uni 	$Lt_21_32770;
$Lt_21_45314:
	mov.s32 	%r41, 0;
	mov.u64 	%rd1, __cuda_local_var_302754_34_non_const_smem__1;
$Lt_21_32770:
	mov.u32 	%r65, 0;
	setp.le.s32 	%p20, %r41, %r65;
	@%p20 bra 	$Lt_21_37890;
	add.s32 	%r66, %r41, 3;
	shr.s32 	%r67, %r66, 31;
	mov.s32 	%r68, 3;
	and.b32 	%r69, %r67, %r68;
	add.s32 	%r70, %r69, %r66;
	shr.s32 	%r71, %r70, 2;
	shl.b32 	%r32, %r19, 2;
	mul.lo.s32 	%r72, %r1, 32;
	mul.lo.s32 	%r73, %r1, 128;
	sub.s32 	%r74, %r4, 4;
	ld.param.s8 	%r75, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_repeat_edge_pixels];
	mov.s32 	%r76, 0;
	setp.ne.s32 	%p21, %r75, %r76;
	mov.pred 	%p22, %p21;
	mov.pred 	%p23, %p6;
	sub.s32 	%r77, %r4, 3;
	sub.s32 	%r78, %r4, 2;
	sub.s32 	%r79, %r4, 1;
	mul.lo.s32 	%r80, %r8, %r19;
	mul.lo.s32 	%r81, %r19, -4;
	add.u32 	%r82, %r72, %r12;
	add.u32 	%r83, %r73, %r12;
	selp.s32 	%r40, 1, 0, %p22;
	cvt.s64.s32 	%rd16, %r82;
	cvt.s64.s32 	%rd17, %r83;
	add.s32 	%r84, %r80, %r18;
	mul.wide.s32 	%rd18, %r82, 4;
	mul.wide.s32 	%rd19, %r83, 4;
	add.u64 	%rd6, %rd18, %rd1;
	add.u64 	%rd7, %rd19, %rd1;
	ld.param.f32 	%f73, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_neg_d_minus2];
	ld.param.f32 	%f74, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_neg_d_minus1];
	ld.param.f32 	%f75, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_n_minus1];
	ld.param.f32 	%f76, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_n_minus2];
	mov.f32 	%f77, 0f00000000;    	// 0
	mov.f32 	%f5, 0f00000000;     	// 0
	mov.f32 	%f6, 0f00000000;     	// 0
	mov.f32 	%f7, 0f00000000;     	// 0
	mov.s32 	%r85, %r71;
$Lt_21_38402:
 //<loop> Loop body line 182, nesting depth: 1, estimated iterations: unknown
	.loc	19	198	0
	sub.s32 	%r41, %r41, 4;
	.loc	19	199	0
	sub.s32 	%r8, %r8, 4;
	add.s32 	%r84, %r81, %r84;
	.loc	19	200	0
	sub.s32 	%r2, %r2, 4;
	.loc	19	201	0
	sub.s32 	%r21, %r21, %r32;
	.loc	19	198	0
	mov.u32 	%r86, 0;
	setp.lt.s32 	%p24, %r8, %r86;
	@%p24 bra 	$Lt_21_47362;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	setp.ge.s32 	%p25, %r8, %r3;
	@%p25 bra 	$Lt_21_47362;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	mov.u32 	%r87, 0;
	setp.lt.s32 	%p26, %r18, %r87;
	@%p26 bra 	$Lt_21_47362;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	setp.ge.s32 	%p27, %r18, %r15;
	@%p27 bra 	$Lt_21_47362;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	38	0
	ld.param.u64 	%rd20, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_pIn];
	cvt.s64.s32 	%rd21, %r21;
	mul.wide.s32 	%rd22, %r21, 16;
	add.u64 	%rd23, %rd20, %rd22;
	ld.global.v4.f32 	{%f78,%f79,%f80,%f81}, [%rd23+0];
	.loc	3	255	0
	mov.f32 	%f82, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p28, %f78, %f82;
	@!%p28 bra 	$Lt_21_38658;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	234	0
	neg.ftz.f32 	%f83, %f78;
	lg2.approx.ftz.f32 	%f84, %f83;
	mov.f32 	%f85, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f86, %f84, %f85;
	ex2.approx.ftz.f32 	%f87, %f86;
	neg.ftz.f32 	%f18, %f87;
	bra.uni 	$LDWendi___log2f_198_16;
$Lt_21_38658:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f88, %f78;
	mov.f32 	%f89, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f90, %f88, %f89;
	ex2.approx.ftz.f32 	%f18, %f90;
$LDWendi___log2f_198_16:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	256	0
	mov.f32 	%f91, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p29, %f79, %f91;
	@!%p29 bra 	$Lt_21_39170;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	234	0
	neg.ftz.f32 	%f92, %f79;
	lg2.approx.ftz.f32 	%f93, %f92;
	mov.f32 	%f94, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f95, %f93, %f94;
	ex2.approx.ftz.f32 	%f96, %f95;
	neg.ftz.f32 	%f28, %f96;
	bra.uni 	$LDWendi___log2f_198_14;
$Lt_21_39170:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f97, %f79;
	mov.f32 	%f98, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f99, %f97, %f98;
	ex2.approx.ftz.f32 	%f28, %f99;
$LDWendi___log2f_198_14:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	257	0
	mov.f32 	%f100, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p30, %f80, %f100;
	@!%p30 bra 	$Lt_21_39682;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	234	0
	neg.ftz.f32 	%f101, %f80;
	lg2.approx.ftz.f32 	%f102, %f101;
	mov.f32 	%f103, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f104, %f102, %f103;
	ex2.approx.ftz.f32 	%f105, %f104;
	neg.ftz.f32 	%f38, %f105;
	bra.uni 	$LDWendi___log2f_198_12;
$Lt_21_39682:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f106, %f80;
	mov.f32 	%f107, 0f400ccccd;   	// 2.2
	mul.ftz.f32 	%f108, %f106, %f107;
	ex2.approx.ftz.f32 	%f38, %f108;
$LDWendi___log2f_198_12:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	208	0
	cvt.ftz.sat.f32.f32 	%f109, %f81;
	mul.ftz.f32 	%f43, %f18, %f109;
	mul.ftz.f32 	%f44, %f28, %f109;
	mul.ftz.f32 	%f45, %f38, %f109;
	mov.f32 	%f46, %f109;
	bra.uni 	$L_21_31490;
$Lt_21_47362:
$L_21_31746:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	210	0
	mov.f32 	%f46, 0f00000000;    	// 0
	mov.f32 	%f45, 0f00000000;    	// 0
	mov.f32 	%f44, 0f00000000;    	// 0
	mov.f32 	%f43, 0f00000000;    	// 0
$L_21_31490:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	212	0
	st.shared.f32 	[%rd6+0], %f43;
	.loc	19	213	0
	st.shared.f32 	[%rd6+512], %f44;
	.loc	19	214	0
	st.shared.f32 	[%rd6+1024], %f45;
	.loc	19	215	0
	st.shared.f32 	[%rd6+1536], %f46;
	.loc	19	217	0
	bar.sync 	0;
	.loc	19	223	0
	ld.shared.f32 	%f110, [%rd7+384];
	set.eq.u32.s32 	%r88, %r74, %r41;
	neg.s32 	%r89, %r88;
	and.b32 	%r90, %r89, %r40;
	mov.u32 	%r91, 0;
	setp.eq.s32 	%p31, %r90, %r91;
	@%p31 bra 	$Lt_21_40194;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	226	0
	mov.f32 	%f111, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f49, %f110, %f111;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_21_40194:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	228	0
	mov.f32 	%f112, %f7;
	mul.ftz.f32 	%f113, %f76, %f77;
	fma.rn.ftz.f32 	%f114, %f112, %f75, %f113;
	fma.rn.ftz.f32 	%f115, %f6, %f74, %f114;
	fma.rn.ftz.f32 	%f53, %f5, %f73, %f115;
	.loc	19	232	0
	mov.f32 	%f5, %f6;
	.loc	19	233	0
	mov.f32 	%f6, %f53;
	.loc	19	235	0
	st.shared.f32 	[%rd7+384], %f53;
	.loc	19	223	0
	ld.shared.f32 	%f116, [%rd7+256];
	set.eq.u32.s32 	%r92, %r77, %r41;
	neg.s32 	%r93, %r92;
	and.b32 	%r94, %r93, %r40;
	mov.u32 	%r95, 0;
	setp.eq.s32 	%p32, %r94, %r95;
	@%p32 bra 	$Lt_21_40706;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	226	0
	mov.f32 	%f117, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f49, %f116, %f117;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_21_40706:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	228	0
	mul.ftz.f32 	%f118, %f112, %f76;
	fma.rn.ftz.f32 	%f119, %f110, %f75, %f118;
	fma.rn.ftz.f32 	%f120, %f6, %f74, %f119;
	fma.rn.ftz.f32 	%f53, %f5, %f73, %f120;
	.loc	19	232	0
	mov.f32 	%f5, %f6;
	.loc	19	233	0
	mov.f32 	%f6, %f53;
	.loc	19	235	0
	st.shared.f32 	[%rd7+256], %f53;
	.loc	19	223	0
	ld.shared.f32 	%f121, [%rd7+128];
	set.eq.u32.s32 	%r96, %r78, %r41;
	neg.s32 	%r97, %r96;
	and.b32 	%r98, %r97, %r40;
	mov.u32 	%r99, 0;
	setp.eq.s32 	%p33, %r98, %r99;
	@%p33 bra 	$Lt_21_41218;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	226	0
	mov.f32 	%f122, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f49, %f121, %f122;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_21_41218:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	228	0
	mul.ftz.f32 	%f123, %f110, %f76;
	fma.rn.ftz.f32 	%f124, %f116, %f75, %f123;
	fma.rn.ftz.f32 	%f125, %f6, %f74, %f124;
	fma.rn.ftz.f32 	%f53, %f5, %f73, %f125;
	.loc	19	232	0
	mov.f32 	%f5, %f6;
	.loc	19	233	0
	mov.f32 	%f6, %f53;
	.loc	19	235	0
	st.shared.f32 	[%rd7+128], %f53;
	.loc	19	223	0
	ld.shared.f32 	%f64, [%rd7+0];
	set.eq.u32.s32 	%r100, %r79, %r41;
	neg.s32 	%r101, %r100;
	and.b32 	%r102, %r101, %r40;
	mov.u32 	%r103, 0;
	setp.eq.s32 	%p34, %r102, %r103;
	@%p34 bra 	$Lt_21_41730;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	226	0
	mov.f32 	%f126, 0f3f000000;   	// 0.5
	mul.ftz.f32 	%f49, %f64, %f126;
	mov.f32 	%f5, %f49;
	mov.f32 	%f6, %f49;
$Lt_21_41730:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	228	0
	mul.ftz.f32 	%f127, %f116, %f76;
	fma.rn.ftz.f32 	%f128, %f121, %f75, %f127;
	fma.rn.ftz.f32 	%f129, %f6, %f74, %f128;
	fma.rn.ftz.f32 	%f53, %f5, %f73, %f129;
	.loc	19	230	0
	mov.f32 	%f77, %f121;
	.loc	19	231	0
	mov.f32 	%f7, %f64;
	.loc	19	232	0
	mov.f32 	%f5, %f6;
	.loc	19	233	0
	mov.f32 	%f6, %f53;
	.loc	19	235	0
	st.shared.f32 	[%rd7+0], %f53;
	.loc	19	238	0
	bar.sync 	0;
	.loc	19	242	0
	mov.s32 	%r21, %r84;
	.loc	19	244	0
	setp.ge.s32 	%p35, %r2, %r4;
	@%p35 bra 	$Lt_21_42242;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	38	0
	ld.param.u64 	%rd24, [__cudaparm_VerticalRecursiveGaussianRGBAF32_kernel_pOut];
	mul.lo.s32 	%r104, %r2, %r22;
	add.s32 	%r105, %r13, %r104;
	cvt.s64.s32 	%rd25, %r105;
	mul.wide.s32 	%rd26, %r105, 16;
	add.u64 	%rd27, %rd24, %rd26;
	ld.global.f32 	%f130, [%rd27+12];
	.loc	3	208	0
	ld.shared.f32 	%f131, [%rd6+1536];
	add.ftz.f32 	%f132, %f131, %f130;
	cvt.ftz.sat.f32.f32 	%f133, %f132;
	mov.f32 	%f134, %f133;
	mov.f32 	%f135, 0fb70637bd;   	// -8e-006
	add.ftz.f32 	%f136, %f133, %f135;
	mov.f32 	%f137, 0f00000000;   	// 0
	setp.le.ftz.f32 	%p36, %f136, %f137;
	@%p36 bra 	$Lt_21_43010;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f138, %f133;
	ld.global.v4.f32 	{%f139,%f140,%f141,_}, [%rd27+0];
	ld.shared.f32 	%f142, [%rd6+1024];
	add.ftz.f32 	%f143, %f141, %f142;
	mul.ftz.f32 	%f144, %f138, %f143;
	.loc	3	214	0
	ld.shared.f32 	%f145, [%rd6+512];
	add.ftz.f32 	%f146, %f140, %f145;
	mul.ftz.f32 	%f147, %f138, %f146;
	.loc	3	215	0
	ld.shared.f32 	%f148, [%rd6+0];
	add.ftz.f32 	%f149, %f139, %f148;
	mul.ftz.f32 	%f150, %f138, %f149;
	bra.uni 	$Lt_21_42754;
$Lt_21_43010:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	219	0
	mov.f32 	%f144, 0f00000000;   	// 0
	mov.f32 	%f147, 0f00000000;   	// 0
	mov.f32 	%f150, 0f00000000;   	// 0
	mov.f32 	%f134, 0f00000000;   	// 0
$Lt_21_42754:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	266	0
	mov.f32 	%f151, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p37, %f150, %f151;
	@!%p37 bra 	$Lt_21_43266;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	242	0
	neg.ftz.f32 	%f152, %f150;
	lg2.approx.ftz.f32 	%f153, %f152;
	mov.f32 	%f154, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f155, %f153, %f154;
	ex2.approx.ftz.f32 	%f156, %f155;
	neg.ftz.f32 	%f157, %f156;
	bra.uni 	$LDWendi___log2f_198_6;
$Lt_21_43266:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f158, %f150;
	mov.f32 	%f159, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f160, %f158, %f159;
	ex2.approx.ftz.f32 	%f157, %f160;
$LDWendi___log2f_198_6:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	267	0
	mov.f32 	%f161, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p38, %f147, %f161;
	@!%p38 bra 	$Lt_21_43778;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	242	0
	neg.ftz.f32 	%f162, %f147;
	lg2.approx.ftz.f32 	%f163, %f162;
	mov.f32 	%f164, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f165, %f163, %f164;
	ex2.approx.ftz.f32 	%f166, %f165;
	neg.ftz.f32 	%f167, %f166;
	bra.uni 	$LDWendi___log2f_198_4;
$Lt_21_43778:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f168, %f147;
	mov.f32 	%f169, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f170, %f168, %f169;
	ex2.approx.ftz.f32 	%f167, %f170;
$LDWendi___log2f_198_4:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	268	0
	mov.f32 	%f171, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p39, %f144, %f171;
	@!%p39 bra 	$Lt_21_44290;
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	242	0
	neg.ftz.f32 	%f172, %f144;
	lg2.approx.ftz.f32 	%f173, %f172;
	mov.f32 	%f174, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f175, %f173, %f174;
	ex2.approx.ftz.f32 	%f176, %f175;
	neg.ftz.f32 	%f177, %f176;
	bra.uni 	$LDWendi___log2f_198_2;
$Lt_21_44290:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f178, %f144;
	mov.f32 	%f179, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f180, %f178, %f179;
	ex2.approx.ftz.f32 	%f177, %f180;
$LDWendi___log2f_198_2:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	st.global.v4.f32 	[%rd27+0], {%f157,%f167,%f177,%f134};
$Lt_21_42242:
 //<loop> Part of loop body line 182, head labeled $Lt_21_38402
	.loc	19	257	0
	bar.sync 	0;
	mov.u32 	%r106, 0;
	setp.gt.s32 	%p40, %r41, %r106;
	@%p40 bra 	$Lt_21_38402;
$Lt_21_37890:
$LDWendi__Z15IntegerMultiplyii_198_1:
	.loc	19	594	0
	exit;
$LDWend_VerticalRecursiveGaussianRGBAF32_kernel:
	} // VerticalRecursiveGaussianRGBAF32_kernel

	.entry HorizontalRecursiveGaussianRGBAF16_kernel (
		.param .u64 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_pIn,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_src_pitch,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_src_width,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_src_height,
		.param .u64 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_pOut,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_dest_pitch,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_dest_width,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_dest_height,
		.param .s8 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_repeat_edge_pixels,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_n_plus0,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_n_plus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_neg_d_plus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_neg_d_plus2,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_n_minus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_n_minus2,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_neg_d_minus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_neg_d_minus2)
	{
	.reg .u32 %r<144>;
	.reg .u64 %rd<44>;
	.reg .f32 %f<144>;
	.reg .pred %p<46>;
	.loc	19	612	0
$LDWbegin_HorizontalRecursiveGaussianRGBAF16_kernel:
	.loc	19	318	0
	ld.param.s32 	%r1, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_dest_width];
	mov.u32 	%r2, 0;
	setp.le.s32 	%p1, %r1, %r2;
	@%p1 bra 	$Lt_22_59138;
	mov.u32 	%r3, %tid.y;
	shl.b32 	%r4, %r3, 4;
	mov.u32 	%r5, %tid.x;
	add.u32 	%r6, %r4, %r5;
	cvt.s32.u32 	%r7, %ctaid.y;
	mul.lo.s32 	%r8, %r7, 8;
	ld.param.s32 	%r9, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_src_height];
	ld.param.s32 	%r10, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_dest_height];
	sub.s32 	%r11, %r10, %r9;
	add.u32 	%r12, %r8, %r3;
	shr.s32 	%r13, %r11, 1;
	sub.s32 	%r14, %r12, %r13;
	add.s32 	%r15, %r1, 31;
	shr.s32 	%r16, %r15, 31;
	mov.s32 	%r17, 31;
	and.b32 	%r18, %r16, %r17;
	add.s32 	%r19, %r18, %r15;
	shr.s32 	%r20, %r19, 5;
	mov.s32 	%r21, %r5;
	add.u32 	%r22, %r1, %r5;
	ld.param.s32 	%r23, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_src_width];
	sub.s32 	%r24, %r1, %r23;
	mul.lo.u32 	%r25, %r3, 32;
	shr.s32 	%r26, %r24, 1;
	add.u32 	%r27, %r25, %r5;
	mov.s32 	%r28, 31;
	setp.le.s32 	%p2, %r27, %r28;
	setp.lt.s32 	%p3, %r14, %r9;
	add.s32 	%r29, %r14, 8;
	add.s32 	%r30, %r14, 4;
	setp.lt.s32 	%p4, %r30, %r9;
	mov.s32 	%r31, 0;
	mov.f32 	%f1, 0f00000000;     	// 0
	mov.f32 	%f2, 0f00000000;     	// 0
	mov.f32 	%f3, 0f00000000;     	// 0
	mov.u64 	%rd1, __cuda_local_var_302956_34_non_const_smem__0;
	mov.s32 	%r32, %r20;
$Lt_22_41218:
 //<loop> Loop body line 318, nesting depth: 1, estimated iterations: unknown
	.loc	19	335	0
	sub.s32 	%r33, %r21, %r26;
	mov.s32 	%r34, %r14;
	mov.s32 	%r35, 0;
$Lt_22_41986:
 //<loop> Loop body line 335, nesting depth: 2, iterations: 2
	setp.ge.s32 	%p5, %r34, %r9;
	@%p5 bra 	$Lt_22_42242;
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	19	342	0
	mov.u32 	%r36, 0;
	setp.lt.s32 	%p6, %r14, %r36;
	@%p6 bra 	$Lt_22_59906;
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	@!%p3 bra 	$Lt_22_59906;
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	mov.u32 	%r37, 0;
	setp.lt.s32 	%p7, %r33, %r37;
	@%p7 bra 	$Lt_22_59906;
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	setp.ge.s32 	%p8, %r33, %r23;
	@%p8 bra 	$Lt_22_59906;
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	19	36	0
	ld.param.s32 	%r38, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_src_pitch];
	mul.lo.s32 	%r39, %r35, %r38;
	mul.lo.s32 	%r40, %r14, %r38;
	ld.param.u64 	%rd2, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_pIn];
	add.s32 	%r41, %r33, %r40;
	mul.lo.s32 	%r42, %r39, 4;
	add.s32 	%r43, %r41, %r42;
	cvt.s64.s32 	%rd3, %r43;
	mul.wide.s32 	%rd4, %r43, 8;
	add.u64 	%rd5, %rd2, %rd4;
	ld.global.v4.u16 	{%r44,%r45,%r46,%r47}, [%rd5+0];
	.loc	3	255	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r44;
	cvt.ftz.f32.f16	%f4, %b1; }
	mov.f32 	%f5, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p9, %f4, %f5;
	@!%p9 bra 	$Lt_22_42754;
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	3	234	0
	neg.ftz.f32 	%f6, %f4;
	lg2.approx.ftz.f32 	%f7, %f6;
	mov.f32 	%f8, 0f400ccccd;     	// 2.2
	mul.ftz.f32 	%f9, %f7, %f8;
	ex2.approx.ftz.f32 	%f10, %f9;
	neg.ftz.f32 	%f11, %f10;
	bra.uni 	$LDWendi___log2f_199_23;
$Lt_22_42754:
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f12, %f4;
	mov.f32 	%f13, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f14, %f12, %f13;
	ex2.approx.ftz.f32 	%f11, %f14;
$LDWendi___log2f_199_23:
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	3	256	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r45;
	cvt.ftz.f32.f16	%f15, %b1; }
	mov.f32 	%f16, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p10, %f15, %f16;
	@!%p10 bra 	$Lt_22_43266;
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	3	234	0
	neg.ftz.f32 	%f17, %f15;
	lg2.approx.ftz.f32 	%f18, %f17;
	mov.f32 	%f19, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f20, %f18, %f19;
	ex2.approx.ftz.f32 	%f21, %f20;
	neg.ftz.f32 	%f22, %f21;
	bra.uni 	$LDWendi___log2f_199_21;
$Lt_22_43266:
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f23, %f15;
	mov.f32 	%f24, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f25, %f23, %f24;
	ex2.approx.ftz.f32 	%f22, %f25;
$LDWendi___log2f_199_21:
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	3	257	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r46;
	cvt.ftz.f32.f16	%f26, %b1; }
	mov.f32 	%f27, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p11, %f26, %f27;
	@!%p11 bra 	$Lt_22_43778;
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	3	234	0
	neg.ftz.f32 	%f28, %f26;
	lg2.approx.ftz.f32 	%f29, %f28;
	mov.f32 	%f30, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f31, %f29, %f30;
	ex2.approx.ftz.f32 	%f32, %f31;
	neg.ftz.f32 	%f33, %f32;
	bra.uni 	$LDWendi___log2f_199_19;
$Lt_22_43778:
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f34, %f26;
	mov.f32 	%f35, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f36, %f34, %f35;
	ex2.approx.ftz.f32 	%f33, %f36;
$LDWendi___log2f_199_19:
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	19	343	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r47;
	cvt.ftz.f32.f16	%f37, %b1; }
	cvt.ftz.sat.f32.f32 	%f38, %f37;
	mul.ftz.f32 	%f39, %f11, %f38;
	mul.ftz.f32 	%f40, %f22, %f38;
	mul.ftz.f32 	%f41, %f33, %f38;
	mov.f32 	%f42, %f38;
	bra.uni 	$L_22_38146;
$Lt_22_59906:
$L_22_38402:
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	19	345	0
	mov.f32 	%f42, 0f00000000;    	// 0
	mov.f32 	%f41, 0f00000000;    	// 0
	mov.f32 	%f40, 0f00000000;    	// 0
	mov.f32 	%f39, 0f00000000;    	// 0
$L_22_38146:
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	.loc	19	348	0
	mul.lo.s32 	%r48, %r3, 33;
	mul.lo.s32 	%r49, %r35, 132;
	add.u32 	%r50, %r48, %r5;
	add.s32 	%r51, %r49, %r50;
	cvt.s64.s32 	%rd6, %r51;
	mul.wide.s32 	%rd7, %r51, 4;
	add.u64 	%rd8, %rd1, %rd7;
	st.shared.f32 	[%rd8+0], %f39;
	.loc	19	349	0
	st.shared.f32 	[%rd8+1056], %f40;
	.loc	19	350	0
	st.shared.f32 	[%rd8+2112], %f41;
	.loc	19	351	0
	st.shared.f32 	[%rd8+3168], %f42;
$Lt_22_42242:
 //<loop> Part of loop body line 335, head labeled $Lt_22_41986
	add.s32 	%r35, %r35, 1;
	add.s32 	%r34, %r34, 4;
	setp.ne.s32 	%p12, %r34, %r29;
	@%p12 bra 	$Lt_22_41986;
 //<loop> Part of loop body line 318, head labeled $Lt_22_41218
	.loc	19	358	0
	bar.sync 	0;
	@!%p2 bra 	$Lt_22_44546;
 //<loop> Part of loop body line 318, head labeled $Lt_22_41218
	.loc	19	364	0
	ld.param.s8 	%r52, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_repeat_edge_pixels];
	mov.s32 	%r53, 0;
	setp.ne.s32 	%p13, %r52, %r53;
	mov.pred 	%p14, %p13;
	mov.pred 	%p15, %p16;
	neg.s32 	%r54, %r31;
	selp.s32 	%r55, 1, 0, %p14;
	mul.lo.s32 	%r56, %r6, 33;
	mov.s32 	%r57, %r56;
	add.s32 	%r58, %r56, 32;
	add.s32 	%r59, %r54, %r56;
	cvt.s64.s32 	%rd9, %r56;
	mul.wide.s32 	%rd10, %r56, 4;
	add.u64 	%rd11, %rd1, %rd10;
	ld.param.f32 	%f43, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_neg_d_plus2];
	ld.param.f32 	%f44, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_neg_d_plus1];
	ld.param.f32 	%f45, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_n_plus0];
	ld.param.f32 	%f46, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_n_plus1];
$Lt_22_45570:
 //<loop> Loop body line 364, nesting depth: 2, iterations: 32
	.loc	19	367	0
	ld.shared.f32 	%f47, [%rd11+0];
	set.eq.u32.s32 	%r60, %r57, %r59;
	neg.s32 	%r61, %r60;
	and.b32 	%r62, %r61, %r55;
	mov.u32 	%r63, 0;
	setp.eq.s32 	%p17, %r62, %r63;
	@%p17 bra 	$Lt_22_45826;
 //<loop> Part of loop body line 364, head labeled $Lt_22_45570
	.loc	19	370	0
	mov.f32 	%f48, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f49, %f47, %f48;
	mov.f32 	%f1, %f49;
	mov.f32 	%f2, %f49;
$Lt_22_45826:
 //<loop> Part of loop body line 364, head labeled $Lt_22_45570
	.loc	19	372	0
	mul.ftz.f32 	%f50, %f46, %f3;
	fma.rn.ftz.f32 	%f51, %f47, %f45, %f50;
	fma.rn.ftz.f32 	%f52, %f2, %f44, %f51;
	fma.rn.ftz.f32 	%f53, %f1, %f43, %f52;
	.loc	19	374	0
	mov.f32 	%f3, %f47;
	.loc	19	375	0
	mov.f32 	%f1, %f2;
	.loc	19	376	0
	mov.f32 	%f2, %f53;
	.loc	19	378	0
	st.shared.f32 	[%rd11+0], %f53;
	add.s32 	%r57, %r57, 1;
	add.u64 	%rd11, %rd11, 4;
	setp.ne.s32 	%p18, %r57, %r58;
	@%p18 bra 	$Lt_22_45570;
$Lt_22_44546:
 //<loop> Part of loop body line 318, head labeled $Lt_22_41218
	.loc	19	383	0
	bar.sync 	0;
	.loc	19	388	0
	@!%p3 bra 	$Lt_22_47106;
 //<loop> Part of loop body line 318, head labeled $Lt_22_41218
	.loc	19	395	0
	mul.lo.s32 	%r64, %r3, 33;
	add.u32 	%r65, %r5, %r64;
	cvt.s64.s32 	%rd12, %r65;
	mul.wide.s32 	%rd13, %r65, 4;
	add.u64 	%rd14, %rd1, %rd13;
	ld.shared.f32 	%f39, [%rd14+0];
	.loc	19	396	0
	ld.shared.f32 	%f40, [%rd14+1056];
	.loc	19	397	0
	ld.shared.f32 	%f41, [%rd14+2112];
	.loc	19	398	0
	ld.shared.f32 	%f42, [%rd14+3168];
	setp.le.s32 	%p19, %r1, %r21;
	@%p19 bra 	$Lt_22_47106;
 //<loop> Part of loop body line 318, head labeled $Lt_22_41218
	.loc	19	48	0
	ld.param.u64 	%rd15, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_pOut];
	ld.param.s32 	%r66, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_dest_pitch];
	mul.lo.s32 	%r67, %r66, %r12;
	add.s32 	%r68, %r21, %r67;
	cvt.s64.s32 	%rd16, %r68;
	mul.wide.s32 	%rd17, %r68, 8;
	add.u64 	%rd18, %rd15, %rd17;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f39;
	mov.b32		%r69, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f40;
	mov.b32		%r70, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f41;
	mov.b32		%r71, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f42;
	mov.b32		%r72, %b1; }
	st.global.v4.u16 	[%rd18+0], {%r69,%r70,%r71,%r72};
$Lt_22_47106:
$Lt_22_46594:
 //<loop> Part of loop body line 318, head labeled $Lt_22_41218
	.loc	19	402	0
	@!%p4 bra 	$Lt_22_48130;
 //<loop> Part of loop body line 318, head labeled $Lt_22_41218
	.loc	19	395	0
	mul.lo.s32 	%r73, %r3, 33;
	add.u32 	%r74, %r5, %r73;
	cvt.s64.s32 	%rd19, %r74;
	mul.wide.s32 	%rd20, %r74, 4;
	add.u64 	%rd21, %rd1, %rd20;
	ld.shared.f32 	%f39, [%rd21+528];
	.loc	19	396	0
	ld.shared.f32 	%f40, [%rd21+1584];
	.loc	19	397	0
	ld.shared.f32 	%f41, [%rd21+2640];
	.loc	19	398	0
	ld.shared.f32 	%f42, [%rd21+3696];
	setp.le.s32 	%p20, %r1, %r21;
	@%p20 bra 	$Lt_22_48130;
 //<loop> Part of loop body line 318, head labeled $Lt_22_41218
	.loc	19	48	0
	ld.param.s32 	%r75, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_dest_pitch];
	mul.lo.s32 	%r76, %r75, %r12;
	ld.param.u64 	%rd22, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_pOut];
	add.s32 	%r77, %r76, %r21;
	mul.lo.s32 	%r78, %r75, 4;
	add.s32 	%r79, %r77, %r78;
	cvt.s64.s32 	%rd23, %r79;
	mul.wide.s32 	%rd24, %r79, 8;
	add.u64 	%rd25, %rd22, %rd24;
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f39;
	mov.b32		%r80, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f40;
	mov.b32		%r81, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f41;
	mov.b32		%r82, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f42;
	mov.b32		%r83, %b1; }
	st.global.v4.u16 	[%rd25+0], {%r80,%r81,%r82,%r83};
$Lt_22_48130:
$Lt_22_47618:
 //<loop> Part of loop body line 318, head labeled $Lt_22_41218
	.loc	19	409	0
	bar.sync 	0;
	add.s32 	%r31, %r31, 32;
	add.u32 	%r21, %r21, 32;
	setp.lt.s32 	%p21, %r21, %r22;
	@%p21 bra 	$Lt_22_41218;
	bra.uni 	$Lt_22_40706;
$Lt_22_59138:
	mov.s32 	%r31, 0;
	mov.u64 	%rd1, __cuda_local_var_302956_34_non_const_smem__0;
$Lt_22_40706:
	mov.u32 	%r84, 0;
	setp.le.s32 	%p22, %r31, %r84;
	@%p22 bra 	$Lt_22_48898;
	cvt.s32.u32 	%r85, %ctaid.y;
	mul.lo.s32 	%r86, %r85, 8;
	mov.u32 	%r3, %tid.y;
	add.u32 	%r12, %r86, %r3;
	shl.b32 	%r87, %r3, 4;
	mov.u32 	%r5, %tid.x;
	add.u32 	%r6, %r87, %r5;
	ld.param.s32 	%r9, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_src_height];
	ld.param.s32 	%r88, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_dest_height];
	sub.s32 	%r89, %r88, %r9;
	shr.s32 	%r90, %r89, 1;
	sub.s32 	%r14, %r12, %r90;
	add.s32 	%r91, %r31, 31;
	shr.s32 	%r92, %r91, 31;
	mov.s32 	%r93, 31;
	and.b32 	%r94, %r92, %r93;
	add.s32 	%r95, %r94, %r91;
	shr.s32 	%r96, %r95, 5;
	add.u32 	%r21, %r31, %r5;
	ld.param.s32 	%r23, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_src_width];
	sub.s32 	%r97, %r1, %r23;
	mul.lo.u32 	%r98, %r3, 32;
	shr.s32 	%r26, %r97, 1;
	add.u32 	%r99, %r98, %r5;
	add.s32 	%r100, %r12, 8;
	mov.s32 	%r101, 31;
	setp.le.s32 	%p2, %r99, %r101;
	add.s32 	%r29, %r14, 8;
	mov.f32 	%f54, 0f00000000;    	// 0
	mov.f32 	%f1, 0f00000000;     	// 0
	mov.f32 	%f2, 0f00000000;     	// 0
	mov.f32 	%f3, 0f00000000;     	// 0
	mov.s32 	%r102, %r96;
$Lt_22_49410:
 //<loop> Loop body line 409, nesting depth: 1, estimated iterations: unknown
	.loc	19	424	0
	sub.s32 	%r31, %r31, 32;
	sub.u32 	%r21, %r21, 32;
	.loc	19	432	0
	sub.s32 	%r33, %r21, %r26;
	mov.s32 	%r103, %r14;
	mov.s32 	%r104, 0;
$Lt_22_50178:
 //<loop> Loop body line 432, nesting depth: 1, iterations: 2
	setp.ge.s32 	%p23, %r103, %r9;
	@%p23 bra 	$Lt_22_50434;
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	19	439	0
	mov.u32 	%r105, 0;
	setp.lt.s32 	%p24, %r14, %r105;
	@%p24 bra 	$Lt_22_61186;
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	setp.ge.s32 	%p25, %r14, %r9;
	@%p25 bra 	$Lt_22_61186;
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	mov.u32 	%r106, 0;
	setp.lt.s32 	%p26, %r33, %r106;
	@%p26 bra 	$Lt_22_61186;
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	setp.ge.s32 	%p27, %r33, %r23;
	@%p27 bra 	$Lt_22_61186;
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	19	36	0
	ld.param.s32 	%r107, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_src_pitch];
	mul.lo.s32 	%r108, %r107, %r104;
	mul.lo.s32 	%r109, %r14, %r107;
	ld.param.u64 	%rd26, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_pIn];
	add.s32 	%r110, %r33, %r109;
	mul.lo.s32 	%r111, %r108, 4;
	add.s32 	%r112, %r110, %r111;
	cvt.s64.s32 	%rd27, %r112;
	mul.wide.s32 	%rd28, %r112, 8;
	add.u64 	%rd29, %rd26, %rd28;
	ld.global.v4.u16 	{%r44,%r45,%r46,%r47}, [%rd29+0];
	.loc	3	255	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r44;
	cvt.ftz.f32.f16	%f4, %b1; }
	mov.f32 	%f55, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p28, %f4, %f55;
	@!%p28 bra 	$Lt_22_50946;
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	3	234	0
	neg.ftz.f32 	%f56, %f4;
	lg2.approx.ftz.f32 	%f57, %f56;
	mov.f32 	%f58, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f59, %f57, %f58;
	ex2.approx.ftz.f32 	%f60, %f59;
	neg.ftz.f32 	%f11, %f60;
	bra.uni 	$LDWendi___log2f_199_15;
$Lt_22_50946:
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f61, %f4;
	mov.f32 	%f62, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f63, %f61, %f62;
	ex2.approx.ftz.f32 	%f11, %f63;
$LDWendi___log2f_199_15:
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	3	256	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r45;
	cvt.ftz.f32.f16	%f15, %b1; }
	mov.f32 	%f64, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p29, %f15, %f64;
	@!%p29 bra 	$Lt_22_51458;
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	3	234	0
	neg.ftz.f32 	%f65, %f15;
	lg2.approx.ftz.f32 	%f66, %f65;
	mov.f32 	%f67, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f68, %f66, %f67;
	ex2.approx.ftz.f32 	%f69, %f68;
	neg.ftz.f32 	%f22, %f69;
	bra.uni 	$LDWendi___log2f_199_13;
$Lt_22_51458:
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f70, %f15;
	mov.f32 	%f71, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f72, %f70, %f71;
	ex2.approx.ftz.f32 	%f22, %f72;
$LDWendi___log2f_199_13:
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	3	257	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r46;
	cvt.ftz.f32.f16	%f26, %b1; }
	mov.f32 	%f73, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p30, %f26, %f73;
	@!%p30 bra 	$Lt_22_51970;
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	3	234	0
	neg.ftz.f32 	%f74, %f26;
	lg2.approx.ftz.f32 	%f75, %f74;
	mov.f32 	%f76, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f77, %f75, %f76;
	ex2.approx.ftz.f32 	%f78, %f77;
	neg.ftz.f32 	%f33, %f78;
	bra.uni 	$LDWendi___log2f_199_11;
$Lt_22_51970:
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f79, %f26;
	mov.f32 	%f80, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f81, %f79, %f80;
	ex2.approx.ftz.f32 	%f33, %f81;
$LDWendi___log2f_199_11:
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	19	440	0
	{ .reg .b32 %b1;
	mov.b32		%b1, %r47;
	cvt.ftz.f32.f16	%f37, %b1; }
	cvt.ftz.sat.f32.f32 	%f82, %f37;
	mul.ftz.f32 	%f39, %f11, %f82;
	mul.ftz.f32 	%f40, %f22, %f82;
	mul.ftz.f32 	%f41, %f33, %f82;
	mov.f32 	%f42, %f82;
	bra.uni 	$L_22_39426;
$Lt_22_61186:
$L_22_39682:
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	19	442	0
	mov.f32 	%f42, 0f00000000;    	// 0
	mov.f32 	%f41, 0f00000000;    	// 0
	mov.f32 	%f40, 0f00000000;    	// 0
	mov.f32 	%f39, 0f00000000;    	// 0
$L_22_39426:
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	.loc	19	445	0
	mul.lo.s32 	%r48, %r3, 33;
	mul.lo.s32 	%r113, %r104, 132;
	add.u32 	%r114, %r48, %r5;
	add.s32 	%r115, %r113, %r114;
	cvt.s64.s32 	%rd30, %r115;
	mul.wide.s32 	%rd31, %r115, 4;
	add.u64 	%rd32, %rd1, %rd31;
	st.shared.f32 	[%rd32+0], %f39;
	.loc	19	446	0
	st.shared.f32 	[%rd32+1056], %f40;
	.loc	19	447	0
	st.shared.f32 	[%rd32+2112], %f41;
	.loc	19	448	0
	st.shared.f32 	[%rd32+3168], %f42;
$Lt_22_50434:
 //<loop> Part of loop body line 432, head labeled $Lt_22_50178
	add.s32 	%r104, %r104, 1;
	add.s32 	%r103, %r103, 4;
	setp.ne.s32 	%p31, %r103, %r29;
	@%p31 bra 	$Lt_22_50178;
 //<loop> Part of loop body line 409, head labeled $Lt_22_49410
	.loc	19	455	0
	bar.sync 	0;
	@!%p2 bra 	$Lt_22_52738;
 //<loop> Part of loop body line 409, head labeled $Lt_22_49410
	.loc	19	461	0
	mul.lo.s32 	%r116, %r6, 33;
	add.s32 	%r117, %r116, 31;
	sub.s32 	%r118, %r1, %r31;
	ld.param.s8 	%r119, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_repeat_edge_pixels];
	mov.s32 	%r120, 0;
	setp.ne.s32 	%p32, %r119, %r120;
	mov.pred 	%p33, %p32;
	mov.pred 	%p34, %p16;
	sub.s32 	%r121, %r118, 1;
	selp.s32 	%r55, 1, 0, %p33;
	cvt.s64.s32 	%rd33, %r117;
	mul.wide.s32 	%rd34, %r117, 4;
	add.u64 	%rd35, %rd1, %rd34;
	ld.param.f32 	%f83, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_neg_d_minus2];
	ld.param.f32 	%f84, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_neg_d_minus1];
	ld.param.f32 	%f85, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_n_minus1];
	ld.param.f32 	%f86, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_n_minus2];
	mov.s32 	%r122, 31;
$Lt_22_53762:
 //<loop> Loop body line 461, nesting depth: 1, iterations: 32
	.loc	19	464	0
	ld.shared.f32 	%f47, [%rd35+0];
	set.eq.u32.s32 	%r123, %r121, %r122;
	neg.s32 	%r124, %r123;
	and.b32 	%r125, %r124, %r55;
	mov.u32 	%r126, 0;
	setp.eq.s32 	%p35, %r125, %r126;
	@%p35 bra 	$Lt_22_54018;
 //<loop> Part of loop body line 461, head labeled $Lt_22_53762
	.loc	19	467	0
	mov.f32 	%f87, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f88, %f47, %f87;
	mov.f32 	%f1, %f88;
	mov.f32 	%f2, %f88;
$Lt_22_54018:
 //<loop> Part of loop body line 461, head labeled $Lt_22_53762
	.loc	19	469	0
	mul.ftz.f32 	%f89, %f86, %f54;
	fma.rn.ftz.f32 	%f90, %f3, %f85, %f89;
	fma.rn.ftz.f32 	%f91, %f2, %f84, %f90;
	fma.rn.ftz.f32 	%f92, %f1, %f83, %f91;
	.loc	19	471	0
	mov.f32 	%f54, %f3;
	.loc	19	472	0
	mov.f32 	%f3, %f47;
	.loc	19	473	0
	mov.f32 	%f1, %f2;
	.loc	19	474	0
	mov.f32 	%f2, %f92;
	.loc	19	476	0
	st.shared.f32 	[%rd35+0], %f92;
	.loc	19	477	0
	sub.u64 	%rd35, %rd35, 4;
	sub.s32 	%r122, %r122, 1;
	mov.u32 	%r127, -1;
	setp.ne.s32 	%p36, %r122, %r127;
	@%p36 bra 	$Lt_22_53762;
$Lt_22_52738:
 //<loop> Part of loop body line 409, head labeled $Lt_22_49410
	.loc	19	481	0
	bar.sync 	0;
	.loc	19	486	0
	mov.s32 	%r128, %r12;
	mov.s32 	%r129, 0;
$Lt_22_55298:
 //<loop> Loop body line 486, nesting depth: 1, iterations: 2
	setp.ge.s32 	%p37, %r128, %r88;
	@%p37 bra 	$Lt_22_56066;
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	setp.le.s32 	%p38, %r1, %r21;
	@%p38 bra 	$Lt_22_56066;
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	19	36	0
	ld.param.s32 	%r130, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_dest_pitch];
	mul.lo.s32 	%r131, %r130, %r129;
	mul.lo.s32 	%r132, %r130, %r12;
	ld.param.u64 	%rd36, [__cudaparm_HorizontalRecursiveGaussianRGBAF16_kernel_pOut];
	add.s32 	%r133, %r132, %r21;
	mul.lo.s32 	%r134, %r131, 4;
	add.s32 	%r135, %r133, %r134;
	cvt.s64.s32 	%rd37, %r135;
	mul.wide.s32 	%rd38, %r135, 8;
	add.u64 	%rd39, %rd36, %rd38;
	ld.global.v4.u16 	{%r44,%r45,%r46,%r47}, [%rd39+0];
	.loc	3	208	0
	mul.lo.s32 	%r48, %r3, 33;
	{ .reg .b32 %b1;
	mov.b32		%b1, %r47;
	cvt.ftz.f32.f16	%f37, %b1; }
	mul.lo.s32 	%r136, %r129, 132;
	add.u32 	%r137, %r48, %r5;
	add.s32 	%r138, %r136, %r137;
	cvt.s64.s32 	%rd40, %r138;
	mul.wide.s32 	%rd41, %r138, 4;
	add.u64 	%rd42, %rd1, %rd41;
	ld.shared.f32 	%f93, [%rd42+3168];
	add.ftz.f32 	%f94, %f37, %f93;
	cvt.ftz.sat.f32.f32 	%f95, %f94;
	mov.f32 	%f96, %f95;
	mov.f32 	%f97, 0fb70637bd;    	// -8e-006
	add.ftz.f32 	%f98, %f95, %f97;
	mov.f32 	%f99, 0f00000000;    	// 0
	setp.le.ftz.f32 	%p39, %f98, %f99;
	@%p39 bra 	$Lt_22_56834;
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f100, %f95;
	ld.shared.f32 	%f101, [%rd42+2112];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r46;
	cvt.ftz.f32.f16	%f102, %b1; }
	add.ftz.f32 	%f103, %f101, %f102;
	mul.ftz.f32 	%f104, %f100, %f103;
	.loc	3	214	0
	ld.shared.f32 	%f105, [%rd42+1056];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r45;
	cvt.ftz.f32.f16	%f106, %b1; }
	add.ftz.f32 	%f107, %f105, %f106;
	mul.ftz.f32 	%f108, %f100, %f107;
	.loc	3	215	0
	ld.shared.f32 	%f109, [%rd42+0];
	{ .reg .b32 %b1;
	mov.b32		%b1, %r44;
	cvt.ftz.f32.f16	%f110, %b1; }
	add.ftz.f32 	%f111, %f109, %f110;
	mul.ftz.f32 	%f112, %f100, %f111;
	bra.uni 	$Lt_22_56578;
$Lt_22_56834:
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	3	219	0
	mov.f32 	%f104, 0f00000000;   	// 0
	mov.f32 	%f108, 0f00000000;   	// 0
	mov.f32 	%f112, 0f00000000;   	// 0
	mov.f32 	%f96, 0f00000000;    	// 0
$Lt_22_56578:
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	3	266	0
	mov.f32 	%f113, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p40, %f112, %f113;
	@!%p40 bra 	$Lt_22_57090;
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	3	242	0
	neg.ftz.f32 	%f114, %f112;
	lg2.approx.ftz.f32 	%f115, %f114;
	mov.f32 	%f116, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f117, %f115, %f116;
	ex2.approx.ftz.f32 	%f118, %f117;
	neg.ftz.f32 	%f119, %f118;
	bra.uni 	$LDWendi___log2f_199_5;
$Lt_22_57090:
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f120, %f112;
	mov.f32 	%f121, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f122, %f120, %f121;
	ex2.approx.ftz.f32 	%f119, %f122;
$LDWendi___log2f_199_5:
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	3	267	0
	mov.f32 	%f123, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p41, %f108, %f123;
	@!%p41 bra 	$Lt_22_57602;
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	3	242	0
	neg.ftz.f32 	%f124, %f108;
	lg2.approx.ftz.f32 	%f125, %f124;
	mov.f32 	%f126, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f127, %f125, %f126;
	ex2.approx.ftz.f32 	%f128, %f127;
	neg.ftz.f32 	%f129, %f128;
	bra.uni 	$LDWendi___log2f_199_3;
$Lt_22_57602:
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f130, %f108;
	mov.f32 	%f131, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f132, %f130, %f131;
	ex2.approx.ftz.f32 	%f129, %f132;
$LDWendi___log2f_199_3:
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	3	268	0
	mov.f32 	%f133, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p42, %f104, %f133;
	@!%p42 bra 	$Lt_22_58114;
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	3	242	0
	neg.ftz.f32 	%f134, %f104;
	lg2.approx.ftz.f32 	%f135, %f134;
	mov.f32 	%f136, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f137, %f135, %f136;
	ex2.approx.ftz.f32 	%f138, %f137;
	neg.ftz.f32 	%f139, %f138;
	bra.uni 	$LDWendi___log2f_199_1;
$Lt_22_58114:
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f140, %f104;
	mov.f32 	%f141, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f142, %f140, %f141;
	ex2.approx.ftz.f32 	%f139, %f142;
$LDWendi___log2f_199_1:
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	19	48	0
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f119;
	mov.b32		%r139, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f129;
	mov.b32		%r140, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f139;
	mov.b32		%r141, %b1; }
	{ .reg .b32 %b1;
	cvt.rn.ftz.f16.f32	%b1, %f96;
	mov.b32		%r142, %b1; }
	st.global.v4.u16 	[%rd39+0], {%r139,%r140,%r141,%r142};
$Lt_22_56066:
$Lt_22_55554:
 //<loop> Part of loop body line 486, head labeled $Lt_22_55298
	.loc	19	504	0
	add.s32 	%r129, %r129, 1;
	add.s32 	%r128, %r128, 4;
	setp.ne.s32 	%p43, %r128, %r100;
	@%p43 bra 	$Lt_22_55298;
 //<loop> Part of loop body line 409, head labeled $Lt_22_49410
	.loc	19	512	0
	bar.sync 	0;
	setp.gt.s32 	%p44, %r21, %r5;
	@%p44 bra 	$Lt_22_49410;
$Lt_22_48898:
	.loc	19	633	0
	exit;
$LDWend_HorizontalRecursiveGaussianRGBAF16_kernel:
	} // HorizontalRecursiveGaussianRGBAF16_kernel

	.entry HorizontalRecursiveGaussianRGBAF32_kernel (
		.param .u64 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_pIn,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_src_pitch,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_src_width,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_src_height,
		.param .u64 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_pOut,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_dest_pitch,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_dest_width,
		.param .s32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_dest_height,
		.param .s8 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_repeat_edge_pixels,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_n_plus0,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_n_plus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_neg_d_plus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_neg_d_plus2,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_n_minus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_n_minus2,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_neg_d_minus1,
		.param .f32 __cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_neg_d_minus2)
	{
	.reg .u32 %r<126>;
	.reg .u64 %rd<37>;
	.reg .f32 %f<153>;
	.reg .pred %p<46>;
	.loc	19	651	0
$LDWbegin_HorizontalRecursiveGaussianRGBAF32_kernel:
	.loc	19	318	0
	ld.param.s32 	%r1, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_dest_width];
	mov.u32 	%r2, 0;
	setp.le.s32 	%p1, %r1, %r2;
	@%p1 bra 	$Lt_23_59138;
	cvt.s32.u32 	%r3, %ctaid.y;
	mul.lo.s32 	%r4, %r3, 8;
	ld.param.s32 	%r5, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_src_height];
	ld.param.s32 	%r6, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_dest_height];
	sub.s32 	%r7, %r6, %r5;
	mov.u32 	%r8, %tid.y;
	add.u32 	%r9, %r4, %r8;
	shr.s32 	%r10, %r7, 1;
	sub.s32 	%r11, %r9, %r10;
	shl.b32 	%r12, %r8, 4;
	mov.u32 	%r13, %tid.x;
	add.u32 	%r14, %r12, %r13;
	add.s32 	%r15, %r1, 31;
	shr.s32 	%r16, %r15, 31;
	mov.s32 	%r17, 31;
	and.b32 	%r18, %r16, %r17;
	add.s32 	%r19, %r18, %r15;
	shr.s32 	%r20, %r19, 5;
	mov.s32 	%r21, %r13;
	add.u32 	%r22, %r1, %r13;
	ld.param.s32 	%r23, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_src_width];
	sub.s32 	%r24, %r1, %r23;
	mul.lo.u32 	%r25, %r8, 32;
	shr.s32 	%r26, %r24, 1;
	add.u32 	%r27, %r25, %r13;
	mov.s32 	%r28, 31;
	setp.le.s32 	%p2, %r27, %r28;
	add.s32 	%r29, %r11, 8;
	mov.s32 	%r30, 0;
	mov.f32 	%f1, 0f00000000;     	// 0
	mov.f32 	%f2, 0f00000000;     	// 0
	mov.f32 	%f3, 0f00000000;     	// 0
	mov.u64 	%rd1, __cuda_local_var_302956_34_non_const_smem__0;
	mov.s32 	%r31, %r20;
$Lt_23_41218:
 //<loop> Loop body line 318, nesting depth: 1, estimated iterations: unknown
	.loc	19	335	0
	sub.s32 	%r32, %r21, %r26;
	mov.s32 	%r33, %r11;
	mov.s32 	%r34, 0;
$Lt_23_41986:
 //<loop> Loop body line 335, nesting depth: 2, iterations: 2
	setp.ge.s32 	%p3, %r33, %r5;
	@%p3 bra 	$Lt_23_42242;
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	19	342	0
	mov.u32 	%r35, 0;
	setp.lt.s32 	%p4, %r11, %r35;
	@%p4 bra 	$Lt_23_59906;
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	setp.ge.s32 	%p5, %r11, %r5;
	@%p5 bra 	$Lt_23_59906;
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	mov.u32 	%r36, 0;
	setp.lt.s32 	%p6, %r32, %r36;
	@%p6 bra 	$Lt_23_59906;
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	setp.ge.s32 	%p7, %r32, %r23;
	@%p7 bra 	$Lt_23_59906;
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	19	38	0
	ld.param.s32 	%r37, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_src_pitch];
	mul.lo.s32 	%r38, %r34, %r37;
	mul.lo.s32 	%r39, %r11, %r37;
	ld.param.u64 	%rd2, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_pIn];
	add.s32 	%r40, %r32, %r39;
	mul.lo.s32 	%r41, %r38, 4;
	add.s32 	%r42, %r40, %r41;
	cvt.s64.s32 	%rd3, %r42;
	mul.wide.s32 	%rd4, %r42, 16;
	add.u64 	%rd5, %rd2, %rd4;
	ld.global.v4.f32 	{%f4,%f5,%f6,%f7}, [%rd5+0];
	.loc	3	255	0
	mov.f32 	%f8, 0f00000000;     	// 0
	setp.lt.ftz.f32 	%p8, %f4, %f8;
	@!%p8 bra 	$Lt_23_42754;
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	3	234	0
	neg.ftz.f32 	%f9, %f4;
	lg2.approx.ftz.f32 	%f10, %f9;
	mov.f32 	%f11, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f12, %f10, %f11;
	ex2.approx.ftz.f32 	%f13, %f12;
	neg.ftz.f32 	%f14, %f13;
	bra.uni 	$LDWendi___log2f_200_23;
$Lt_23_42754:
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f15, %f4;
	mov.f32 	%f16, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f17, %f15, %f16;
	ex2.approx.ftz.f32 	%f14, %f17;
$LDWendi___log2f_200_23:
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	3	256	0
	mov.f32 	%f18, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p9, %f5, %f18;
	@!%p9 bra 	$Lt_23_43266;
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	3	234	0
	neg.ftz.f32 	%f19, %f5;
	lg2.approx.ftz.f32 	%f20, %f19;
	mov.f32 	%f21, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f22, %f20, %f21;
	ex2.approx.ftz.f32 	%f23, %f22;
	neg.ftz.f32 	%f24, %f23;
	bra.uni 	$LDWendi___log2f_200_21;
$Lt_23_43266:
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f25, %f5;
	mov.f32 	%f26, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f27, %f25, %f26;
	ex2.approx.ftz.f32 	%f24, %f27;
$LDWendi___log2f_200_21:
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	3	257	0
	mov.f32 	%f28, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p10, %f6, %f28;
	@!%p10 bra 	$Lt_23_43778;
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	3	234	0
	neg.ftz.f32 	%f29, %f6;
	lg2.approx.ftz.f32 	%f30, %f29;
	mov.f32 	%f31, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f32, %f30, %f31;
	ex2.approx.ftz.f32 	%f33, %f32;
	neg.ftz.f32 	%f34, %f33;
	bra.uni 	$LDWendi___log2f_200_19;
$Lt_23_43778:
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f35, %f6;
	mov.f32 	%f36, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f37, %f35, %f36;
	ex2.approx.ftz.f32 	%f34, %f37;
$LDWendi___log2f_200_19:
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	19	343	0
	cvt.ftz.sat.f32.f32 	%f38, %f7;
	mul.ftz.f32 	%f39, %f14, %f38;
	mul.ftz.f32 	%f40, %f24, %f38;
	mul.ftz.f32 	%f41, %f34, %f38;
	mov.f32 	%f42, %f38;
	bra.uni 	$L_23_38146;
$Lt_23_59906:
$L_23_38402:
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	19	345	0
	mov.f32 	%f42, 0f00000000;    	// 0
	mov.f32 	%f41, 0f00000000;    	// 0
	mov.f32 	%f40, 0f00000000;    	// 0
	mov.f32 	%f39, 0f00000000;    	// 0
$L_23_38146:
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	.loc	19	348	0
	mul.lo.s32 	%r43, %r8, 33;
	mul.lo.s32 	%r44, %r34, 132;
	add.u32 	%r45, %r43, %r13;
	add.s32 	%r46, %r44, %r45;
	cvt.s64.s32 	%rd6, %r46;
	mul.wide.s32 	%rd7, %r46, 4;
	add.u64 	%rd8, %rd1, %rd7;
	st.shared.f32 	[%rd8+0], %f39;
	.loc	19	349	0
	st.shared.f32 	[%rd8+1056], %f40;
	.loc	19	350	0
	st.shared.f32 	[%rd8+2112], %f41;
	.loc	19	351	0
	st.shared.f32 	[%rd8+3168], %f42;
$Lt_23_42242:
 //<loop> Part of loop body line 335, head labeled $Lt_23_41986
	add.s32 	%r34, %r34, 1;
	add.s32 	%r33, %r33, 4;
	setp.ne.s32 	%p11, %r33, %r29;
	@%p11 bra 	$Lt_23_41986;
 //<loop> Part of loop body line 318, head labeled $Lt_23_41218
	.loc	19	358	0
	bar.sync 	0;
	@!%p2 bra 	$Lt_23_44546;
 //<loop> Part of loop body line 318, head labeled $Lt_23_41218
	.loc	19	364	0
	ld.param.s8 	%r47, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_repeat_edge_pixels];
	mov.s32 	%r48, 0;
	setp.ne.s32 	%p12, %r47, %r48;
	mov.pred 	%p13, %p12;
	mov.pred 	%p14, %p15;
	neg.s32 	%r49, %r30;
	selp.s32 	%r50, 1, 0, %p13;
	mul.lo.s32 	%r51, %r14, 33;
	mov.s32 	%r52, %r51;
	add.s32 	%r53, %r51, 32;
	add.s32 	%r54, %r49, %r51;
	cvt.s64.s32 	%rd9, %r51;
	mul.wide.s32 	%rd10, %r51, 4;
	add.u64 	%rd11, %rd1, %rd10;
	ld.param.f32 	%f43, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_neg_d_plus2];
	ld.param.f32 	%f44, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_neg_d_plus1];
	ld.param.f32 	%f45, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_n_plus0];
	ld.param.f32 	%f46, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_n_plus1];
$Lt_23_45570:
 //<loop> Loop body line 364, nesting depth: 2, iterations: 32
	.loc	19	367	0
	ld.shared.f32 	%f47, [%rd11+0];
	set.eq.u32.s32 	%r55, %r52, %r54;
	neg.s32 	%r56, %r55;
	and.b32 	%r57, %r56, %r50;
	mov.u32 	%r58, 0;
	setp.eq.s32 	%p16, %r57, %r58;
	@%p16 bra 	$Lt_23_45826;
 //<loop> Part of loop body line 364, head labeled $Lt_23_45570
	.loc	19	370	0
	mov.f32 	%f48, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f49, %f47, %f48;
	mov.f32 	%f1, %f49;
	mov.f32 	%f2, %f49;
$Lt_23_45826:
 //<loop> Part of loop body line 364, head labeled $Lt_23_45570
	.loc	19	372	0
	mul.ftz.f32 	%f50, %f46, %f3;
	fma.rn.ftz.f32 	%f51, %f47, %f45, %f50;
	fma.rn.ftz.f32 	%f52, %f2, %f44, %f51;
	fma.rn.ftz.f32 	%f53, %f1, %f43, %f52;
	.loc	19	374	0
	mov.f32 	%f3, %f47;
	.loc	19	375	0
	mov.f32 	%f1, %f2;
	.loc	19	376	0
	mov.f32 	%f2, %f53;
	.loc	19	378	0
	st.shared.f32 	[%rd11+0], %f53;
	add.s32 	%r52, %r52, 1;
	add.u64 	%rd11, %rd11, 4;
	setp.ne.s32 	%p17, %r52, %r53;
	@%p17 bra 	$Lt_23_45570;
$Lt_23_44546:
 //<loop> Part of loop body line 318, head labeled $Lt_23_41218
	.loc	19	383	0
	bar.sync 	0;
	.loc	19	388	0
	mov.s32 	%r59, %r11;
	mov.s32 	%r60, 0;
$Lt_23_47106:
 //<loop> Loop body line 388, nesting depth: 2, iterations: 2
	setp.ge.s32 	%p18, %r59, %r5;
	@%p18 bra 	$Lt_23_47874;
 //<loop> Part of loop body line 388, head labeled $Lt_23_47106
	setp.le.s32 	%p19, %r1, %r21;
	@%p19 bra 	$Lt_23_47874;
 //<loop> Part of loop body line 388, head labeled $Lt_23_47106
	.loc	19	52	0
	mul.lo.s32 	%r43, %r8, 33;
	ld.param.s32 	%r61, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_dest_pitch];
	mul.lo.s32 	%r62, %r60, %r61;
	add.u32 	%r63, %r43, %r13;
	mul.lo.s32 	%r64, %r61, %r9;
	mul.lo.s32 	%r65, %r60, 132;
	add.s32 	%r66, %r63, %r65;
	cvt.s64.s32 	%rd12, %r66;
	mul.wide.s32 	%rd13, %r66, 4;
	add.u64 	%rd14, %rd1, %rd13;
	ld.param.u64 	%rd15, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_pOut];
	add.s32 	%r67, %r64, %r21;
	mul.lo.s32 	%r68, %r62, 4;
	add.s32 	%r69, %r67, %r68;
	cvt.s64.s32 	%rd16, %r69;
	mul.wide.s32 	%rd17, %r69, 16;
	add.u64 	%rd18, %rd15, %rd17;
	ld.shared.f32 	%f54, [%rd14+0];
	ld.shared.f32 	%f55, [%rd14+1056];
	ld.shared.f32 	%f56, [%rd14+2112];
	ld.shared.f32 	%f57, [%rd14+3168];
	st.global.v4.f32 	[%rd18+0], {%f54,%f55,%f56,%f57};
$Lt_23_47874:
$Lt_23_47362:
 //<loop> Part of loop body line 388, head labeled $Lt_23_47106
	.loc	19	402	0
	add.s32 	%r60, %r60, 1;
	add.s32 	%r59, %r59, 4;
	setp.ne.s32 	%p20, %r59, %r29;
	@%p20 bra 	$Lt_23_47106;
 //<loop> Part of loop body line 318, head labeled $Lt_23_41218
	.loc	19	409	0
	bar.sync 	0;
	add.s32 	%r30, %r30, 32;
	add.u32 	%r21, %r21, 32;
	setp.lt.s32 	%p21, %r21, %r22;
	@%p21 bra 	$Lt_23_41218;
	bra.uni 	$Lt_23_40706;
$Lt_23_59138:
	mov.s32 	%r30, 0;
	mov.u64 	%rd1, __cuda_local_var_302956_34_non_const_smem__0;
$Lt_23_40706:
	mov.u32 	%r70, 0;
	setp.le.s32 	%p22, %r30, %r70;
	@%p22 bra 	$Lt_23_48898;
	cvt.s32.u32 	%r71, %ctaid.y;
	mul.lo.s32 	%r72, %r71, 8;
	mov.u32 	%r8, %tid.y;
	add.u32 	%r9, %r72, %r8;
	shl.b32 	%r73, %r8, 4;
	mov.u32 	%r13, %tid.x;
	add.u32 	%r14, %r73, %r13;
	ld.param.s32 	%r5, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_src_height];
	ld.param.s32 	%r74, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_dest_height];
	sub.s32 	%r75, %r74, %r5;
	shr.s32 	%r76, %r75, 1;
	sub.s32 	%r11, %r9, %r76;
	add.s32 	%r77, %r30, 31;
	shr.s32 	%r78, %r77, 31;
	mov.s32 	%r79, 31;
	and.b32 	%r80, %r78, %r79;
	add.s32 	%r81, %r80, %r77;
	shr.s32 	%r82, %r81, 5;
	add.u32 	%r21, %r30, %r13;
	ld.param.s32 	%r23, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_src_width];
	sub.s32 	%r83, %r1, %r23;
	mul.lo.u32 	%r84, %r8, 32;
	shr.s32 	%r26, %r83, 1;
	add.u32 	%r85, %r84, %r13;
	add.s32 	%r86, %r9, 8;
	mov.s32 	%r87, 31;
	setp.le.s32 	%p2, %r85, %r87;
	add.s32 	%r29, %r11, 8;
	mov.f32 	%f58, 0f00000000;    	// 0
	mov.f32 	%f1, 0f00000000;     	// 0
	mov.f32 	%f2, 0f00000000;     	// 0
	mov.f32 	%f3, 0f00000000;     	// 0
	mov.s32 	%r88, %r82;
$Lt_23_49410:
 //<loop> Loop body line 409, nesting depth: 1, estimated iterations: unknown
	.loc	19	424	0
	sub.s32 	%r30, %r30, 32;
	sub.u32 	%r21, %r21, 32;
	.loc	19	432	0
	sub.s32 	%r32, %r21, %r26;
	mov.s32 	%r89, %r11;
	mov.s32 	%r90, 0;
$Lt_23_50178:
 //<loop> Loop body line 432, nesting depth: 1, iterations: 2
	setp.ge.s32 	%p23, %r89, %r5;
	@%p23 bra 	$Lt_23_50434;
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	19	439	0
	mov.u32 	%r91, 0;
	setp.lt.s32 	%p24, %r11, %r91;
	@%p24 bra 	$Lt_23_61186;
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	setp.ge.s32 	%p25, %r11, %r5;
	@%p25 bra 	$Lt_23_61186;
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	mov.u32 	%r92, 0;
	setp.lt.s32 	%p26, %r32, %r92;
	@%p26 bra 	$Lt_23_61186;
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	setp.ge.s32 	%p27, %r32, %r23;
	@%p27 bra 	$Lt_23_61186;
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	19	38	0
	ld.param.s32 	%r93, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_src_pitch];
	mul.lo.s32 	%r94, %r93, %r90;
	mul.lo.s32 	%r95, %r11, %r93;
	ld.param.u64 	%rd19, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_pIn];
	add.s32 	%r96, %r32, %r95;
	mul.lo.s32 	%r97, %r94, 4;
	add.s32 	%r98, %r96, %r97;
	cvt.s64.s32 	%rd20, %r98;
	mul.wide.s32 	%rd21, %r98, 16;
	add.u64 	%rd22, %rd19, %rd21;
	ld.global.v4.f32 	{%f59,%f60,%f61,%f62}, [%rd22+0];
	.loc	3	255	0
	mov.f32 	%f63, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p28, %f59, %f63;
	@!%p28 bra 	$Lt_23_50946;
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	3	234	0
	neg.ftz.f32 	%f64, %f59;
	lg2.approx.ftz.f32 	%f65, %f64;
	mov.f32 	%f66, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f67, %f65, %f66;
	ex2.approx.ftz.f32 	%f68, %f67;
	neg.ftz.f32 	%f14, %f68;
	bra.uni 	$LDWendi___log2f_200_15;
$Lt_23_50946:
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f69, %f59;
	mov.f32 	%f70, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f71, %f69, %f70;
	ex2.approx.ftz.f32 	%f14, %f71;
$LDWendi___log2f_200_15:
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	3	256	0
	mov.f32 	%f72, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p29, %f60, %f72;
	@!%p29 bra 	$Lt_23_51458;
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	3	234	0
	neg.ftz.f32 	%f73, %f60;
	lg2.approx.ftz.f32 	%f74, %f73;
	mov.f32 	%f75, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f76, %f74, %f75;
	ex2.approx.ftz.f32 	%f77, %f76;
	neg.ftz.f32 	%f24, %f77;
	bra.uni 	$LDWendi___log2f_200_13;
$Lt_23_51458:
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f78, %f60;
	mov.f32 	%f79, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f80, %f78, %f79;
	ex2.approx.ftz.f32 	%f24, %f80;
$LDWendi___log2f_200_13:
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	3	257	0
	mov.f32 	%f81, 0f00000000;    	// 0
	setp.lt.ftz.f32 	%p30, %f61, %f81;
	@!%p30 bra 	$Lt_23_51970;
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	3	234	0
	neg.ftz.f32 	%f82, %f61;
	lg2.approx.ftz.f32 	%f83, %f82;
	mov.f32 	%f84, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f85, %f83, %f84;
	ex2.approx.ftz.f32 	%f86, %f85;
	neg.ftz.f32 	%f34, %f86;
	bra.uni 	$LDWendi___log2f_200_11;
$Lt_23_51970:
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	3	236	0
	lg2.approx.ftz.f32 	%f87, %f61;
	mov.f32 	%f88, 0f400ccccd;    	// 2.2
	mul.ftz.f32 	%f89, %f87, %f88;
	ex2.approx.ftz.f32 	%f34, %f89;
$LDWendi___log2f_200_11:
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	19	440	0
	cvt.ftz.sat.f32.f32 	%f90, %f62;
	mul.ftz.f32 	%f39, %f14, %f90;
	mul.ftz.f32 	%f40, %f24, %f90;
	mul.ftz.f32 	%f41, %f34, %f90;
	mov.f32 	%f42, %f90;
	bra.uni 	$L_23_39426;
$Lt_23_61186:
$L_23_39682:
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	19	442	0
	mov.f32 	%f42, 0f00000000;    	// 0
	mov.f32 	%f41, 0f00000000;    	// 0
	mov.f32 	%f40, 0f00000000;    	// 0
	mov.f32 	%f39, 0f00000000;    	// 0
$L_23_39426:
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	.loc	19	445	0
	mul.lo.s32 	%r43, %r8, 33;
	mul.lo.s32 	%r99, %r90, 132;
	add.u32 	%r100, %r43, %r13;
	add.s32 	%r101, %r99, %r100;
	cvt.s64.s32 	%rd23, %r101;
	mul.wide.s32 	%rd24, %r101, 4;
	add.u64 	%rd25, %rd1, %rd24;
	st.shared.f32 	[%rd25+0], %f39;
	.loc	19	446	0
	st.shared.f32 	[%rd25+1056], %f40;
	.loc	19	447	0
	st.shared.f32 	[%rd25+2112], %f41;
	.loc	19	448	0
	st.shared.f32 	[%rd25+3168], %f42;
$Lt_23_50434:
 //<loop> Part of loop body line 432, head labeled $Lt_23_50178
	add.s32 	%r90, %r90, 1;
	add.s32 	%r89, %r89, 4;
	setp.ne.s32 	%p31, %r89, %r29;
	@%p31 bra 	$Lt_23_50178;
 //<loop> Part of loop body line 409, head labeled $Lt_23_49410
	.loc	19	455	0
	bar.sync 	0;
	@!%p2 bra 	$Lt_23_52738;
 //<loop> Part of loop body line 409, head labeled $Lt_23_49410
	.loc	19	461	0
	mul.lo.s32 	%r102, %r14, 33;
	add.s32 	%r103, %r102, 31;
	sub.s32 	%r104, %r1, %r30;
	ld.param.s8 	%r105, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_repeat_edge_pixels];
	mov.s32 	%r106, 0;
	setp.ne.s32 	%p32, %r105, %r106;
	mov.pred 	%p33, %p32;
	mov.pred 	%p34, %p15;
	sub.s32 	%r107, %r104, 1;
	selp.s32 	%r50, 1, 0, %p33;
	cvt.s64.s32 	%rd26, %r103;
	mul.wide.s32 	%rd27, %r103, 4;
	add.u64 	%rd28, %rd1, %rd27;
	ld.param.f32 	%f91, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_neg_d_minus2];
	ld.param.f32 	%f92, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_neg_d_minus1];
	ld.param.f32 	%f93, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_n_minus1];
	ld.param.f32 	%f94, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_n_minus2];
	mov.s32 	%r108, 31;
$Lt_23_53762:
 //<loop> Loop body line 461, nesting depth: 1, iterations: 32
	.loc	19	464	0
	ld.shared.f32 	%f47, [%rd28+0];
	set.eq.u32.s32 	%r109, %r107, %r108;
	neg.s32 	%r110, %r109;
	and.b32 	%r111, %r110, %r50;
	mov.u32 	%r112, 0;
	setp.eq.s32 	%p35, %r111, %r112;
	@%p35 bra 	$Lt_23_54018;
 //<loop> Part of loop body line 461, head labeled $Lt_23_53762
	.loc	19	467	0
	mov.f32 	%f95, 0f3f000000;    	// 0.5
	mul.ftz.f32 	%f96, %f47, %f95;
	mov.f32 	%f1, %f96;
	mov.f32 	%f2, %f96;
$Lt_23_54018:
 //<loop> Part of loop body line 461, head labeled $Lt_23_53762
	.loc	19	469	0
	mul.ftz.f32 	%f97, %f94, %f58;
	fma.rn.ftz.f32 	%f98, %f3, %f93, %f97;
	fma.rn.ftz.f32 	%f99, %f2, %f92, %f98;
	fma.rn.ftz.f32 	%f100, %f1, %f91, %f99;
	.loc	19	471	0
	mov.f32 	%f58, %f3;
	.loc	19	472	0
	mov.f32 	%f3, %f47;
	.loc	19	473	0
	mov.f32 	%f1, %f2;
	.loc	19	474	0
	mov.f32 	%f2, %f100;
	.loc	19	476	0
	st.shared.f32 	[%rd28+0], %f100;
	.loc	19	477	0
	sub.u64 	%rd28, %rd28, 4;
	sub.s32 	%r108, %r108, 1;
	mov.u32 	%r113, -1;
	setp.ne.s32 	%p36, %r108, %r113;
	@%p36 bra 	$Lt_23_53762;
$Lt_23_52738:
 //<loop> Part of loop body line 409, head labeled $Lt_23_49410
	.loc	19	481	0
	bar.sync 	0;
	.loc	19	486	0
	mov.s32 	%r114, %r9;
	mov.s32 	%r115, 0;
$Lt_23_55298:
 //<loop> Loop body line 486, nesting depth: 1, iterations: 2
	setp.ge.s32 	%p37, %r114, %r74;
	@%p37 bra 	$Lt_23_56066;
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	setp.le.s32 	%p38, %r1, %r21;
	@%p38 bra 	$Lt_23_56066;
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	19	38	0
	ld.param.s32 	%r116, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_dest_pitch];
	mul.lo.s32 	%r117, %r116, %r115;
	mul.lo.s32 	%r118, %r116, %r9;
	ld.param.u64 	%rd29, [__cudaparm_HorizontalRecursiveGaussianRGBAF32_kernel_pOut];
	add.s32 	%r119, %r118, %r21;
	mul.lo.s32 	%r120, %r117, 4;
	add.s32 	%r121, %r119, %r120;
	cvt.s64.s32 	%rd30, %r121;
	mul.wide.s32 	%rd31, %r121, 16;
	add.u64 	%rd32, %rd29, %rd31;
	ld.global.f32 	%f101, [%rd32+12];
	.loc	3	208	0
	mul.lo.s32 	%r43, %r8, 33;
	mul.lo.s32 	%r122, %r115, 132;
	add.u32 	%r123, %r43, %r13;
	add.s32 	%r124, %r122, %r123;
	cvt.s64.s32 	%rd33, %r124;
	mul.wide.s32 	%rd34, %r124, 4;
	add.u64 	%rd35, %rd1, %rd34;
	ld.shared.f32 	%f102, [%rd35+3168];
	add.ftz.f32 	%f103, %f102, %f101;
	cvt.ftz.sat.f32.f32 	%f104, %f103;
	mov.f32 	%f105, %f104;
	mov.f32 	%f106, 0fb70637bd;   	// -8e-006
	add.ftz.f32 	%f107, %f104, %f106;
	mov.f32 	%f108, 0f00000000;   	// 0
	setp.le.ftz.f32 	%p39, %f107, %f108;
	@%p39 bra 	$Lt_23_56834;
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	3	213	0
	rcp.approx.ftz.f32 	%f109, %f104;
	ld.global.v4.f32 	{%f110,%f111,%f112,_}, [%rd32+0];
	ld.shared.f32 	%f113, [%rd35+2112];
	add.ftz.f32 	%f114, %f112, %f113;
	mul.ftz.f32 	%f115, %f109, %f114;
	.loc	3	214	0
	ld.shared.f32 	%f116, [%rd35+1056];
	add.ftz.f32 	%f117, %f111, %f116;
	mul.ftz.f32 	%f118, %f109, %f117;
	.loc	3	215	0
	ld.shared.f32 	%f119, [%rd35+0];
	add.ftz.f32 	%f120, %f119, %f110;
	mul.ftz.f32 	%f121, %f109, %f120;
	bra.uni 	$Lt_23_56578;
$Lt_23_56834:
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	3	219	0
	mov.f32 	%f115, 0f00000000;   	// 0
	mov.f32 	%f118, 0f00000000;   	// 0
	mov.f32 	%f121, 0f00000000;   	// 0
	mov.f32 	%f105, 0f00000000;   	// 0
$Lt_23_56578:
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	3	266	0
	mov.f32 	%f122, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p40, %f121, %f122;
	@!%p40 bra 	$Lt_23_57090;
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	3	242	0
	neg.ftz.f32 	%f123, %f121;
	lg2.approx.ftz.f32 	%f124, %f123;
	mov.f32 	%f125, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f126, %f124, %f125;
	ex2.approx.ftz.f32 	%f127, %f126;
	neg.ftz.f32 	%f128, %f127;
	bra.uni 	$LDWendi___log2f_200_5;
$Lt_23_57090:
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f129, %f121;
	mov.f32 	%f130, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f131, %f129, %f130;
	ex2.approx.ftz.f32 	%f128, %f131;
$LDWendi___log2f_200_5:
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	3	267	0
	mov.f32 	%f132, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p41, %f118, %f132;
	@!%p41 bra 	$Lt_23_57602;
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	3	242	0
	neg.ftz.f32 	%f133, %f118;
	lg2.approx.ftz.f32 	%f134, %f133;
	mov.f32 	%f135, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f136, %f134, %f135;
	ex2.approx.ftz.f32 	%f137, %f136;
	neg.ftz.f32 	%f138, %f137;
	bra.uni 	$LDWendi___log2f_200_3;
$Lt_23_57602:
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f139, %f118;
	mov.f32 	%f140, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f141, %f139, %f140;
	ex2.approx.ftz.f32 	%f138, %f141;
$LDWendi___log2f_200_3:
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	3	268	0
	mov.f32 	%f142, 0f00000000;   	// 0
	setp.lt.ftz.f32 	%p42, %f115, %f142;
	@!%p42 bra 	$Lt_23_58114;
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	3	242	0
	neg.ftz.f32 	%f143, %f115;
	lg2.approx.ftz.f32 	%f144, %f143;
	mov.f32 	%f145, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f146, %f144, %f145;
	ex2.approx.ftz.f32 	%f147, %f146;
	neg.ftz.f32 	%f148, %f147;
	bra.uni 	$LDWendi___log2f_200_1;
$Lt_23_58114:
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	3	244	0
	lg2.approx.ftz.f32 	%f149, %f115;
	mov.f32 	%f150, 0f3ee8ba2e;   	// 0.454545
	mul.ftz.f32 	%f151, %f149, %f150;
	ex2.approx.ftz.f32 	%f148, %f151;
$LDWendi___log2f_200_1:
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	st.global.v4.f32 	[%rd32+0], {%f128,%f138,%f148,%f105};
$Lt_23_56066:
$Lt_23_55554:
 //<loop> Part of loop body line 486, head labeled $Lt_23_55298
	.loc	19	504	0
	add.s32 	%r115, %r115, 1;
	add.s32 	%r114, %r114, 4;
	setp.ne.s32 	%p43, %r114, %r86;
	@%p43 bra 	$Lt_23_55298;
 //<loop> Part of loop body line 409, head labeled $Lt_23_49410
	.loc	19	512	0
	bar.sync 	0;
	setp.gt.s32 	%p44, %r21, %r13;
	@%p44 bra 	$Lt_23_49410;
$Lt_23_48898:
	.loc	19	672	0
	exit;
$LDWend_HorizontalRecursiveGaussianRGBAF32_kernel:
	} // HorizontalRecursiveGaussianRGBAF32_kernel

