Skip to content

Commit 78ac958

Browse files
committed
Add missing intrinsics to cuda headers.
LLVM prevents the sm_32_intrinsics.hpp header from being included with a #define __SM_32_INTRINSICS_HPP__. It also provides drop-in replacements of the functions defined in the CUDA header. One issue is that some intrinsics were added after the replacement was written, and thus have no replacement, breaking code that calls them (Raft is one example). This CL backport the code from sm_32_intrinsics.hpp for the missing intrinsics.
1 parent 24d730b commit 78ac958

File tree

1 file changed

+225
-0
lines changed

1 file changed

+225
-0
lines changed

clang/lib/Headers/__clang_cuda_intrinsics.h

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,231 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
479479
return ret;
480480
}
481481

482+
#define INTRINSIC_LOAD(func_name, asm_op, decl_type, internal_type, asm_type) \
483+
inline __device__ decl_type func_name(const decl_type *ptr) { \
484+
internal_type ret; \
485+
asm(asm_op" %0, [%1];" : asm_type(ret) : "l"(ptr)); \
486+
return (decl_type)ret; \
487+
}
488+
489+
#define INTRINSIC_LOAD2(func_name, asm_op, decl_type, internal_type, asm_type) \
490+
inline __device__ decl_type func_name(const decl_type *ptr) { \
491+
decl_type ret; \
492+
internal_type tmp; \
493+
asm(asm_op" {%0,%1}, [%2];" \
494+
: asm_type(tmp.x), asm_type(tmp.y) \
495+
: "l"(ptr)); \
496+
using element_type = decltype(ret.x); \
497+
ret.x = (element_type)(tmp.x); \
498+
ret.y = (element_type)tmp.y; \
499+
return ret; \
500+
}
501+
502+
#define INTRINSIC_LOAD4(func_name, asm_op, decl_type, internal_type, asm_type) \
503+
inline __device__ decl_type func_name(const decl_type *ptr) { \
504+
decl_type ret; \
505+
internal_type tmp; \
506+
asm(asm_op" {%0,%1,%2,%3}, [%4];" \
507+
: asm_type(tmp.x), asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
508+
: "l"(ptr)); \
509+
using element_type = decltype(ret.x); \
510+
ret.x = (element_type)tmp.x; \
511+
ret.y = (element_type)tmp.y; \
512+
ret.z = (element_type)tmp.z; \
513+
ret.w = (element_type)tmp.w; \
514+
return ret; \
515+
}
516+
517+
INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r");
518+
INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r");
519+
INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h");
520+
INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r");
521+
INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long, "=l");
522+
523+
INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r");
524+
INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r");
525+
INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h");
526+
INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h");
527+
INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r");
528+
INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r");
529+
INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l");
530+
531+
INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int, "=r");
532+
INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short, "=h");
533+
INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int, "=r");
534+
INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long, unsigned long long, "=l");
535+
536+
INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r");
537+
INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r");
538+
INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h");
539+
INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h");
540+
INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r");
541+
INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r");
542+
INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2, "=l");
543+
544+
INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f");
545+
INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d");
546+
INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f");
547+
INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f");
548+
INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d");
549+
550+
#define MINTRINSIC_LOAD(func_name, asm_op, decl_type, internal_type, asm_type) \
551+
inline __device__ decl_type func_name(const decl_type *ptr) { \
552+
internal_type ret; \
553+
asm(asm_op" %0, [%1];" : asm_type(ret) : "l"(ptr) : "memory"); \
554+
return (decl_type)ret; \
555+
}
556+
557+
#define MINTRINSIC_LOAD2(func_name, asm_op, decl_type, internal_type, asm_type) \
558+
inline __device__ decl_type func_name(const decl_type *ptr) { \
559+
decl_type ret; \
560+
internal_type tmp; \
561+
asm(asm_op" {%0,%1}, [%2];" \
562+
: asm_type(tmp.x), asm_type(tmp.y) \
563+
: "l"(ptr) : "memory"); \
564+
using element_type = decltype(ret.x); \
565+
ret.x = (element_type)tmp.x; \
566+
ret.y = (element_type)tmp.y; \
567+
return ret; \
568+
}
569+
570+
#define MINTRINSIC_LOAD4(func_name, asm_op, decl_type, internal_type, asm_type) \
571+
inline __device__ decl_type func_name(const decl_type *ptr) { \
572+
decl_type ret; \
573+
internal_type tmp; \
574+
asm(asm_op" {%0,%1,%2,%3}, [%4];" \
575+
: asm_type(tmp.x), asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
576+
: "l"(ptr) : "memory"); \
577+
using element_type = decltype(ret.x); \
578+
ret.x = (element_type)tmp.x; \
579+
ret.y = (element_type)tmp.y; \
580+
ret.z = (element_type)tmp.z; \
581+
ret.w = (element_type)tmp.w; \
582+
return ret; \
583+
}
584+
585+
MINTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int, "=r");
586+
MINTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short,
587+
"=h");
588+
MINTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int, "=r");
589+
MINTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long,
590+
unsigned long long, "=l");
591+
592+
MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2, "=r");
593+
MINTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4, "=r");
594+
MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2, "=h");
595+
MINTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4, "=h");
596+
MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2, "=r");
597+
MINTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4, "=r");
598+
MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2, "=l");
599+
600+
MINTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f");
601+
MINTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d");
602+
603+
MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2, "=f");
604+
MINTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4, "=f");
605+
MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2, "=f");
606+
607+
608+
INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r");
609+
INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r");
610+
INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h");
611+
INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r");
612+
INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long, "=l");
613+
614+
INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r");
615+
INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r");
616+
INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h");
617+
INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h");
618+
INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r");
619+
INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r");
620+
INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l");
621+
622+
INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int, "=r");
623+
INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short,
624+
"=h");
625+
INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int, "=r");
626+
INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long,
627+
unsigned long long, "=l");
628+
629+
INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r");
630+
INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r");
631+
INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h");
632+
INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h");
633+
INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r");
634+
INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r");
635+
INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2, "=l");
636+
637+
INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f");
638+
INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d");
639+
INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f");
640+
INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f");
641+
INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d");
642+
643+
#define INTRINSIC_STORE(func_name, asm_op, decl_type, internal_type, asm_type) \
644+
inline __device__ void func_name(decl_type *ptr, decl_type value) { \
645+
internal_type tmp = (internal_type)value; \
646+
asm(asm_op" [%0], %1;" ::"l"(ptr), asm_type(tmp) : "memory"); \
647+
}
648+
649+
#define INTRINSIC_STORE2(func_name, asm_op, decl_type, internal_type, asm_type) \
650+
inline __device__ void func_name(decl_type *ptr, decl_type value) { \
651+
internal_type tmp; \
652+
using element_type = decltype(tmp.x); \
653+
tmp.x = (element_type)(value.x); \
654+
tmp.y = (element_type)(value.y); \
655+
asm(asm_op" [%0], {%1,%2};" ::"l"(ptr), asm_type(tmp.x), asm_type(tmp.y) \
656+
: "memory"); \
657+
}
658+
659+
#define INTRINSIC_STORE4(func_name, asm_op, decl_type, internal_type, asm_type) \
660+
inline __device__ void func_name(decl_type *ptr, decl_type value) { \
661+
internal_type tmp; \
662+
using element_type = decltype(tmp.x); \
663+
tmp.x = (element_type)(value.x); \
664+
tmp.y = (element_type)(value.y); \
665+
tmp.z = (element_type)(value.z); \
666+
tmp.w = (element_type)(value.w); \
667+
asm(asm_op" [%0], {%1,%2,%3,%4};" ::"l"(ptr), asm_type(tmp.x), \
668+
asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
669+
: "memory"); \
670+
}
671+
672+
INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r");
673+
INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r");
674+
INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h");
675+
INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r");
676+
INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l");
677+
678+
INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r");
679+
INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r");
680+
INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h");
681+
INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h");
682+
INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r");
683+
INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r");
684+
INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l");
685+
686+
INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r");
687+
INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short,
688+
"h");
689+
INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r");
690+
INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long,
691+
unsigned long long, "l");
692+
693+
INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r");
694+
INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r");
695+
INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h");
696+
INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h");
697+
INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r");
698+
INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r");
699+
INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l");
700+
701+
INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f");
702+
INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d");
703+
INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f");
704+
INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f");
705+
INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d");
706+
482707
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
483708

484709
#if CUDA_VERSION >= 11000

0 commit comments

Comments
 (0)