@@ -479,6 +479,231 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
479
479
return ret ;
480
480
}
481
481
482
+ #define INTRINSIC_LOAD (func_name , asm_op , decl_type , internal_type , asm_type ) \
483
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
484
+ internal_type ret; \
485
+ asm(asm_op" %0, [%1];" : asm_type(ret) : "l"(ptr)); \
486
+ return (decl_type)ret; \
487
+ }
488
+
489
+ #define INTRINSIC_LOAD2 (func_name , asm_op , decl_type , internal_type , asm_type ) \
490
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
491
+ decl_type ret; \
492
+ internal_type tmp; \
493
+ asm(asm_op" {%0,%1}, [%2];" \
494
+ : asm_type(tmp.x), asm_type(tmp.y) \
495
+ : "l"(ptr)); \
496
+ using element_type = decltype(ret.x); \
497
+ ret.x = (element_type)(tmp.x); \
498
+ ret.y = (element_type)tmp.y; \
499
+ return ret; \
500
+ }
501
+
502
+ #define INTRINSIC_LOAD4 (func_name , asm_op , decl_type , internal_type , asm_type ) \
503
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
504
+ decl_type ret; \
505
+ internal_type tmp; \
506
+ asm(asm_op" {%0,%1,%2,%3}, [%4];" \
507
+ : asm_type(tmp.x), asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
508
+ : "l"(ptr)); \
509
+ using element_type = decltype(ret.x); \
510
+ ret.x = (element_type)tmp.x; \
511
+ ret.y = (element_type)tmp.y; \
512
+ ret.z = (element_type)tmp.z; \
513
+ ret.w = (element_type)tmp.w; \
514
+ return ret; \
515
+ }
516
+
517
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , char , unsigned int , "=r" );
518
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , signed char , unsigned int , "=r" );
519
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.s16" , short , unsigned short , "=h" );
520
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.s32" , int , unsigned int , "=r" );
521
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.s64" , long long , unsigned long long, "=l" );
522
+
523
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s8" , char2 , int2 , "=r" );
524
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s8" , char4 , int4 , "=r" );
525
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s16" , short2 , short2 , "=h" );
526
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s16" , short4 , short4 , "=h" );
527
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s32" , int2 , int2 , "=r" );
528
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s32" , int4 , int4 , "=r" );
529
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s64 " , longlong2 , longlong2 , "=l" );
530
+
531
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.u8" , unsigned char , unsigned int , "=r" );
532
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.u16" , unsigned short , unsigned short , "=h" );
533
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.u32" , unsigned int , unsigned int , "=r" );
534
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.u64" , unsigned long long, unsigned long long, "=l" );
535
+
536
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u8" , uchar2 , int2 , "=r" );
537
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u8" , uchar4 , int4 , "=r" );
538
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u16" , ushort2 , ushort2 , "=h" );
539
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u16" , ushort4 , ushort4 , "=h" );
540
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u32" , uint2 , uint2 , "=r" );
541
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u32" , uint4 , uint4 , "=r" );
542
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u64" , ulonglong2 , ulonglong2 , "=l" );
543
+
544
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.f32" , float , float , "=f" );
545
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.f64" , double , double , "=d" );
546
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f32" , float2 , float2 , "=f" );
547
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.f32" , float4 , float4 , "=f" );
548
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f64" , double2 , double2 , "=d" );
549
+
550
+ #define MINTRINSIC_LOAD (func_name , asm_op , decl_type , internal_type , asm_type ) \
551
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
552
+ internal_type ret; \
553
+ asm(asm_op" %0, [%1];" : asm_type(ret) : "l"(ptr) : "memory"); \
554
+ return (decl_type)ret; \
555
+ }
556
+
557
+ #define MINTRINSIC_LOAD2 (func_name , asm_op , decl_type , internal_type , asm_type ) \
558
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
559
+ decl_type ret; \
560
+ internal_type tmp; \
561
+ asm(asm_op" {%0,%1}, [%2];" \
562
+ : asm_type(tmp.x), asm_type(tmp.y) \
563
+ : "l"(ptr) : "memory"); \
564
+ using element_type = decltype(ret.x); \
565
+ ret.x = (element_type)tmp.x; \
566
+ ret.y = (element_type)tmp.y; \
567
+ return ret; \
568
+ }
569
+
570
+ #define MINTRINSIC_LOAD4 (func_name , asm_op , decl_type , internal_type , asm_type ) \
571
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
572
+ decl_type ret; \
573
+ internal_type tmp; \
574
+ asm(asm_op" {%0,%1,%2,%3}, [%4];" \
575
+ : asm_type(tmp.x), asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
576
+ : "l"(ptr) : "memory"); \
577
+ using element_type = decltype(ret.x); \
578
+ ret.x = (element_type)tmp.x; \
579
+ ret.y = (element_type)tmp.y; \
580
+ ret.z = (element_type)tmp.z; \
581
+ ret.w = (element_type)tmp.w; \
582
+ return ret; \
583
+ }
584
+
585
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.u8" , unsigned char , unsigned int , "=r" );
586
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.u16" , unsigned short , unsigned short ,
587
+ "=h" );
588
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.u32" , unsigned int , unsigned int , "=r" );
589
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.u64" , unsigned long long,
590
+ unsigned long long, "=l" );
591
+
592
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u8" , uchar2 , uint2 , "=r" );
593
+ MINTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u8" , uchar4 , uint4 , "=r" );
594
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u16" , ushort2 , ushort2 , "=h" );
595
+ MINTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u16" , ushort4 , ushort4 , "=h" );
596
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u32" , uint2 , uint2 , "=r" );
597
+ MINTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u32" , uint4 , uint4 , "=r" );
598
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u64" , ulonglong2 , ulonglong2 , "=l" );
599
+
600
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.f32" , float , float , "=f" );
601
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.f64" , double , double , "=d" );
602
+
603
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f32" , float2 , float2 , "=f" );
604
+ MINTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.f32" , float4 , float4 , "=f" );
605
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f64" , double2 , double2 , "=f" );
606
+
607
+
608
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , char , unsigned int , "=r" );
609
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , signed char , signed int , "=r" );
610
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.s16" , short , unsigned short , "=h" );
611
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.s32" , int , unsigned int , "=r" );
612
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.s64" , long long , unsigned long long, "=l" );
613
+
614
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s8" , char2 , int2 , "=r" );
615
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s8" , char4 , int4 , "=r" );
616
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s16" , short2 , short2 , "=h" );
617
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s16" , short4 , short4 , "=h" );
618
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s32" , int2 , int2 , "=r" );
619
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s32" , int4 , int4 , "=r" );
620
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s64" , longlong2 , longlong2 , "=l" );
621
+
622
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.u8" , unsigned char , unsigned int , "=r" );
623
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.u16" , unsigned short , unsigned short ,
624
+ "=h" );
625
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.u32" , unsigned int , unsigned int , "=r" );
626
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.u64" , unsigned long long,
627
+ unsigned long long, "=l" );
628
+
629
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u8" , uchar2 , uint2 , "=r" );
630
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u8" , uchar4 , uint4 , "=r" );
631
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u16" , ushort2 , ushort2 , "=h" );
632
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u16" , ushort4 , ushort4 , "=h" );
633
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u32" , uint2 , uint2 , "=r" );
634
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u32" , uint4 , uint4 , "=r" );
635
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u64" , ulonglong2 , ulonglong2 , "=l" );
636
+
637
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.f32" , float , float , "=f" );
638
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.f64" , double , double , "=d" );
639
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f32" , float2 , float2 , "=f" );
640
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.f32" , float4 , float4 , "=f" );
641
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f64" , double2 , double2 , "=d" );
642
+
643
+ #define INTRINSIC_STORE (func_name , asm_op , decl_type , internal_type , asm_type ) \
644
+ inline __device__ void func_name(decl_type *ptr, decl_type value) { \
645
+ internal_type tmp = (internal_type)value; \
646
+ asm(asm_op" [%0], %1;" ::"l"(ptr), asm_type(tmp) : "memory"); \
647
+ }
648
+
649
+ #define INTRINSIC_STORE2 (func_name , asm_op , decl_type , internal_type , asm_type ) \
650
+ inline __device__ void func_name(decl_type *ptr, decl_type value) { \
651
+ internal_type tmp; \
652
+ using element_type = decltype(tmp.x); \
653
+ tmp.x = (element_type)(value.x); \
654
+ tmp.y = (element_type)(value.y); \
655
+ asm(asm_op" [%0], {%1,%2};" ::"l"(ptr), asm_type(tmp.x), asm_type(tmp.y) \
656
+ : "memory"); \
657
+ }
658
+
659
+ #define INTRINSIC_STORE4 (func_name , asm_op , decl_type , internal_type , asm_type ) \
660
+ inline __device__ void func_name(decl_type *ptr, decl_type value) { \
661
+ internal_type tmp; \
662
+ using element_type = decltype(tmp.x); \
663
+ tmp.x = (element_type)(value.x); \
664
+ tmp.y = (element_type)(value.y); \
665
+ tmp.z = (element_type)(value.z); \
666
+ tmp.w = (element_type)(value.w); \
667
+ asm(asm_op" [%0], {%1,%2,%3,%4};" ::"l"(ptr), asm_type(tmp.x), \
668
+ asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
669
+ : "memory"); \
670
+ }
671
+
672
+ INTRINSIC_STORE (__stwt , "st.global.wt.s8" , char , int , "r" );
673
+ INTRINSIC_STORE (__stwt , "st.global.wt.s8" , signed char , int , "r" );
674
+ INTRINSIC_STORE (__stwt , "st.global.wt.s16" , short , short , "h" );
675
+ INTRINSIC_STORE (__stwt , "st.global.wt.s32" , int , int , "r" );
676
+ INTRINSIC_STORE (__stwt , "st.global.wt.s64" , long long , long long , "l" );
677
+
678
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s8" , char2 , int2 , "r" );
679
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s8" , char4 , int4 , "r" );
680
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s16" , short2 , short2 , "h" );
681
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s16" , short4 , short4 , "h" );
682
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s32" , int2 , int2 , "r" );
683
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s32" , int4 , int4 , "r" );
684
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s64" , longlong2 , longlong2 , "l" );
685
+
686
+ INTRINSIC_STORE (__stwt , "st.global.wt.u8" , unsigned char , int , "r" );
687
+ INTRINSIC_STORE (__stwt , "st.global.wt.u16" , unsigned short , unsigned short ,
688
+ "h" );
689
+ INTRINSIC_STORE (__stwt , "st.global.wt.u32" , unsigned int , unsigned int , "r" );
690
+ INTRINSIC_STORE (__stwt , "st.global.wt.u64" , unsigned long long,
691
+ unsigned long long, "l" );
692
+
693
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u8" , uchar2 , uchar2 , "r" );
694
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u8" , uchar4 , uint4 , "r" );
695
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u16" , ushort2 , ushort2 , "h" );
696
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u16" , ushort4 , ushort4 , "h" );
697
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u32" , uint2 , uint2 , "r" );
698
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u32" , uint4 , uint4 , "r" );
699
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u64" , ulonglong2 , ulonglong2 , "l" );
700
+
701
+ INTRINSIC_STORE (__stwt , "st.global.wt.f32" , float , float , "f" );
702
+ INTRINSIC_STORE (__stwt , "st.global.wt.f64" , double , double , "d" );
703
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f32" , float2 , float2 , "f" );
704
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.f32" , float4 , float4 , "f" );
705
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f64" , double2 , double2 , "d" );
706
+
482
707
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
483
708
484
709
#if CUDA_VERSION >= 11000
0 commit comments