diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 69a8e61bd481b5caeedfa46991dcfcb8a9f0e9e6..566a451c21b83c34090fcc8a8a150fe42d4eef17 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -119,6 +119,8 @@
 #define BLOCK_ELLIPSE 128
 #define MAX_SHARED_TYPES 11
 
+#if (__CUDACC_VER_MAJOR__ < 9)
+
 #ifdef _SINGLE_SINGLE
 #define shfl_xor __shfl_xor
 #else
@@ -132,6 +134,25 @@ ucl_inline double shfl_xor(double var, int laneMask, int width) {
 }
 #endif
 
+#else
+
+#ifdef _SINGLE_SINGLE
+ucl_inline double shfl_xor(double var, int laneMask, int width) {
+  return __shfl_xor_sync(0xffffffff, var, laneMask, width);
+}
+#else
+ucl_inline double shfl_xor(double var, int laneMask, int width) {
+  int2 tmp;
+  tmp.x = __double2hiint(var);
+  tmp.y = __double2loint(var);
+  tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,laneMask,width);
+  tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,laneMask,width);
+  return __hiloint2double(tmp.x,tmp.y);
+}
+#endif
+
+#endif
+
 #endif
 
 #endif