diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 0f7adaf24a..934ccbada6 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -69,7 +69,7 @@ __global__ void concat_variable_kernel(
   IntType num_inputs = input_ptr_data.size;
 
   // verbose declaration needed due to template
-  extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+  extern __shared__ __align__(sizeof(T) > 16 ? sizeof(T) : 16) unsigned char smem[];
   IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
 
   if (useSmem) {
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 94989089ec..1d26d4bacb 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -172,7 +172,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
     const DepthwiseArgs args, const T* input, const T* filter, T* output) {
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.x depths.
-  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  extern __shared__ __align__(sizeof(T) > 16 ? sizeof(T) : 16) unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -452,7 +452,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
     const DepthwiseArgs args, const T* input, const T* filter, T* output) {
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.z depths.
-  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  extern __shared__ __align__(sizeof(T) > 16 ? sizeof(T) : 16) unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -1118,7 +1118,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
     const DepthwiseArgs args, const T* output, const T* input, T* filter) {
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
   // Holds block plus halo and filter data for blockDim.x depths.
-  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  extern __shared__ __align__(sizeof(T) > 16 ? sizeof(T) : 16) unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -1388,7 +1388,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
     const DepthwiseArgs args, const T* output, const T* input, T* filter) {
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
   // Holds block plus halo and filter data for blockDim.z depths.
-  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  extern __shared__ __align__(sizeof(T) > 16 ? sizeof(T) : 16) unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);
 
   const int num_batches = args.batch;
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index 393818730b..58a1294005 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -121,7 +121,7 @@ __global__ void split_v_kernel(const T* input_ptr,
   int num_outputs = output_ptr_data.size;
 
   // verbose declaration needed due to template
-  extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+  extern __shared__ __align__(sizeof(T) > 16 ? sizeof(T) : 16) unsigned char smem[];
   IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
 
   if (useSmem) {
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 0ce5cda517..d4dc2235ac 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -330,11 +330,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "protobuf_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
-          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://mirror.bazel.build/github.com/dinever/protobuf/archive/188578878eff18c2148baba0e116d87ce8f49410.tar.gz",
+          "https://github.com/dinever/protobuf/archive/188578878eff18c2148baba0e116d87ce8f49410.tar.gz",
       ],
-      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
-      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
+      sha256 = "7a1d96ccdf7131535828cad737a76fd65ed766e9511e468d0daa3cc4f3db5175",
+      strip_prefix = "protobuf-188578878eff18c2148baba0e116d87ce8f49410",
   )
 
   # We need to import the protobuf library under the names com_google_protobuf
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 2a37c65bc7..43446dd99b 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -110,7 +110,7 @@ cc_library(
         ".",
         "cuda/include",
     ],
-    linkopts = ["-lgomp"],
+    #linkopts = ["-lgomp"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )