Cuda summation code error

I am newbie to cuda. I am using numba trying to add some arrays. the code is
from future import division
from numba import cuda, float32
import numpy as np
import math

TPB = 16
def fast_matmul(A, C):

sA = cuda.shared.array(shape=(1, TPB), dtype=float32)
sD = cuda.shared.array(shape=(1, TPB), dtype=float32)

thread_idx_x = cuda.threadIdx.x
thread_idx_y = cuda.threadIdx.y
totla_No_of_threads_x = cuda.blockDim.x
totla_No_of_threads_y = cuda.blockDim.y
block_idx_x = cuda.blockIdx.x
block_idx_y = cuda.blockIdx.y

x, y = cuda.grid(2)

if x >= A.shape[1]: #and y >= C.shape[1]:

s = 0
index_1 = 1
for i in range(int(A.shape[1] / TPB)):
    sA[thread_idx_x, thread_idx_y] = A[x, thread_idx_y + i * TPB]

    if thread_idx_y <= (totla_No_of_threads_y-index_1):
        sD[thread_idx_x, thread_idx_y] = sA[thread_idx_x, (thread_idx_y +index_1)] - sA[thread_idx_x, thread_idx_y]
    for s in range(totla_No_of_threads_y//2):
        if thread_idx_y < s:
            sD[thread_idx_x, thread_idx_y] += sD[thread_idx_x, thread_idx_y+s]
        C[x, y] = sD[x,y]

A = np.full((1, 16), 3, dtype=np.float32)
C = np.zeros((1, 16))

print(‘A:’, A, ‘C:’, C)
TPB = 32

dA = cuda.to_device(A)
dC= cuda.to_device©
fast_matmul[(1, 1), (32, 32)](dA, dC)
res= dC.copy_to_host()


when I run the code, I got a lot of errors, could any one direct me why. another thing to mention, that is not the final one, I am trying to calculate a fitness function so I simplified it first as a difference between 2 values.