[med-svn] [Git][med-team/plastimatch][upstream] New upstream version 1.10.0

Sat Feb 21 20:27:45 GMT 2026


Étienne Mollier pushed to branch upstream at Debian Med / plastimatch


Commits:
abd0f0da by Étienne Mollier at 2026-02-21T18:20:58+01:00
New upstream version 1.10.0
- - - - -


9 changed files:

- − src/register/cuda/viscous_compute.cu
- − src/register/cuda/viscous_convolution.cu
- − src/register/cuda/viscous_finalize.cu
- − src/register/cuda/viscous_funcHistogram.cu
- − src/register/cuda/viscous_funcImageDomain.cu
- − src/register/cuda/viscous_initialize.cu
- − src/register/cuda/viscous_main.cu
- − src/register/viscous_convolution.h
- − src/register/viscous_global.h


Changes:

=====================================
src/register/cuda/viscous_compute.cu deleted
=====================================
@@ -1,459 +0,0 @@
-/*******************************************************************
-c* Multimodal Deformable Image Registration						   *
-c* via Mutual Information or Bhattacharyya Distantce               *
-c* Version: 1.0                                                    *
-c* Language: C, CUDA                                               *
-c*                                                                 *
-c* Developer: Yifei Lou                                            *
-c* Email: yifei.lou at ece.gatech.edu                                 *
-c*                                                                 *
-c* School of Electrical and Computer Engineering                   *   
-c* Georgia Institute of Technology                                 *
-c* Atlanta, GA, 30318                                              *
-c* Website: http://groups.bme.gatech.edu/groups/bil/               *
-c*                                                                 *
-c* Copyright (c) 2011                                              *
-c* All rights reserved.                                            *
-c*                                                                 *
-c* Permission to use, copy, or modify this code and its            *
-c* documentation for scientific purpose is hereby granted          *
-c* without fee, provided that this copyright notice appear in      *
-c* all copies and that both that copyright notice and this         *
-c* permission notice appear in supporting documentation. The use   *
-c* for commercial purposes is prohibited without permission.       *
-c*                                                                 *
-c* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          *
-c* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     *
-c* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        *
-c* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        *
-c* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            *
-c* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    *
-c* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT *
-c* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF*
-c* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED *
-c* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT     *
-c* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING  *
-c* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  *
-c* THE POSSIBILITY OF SUCH DAMAGE.                                 *
-c*                                                                 *
-c******************************************************************/
-
-/*******************************************************************
-c* Short discription                                               *
-c*   main function to register two images on the current scale     *
-c*	 including upsample and downsample                             *
-c******************************************************************/
-
-#include <thrust/binary_search.h>
-#include <thrust/device_vector.h>
-#include <thrust/functional.h>
-#include <thrust/sort.h>
-#include <cublas.h>
-#include <cutil.h>
-#include <cutil_inline.h>
-#include "viscous_convolution.h"
-#include "viscous_global.h"
-
-// hash a point in the unit square to the index of
-// the grid bucket that contains it
-struct point_to_bucket_index : public thrust::unary_function<float2,unsigned int>
-{
- 	__host__ __device__
-  	point_to_bucket_index(unsigned int width, unsigned int height)
-    		:w(width),h(height){}
-
-  	__host__ __device__
-  	unsigned int operator()(float2 p) const
-  	{
-// find the raster indices of p's bucket
-    		unsigned int x = static_cast<unsigned int>(p.x * (w-1));
-    		unsigned int y = static_cast<unsigned int>(p.y * (h-1));
-
-// return the bucket's linear index
-    		return y * w + x;
-  	}
-
-  	unsigned int w, h;
-};
-
-__global__ void downSample(float *src, float *dest, int NX, int NY, int NZ, int s)
-{
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-
-        if(tid < NX*NY*NZ)
-	{
-		int z = tid/(NX*NY);
-		int y = (tid%(NX*NY))/NX;
-		int x = tid%NX;
-
-		float sum =0.0f;
-		for(int xs = 0; xs<s; xs++)	
-			for(int ys =0; ys<s; ys++)
-				sum += src[s*x+xs + (s*y+ys)*NX0 + s*z*NX0*NY0];
-		dest[tid] = sum/s/s;
-	}
-
-
-}
-
-__global__ void upSample(float *src, float *dest, int NX, int NY, int NZ)
-//      upsampling
-{
-const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-
-        if(tid < NX*NY*NZ)
-	{
-		int z = tid/(NX*NY);
-		int y = (tid%(NX*NY))/NX;
-		int x = tid%NX;
-		
-		int xmin = x/2 - (x%2 == 0);
-		int xmax = x/2 + (x%2 == 1);
-		int ymin = y/2 - (y%2 == 0);
-		int ymax = y/2 + (y%2 == 1);
-		int zmin = z/2 - (z%2 == 0);
-		int zmax = z/2 + (z%2 == 1);
-
-		xmin = (xmin < 0) ? 0: xmin;
-		ymin = (ymin < 0) ? 0: ymin;
-		zmin = (zmin < 0) ? 0: zmin;
-
-		xmax = (xmax < NX)? xmax : NX-1;
-		ymax = (ymax < NY)? ymax : NY-1;
-		zmax = (zmax < NZ)? zmax : NZ-1; 
-		
-		float wx = 0.25 + 0.5*(x%2==0);
-                float wy = 0.25 + 0.5*(y%2==0);
-		float wz = 0.25 + 0.5*(z%2==0);
-		
-		dest[tid] = 	src[xmin + ymin*NX/2 + zmin*NX/2*NY/2] * (1.0 - wx) * (1.0-wy) * (1.0-wz) +
-			    	src[xmax + ymin*NX/2 + zmin*NX/2*NY/2] * wx * (1.0-wy) * (1.0-wz) + 
-				src[xmin + ymax*NX/2 + zmin*NX/2*NY/2] * (1.0 - wx) * wy * (1.0-wz) +
-				src[xmax + ymax*NX/2 + zmin*NX/2*NY/2] * wx * wy * (1.0-wz) +
-				src[xmin + ymin*NX/2 + zmax*NX/2*NY/2] * (1.0 - wx) * (1.0-wy) * wz +
-			    	src[xmax + ymin*NX/2 + zmax*NX/2*NY/2] * wx * (1.0-wy) * wz + 
-				src[xmin + ymax*NX/2 + zmax*NX/2*NY/2] * (1.0 - wx) * wy * wz +
-				src[xmax + ymax*NX/2 + zmax*NX/2*NY/2] * wx * wy * wz;
-		dest[tid] = 2*dest[tid];
-	
-	}
-
-}
-
-
-void compute(float *d_im_move, float *d_im_static, float *d_mv_x, float *d_mv_y, float *d_mv_z, int maxIter)
-//	d_mv_x, d_mv_y and d_im_move are updated
-{
-
-//	bind moving image to texture	
-	const cudaExtent volumeSize = make_cudaExtent(NX, NY, NZ);    	
-	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-	cutilSafeCall( cudaMalloc3DArray(&d_im_move_array, &channelDesc, volumeSize) );
-    	cudaMemcpy3DParms copyParams = {0};
-    	copyParams.srcPtr   = make_cudaPitchedPtr((void*)d_im_move, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);
-    	copyParams.dstArray = d_im_move_array;
-    	copyParams.extent   = volumeSize;
-    	copyParams.kind     = cudaMemcpyDeviceToDevice;
-    	cutilSafeCall( cudaMemcpy3D(&copyParams) );
-
-
-	d_im_move_tex.normalized = false;                
-    	d_im_move_tex.filterMode = cudaFilterModeLinear;  
-
-    
-    	cutilSafeCall(cudaBindTextureToArray(d_im_move_tex, d_im_move_array, channelDesc));
-
-
-// 	bind vector flows to texture
-	cutilSafeCall( cudaMalloc3DArray(&d_mv_x_array, &channelDesc, volumeSize) );
-	cudaMemcpy3DParms copyParams_x = {0};
-    	copyParams_x.srcPtr   = make_cudaPitchedPtr((void*)d_mv_x, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);
-    	copyParams_x.dstArray = d_mv_x_array;
-    	copyParams_x.extent   = volumeSize;
-    	copyParams_x.kind     = cudaMemcpyDeviceToDevice;
-    	cutilSafeCall( cudaMemcpy3D(&copyParams_x) );
-	d_mv_x_tex.normalized = false;
-	d_mv_x_tex.filterMode = cudaFilterModeLinear;
-
-
-	cutilSafeCall( cudaMalloc3DArray(&d_mv_y_array, &channelDesc, volumeSize) );
-	cudaMemcpy3DParms copyParams_y = {0};
-    	copyParams_y.srcPtr   = make_cudaPitchedPtr((void*)d_mv_y, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);
-    	copyParams_y.dstArray = d_mv_y_array;
-    	copyParams_y.extent   = volumeSize;
-    	copyParams_y.kind     = cudaMemcpyDeviceToDevice;
-    	cutilSafeCall( cudaMemcpy3D(&copyParams_y) );
-	d_mv_y_tex.normalized = false;
-	d_mv_y_tex.filterMode = cudaFilterModeLinear;
-
-	cutilSafeCall( cudaMalloc3DArray(&d_mv_z_array, &channelDesc, volumeSize) );
-	cudaMemcpy3DParms copyParams_z = {0};
-    	copyParams_z.srcPtr   = make_cudaPitchedPtr((void*)d_mv_z, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);
-    	copyParams_z.dstArray = d_mv_z_array;
-    	copyParams_z.extent   = volumeSize;
-    	copyParams_z.kind     = cudaMemcpyDeviceToDevice;
-    	cutilSafeCall( cudaMemcpy3D(&copyParams_z) );
-	d_mv_z_tex.normalized = false;
-	d_mv_z_tex.filterMode = cudaFilterModeLinear;
-	
-	float *d_im_out;
-	cutilSafeCall( cudaMalloc((void **)&d_im_out, sDATA_SIZE) );
-
-
-
-//	velocity
-	float *d_v_x, *d_v_x_copy;
-	float *d_v_y, *d_v_y_copy;
-	float *d_v_z, *d_v_z_copy;
-	cutilSafeCall( cudaMalloc((void **)&d_v_x, sDATA_SIZE) );
-	cutilSafeCall( cudaMalloc((void **)&d_v_y, sDATA_SIZE) );
-	cutilSafeCall( cudaMalloc((void **)&d_v_z, sDATA_SIZE) );
-	cutilSafeCall( cudaMalloc((void **)&d_v_x_copy, sDATA_SIZE) );
-	cutilSafeCall( cudaMalloc((void **)&d_v_y_copy, sDATA_SIZE) );
-	cutilSafeCall( cudaMalloc((void **)&d_v_z_copy, sDATA_SIZE) );
-
-
-//	setup for computing joint histogram via thrust
-// 		the grid data structure keeps a range per grid bucket:
-// 		each bucket_begin[i] indexes the first element of bucket i's list of points
-// 		each bucket_end[i] indexes one past the last element of bucket i's list of points
-  	thrust::device_vector<unsigned int> bucket_begin(nBin*nBin);
-  	thrust::device_vector<unsigned int> bucket_end(nBin*nBin);
-
-// 		allocate storage for each point's bucket index
-  	thrust::device_vector<unsigned int> bucket_indices(NX*NY*NZ);
-
-// 		allocate space to hold per-bucket sizes
-        thrust::device_vector<unsigned int> bucket_sizes(nBin*nBin);
-
-//		allocate float2 vector
-	float2 *d_points;
-	cudaMalloc((void**) &d_points, sizeof(float2)*NX*NY*NZ);
-
-
-
-	int regrid = 0;
-	float MI[1000];
-
-	int3   Dims;
-	Dims.x = NX;
-	Dims.y = NY;
-	Dims.z = NZ;
-
-
-
-		for(int it=0; it<maxIter; it++)
-	{
-// 	upate image
-		ImageWarp<<<nblocks, NTHREAD_PER_BLOCK>>>(d_mv_x, d_mv_y, d_mv_z, d_im_out, NX, NY, NZ);	
-
-		
-//	joint histogram via thrust ----- begin
-//		convert to float2 vector
-		transToFloat2<<<nblocks, NTHREAD_PER_BLOCK>>>(d_im_out, d_im_static, d_points, NX*NY*NZ);
-
-//		use a thrust ptr to wrap the raw pointer
-		thrust::device_ptr<float2> points_t(d_points);
-
-// 		transform the points to their bucket indices
-        	thrust::transform(points_t, points_t+NX*NY*NZ, bucket_indices.begin(), point_to_bucket_index(nBin,nBin));
-
-//		sort the bucket index
-        	thrust::sort(bucket_indices.begin(), bucket_indices.end());
-
-// 		find the beginning of each bucket's list of points
-        	thrust::counting_iterator<unsigned int> search_begin(0);
-        	thrust::lower_bound(bucket_indices.begin(), bucket_indices.end(), search_begin,
-                      search_begin + nBin*nBin, bucket_begin.begin());
-
-// 		find the end of each bucket's list of points
-        	thrust::upper_bound(bucket_indices.begin(), bucket_indices.end(), search_begin,
-                      search_begin + nBin*nBin, bucket_end.begin());
-
-//		take the difference between bounds to find each bucket size
-       		thrust::transform(bucket_end.begin(), bucket_end.end(), bucket_begin.begin(),
-                bucket_sizes.begin(), thrust :: minus<unsigned int>());
-
-//		now hist contains the histogram
-		unsigned int *hist = thrust::raw_pointer_cast(&bucket_sizes[0]);
-
-		copyHist<<<nblocks_hist, NTHREAD_PER_BLOCK>>>(hist, d_jointHistogram);
-// 	joint histogram via thrust ----- end
-
-
-			
-//	compute the convolution of joint histogram
-		myconv2dGPU<<<nblocks_hist, NTHREAD_PER_BLOCK>>>(d_jointHistogram, d_jointHistogram_conv, GaussKernelH, nBin, nBin, 3*hValue);
-	
-		
-//	normalize joint histogram
-		float sum = cublasSasum (nBin*nBin, d_jointHistogram_conv , 1);	
-		cublasSscal (nBin*nBin, 1.0f/sum, d_jointHistogram_conv, 1);
-		
-
-//	compute mutual info by GPU
-		marginalDist<<<nBin, nBin>>>(d_jointHistogram_conv, d_probx, d_proby);
-
-		switch (METHOD)
-		{
-			case 1:
-				marginalBnorm_sum<<<nblocks_hist, NTHREAD_PER_BLOCK>>>(d_jointHistogram_conv, d_probx, d_proby, d_jointHistogram); 
-				marginalDistAlongY<<<nBin, nBin>>>(d_jointHistogram, d_Bsum);
-				BnormGPU<<<nblocks_hist, NTHREAD_PER_BLOCK>>>(d_jointHistogram_conv, d_probx, d_proby,d_Bsum, d_jointHistogram); 
-				break;
-			case 2: 
-				mutualInfoGPU<<<nblocks_hist, NTHREAD_PER_BLOCK>>>(d_jointHistogram_conv, d_probx, d_proby, d_jointHistogram); 
-				break;
-		}
-		MI[it] = cublasSasum (nBin*nBin, d_jointHistogram_conv, 1);	
-		printf("mutual information (%d)= %f\n", it, MI[it]);
-
-
-//	NOTE: after this step, jointHistogram becomes the likelihood
-//	compute the first derivative w.r.t. x-dim of joint histogram	
-		myconv2dGPU<<<nblocks_hist, NTHREAD_PER_BLOCK>>>(d_jointHistogram, d_jointHistogram_conv, GaussKernelHx, nBin, nBin,3*hValue);
-
-// 	compute the force		
-		forceComp<<<nblocks, NTHREAD_PER_BLOCK>>>(d_im_out, d_im_static, d_jointHistogram_conv, d_v_x, d_v_y, d_v_z, NX, NY, NZ);
-
-		ImageSmooth(d_v_x, d_v_x_copy,Dims);
-		ImageSmooth(d_v_y, d_v_y_copy,Dims);
-		ImageSmooth(d_v_z, d_v_z_copy,Dims);
-		
-		flowComp<<<nblocks, NTHREAD_PER_BLOCK>>>(d_mv_x, d_mv_y, d_mv_z, d_v_x_copy, d_v_y_copy, d_v_z_copy, d_v_x, d_v_y, NX, NY, NZ);
-// 	NOTE: d_v_x is Jacobian, d_v_y is the max flow
-//		d_v_x_copy, d_v_y_copy, d_v_z_copy are the displacement
-
-		thrust :: device_ptr<float> data_ptr(d_v_y);
-		int maxInd = cublasIsamax(NX*NY*NZ, d_v_y, 1) -1;
-		float maxflow = data_ptr[maxInd];
-		float dt = (du/maxflow); // > 1) ? 1 : du/maxflow;
-		printf("dt = %f \n", dt);
-
-		flowUpdate<<<nblocks, NTHREAD_PER_BLOCK>>>(d_mv_x, d_mv_y, d_mv_z, d_v_x_copy, d_v_y_copy, d_v_z_copy,dt, NX, NY, NZ);
-
-// 	regridding if Jacobian < threshJaco		
-		sum = cublasSasum(NX*NY*NZ, d_v_x, 1);
-		if (sum>0.5)
-		{
-			regrid ++;
-			
-			printf("regrid = %d\n", regrid);
-//	save d_im_move to be d_im_out
-
-			cudaUnbindTexture(d_im_move_tex);
-			cudaMemcpy3DParms copyParams = {0};
-    			copyParams.srcPtr   = make_cudaPitchedPtr((void*)d_im_out, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);
-    			copyParams.dstArray = d_im_move_array;
-    			copyParams.extent   = volumeSize;
-    			copyParams.kind     = cudaMemcpyDeviceToDevice;
-    			cutilSafeCall( cudaMemcpy3D(&copyParams) );
-    			cutilSafeCall(cudaBindTextureToArray(d_im_move_tex, d_im_move_array));
-
-		
-//	update vector flow
-			ImageWarp_mv<<<nblocks, NTHREAD_PER_BLOCK>>>(d_mv_x, d_mv_y, d_mv_z, NX, NY, NZ);	
-			
-			cudaMemcpy3DParms copyParams_x = {0};
-    			copyParams_x.srcPtr   = make_cudaPitchedPtr((void*)d_mv_x, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);
-    			copyParams_x.dstArray = d_mv_x_array;
-    			copyParams_x.extent   = volumeSize;
-    			copyParams_x.kind     = cudaMemcpyDeviceToDevice;
-    			cutilSafeCall( cudaMemcpy3D(&copyParams_x) );
-			cutilSafeCall(cudaBindTextureToArray(d_mv_x_tex, d_mv_x_array));
-
-			cudaMemcpy3DParms copyParams_y = {0};
-    			copyParams_y.srcPtr   = make_cudaPitchedPtr((void*)d_mv_y, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);
-    			copyParams_y.dstArray = d_mv_y_array;
-    			copyParams_y.extent   = volumeSize;
-    			copyParams_y.kind     = cudaMemcpyDeviceToDevice;
-    			cutilSafeCall( cudaMemcpy3D(&copyParams_y) );
-			cutilSafeCall(cudaBindTextureToArray(d_mv_y_tex, d_mv_y_array));
-
-			cudaMemcpy3DParms copyParams_z = {0};
-    			copyParams_z.srcPtr   = make_cudaPitchedPtr((void*)d_mv_z, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);
-    			copyParams_z.dstArray = d_mv_z_array;
-    			copyParams_z.extent   = volumeSize;
-    			copyParams_z.kind     = cudaMemcpyDeviceToDevice;
-    			cutilSafeCall( cudaMemcpy3D(&copyParams_z) );
-			cutilSafeCall(cudaBindTextureToArray(d_mv_z_tex, d_mv_z_array));
-	
-
-			cutilSafeCall( cudaMemset(d_mv_x, 0, sDATA_SIZE) );
-			cutilSafeCall( cudaMemset(d_mv_y, 0, sDATA_SIZE) );
-			cutilSafeCall( cudaMemset(d_mv_z, 0, sDATA_SIZE) );
-
-
-			
-			
-		} // end for regridding
-	
-
-		
-
-	} // for-loop iteration
-
-
-	if (!regrid)
-	{
-		ImageWarp<<<nblocks, NTHREAD_PER_BLOCK>>>(d_mv_x, d_mv_y, d_mv_z, d_im_move, NX, NY, NZ);			
-	}	
-	else
-	{
-		cudaMemcpy3DParms copyParams = {0};		
-		cudaUnbindTexture(d_im_move_tex);
-		copyParams.srcPtr   = make_cudaPitchedPtr((void*)d_im_move, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);
-    		copyParams.dstArray = d_im_move_array;
-    		copyParams.extent   = volumeSize;
-    		copyParams.kind     = cudaMemcpyDeviceToDevice;
-    		cutilSafeCall( cudaMemcpy3D(&copyParams) );
-    		cutilSafeCall(cudaBindTextureToArray(d_im_move_tex, d_im_move_array));
-
-		ImageWarp_final<<<nblocks, NTHREAD_PER_BLOCK>>>(d_mv_x, d_mv_y, d_mv_z,d_im_move, NX, NY, NZ);
-		
-
-	}
-
-	
-
-	cudaFree(d_points);
-	
-	
-
-	cudaFree(d_v_x);
-	cudaFree(d_v_y);
-	cudaFree(d_v_z);
-	cudaFree(d_v_x_copy);
-	cudaFree(d_v_y_copy);
-	cudaFree(d_v_z_copy);	
-
-
-	cudaUnbindTexture(d_im_move_tex);
-	cudaFreeArray(d_im_move_array);
-
-	cudaUnbindTexture(d_mv_x_tex);
-	cudaFreeArray(d_mv_x_array);
-
-	cudaUnbindTexture(d_mv_y_tex);
-	cudaFreeArray(d_mv_y_array);
-
-	cudaUnbindTexture(d_mv_z_tex);
-	cudaFreeArray(d_mv_z_array);
-
-	cudaFree(d_im_out);
-
-
-}
-
-
-__global__ void transToFloat2(const float *input1, const float *input2, float2 *output, const int n)
-{
-	 const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-//      obtain current id on thread
-
-        if (tid < n)
-        {
-		output[tid] = make_float2(input1[tid], input2[tid]);
-	}
-	
-}


=====================================
src/register/cuda/viscous_convolution.cu deleted
=====================================
@@ -1,349 +0,0 @@
-/*******************************************************************
-c* Multimodal Deformable Image Registration						   *
-c* via Mutual Information or Bhattacharyya Distantce               *
-c* Version: 1.0                                                    *
-c* Language: C, CUDA                                               *
-c*                                                                 *
-c* Developer: Yifei Lou                                            *
-c* Email: yifei.lou at ece.gatech.edu                                 *
-c*                                                                 *
-c* School of Electrical and Computer Engineering                   *   
-c* Georgia Institute of Technology                                 *
-c* Atlanta, GA, 30318                                              *
-c* Website: http://groups.bme.gatech.edu/groups/bil/               *
-c*                                                                 *
-c* Copyright (c) 2011                                              *
-c* All rights reserved.                                            *
-c*                                                                 *
-c* Permission to use, copy, or modify this code and its            *
-c* documentation for scientific purpose is hereby granted          *
-c* without fee, provided that this copyright notice appear in      *
-c* all copies and that both that copyright notice and this         *
-c* permission notice appear in supporting documentation. The use   *
-c* for commercial purposes is prohibited without permission.       *
-c*                                                                 *
-c* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          *
-c* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     *
-c* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        *
-c* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        *
-c* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            *
-c* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    *
-c* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT *
-c* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF*
-c* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED *
-c* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT     *
-c* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING  *
-c* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  *
-c* THE POSSIBILITY OF SUCH DAMAGE.                                 *
-c*                                                                 *
-c******************************************************************/
-
-/*******************************************************************
-c* Short discription                                               *
-c*   convolution related support functions                         *
-c*	 in SDK as well as developped by Xuejun Gu and Yifei Lou   *
-c******************************************************************/
-
-
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-#include <cutil_inline.h>
-#include "viscous_global.h"
-#include "viscous_convolution.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Row convolution filter by Frame
-////////////////////////////////////////////////////////////////////////////////
-__global__ void convolutionRowGPU_byframe(
-    float *d_Result,
-    float *d_Data,
-    int dataW,
-	int dataH,
-	int nF
-){
-    //Data cache
-    __shared__ float data[KERNEL_RADIUS + ROW_TILE_W + KERNEL_RADIUS];
-
-    //Current tile and apron limits, relative to row start
-    const int         tileStart = IMUL(blockIdx.x, ROW_TILE_W);
-    const int           tileEnd = tileStart + ROW_TILE_W - 1;
-    const int        apronStart = tileStart - KERNEL_RADIUS;
-    const int          apronEnd = tileEnd   + KERNEL_RADIUS;
-
-    //Clamp tile and apron limits by image borders
-    const int    tileEndClamped = min(tileEnd, dataW - 1);
-    const int apronStartClamped = max(apronStart, 0);
-    const int   apronEndClamped = min(apronEnd, dataW - 1);
-
-    //Row start index in d_Data[]
-    const int          rowStart = nF*dataW*dataH+IMUL(blockIdx.y, dataW);
-
-    //Aligned apron start. Assuming dataW and ROW_TILE_W are multiples 
-    //of half-warp size, rowStart + apronStartAligned is also a 
-    //multiple of half-warp size, thus having proper alignment 
-    //for coalesced d_Data[] read.
-    const int apronStartAligned = tileStart - KERNEL_RADIUS_ALIGNED;
-
-    const int loadPos = apronStartAligned + threadIdx.x;
-    //Set the entire data cache contents
-    //Load global memory values, if indices are within the image borders,
-    //or initialize with zeroes otherwise
-    if(loadPos >= apronStart){
-        const int smemPos = loadPos - apronStart;
-
-        data[smemPos] = 
-            ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ?
-            d_Data[rowStart + loadPos] : 0;
-    }
-
-
-    //Ensure the completness of the loading stage
-    //because results, emitted by each thread depend on the data,
-    //loaded by another threads
-    __syncthreads();
-    const int writePos = tileStart + threadIdx.x;
-    //Assuming dataW and ROW_TILE_W are multiples of half-warp size,
-    //rowStart + tileStart is also a multiple of half-warp size,
-    //thus having proper alignment for coalesced d_Result[] write.
-    if(writePos <= tileEndClamped){
-        const int smemPos = writePos - apronStart;
-        float sum = 0;
-        sum = convolutionRow<2 * KERNEL_RADIUS>(data + smemPos);
-
-        d_Result[rowStart + writePos] = sum;
-    }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Column convolution filter
-////////////////////////////////////////////////////////////////////////////////
-__global__ void convolutionColumnGPU(
-    float *d_Result,
-    float *d_Data,
-    int dataW,
-    int dataH,
-    int smemStride,
-    int gmemStride,
-    int nF
-){
-    //Data cache
-    __shared__ float data[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];
-
-    //Current tile and apron limits, in rows
-    const int         tileStart = IMUL(blockIdx.y, COLUMN_TILE_H);
-    const int           tileEnd = tileStart + COLUMN_TILE_H - 1;
-    const int        apronStart = tileStart - KERNEL_RADIUS;
-    const int          apronEnd = tileEnd   + KERNEL_RADIUS;
-
-    //Clamp tile and apron limits by image borders
-    const int    tileEndClamped = min(tileEnd, dataH - 1);
-    const int apronStartClamped = max(apronStart, 0);
-    const int   apronEndClamped = min(apronEnd, dataH - 1);
-
-    //Current column index
-    const int       columnStart = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
-
-    //Shared and global memory indices for current column
-    
-    int smemPos = IMUL(threadIdx.y, COLUMN_TILE_W) + threadIdx.x;
-    //int gmemPos = IMUL(apronStart + threadIdx.y, dataW) + columnStart;
-    int gmemPos = IMUL(apronStart + threadIdx.y, dataW) + columnStart + nF *dataH *dataW;  // added by xuejun
-    
-    //Cycle through the entire data cache
-    //Load global memory values, if indices are within the image borders,
-    //or initialize with zero otherwise
-    for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y){
-        data[smemPos] = 
-        ((y >= apronStartClamped) && (y <= apronEndClamped)) ? 
-        d_Data[gmemPos] : 0;
-        smemPos += smemStride;
-        gmemPos += gmemStride;
-    }
-
-    //Ensure the completness of the loading stage
-    //because results, emitted by each thread depend on the data, 
-    //loaded by another threads
-    __syncthreads();
-    
-    //Shared and global memory indices for current column
-    smemPos = IMUL(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;
-    //gmemPos = IMUL(tileStart + threadIdx.y , dataW) + columnStart;
-    gmemPos = IMUL(tileStart + threadIdx.y , dataW) + columnStart + nF *dataH *dataW;  // added by xuejun
-    
-    //Cycle through the tile body, clamped by image borders
-    //Calculate and output the results
-    for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y){
-        float sum = 0;
-
-        sum = convolutionColumn<2 * KERNEL_RADIUS>(data + smemPos);
-
-        d_Result[gmemPos] = sum;
-        smemPos += smemStride;
-        gmemPos += gmemStride;
-    }
-}
-////////////////////////////////////////////////////////////////////////////////
-// Frame convolution filter
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// Frame convolution filter
-////////////////////////////////////////////////////////////////////////////////
-__global__ void convolutionFrameGPU(
-    float *d_Result,
-    float *d_Data,
-    int dataW,
-    int dataH,
-    int smemStride,
-    int gmemStride
-){
-    //Data cache
-    __shared__ float data[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];
-
-    //Current tile and apron limits, in rows
-    const int         tileStart = IMUL(blockIdx.y, COLUMN_TILE_H);
-    const int           tileEnd = tileStart + COLUMN_TILE_H - 1;
-    const int        apronStart = tileStart - KERNEL_RADIUS;
-    const int          apronEnd = tileEnd   + KERNEL_RADIUS;
-
-    //Clamp tile and apron limits by image borders
-    const int    tileEndClamped = min(tileEnd, dataH - 1);
-    const int apronStartClamped = max(apronStart, 0);
-    const int   apronEndClamped = min(apronEnd, dataH - 1);
-
-    //Current column index
-    const int       columnStart = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
-
-    //Shared and global memory indices for current column
-    int smemPos = IMUL(threadIdx.y, COLUMN_TILE_W) + threadIdx.x;
-    int gmemPos = IMUL(apronStart + threadIdx.y, dataW) + columnStart;
-    //Cycle through the entire data cache
-    //Load global memory values, if indices are within the image borders,
-    //or initialize with zero otherwise
-    for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y){
-        data[smemPos] = 
-        ((y >= apronStartClamped) && (y <= apronEndClamped)) ? 
-        d_Data[gmemPos] : 0;
-        smemPos += smemStride;
-        gmemPos += gmemStride;
-    }
-
-    //Ensure the completness of the loading stage
-    //because results, emitted by each thread depend on the data, 
-    //loaded by another threads
-    __syncthreads();
-    //Shared and global memory indices for current column
-    smemPos = IMUL(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;
-    gmemPos = IMUL(tileStart + threadIdx.y , dataW) + columnStart;
-    //Cycle through the tile body, clamped by image borders
-    //Calculate and output the results
-    for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y){
-        float sum = 0;
-
-        sum = convolutionColumn<2 * KERNEL_RADIUS>(data + smemPos);
-
-        d_Result[gmemPos] = sum;
-        smemPos += smemStride;
-        gmemPos += gmemStride;
-    }
-}
-
-
-  /**************************************************************************/
-  /**********    Doing low pass filter on an image function     ****************/
-  /**************************************************************************/
-
-
-void ImageSmooth(float *d_image, float *d_image_conv, int3 Dims)
-{
-  
-    
-   //Dims: size of image
-
-	int DATA_W = Dims.x, DATA_H = Dims.y, DATA_F = Dims.z;
-	float *d_temp;
-	int SDATA_SIZE = DATA_W*DATA_H*DATA_F*sizeof(float);
-	
-	cudaMalloc((void**)&d_temp, SDATA_SIZE);
-
-	// row convolution:
-	dim3 blockGridRows(iDivUp(DATA_W, ROW_TILE_W), DATA_H, 1);
-	dim3 threadBlockRows(KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1);
-
-	cudaMemset((void*)d_image_conv, 0, SDATA_SIZE);
-	for (int nF = 0; nF < DATA_F; nF++){
-		convolutionRowGPU_byframe<<<blockGridRows, threadBlockRows>>>(d_image_conv, d_image,DATA_W, DATA_H, nF);
-		cutilCheckMsg("convolutionRowGPU() execution failed\n");
-	}
-    
-	//column convolution
-	dim3 blockGridColumns(iDivUp(DATA_W, COLUMN_TILE_W),iDivUp(DATA_H, COLUMN_TILE_H), 1);
-	dim3 threadBlockColumns(COLUMN_TILE_W, 8);
-	cudaMemset((void*)d_temp, 0, SDATA_SIZE);
-    for (int nF = 0; nF < DATA_F; nF++){
-        convolutionColumnGPU<<<blockGridColumns, threadBlockColumns>>>
-	         (d_temp, d_image_conv, DATA_W, DATA_H, COLUMN_TILE_W * threadBlockColumns.y,DATA_W * threadBlockColumns.y,nF);
-		cutilCheckMsg("convolutionColumnGPU() execution failed\n"); 
-    } 
-    
-    // frame convolution
-	dim3 blockGridFrames(iDivUp(DATA_W *DATA_H , COLUMN_TILE_W),iDivUp(DATA_F, COLUMN_TILE_H),1) ;
-	dim3 threadBlockFrames(COLUMN_TILE_W, 8);
-	cudaMemset((void*)d_image_conv, 0, SDATA_SIZE);
-    convolutionFrameGPU<<<blockGridFrames, threadBlockFrames>>>
-       (d_image_conv, d_temp, DATA_W *DATA_H, DATA_F, COLUMN_TILE_W * threadBlockFrames.y,DATA_W*DATA_H * threadBlockFrames.y);
-    cutilCheckMsg("convolutionFrameGPU() execution failed\n");
-    
-	
-	cudaFree(d_temp);
-
-    
-}
-
-__global__ void myconv2dGPU(float *src, float *dest, float *kernel, int M, int N, int kn)
-// [M,N] = size(src); kernel size = 2*kn +1 
-// symmetric boundary condition
-{
-
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-
-	if (tid<M*N)
-	{
-		dest[tid] = src[tid];				
-		int x = tid % M, x0;
-		int y = tid / M, y0;
-	
-		float sum = 0.0;
-		
-
-		for(int i = -kn; i<=kn; i++)
-		{
-			x0=x;
-			if(x-i <0)
-				x0 = -1+2*i-x;
-			if(x-i >= M)
-				x0 = 2*M-1-x+2*i;
-			for(int j = -kn; j<=kn; j++)
-			{
-				y0 = y;				
-				if(y-j<0)
-					y0 = -1+2*j-y;
-				if(y-j>=N)
-					y0 = 2*N-1-y+2*j;
-				sum += kernel[(j+kn)*(2*kn+1)+(i+kn)]*src[(y0-j)*M+(x0-i)];	
-			}
-		}
-		dest[tid] = sum;
-
-	}
-
-}


=====================================
src/register/cuda/viscous_finalize.cu deleted
=====================================
@@ -1,128 +0,0 @@
-/*******************************************************************
-c* Multimodal Deformable Image Registration						   *
-c* via Mutual Information or Bhattacharyya Distantce               *
-c* Version: 1.0                                                    *
-c* Language: C, CUDA                                               *
-c*                                                                 *
-c* Developer: Yifei Lou                                            *
-c* Email: yifei.lou at ece.gatech.edu                                 *
-c*                                                                 *
-c* School of Electrical and Computer Engineering                   *   
-c* Georgia Institute of Technology                                 *
-c* Atlanta, GA, 30318                                              *
-c* Website: http://groups.bme.gatech.edu/groups/bil/               *
-c*                                                                 *
-c* Copyright (c) 2011                                              *
-c* All rights reserved.                                            *
-c*                                                                 *
-c* Permission to use, copy, or modify this code and its            *
-c* documentation for scientific purpose is hereby granted          *
-c* without fee, provided that this copyright notice appear in      *
-c* all copies and that both that copyright notice and this         *
-c* permission notice appear in supporting documentation. The use   *
-c* for commercial purposes is prohibited without permission.       *
-c*                                                                 *
-c* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          *
-c* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     *
-c* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        *
-c* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        *
-c* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            *
-c* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    *
-c* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT *
-c* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF*
-c* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED *
-c* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT     *
-c* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING  *
-c* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  *
-c* THE POSSIBILITY OF SUCH DAMAGE.                                 *
-c*                                                                 *
-c******************************************************************/
-
-/*******************************************************************
-c* Short discription                                               *
-c*   Finalize the reconstruction on the current scale and for the  *
-c*   entire program, output results, release memory spaces for     *
-c*   global variables, etc.                                        *
-c******************************************************************/
-
-#include <iostream>
-#include <stdio.h>
-#include <cutil.h>
-#include <cutil_inline_runtime.h>
-#include "viscous_global.h"
-
-void fina()
-{
-//	map output image to its original scale
-	nblocks.x = NBLOCKX;
-	nblocks.y =  ((1 + (NX0*NY0*NZ0 - 1)/NTHREAD_PER_BLOCK) - 1) / NBLOCKX + 1; 
-
-	printf("moving image: max = %f, min = %f\n", max_im_move, min_im_move);
-	intensityRescale<<<nblocks, NTHREAD_PER_BLOCK>>>(d_im_move[0], max_im_move, min_im_move, -1);
-
-//	output results
-	outputData(d_im_move[0], DATA_SIZE, outputfilename);
-	outputData(d_mv_x[0], DATA_SIZE, output_mv_x);
-	outputData(d_mv_y[0], DATA_SIZE, output_mv_y);
-	outputData(d_mv_z[0], DATA_SIZE, output_mv_z);
-
-// 	free up the host and device 
-//	image pyramid
-	for(int scale =0; scale <NSCALE; scale++)
-	{
-		cudaFree(d_im_move[scale]);
-		cudaFree(d_im_static[scale]);
-		cudaFree(d_mv_x[scale]);
-		cudaFree(d_mv_y[scale]);
-		cudaFree(d_mv_z[scale]);
-	}
-
-	
-
-//	Gaussian kernel
-	cudaFree(GaussKernelH);
-	cudaFree(GaussKernelHx);
-	
-
-//	histogram related
-	cudaFree(d_jointHistogram);
-	cudaFree(d_jointHistogram_conv);
-	cudaFree(d_probx);
-	cudaFree(d_proby);
-	cudaFree(d_Bsum);
-	
-}
-
-
-void outputData(void *src, int size, const char *outputfilename)
-//      output data to file
-{
-      //  void *tempData_h = malloc( size );
-
-	float *tempData_h = (float*) malloc (sizeof(float)*size);
-  	if (tempData_h == NULL) 
-	{
-		fputs ("Memory error",stderr); 
-		exit (2);
-	}
-
-        cutilSafeCall( cudaMemcpy( tempData_h, src, size, cudaMemcpyDeviceToHost) );
-//      copy data from GPU to CPU
-
-        FILE *fp;
-        fp = fopen(outputfilename,"wb");
-        if( fp == NULL )
-        {
-            std::cout << "Can not open file to write results.";
-                exit(1);
-        }
-        fwrite (tempData_h, size, 1 , fp );
-
-        fclose(fp);
-//      write results to file
-	
-	//printf("denoised data =%f\n", tempData_h[53]);
-        free(tempData_h);
-//      free space
-
-}


=====================================
src/register/cuda/viscous_funcHistogram.cu deleted
=====================================
@@ -1,206 +0,0 @@
-
-/*******************************************************************
-c* Multimodal Deformable Image Registration			   *
-c* via Mutual Information or Bhattacharyya Distantce               *
-c* Version: 1.0                                                    *
-c* Language: C, CUDA                                               *
-c*                                                                 *
-c* Developer: Yifei Lou                                            *
-c* Email: yifei.lou at ece.gatech.edu                                 *
-c*                                                                 *
-c* School of Electrical and Computer Engineering                   *   
-c* Georgia Institute of Technology                                 *
-c* Atlanta, GA, 30318                                              *
-c* Website: http://groups.bme.gatech.edu/groups/bil/               *
-c*                                                                 *
-c* Copyright (c) 2011                                              *
-c* All rights reserved.                                            *
-c*                                                                 *
-c* Permission to use, copy, or modify this code and its            *
-c* documentation for scientific purpose is hereby granted          *
-c* without fee, provided that this copyright notice appear in      *
-c* all copies and that both that copyright notice and this         *
-c* permission notice appear in supporting documentation. The use   *
-c* for commercial purposes is prohibited without permission.       *
-c*                                                                 *
-c* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          *
-c* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     *
-c* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        *
-c* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        *
-c* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            *
-c* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    *
-c* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT *
-c* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF*
-c* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED *
-c* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT     *
-c* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING  *
-c* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  *
-c* THE POSSIBILITY OF SUCH DAMAGE.                                 *
-c*                                                                 *
-c******************************************************************/
-
-/*******************************************************************
-c* Short discription                                               *
-c*   histogram related functions                                   *
-c******************************************************************/
-
-#include "viscous_global.h"
-
-__global__ void marginalDist(float *jointHist, float *probx, float *proby)
-{
- 	__shared__ float xData[256];
-	__shared__ float yData[256];
-
-// each thread loads one element from global to shared memory
-       	unsigned int tid = threadIdx.x;
-       	unsigned int j =  blockIdx.x*blockDim.x + threadIdx.x;
-	unsigned int i = threadIdx.x*blockDim.x + blockIdx.x;
-
-       	xData[tid] = fabs(jointHist[i]);
-	yData[tid] = fabs(jointHist[j]);
-       	__syncthreads();
-
-// do reduction in shared memory
-       	if(tid < 128)
-       	{
-          	xData[tid] += xData[tid+128];
-		yData[tid] += yData[tid+128];
-	        __syncthreads();
-       	}
-       	if(tid < 64)
-       	{
-        	xData[tid] += xData[tid+64];
-		yData[tid] += yData[tid+64];
-                __syncthreads();
-       	}
-       	if(tid < 32)
-       	{
-               	xData[tid] += xData[tid+32];
-               	xData[tid] += xData[tid+16];
-               	xData[tid] += xData[tid+8];
-               	xData[tid] += xData[tid+4];
-               	xData[tid] += xData[tid+2];
-                xData[tid] += xData[tid+1];
-
-		yData[tid] += yData[tid+32];
-               	yData[tid] += yData[tid+16];
-               	yData[tid] += yData[tid+8];
-               	yData[tid] += yData[tid+4];
-               	yData[tid] += yData[tid+2];
-                yData[tid] += yData[tid+1];
-  	}
-
-// write result for this block to global memory
-       	if (tid == 0) 
-	{
-		probx[blockIdx.x] = xData[0];
-		proby[blockIdx.x] = yData[0];
-
-	}
-}
-
-
-
-__global__ void mutualInfoGPU(float *jointHist, float *probx, float *proby, float *likelihood)
-{
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-	if (tid<nBin*nBin)
-	{
-		int i = tid % nBin;
-		int j = tid / nBin;
-
-		likelihood[tid] = jointHist[tid];
-		if (likelihood[tid] >0)
-		{
-			if(probx[i]>0 && proby[j]>0)	
-				likelihood[tid] = log2f(likelihood[tid]/probx[i]/proby[j]);
-			else
-				likelihood[tid] = log2f(likelihood[tid]);	
-
-			jointHist[tid] = jointHist[tid]*likelihood[tid];
-		}
-		
-	}
-
-}
-
-__global__ void marginalBnorm_sum(float *jointHist, float *probx, float *proby, float *Bsum)
-{
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-	if (tid<nBin*nBin)
-	{
-		int i = tid % nBin;
-		int j = tid / nBin;
-
-		Bsum[tid] = sqrtf( proby[j]*fabs(jointHist[tid])/(probx[i]+EPS) );
-
-	}
-
-}
-
-
-__global__ void marginalDistAlongY(float *jointHist, float *dest)
-{
-       __shared__ float data[256];
-
-// each thread loads one element from global to shared memory
-       unsigned int tid = threadIdx.x;
-       unsigned int i = threadIdx.x*blockDim.x + blockIdx.x;
-
-       data[tid] = jointHist[i];
-       __syncthreads();
-
-// do reduction in shared memory
-       if(tid < 128)
-       {
-               data[tid] += data[tid+128];
-               __syncthreads();
-       }
-       if(tid < 64)
-       {
-               data[tid] += data[tid+64];
-               __syncthreads();
-       }
-       if(tid < 32)
-       {
-               data[tid] += data[tid+32];
-               data[tid] += data[tid+16];
-               data[tid] += data[tid+8];
-               data[tid] += data[tid+4];
-               data[tid] += data[tid+2];
-               data[tid] += data[tid+1];
-       }
-
-// write result for this block to global memory
-       if (tid == 0) dest[blockIdx.x] = data[0];
-}
-
-
-
-
-__global__ void BnormGPU(float *jointHist, float *probx, float *proby, float *Bsum, float *likelihood)
-{
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-	if (tid<nBin*nBin)
-	{
-		int i = tid % nBin;
-		int j = tid / nBin;
-
-		likelihood[tid] = -sqrtf(probx[i]*proby[j]/(jointHist[tid]+EPS))-Bsum[i];
-		jointHist[tid] = sqrtf(probx[i]*proby[j]*fabs(jointHist[tid]));
-		
-
-	}
-
-}
-
-__global__ void copyHist(unsigned int *hist, float *jointHist)
-{
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-
-	if (tid<nBin*nBin)
-	{
-		jointHist[tid] = (float) hist[tid];
-	}
-
-}


=====================================
src/register/cuda/viscous_funcImageDomain.cu deleted
=====================================
@@ -1,272 +0,0 @@
-/*******************************************************************
-c* Multimodal Deformable Image Registration			   *
-c* via Mutual Information or Bhattacharyya Distantce               *
-c* Version: 1.0                                                    *
-c* Language: C, CUDA                                               *
-c*                                                                 *
-c* Developer: Yifei Lou                                            *
-c* Email: yifei.lou at ece.gatech.edu                                 *
-c*                                                                 *
-c* School of Electrical and Computer Engineering                   *   
-c* Georgia Institute of Technology                                 *
-c* Atlanta, GA, 30318                                              *
-c* Website: http://groups.bme.gatech.edu/groups/bil/               *
-c*                                                                 *
-c* Copyright (c) 2011                                              *
-c* All rights reserved.                                            *
-c*                                                                 *
-c* Permission to use, copy, or modify this code and its            *
-c* documentation for scientific purpose is hereby granted          *
-c* without fee, provided that this copyright notice appear in      *
-c* all copies and that both that copyright notice and this         *
-c* permission notice appear in supporting documentation. The use   *
-c* for commercial purposes is prohibited without permission.       *
-c*                                                                 *
-c* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          *
-c* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     *
-c* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        *
-c* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        *
-c* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            *
-c* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    *
-c* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT *
-c* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF*
-c* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED *
-c* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT     *
-c* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING  *
-c* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  *
-c* THE POSSIBILITY OF SUCH DAMAGE.                                 *
-c*                                                                 *
-c******************************************************************/
-
-/*******************************************************************
-c* Short discription                                               *
-c*   Supporting functions in the image domain                      *
-c*	 such as gradient, force computation, flow propagation, etc*
-c******************************************************************/
-
-#include "viscous_global.h"
-
-__global__ void forceComp(float *d_im_out, float *d_im_static, float *d_Likelihood, float *d_v_x, float *d_v_y, float *d_v_z, int NX, int NY, int NZ)
-{
-	
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-
-if (tid<NX*NY*NZ)
-{
-	float x = d_im_out[tid]*(nBin-1.0);
-	float y = d_im_static[tid]*(nBin-1.0);
-	
-	int xmin = x;
-	int xmax = (xmin == x)? xmin : xmin+1;
-	int ymin = y;
-	int ymax = (ymin == y)? ymin : ymin+1;
-	
-	float dLx = 	  d_Likelihood[ymin*nBin + xmin] * (1.0f-(x-xmin)) * (1.0f-(y-ymin))
-			+ d_Likelihood[ymax*nBin + xmin] * (1.0f-(x-xmin)) * (y-ymin)  
-			+ d_Likelihood[ymin*nBin + xmax] * (x-xmin)       * (1.0f-(y-ymin))
-			+ d_Likelihood[ymax*nBin + xmax] * (x-xmin)	  * (y-ymin);		
-
-	
-			
-// 	compute image gradient, save as x, y and z
-	float z;
-	int zmin = tid/(NX*NY);
-	ymin = (tid%(NX*NY))/NX;
-	xmin = tid%NX;
-
-	if(xmin+1 < NX && xmin-1>=0)
-		x = ImageGradient(d_im_out[zmin*NX*NY + ymin*NX + (xmin-1) ], d_im_out[zmin*NX*NY +ymin*NX + xmin], d_im_out[zmin*NX*NY +ymin*NX + (xmin+1)]);
-	else 	x = 0;
-
-	if(ymin+1 < NY && ymin-1>=0)
-		y = ImageGradient(d_im_out[zmin*NX*NY +(ymin-1)*NX + xmin], d_im_out[zmin*NX*NY +ymin*NX+ xmin], d_im_out[zmin*NX*NY +(ymin+1)*NX+xmin]);
-	else	y = 0;
-	if(zmin+1 <NZ && zmin-1>=0)
-		z = ImageGradient(d_im_out[(zmin-1)*NX*NY + ymin*NX + xmin], d_im_out[zmin*NX*NY + ymin*NX + xmin],d_im_out[(zmin+1)*NX*NY + ymin*NX + xmin]);
-	else 	z = 0;
-
-	d_v_x[tid] = 	-ALPHA*dLx*x;
-	d_v_y[tid] = 	-ALPHA*dLx*y;
-	d_v_z[tid] =	-ALPHA*dLx*z;
-
-		
-}
-}
-
-__global__ void flowComp(float *d_mv_x, float *d_mv_y, float *d_mv_z, float *d_v_x, float *d_v_y, float *d_v_z, float *jacobian, float *flow, int NX, int NY, int NZ)
-{
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-
-if (tid<NX*NY*NZ)
-{
-	float u1x, u1y, u1z;
-	float u2x, u2y, u2z;
-	float u3x, u3y, u3z;
-
-	int z = tid/(NX*NY);
-	int y = (tid%(NX*NY))/NX;
-	int x = tid%NX;
-
-	if(x+1<NX && x-1>=0)
-	{
-		u1x = ImageGradient(d_mv_x[z*NX*NY+y*NX+x-1], d_mv_x[z*NX*NY+y*NX+x], d_mv_x[z*NX*NY+y*NX+x+1]);
-		u2x = ImageGradient(d_mv_y[z*NX*NY+y*NX+x-1], d_mv_y[z*NX*NY+y*NX+x], d_mv_y[z*NX*NY+y*NX+x+1]);
-		u3x = ImageGradient(d_mv_z[z*NX*NY+y*NX+x-1], d_mv_z[z*NX*NY+y*NX+x], d_mv_y[z*NX*NY+y*NX+x+1]);
-	}
-	else 
-	{
-		u1x = 0;
-		u2x = 0;
-		u3x = 0;
-	}
-
-	if(y+1<NY && y-1>=0)
-	{
-		u1y = ImageGradient(d_mv_x[z*NX*NY+(y-1)*NX+x], d_mv_x[z*NX*NY+y*NX+x], d_mv_x[z*NX*NY+(y+1)*NX+x]);
-		u2y = ImageGradient(d_mv_y[z*NX*NY+(y-1)*NX+x], d_mv_y[z*NX*NY+y*NX+x], d_mv_y[z*NX*NY+(y+1)*NX+x]);
-		u3y = ImageGradient(d_mv_z[z*NX*NY+(y-1)*NX+x], d_mv_z[z*NX*NY+y*NX+x], d_mv_z[z*NX*NY+(y+1)*NX+x]);
-	}
-	else
-	{
-		u1y = 0; 
-		u2y = 0;
-		u3y = 0;
-	}
-	
-	if(z+1<NZ && z-1>=0)
-	{
-		u1z = ImageGradient(d_mv_x[(z-1)*NX*NY+y*NX+x], d_mv_x[z*NX*NY+y*NX+x], d_mv_x[(z+1)*NX*NY+y*NX+x]);
-		u2z = ImageGradient(d_mv_y[(z-1)*NX*NY+y*NX+x], d_mv_y[z*NX*NY+y*NX+x], d_mv_y[(z+1)*NX*NY+y*NX+x]);
-		u3z = ImageGradient(d_mv_z[(z-1)*NX*NY+y*NX+x], d_mv_z[z*NX*NY+y*NX+x], d_mv_z[(z+1)*NX*NY+y*NX+x]);
-	}
-	else
-	{
-		u1z = 0; 
-		u2z = 0;
-		u3z = 0;
-	}
-	
-	float R1 = d_v_x[tid] - d_v_x[tid]*u1x - d_v_y[tid]*u1y - d_v_z[tid]*u1z;
-	float R2 = d_v_y[tid] - d_v_x[tid]*u2x - d_v_y[tid]*u2y - d_v_z[tid]*u2z;
-	float R3 = d_v_z[tid] - d_v_x[tid]*u3x - d_v_y[tid]*u3y - d_v_z[tid]*u3z;
-
-	float jaco = (1.0f-u1x)*(1.0f-u2y)*(1.0f-u3z)-u3x*u1y*u2z - u2x*u3y*u1z 
-			-(1.0f-u1x)*u3y*u2z - u3x*(1.0f-u2y)*u1z - u2x*u1y*(1.0f-u3z);
-	jacobian[tid] = (fabs(jaco)<= threshJaco) ? 1.0f : 0.0f;
-	
-	flow[tid] = sqrtf(R1 * R1 + R2 * R2 + R3 * R3);
-	
-// 	d_v_x, d_v_y, d_v_z become displacement
-	d_v_x[tid] = R1; 
-	d_v_y[tid] = R2;
-	d_v_z[tid] = R3;
-	
-}	
-}
-
-__global__ void flowUpdate(float *d_mv_x, float *d_mv_y, float *d_mv_z, float *d_disp_x, float *d_disp_y, float *d_disp_z, float dt, int NX, int NY, int NZ)
-{
-	
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-
-if (tid<NX*NY*NZ)
-{
-	
-	d_mv_x[tid] += d_disp_x[tid]*dt;
-	d_mv_y[tid] += d_disp_y[tid]*dt;
-	d_mv_z[tid] += d_disp_z[tid]*dt;
-
-}
-}
-
-__device__ float ImageGradient(float Im, float I, float Ip)
-{
-	float xp = Ip-I;
-	float xm = I - Im;
-
-	return minmod(xp, xm);
-
-
-}
-
-
-__global__ void ImageWarp(float *mv_x, float *mv_y, float *mv_z, float *dest, int NX, int NY, int NZ)
-{
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-//      obtain current id on
-
-        if(tid < NX*NY*NZ)
-	{
-		int z = tid/(NX*NY);
-		int y = (tid%(NX*NY))/NX;
-		int x = tid%NX;
-
-		float v_x = mv_x[tid];
-		float v_y = mv_y[tid];
-		float v_z = mv_z[tid];
-
-		dest[tid] = tex3D(d_im_move_tex, x-v_x+0.5, y-v_y+0.5, z-v_z+0.5);	
-
-	}
-	
-}
-
-__global__ void ImageWarp_mv(float *mv_x, float *mv_y, float *mv_z, int NX, int NY, int NZ)
-{
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-//      obtain current id on
-
-        if(tid < NX*NY*NZ)
-	{
-		int z = tid/(NX*NY);
-		int y = (tid%(NX*NY))/NX;
-		int x = tid%NX;
-
-		float v_x = mv_x[tid];
-		float v_y = mv_y[tid];
-		float v_z = mv_z[tid];
-		
-		mv_x[tid] += tex3D(d_mv_x_tex, x-v_x+0.5, y-v_y+0.5,z-v_z+0.5) ;	
-		mv_y[tid] += tex3D(d_mv_y_tex, x-v_x+0.5, y-v_y+0.5,z-v_z+0.5) ;	
-		mv_z[tid] += tex3D(d_mv_z_tex, x-v_x+0.5, y-v_y+0.5,z-v_z+0.5) ;	
-	}
-	
-}
-
-__global__ void ImageWarp_final(float *mv_x, float *mv_y, float *mv_z, float *dest, int NX, int NY, int NZ)
-{
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-//      obtain current id on
-
-        if(tid < NX*NY*NZ)
-	{
-		int z = tid/(NX*NY);
-		int y = (tid%(NX*NY))/NX;
-		int x = tid%NX;
-				
-		mv_x[tid] = tex3D(d_mv_x_tex, x, y, z) ;	
-		mv_y[tid] = tex3D(d_mv_y_tex, x, y, z) ;
-		mv_z[tid] = tex3D(d_mv_z_tex, x, y, z) ;	
-		dest[tid] = tex3D(d_im_move_tex, x-mv_x[tid]+0.5, y-mv_y[tid]+0.5, z-mv_z[tid]+0.5);
-	}
-	
-}
-
-
-__host__ __device__ float minmod(float x, float y)
-{
-	if (x*y<=0)
-		return 0;
-
-	if (x>0)
-	{
-		if(x>y)
-			return y;
-		else 	return x;
-	}
-	else
-	{
-		if(x>y)	return x;
-		else 	return y;
-	}
-}


=====================================
src/register/cuda/viscous_initialize.cu deleted
=====================================
@@ -1,267 +0,0 @@
-/*******************************************************************
-c* Multimodal Deformable Image Registration						   *
-c* via Mutual Information or Bhattacharyya Distantce               *
-c* Version: 1.0                                                    *
-c* Language: C, CUDA                                               *
-c*                                                                 *
-c* Developer: Yifei Lou                                            *
-c* Email: yifei.lou at ece.gatech.edu                                 *
-c*                                                                 *
-c* School of Electrical and Computer Engineering                   *   
-c* Georgia Institute of Technology                                 *
-c* Atlanta, GA, 30318                                              *
-c* Website: http://groups.bme.gatech.edu/groups/bil/               *
-c*                                                                 *
-c* Copyright (c) 2011                                              *
-c* All rights reserved.                                            *
-c*                                                                 *
-c* Permission to use, copy, or modify this code and its            *
-c* documentation for scientific purpose is hereby granted          *
-c* without fee, provided that this copyright notice appear in      *
-c* all copies and that both that copyright notice and this         *
-c* permission notice appear in supporting documentation. The use   *
-c* for commercial purposes is prohibited without permission.       *
-c*                                                                 *
-c* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          *
-c* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     *
-c* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        *
-c* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        *
-c* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            *
-c* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    *
-c* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT *
-c* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF*
-c* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED *
-c* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT     *
-c* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING  *
-c* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  *
-c* THE POSSIBILITY OF SUCH DAMAGE.                                 *
-c*                                                                 *
-c******************************************************************/
-
-/*******************************************************************
-c* Short discription                                               *
-c*   initialization of the entire program including				   *
-c*   data preprocessing, construction of image pyramid             *
-c*   and Gaussian smoothing kernels                                *
-c******************************************************************/
-
-#include <iostream>
-#include <thrust/device_ptr.h>
-#include <cublas.h>
-#include <cutil.h>
-#include <cutil_inline.h>
-#include "viscous_convolution.h"
-#include "viscous_global.h"
-
-void dataPreprocessing(float *image, float *maxValue, float *minValue)
-{
-	thrust :: device_ptr<float> data_ptr(image);
-	int maxInd = cublasIsamax(NX0*NY0*NZ0, image, 1) -1;
-	int minInd = cublasIsamin(NX0*NY0*NZ0, image, 1) -1;
-	*maxValue = data_ptr[maxInd];
-	*minValue = data_ptr[minInd];
-
-	nblocks.x = NBLOCKX;
-	nblocks.y =  ((1 + (NX0*NY0*NZ0 - 1)/NTHREAD_PER_BLOCK) - 1) / NBLOCKX + 1; 
-
-	intensityRescale<<<nblocks, NTHREAD_PER_BLOCK>>>(image, *maxValue, *minValue, 1);
-
-}
-
-
-__global__ void intensityRescale(float *image, float maxValue, float minValue, int type)
-//	type > 0: forward
-//	type < 0: backward
-{
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-
-        if(tid < NX0*NY0*NZ0)
-	{	
-		if(type>0)		
-			image[tid] = (image[tid] - minValue)/(maxValue-minValue);
-		else
-			image[tid] = image[tid]*(maxValue-minValue) + minValue;
-	
-	}
-}
-
-__global__ void short2float(short *raw, float *image, int type)
-{
-	const int tid = (blockIdx.y*NBLOCKX + blockIdx.x)*blockDim.x + threadIdx.x;
-
-        if(tid < NX0*NY0*NZ0)
-	{	
-		if (type>0)		
-			image[tid] = (float) raw[tid];
-		else
-			raw[tid] = (short) image[tid];
-	}
-	
-}
-
-void initData()
-{
-
-// 	load static and moving images on host
-	float *h_im_static = (float*)malloc(DATA_SIZE);
-	loadData(h_im_static, DATA_SIZE, inputfilename_static);
-	float *h_im_move = (float *)malloc(DATA_SIZE);
-	loadData(h_im_move, DATA_SIZE, inputfilename_move);	
-
-
-//	create image pyramid on device
-//		finest level 0
-	cutilSafeCall( cudaMalloc((void**) &d_im_move[0], DATA_SIZE ) );
-	cutilSafeCall( cudaMalloc((void**) &d_im_static[0],DATA_SIZE) );
-
-	cutilSafeCall( cudaMemcpy(d_im_static[0], h_im_static, DATA_SIZE, cudaMemcpyHostToDevice) );
-	cutilSafeCall( cudaMemcpy(d_im_move[0], h_im_move, DATA_SIZE, cudaMemcpyHostToDevice) );
-
-	
-	free(h_im_static);
-	free(h_im_move);
-
-//	scale intensity to [0, 1]
-	dataPreprocessing(d_im_static[0],&max_im_move, &min_im_move);
-	dataPreprocessing(d_im_move[0], &max_im_move, &min_im_move);
-
-
-	
-//		vector flow
-	cutilSafeCall( cudaMalloc((void **)&d_mv_x[0], DATA_SIZE) );
-	cutilSafeCall( cudaMalloc((void **)&d_mv_y[0], DATA_SIZE) );
-	cutilSafeCall( cudaMalloc((void **)&d_mv_z[0], DATA_SIZE) );
-
-	cutilSafeCall( cudaMemset(d_mv_x[0], 0, DATA_SIZE) );
-	cutilSafeCall( cudaMemset(d_mv_y[0], 0, DATA_SIZE) );
-	cutilSafeCall( cudaMemset(d_mv_z[0], 0, DATA_SIZE) );
-
-
-//		coarse levels
-	for(int scale = 1; scale < NSCALE; scale ++)
-	{
-		NX = NX0/pow(2, scale);
-		NY = NY0/pow(2, scale);
-		NZ = (NZ0-1)/pow(2, scale) +1;
-
-		sDATA_SIZE = sizeof(float)*NX*NY*NZ;
-
-		cutilSafeCall( cudaMalloc((void**) &d_im_move[scale],   sDATA_SIZE ) );
-		cutilSafeCall( cudaMalloc((void**) &d_im_static[scale], sDATA_SIZE ) );
-
-		nblocks.x = NBLOCKX;
-        	nblocks.y =  ((1 + (NX*NY*NZ - 1)/NTHREAD_PER_BLOCK) - 1) / NBLOCKX + 1; 
-
-		int s = pow(2, scale);
-	
-		downSample<<<nblocks, NTHREAD_PER_BLOCK>>>(d_im_move[0], d_im_move[scale], NX, NY, NZ, s);
-		downSample<<<nblocks, NTHREAD_PER_BLOCK>>>(d_im_static[0], d_im_static[scale],NX, NY, NZ, s);
-
-		cutilSafeCall( cudaMalloc((void **)&d_mv_x[scale], sDATA_SIZE ) );
-		cutilSafeCall( cudaMalloc((void **)&d_mv_y[scale], sDATA_SIZE ) );
-		cutilSafeCall( cudaMalloc((void **)&d_mv_z[scale], sDATA_SIZE ) );
-
-		cutilSafeCall( cudaMemset(d_mv_x[scale], 0, sDATA_SIZE ) );
-		cutilSafeCall( cudaMemset(d_mv_y[scale], 0, sDATA_SIZE ) );
-		cutilSafeCall( cudaMemset(d_mv_z[scale], 0, sDATA_SIZE ) );
-		
-
-	}
-	
-	
-
-        std::cout << "Load data successfully.\n\n";
-
-
-//	allocate space for histogram related variables
-	cutilSafeCall( cudaMalloc(&d_jointHistogram, HIST_SIZE) );
-	cutilSafeCall( cudaMalloc(&d_jointHistogram_conv, HIST_SIZE) );
-	cutilSafeCall( cudaMalloc((void **)&d_probx, sizeof(float)*nBin) );
-	cutilSafeCall( cudaMalloc((void **)&d_proby, sizeof(float)*nBin) );
-	cutilSafeCall( cudaMalloc((void **)&d_Bsum,sizeof(float)*nBin ) );
-	
-}
-
-
-
-
-
-void initGaussKernel()
-{
-    std::cout <<	"Initializing Gaussian kernel..." << std::endl;
-	float *h_GaussKernelH  = (float *)malloc(sizeof(float)*(6*hValue+1)*(6*hValue+1));
-	float *h_GaussKernelHx = (float *)malloc(sizeof(float)*(6*hValue+1)*(6*hValue+1));
-	cutilSafeCall( cudaMalloc(&GaussKernelH, sizeof(float)*(6*hValue+1)*(6*hValue+1) ) );
-	cutilSafeCall( cudaMalloc(&GaussKernelHx, sizeof(float)*(6*hValue+1)*(6*hValue+1) ) );
-	
-
-	float sum = 0.0;
-	for(int i = -3*hValue; i <= 3*hValue; i++)
-	{
-		for(int j = -3*hValue; j <= 3*hValue; j++)
-		{
-			
-			float temp = expf(-(i*i+j*j)/2.0/hValue/hValue);
-			h_GaussKernelH[(i+3*hValue)+(j+3*hValue)*(6*hValue+1)] = temp;
-			sum += temp;
-
-			h_GaussKernelHx[(i+3*hValue)+(j+3*hValue)*(6*hValue+1)] = -i*temp/hValue/hValue;
-			
-		}
-	}
-	for(int i = -3*hValue; i <= 3*hValue; i++)
-	{
-		for(int j = -3*hValue; j <= 3*hValue; j++)
-		{
-                	h_GaussKernelH[(i+3*hValue)+(j+3*hValue)*(6*hValue+1)] /= sum;
-			h_GaussKernelHx[(i+3*hValue)+(j+3*hValue)*(6*hValue+1)] /= sum;		
-                }
-        }	
-
-	cutilSafeCall( cudaMemcpy(GaussKernelH,  h_GaussKernelH,  sizeof(float)*(6*hValue+1)*(6*hValue+1), cudaMemcpyHostToDevice) );
-	cutilSafeCall( cudaMemcpy(GaussKernelHx, h_GaussKernelHx, sizeof(float)*(6*hValue+1)*(6*hValue+1), cudaMemcpyHostToDevice) );
-	
-	
-
-	
-	
-	free(h_GaussKernelH);
-	free(h_GaussKernelHx);	
-
-	float *h_GaussKernel1D = (float *)malloc(sizeof(float)*KERNEL_W);
-	float sumK = 0.0;
-        for(int i = 0; i < KERNEL_W; i++)
-	{
-            	h_GaussKernel1D[i] = expf(-(i-KERNEL_RADIUS)*(i-KERNEL_RADIUS)/2.0/sValue/sValue);
-		sumK += h_GaussKernel1D[i];
-	}
-	
-	
-	for(int i = 0; i < KERNEL_W; i++)
-		h_GaussKernel1D[i] /= sumK;
-
-	cudaMemcpyToSymbol(d_Kernel, h_GaussKernel1D, KERNEL_W * sizeof(float));
-
-	
-	free(h_GaussKernel1D);
-
-}
-
-void loadData(float *dest, int sizeInByte, const char *filename)
-//	load data
-{
-	FILE *fp = fopen(filename,"rb");
-        if( fp == NULL )
-        {
-                printf("   File \'%s\' could not be opened!\n", filename);
-                exit(1);
-        }
-
-        else{
-                printf("Reading File \'%s\' ... \n", filename);
-
-                size_t temp = fread(dest, 1, sizeInByte, fp);
-                fclose(fp);
-        }
-        
-}


=====================================
src/register/cuda/viscous_main.cu deleted
=====================================
@@ -1,296 +0,0 @@
-/*******************************************************************
-c* Multimodal Deformable Image Registration			   *
-c* via Mutual Information or Bhattacharyya Distantce               *
-c* Version: 1.0                                                    *
-c* Language: C, CUDA                                               *
-c*                                                                 *
-c* Developer: Yifei Lou                                            *
-c* Email: yifei.lou at ece.gatech.edu                                 *
-c*                                                                 *
-c* School of Electrical and Computer Engineering                   *   
-c* Georgia Institute of Technology                                 *
-c* Atlanta, GA, 30318                                              *
-c* Website: http://groups.bme.gatech.edu/groups/bil/               *
-c*                                                                 *
-c* Copyright (c) 2011                                              *
-c* All rights reserved.                                            *
-c*                                                                 *
-c* Permission to use, copy, or modify this code and its            *
-c* documentation for scientific purpose is hereby granted          *
-c* without fee, provided that this copyright notice appear in      *
-c* all copies and that both that copyright notice and this         *
-c* permission notice appear in supporting documentation. The use   *
-c* for commercial purposes is prohibited without permission.       *
-c*                                                                 *
-c* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          *
-c* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     *
-c* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        *
-c* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        *
-c* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            *
-c* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    *
-c* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT *
-c* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF*
-c* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED *
-c* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT     *
-c* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING  *
-c* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  *
-c* THE POSSIBILITY OF SUCH DAMAGE.                                 *
-c*                                                                 *
-c******************************************************************/
-
-/*******************************************************************
-c* Short discription                                               *
-c*   main code of the multi-modal deformable registration          *
-c*    it calls all the other components                            *
-c******************************************************************/
-
-
-
-// includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <iostream>
-
-
-#define GCS_REPRESS_EXTERNS
-// includes, gloable variables
-#include "viscous_global.h"
-//#include "convolution.cu"
-#include "viscous_cuda.h"
- 
-// includes, project
-#include <cutil_inline.h>
-#include <cublas.h>
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/generate.h>
-#include <thrust/sort.h>
-#include <thrust/binary_search.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include <thrust/functional.h>
-
-#include <cuda.h>   // for float2
-
-using namespace std;
-using namespace thrust;
-
-//	include files
-//#include "initialize.cu"
-//#include "funcHistogram.cu"
-//#include "funcImageDomain.cu"
-//#include "compute.cu"
-//#include "finalize.cu"
-
-
-/* Global variables */
-/***************************************
-*	global variables declaration
-***************************************/
-char inputfilename_move[100]   = "./inputDTI.raw"; 
-//	image move
-char inputfilename_static[100] = "./inputT2.raw";
-// 	image static
-char outputfilename[100] = "./outputDTI.raw"; 
-//      image out
-char output_mv_x[100] = "./output_xFlow.dat";
-char output_mv_y[100] = "./output_yFlow.dat";
-char output_mv_z[100] = "./output_zFlow.dat";
-
-float *h_im_static, *h_im_move;
-//	image pyramid
-float *d_im_static[NSCALE], *d_im_move[NSCALE];
-// 	vector flow
-float *d_mv_x[NSCALE], *d_mv_y[NSCALE], *d_mv_z[NSCALE];
-
-
-//	gaussian kernel
-float *GaussKernelH, *GaussKernelHx;
-
-//	 histogram related
-float *d_jointHistogram;
-float *d_jointHistogram_conv;
-float *d_probx, *d_proby;
-float *d_Bsum;
-
-dim3 nblocks;
-dim3 nblocks_hist;
-
-int NX, NY, NZ, sDATA_SIZE;
-//	dimension at current pyramid level
-float max_im_move, min_im_move;
-//	max and min intensity of the moving image
-
-
-cudaArray *d_im_move_array;
-texture<float, 3, cudaReadModeElementType> d_im_move_tex;
-
-
-cudaArray *d_mv_x_array, *d_mv_y_array, *d_mv_z_array;
-texture<float, 3, cudaReadModeElementType> d_mv_x_tex;
-texture<float, 3, cudaReadModeElementType> d_mv_y_tex;
-texture<float, 3, cudaReadModeElementType> d_mv_z_tex;
-
-int deviceCount;
-cudaDeviceProp dP;
-
-/****************************************************
-	main program
-****************************************************/
-int 
-CUDA_viscous_main (
-    int argc, 
-    char** argv
-)
-{
-    cout << endl << "****************************************" << endl;
-    cout << "Computation parameters..." << endl;
-    cout << "****************************************" << endl ;
-
-    int device = DEVICENUMBER;
-//      device number
-
-    cudaSetDevice(device);
-    cout << "Using device # " << device << endl;
-//      choose which device to use
-
-    cudaGetDeviceCount(&deviceCount);
-    cudaGetDeviceProperties(&dP,device);
-    cout<<"Max threads per block: "<<dP.maxThreadsPerBlock<<endl;
-    cout<<"Max Threads DIM: "<<dP.maxThreadsDim[0]<<" x "<<dP.maxThreadsDim[1]<<" x "<<dP.maxThreadsDim[2]<<endl;
-    cout<<"Max Grid Size: "<<dP.maxGridSize[0]<<" x "<<dP.maxGridSize[1]<<" x "<<dP.maxGridSize[2]<<endl;
-    printf("Device %d: \"%s\" with Compute %d.%d capability\n", 
-        device, dP.name, dP.major, dP.minor);
-//      obtain computing resource
-
-
-    nblocks_hist.x = NBLOCKX;
-    nblocks_hist.y =  ((1 + (nBin*nBin - 1)/NTHREAD_PER_BLOCK) - 1) / NBLOCKX + 1; 
-
-    cout << endl << "****************************************" << endl;
-    cout << "Computing starts..." << endl;
-    cout << "****************************************" << endl << endl;
-
-#if defined (commentout)
-//	mark the start total time timer 
-    unsigned int totalTimer = 0;
-    cutilCheckError( cutCreateTimer( &totalTimer));
-    cutilCheckError( cutStartTimer( totalTimer));
-#endif
-
-/******************************************************
-	initialize
-******************************************************/
-    cout << "\n\n";
-    cout << "Initializing MI 3Dreg program...\n\n";
-	
-//////  CBLAS initialization ////////////////////////////
-
-    cout << "Initializing CUBLAS..." << endl;
-
-    cublasStatus status = cublasInit();
-    if (status != CUBLAS_STATUS_SUCCESS)
-    {
-        fprintf (stderr, "!!!! CUBLAS initialization error\n");
-        getchar();
-        exit(0);
-    }
-//      initialize CUBLAS
-	
-    initData();
-    initGaussKernel();
-	
-/******************************************************
-	start iterations
-******************************************************/
-#if defined (commentout)
-    // mark the start time
-    unsigned int timer = 0;
-    cutilCheckError( cutCreateTimer( &timer));
-    cutilCheckError( cutStartTimer( timer));
-#endif
-
-    cout << "\n\n";
-    cout << "Performing registration...\n\n";
-
-    for(int scale = NSCALE-1; scale >=0; scale--)
-    {
-        NX = NX0/pow(2, scale);
-        NY = NY0/pow(2, scale);
-        NZ = (NZ0-1)/pow(2, scale) + 1;
-	
-        sDATA_SIZE = (NX*NY*NZ)* sizeof(float);
-
-        nblocks.x = NBLOCKX;
-        nblocks.y =  ((1 + (NX*NY*NZ - 1)/NTHREAD_PER_BLOCK) - 1) 
-            / NBLOCKX + 1;
-        printf ("current scale = %d, size of image = %d x %d x %d ... \n", 
-            scale, NX, NY, NZ);
-        if(scale<NSCALE-1)
-        {
-            upSample<<<nblocks, NTHREAD_PER_BLOCK>>>(
-                d_mv_x[scale+1], d_mv_x[scale], NX, NY, NZ);
-            upSample<<<nblocks, NTHREAD_PER_BLOCK>>>(
-                d_mv_y[scale+1], d_mv_y[scale], NX, NY, NZ);
-            upSample<<<nblocks, NTHREAD_PER_BLOCK>>>(
-                d_mv_z[scale+1], d_mv_z[scale], NX, NY, NZ);
-        }
-        compute (
-            d_im_move[scale], d_im_static[scale], 
-            d_mv_x[scale], d_mv_y[scale], d_mv_z[scale], MAX_ITER);
-        printf("\n\n");
-    }
-
-    cudaThreadSynchronize();
-#if defined (commentout)
-    cutilCheckError( cutStopTimer( timer));
-    printf("\n\n****************************************\n");
-    printf( "Computing time: %f (ms)\n", cutGetTimerValue( timer));
-    printf("****************************************\n\n\n");
-    cutilCheckError( cutDeleteTimer( timer));
-#endif
-//      mark the end timer and print
-
-/******************************************************
-	finalize
-******************************************************/
-
-    printf("Finalizing program...\n\n");
-	
-    fina();
-
-/****   shut down CBLAS ********/
-
-    status = cublasShutdown();
-    if (status != CUBLAS_STATUS_SUCCESS)
-    {
-        fprintf (stderr, "!!!! shutdown error (A)\n");
-        getchar();
-        exit(0);
-    }
-//      Shut down CUBLAS
-
-    cudaThreadSynchronize();
-
-//	mark the end total timer
-#if defined (commentout)
-    cutilCheckError( cutStopTimer( totalTimer));
-    printf("\n\n****************************************\n");
-    printf( "Entire program time: %f (ms)\n", cutGetTimerValue( totalTimer));
-    printf("****************************************\n\n\n");
-    cutilCheckError( cutDeleteTimer( totalTimer));
-#endif
-
-    printf("Have a nice day!\n");
-	
-    cudaThreadExit();	
-
-#if defined (commentout)
-    cutilExit(argc, argv);
-#endif
-    return 0;
-}


=====================================
src/register/viscous_convolution.h deleted
=====================================
@@ -1,125 +0,0 @@
-/*******************************************************************
-c* Multimodal Deformable Image Registration						   *
-c* via Mutual Information or Bhattacharyya Distantce               *
-c* Version: 1.0                                                    *
-c* Language: C, CUDA                                               *
-c*                                                                 *
-c* Developer: Yifei Lou                                            *
-c* Email: yifei.lou at ece.gatech.edu                                 *
-c*                                                                 *
-c* School of Electrical and Computer Engineering                   *   
-c* Georgia Institute of Technology                                 *
-c* Atlanta, GA, 30318                                              *
-c* Website: http://groups.bme.gatech.edu/groups/bil/               *
-c*                                                                 *
-c* Copyright (c) 2011                                              *
-c* All rights reserved.                                            *
-c*                                                                 *
-c* Permission to use, copy, or modify this code and its            *
-c* documentation for scientific purpose is hereby granted          *
-c* without fee, provided that this copyright notice appear in      *
-c* all copies and that both that copyright notice and this         *
-c* permission notice appear in supporting documentation. The use   *
-c* for commercial purposes is prohibited without permission.       *
-c*                                                                 *
-c* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          *
-c* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     *
-c* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        *
-c* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        *
-c* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            *
-c* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    *
-c* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT *
-c* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF*
-c* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED *
-c* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT     *
-c* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING  *
-c* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  *
-c* THE POSSIBILITY OF SUCH DAMAGE.                                 *
-c*                                                                 *
-c******************************************************************/
-
-
-
-#ifndef _3DImageSmooth_H_
-#define _3DImageSmooth_H_
-
-#define    KERNEL_RADIUS   8
-const int  KERNEL_W = (2 * KERNEL_RADIUS + 1);
-
-
-// Assuming ROW_TILE_W, KERNEL_RADIUS_ALIGNED and dataW 
-// are multiples of coalescing granularity size,
-// all global memory operations are coalesced in convolutionRowGPU()
-#define            ROW_TILE_W              128
-#define            KERNEL_RADIUS_ALIGNED   16
-
-
-// Assuming COLUMN_TILE_W and dataW are multiples
-// of coalescing granularity size, all global memory operations 
-// are coalesced in convolutionColumnGPU()
-#define           COLUMN_TILE_W        16
-#define           COLUMN_TILE_H        48
-
-
-//#define           UNROLL_INNER  // for fast convolution
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Common host and device functions
-////////////////////////////////////////////////////////////////////////////////
-//Round a / b to nearest higher integer value
-inline int iDivUp(int a, int b){
-    return (a % b != 0) ? (a / b + 1) : (a / b);
-}
-
-//24-bit multiplication is faster on G80,
-//but we must be sure to multiply integers
-//only within [-8M, 8M - 1] range
-#define IMUL(a, b) __mul24(a, b)
-
-
-__device__ __constant__ float             d_Kernel[KERNEL_W];
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Loop unrolling templates, needed for best performance
-////////////////////////////////////////////////////////////////////////////////
-
-template<int i> __device__ float convolutionRow(float *data){
-    return
-        data[KERNEL_RADIUS - i] * d_Kernel[i]
-        + convolutionRow<i - 1>(data);
-}
-
-template<> __device__ float convolutionRow<-1>(float *data){
-    return 0;
-}
-
-template<int i> __device__ float convolutionColumn(float *data){
-    return 
-        data[(KERNEL_RADIUS - i) * COLUMN_TILE_W] * d_Kernel[i]
-        + convolutionColumn<i - 1>(data);
-}
-
-template<> __device__ float convolutionColumn<-1>(float *data){
-    return 0;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// GPU convolution
-////////////////////////////////////////////////////////////////////////////////
-
-//subroutines
-
-void   ImageSmooth(float *d_image, float *d_image_conv, int3 Dims);
-__global__ void myconv2dGPU(float *src, float *dest, float *kernel, int M, int N, int kn);
-
-#endif
-
-
-
-
-
-
-


=====================================
src/register/viscous_global.h deleted
=====================================
@@ -1,211 +0,0 @@
-/*******************************************************************
-c* Multimodal Deformable Image Registration			   *
-c* via Mutual Information or Bhattacharyya Distantce               *
-c* Version: 1.0                                                    *
-c* Language: C, CUDA                                               *
-c*                                                                 *
-c* Developer: Yifei Lou                                            *
-c* Email: yifei.lou at ece.gatech.edu                                 *
-c*                                                                 *
-c* School of Electrical and Computer Engineering                   *   
-c* Georgia Institute of Technology                                 *
-c* Atlanta, GA, 30318                                              *
-c* Website: http://groups.bme.gatech.edu/groups/bil/               *
-c*                                                                 *
-c* Copyright (c) 2011                                              *
-c* All rights reserved.                                            *
-c*                                                                 *
-c* Permission to use, copy, or modify this code and its            *
-c* documentation for scientific purpose is hereby granted          *
-c* without fee, provided that this copyright notice appear in      *
-c* all copies and that both that copyright notice and this         *
-c* permission notice appear in supporting documentation. The use   *
-c* for commercial purposes is prohibited without permission.       *
-c*                                                                 *
-c* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          *
-c* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     *
-c* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        *
-c* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        *
-c* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            *
-c* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    *
-c* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT *
-c* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF*
-c* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED *
-c* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT     *
-c* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING  *
-c* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF  *
-c* THE POSSIBILITY OF SUCH DAMAGE.                                 *
-c*                                                                 *
-c******************************************************************/
-#ifndef _viscous_global_h_
-#define _viscous_global_h_
-
-/**************************************
-*	CUDA parameters
-**************************************/
-#define NTHREAD_PER_BLOCK 256
-//      number of threads per block along each direction
-
-#define NBLOCKX 1024
-//      the leading dimension of the 2d thread grid
-
-#define DEVICENUMBER 0
-//      device number to be used
-
-
-
-/**************************************
-*	parameters
-**************************************/
-#define NX0 256
-#define NY0 256
-#define NZ0 64
-#define DATA_SIZE ((NX0*NY0*NZ0)*sizeof(float))
-#define NSCALE 2
-//	number of scales in the image pyramid: 0 is the finest, NSCALE-1 is the coarest
-
-
-#define MAX_ITER 20
-//	max # of iterations in the finest grid
-//	# of iterations at each level = 2^scale*MAX_ITER
-
-
-
-//	histogram related parameters
-#define nBin 256
-// 	number of histogram bins
-
-#define hValue 4
-//	int, parameter in the Gaussian for convolution histogram
-#define HIST_SIZE (nBin*nBin*sizeof(float))
-
-
-#define sValue 3
-// 	int, parameter in the Gaussian for updating velocity
-#define sLength (6*sValue+1)
-
-
-//	image domain parameters
-#define ALPHA 500
-//	parameter in the force calculation
-
-#define du 0.6
-//	parameter to dynamically define dt 
-
-#define METHOD 1
-//	1 for Bnorm, 2 for MI 
-
-#define threshJaco 0.5
-//	threshold for Jacobian to regridding
-
-
-#define EPS 0.000001
-
-
-/***************************************
-*	global variables declaration
-***************************************/
-#ifndef GCS_REPRESS_EXTERNS
-extern char inputfilename_move[100];
-//	image move
-extern char inputfilename_static[100];
-// 	image static
-extern char outputfilename[100];
-//      image out
-extern char output_mv_x[100];
-extern char output_mv_y[100];
-extern char output_mv_z[100];
-
-
-extern float *h_im_static, *h_im_move;
-//	image pyramid
-extern float *d_im_static[NSCALE], *d_im_move[NSCALE];
-// 	vector flow
-extern float *d_mv_x[NSCALE], *d_mv_y[NSCALE], *d_mv_z[NSCALE];
-
-
-//	gaussian kernel
-extern float *GaussKernelH, *GaussKernelHx;
-
-//	 histogram related
-extern float *d_jointHistogram;
-extern float *d_jointHistogram_conv;
-extern float *d_probx, *d_proby;
-extern float *d_Bsum;
-
-extern dim3 nblocks;
-extern dim3 nblocks_hist;
-
-extern int NX, NY, NZ, sDATA_SIZE;
-//	dimension at current pyramid level
-extern float max_im_move, min_im_move;
-//	max and min intensity of the moving image
-
-
-extern cudaArray *d_im_move_array;
-extern texture<float, 3, cudaReadModeElementType> d_im_move_tex;
-
-
-extern cudaArray *d_mv_x_array, *d_mv_y_array, *d_mv_z_array;
-extern texture<float, 3, cudaReadModeElementType> d_mv_x_tex;
-extern texture<float, 3, cudaReadModeElementType> d_mv_y_tex;
-extern texture<float, 3, cudaReadModeElementType> d_mv_z_tex;
-
-extern int deviceCount;
-extern cudaDeviceProp dP;
-//      device properties
-
-#endif /* GCS_REPRESS_EXTERNS */
-
-/*************************************
-
-* 	simple math functions
-
-***************************************/
-__host__ __device__ float minmod(float x, float y);
-__device__ float ImageGradient(float Im, float I, float Ip);
-
-
-/****************************************
-*	Data processing
-****************************************/
-void dataPreprocessing(float *image, float *maxValue, float *minValue);
-__global__ void intensityRescale(float *image, float maxValue, float minValue, int type);
-// type >0 forward calculation: rescale image intensity to [0,1]
-// type <0 backward: map intensity to its original scale
-
-void loadData(float *dest, int sizeInByte, const char *filename);
-void outputData(void *src, int size, const char *outputfilename);
-
-/***************************************
-*	function declaration
-***************************************/
-void initData();
-void initGaussKernel();
-void fina();
-void compute(float *d_im_move, float *d_im_static, float *d_mv_x, float *d_mv_y, float *d_mv_z, int maxIter);
-
-
-/****************************************
-*	kernel declaration
-****************************************/
-__global__ void upSample(float *src, float *dest, int NX, int NY, int NZ);
-__global__ void downSample(float *src, float *dest, int NX, int NY, int NZ, int s);
-
-__global__ void ImageWarp(float *mv_x, float *mv_y, float *mv_z, float *dest, int NX, int NY, int NZ);
-__global__ void ImageWarp_mv(float *mv_x, float *mv_y, float *mv_z, int NX, int NY, int NZ);
-__global__ void ImageWarp_final(float *mv_x, float *mv_y, float *mv_z, float *dest, int NX, int NY, int NZ);
-
-__global__ void forceComp(float *d_im_out, float *d_im_static, float *d_Likelihood, float *d_v_x, float *d_v_y, float *d_v_z, int NX, int NY, int NZ);
-__global__ void flowComp(float *d_mv_x, float *d_mv_y, float *d_mv_z, float *d_v_x, float *d_v_y, float *d_v_z, float *jacobian, float *flow, int NX, int NY, int NZ);
-__global__ void flowUpdate(float *d_mv_x, float *d_mv_y, float *d_mv_z, float *d_disp_x, float *d_disp_y, float *d_disp_z, float dt, int NX, int NY, int NZ);
-
-__global__ void marginalDist(float *jointHist, float *probx, float *proby);
-__global__ void mutualInfoGPU(float *jointHist, float *probx, float *proby, float *likelihood);
-__global__ void copyHist(unsigned int *hist, float *jointHist);
-__global__ void marginalBnorm_sum(float *jointHist, float *probx, float *proby, float *Bsum);
-__global__ void marginalDistAlongY(float *jointHist, float *dest);
-__global__ void BnormGPU(float *jointHist, float *probx, float *proby, float *Bsum, float *likelihood);
-__global__ void transToFloat2(const float *input1, const float *input2, float2 *output, const int n);
-
-#endif



View it on GitLab: https://salsa.debian.org/med-team/plastimatch/-/commit/abd0f0da7e06defe744759e180868f27ba7ef786

-- 
View it on GitLab: https://salsa.debian.org/med-team/plastimatch/-/commit/abd0f0da7e06defe744759e180868f27ba7ef786
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20260221/fef45033/attachment-0001.htm>