Changeset 3780 in /cluster/svnroot


Ignore:
Timestamp:
Aug 7, 2012 10:45:10 PM (8 years ago)
Author:
charliep
Message:

fix for ticket 621, CUDA examples

Location:
bccd-ng/branches/charliep/packages/etc/skel/CUDA
Files:
2 deleted
5 edited

Legend:

Unmodified
Added
Removed
  • bccd-ng/branches/charliep/packages/etc/skel/CUDA/Makefile

    r3751 r3780  
    2424
    2525ifeq ($(wildcard /etc/bccd-revision),)      # not the BCCD
    26   TARGETS = device-query device-query-mpi hello-cuda
    27   LDFLAGS =
     26  TARGETS = device-query hello-cuda
    2827  CFLAGS += --machine 64
    2928else                                        # the BCCD
    30   # The combination of MPI and CUDA and icc needs to be reconciled before
    31   # device-query-mpi can be built, separate functions into CUDA and not
    3229  TARGETS = device-query hello-cuda
    33   LDFLAGS = -L/bccd/software/cuda/3.2.16/Linux/BCCD/x86_64/lib64
    34   CFLAGS += --machine 64
    3530endif
    3631
    3732all: $(TARGETS)
    3833
    39 device-query : device-query-local.c device-query-functions.h
    40         $(CC) $(CFLAGS) -o $@ device-query-local.c $(LDFLAGS)
    41 
    42 device-query-mpi : device-query-mpi.c device-query-functions.h
    43         $(CC) $(CFLAGS) --compiler-bindir='mpicc' -o $@ device-query-mpi.c $(LDFLAGS)
     34device-query : device-query.c device-query-functions.h
     35        $(CC) $(CFLAGS) -o $@ device-query.c
    4436
    4537hello-cuda : hello-cuda.cu
    46         $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
     38        $(CC) $(CFLAGS) -o $@ $^
    4739
    4840clean :
    49         rm -f device-query hello-cuda device-query-mpi
     41        rm -f device-query hello-cuda
  • bccd-ng/branches/charliep/packages/etc/skel/CUDA/Readme

    r3751 r3780  
    2222This directory contains a very simple example of CUDA code, and a program
    2323that will query your system to discover if there are CUDA supported card(s)
    24 present and if so what their capabilities are.
     24present and if so what their capabilities are. 
     25
     26If you are running on a cluster you can use device-query with mpirun
     27to see the CUDA capabilities of each, e.g.:
     28
     29$ mpirun -np 6 -bynode -machinefile ~/machines ./device-query
     30
     31Note the use of bynode scheduling to ensure that each node has one instance
     32of device-query running on it rather than multiple instances if for example
     33the nodes have >1 cores.
  • bccd-ng/branches/charliep/packages/etc/skel/CUDA/device-query-functions.h

    r3751 r3780  
    3030  * charliep    13-April-2011   First pass, based on deviceQuery from NVIDIA.
    3131  * charliep    01-July-2011    Improved error handling, additional characteristics.
     32  * charliep    03-August-2012  Added CPU count based on documentation and compute version
    3233*/
    3334
    3435#include <stdlib.h>
    3536#include <stdio.h>
     37#include <sys/unistd.h>
    3638#include <cuda_runtime_api.h>
    3739
     
    4143void printDeviceProperties(int deviceID);
    4244
    43 
    44 
    4545//function definitions
    4646
     
    4848        cudaError_t status = (cudaError_t)0;
    4949        int driverVersion = 0, runtimeVersion = 0;
    50 
     50 
    5151        if ((status = cudaDriverGetVersion(&driverVersion)) != cudaSuccess) {
    5252                fprintf(stderr, "cudaDriverGetVersion() FAILED, status = %d (%s)\n", status, cudaGetErrorString(status));
     
    8787        cudaError_t status = (cudaError_t)0;
    8888        struct cudaDeviceProp deviceProperties;
    89        
     89        char hostname[128];
     90
     91        gethostname(hostname, sizeof(hostname));
     92        fprintf(stderr, "hostname: %s\n", hostname);
     93
    9094        if ((status = cudaGetDeviceProperties(&deviceProperties, deviceID)) != cudaSuccess) {
    9195                fprintf(stderr, "cudaGetDeviceProperties() FAILED, status = %d (%s)\n", status, cudaGetErrorString(status));
     
    97101        printf("\tCUDA capability major.minor version = %d.%d\n", deviceProperties.major, deviceProperties.minor);
    98102
    99         //              xxx find library with Convert... call, or another way to count the blocks for a total core count
    100         //     #if CUDART_VERSION >= 2000
    101         //         printf("\tMultiprocessors x Cores/MP = Cores: %d (MP) x %d (Cores/MP) = %d (Cores)\n",
    102         //                deviceProperties.multiProcessorCount,
    103         //                ConvertSMVer2Cores(deviceProperties.major, deviceProperties.minor),
    104         //                ConvertSMVer2Cores(deviceProperties.major, deviceProperties.minor) * deviceProperties.multiProcessorCount);
    105         //     #endif
     103        // Compute Capability <= 1.3 --> 8 CUDA Cores per SM
     104        // CC == 2.0 --> 32 CUDA cores per SM
     105        // CC == 2.1 --> 48 CUDA cores per SM
     106
     107        printf("\tmultiProcessorCount = %d\n", deviceProperties.multiProcessorCount);
    106108       
    107         printf("\tmultiProcessorCount = %d\n", deviceProperties.multiProcessorCount);
     109        if ((deviceProperties.major <= 1) && (deviceProperties.minor <= 3))
     110                printf("\tMultiprocessors x Cores/MP = Total Cores : \n\t\t%d (MP) x %d (Cores/MP) = %d (Total Cores)\n", deviceProperties.multiProcessorCount, 8, 8 * deviceProperties.multiProcessorCount);
     111       
     112        else if ((deviceProperties.major == 2) && (deviceProperties.minor == 0))
     113                printf("\tMultiprocessors x Cores/MP = Total Cores : %d (MP) x %d (Cores/MP) = %d (Cores)\n", deviceProperties.multiProcessorCount, 32, 32 * deviceProperties.multiProcessorCount);
     114
     115        else if ((deviceProperties.major == 2) && (deviceProperties.minor == 1))
     116                printf("\tMultiprocessors x Cores/MP = Total Cores : %d (MP) x %d (Cores/MP) = %d (Cores)\n", deviceProperties.multiProcessorCount, 48, 48 * deviceProperties.multiProcessorCount);     
     117               
     118        else
     119                printf("\tUnknown CUDA capability\n");
    108120
    109121        printf("\ttotalGlobalMem = %ld bytes\n", (long)deviceProperties.totalGlobalMem);
  • bccd-ng/branches/charliep/packages/etc/skel/CUDA/device-query.c

    r3751 r3780  
    11/*
    2   * $Id: device-query.c,v 1.5 2012/07/07 15:45:59 charliep Exp $
     2  * $Id: device-query-local.c,v 1.3 2012/05/01 13:53:22 charliep Exp $
    33  *
    44  * This file is part of BCCD, an open-source live CD for computational science
     
    3131  * charliep    01-July-2011    Improved error handling, additional characteristics.
    3232*/
    33 
    34 #include <stdlib.h>
    35 #include <stdio.h>
    36 #include <cuda_runtime_api.h>
    37 
    38 //prototypes
    39 void printCudaVersion();
    40 int printDeviceCount();
    41 void printDeviceProperties(int deviceID);
    42 
     33#include "device-query-functions.h"
    4334
    4435int main(int argc, const char** argv) {
    4536        int deviceCount = 0, device;
    4637        cudaError_t status = (cudaError_t)0;
     38        int driverVersion = 0;
     39
     40        printCudaVersion(&driverVersion, NULL);
     41       
     42        if (driverVersion == 0) {
     43                printf("No CUDA drivers detected--assuming no local CUDA cards.\n");
     44                return 0;
     45        }
    4746
    4847        deviceCount = printDeviceCount();
    49 
    50         printCudaVersion();
    5148
    5249        for (device = 0; device < deviceCount; ++device) {
     
    5754}
    5855
    59 
    60 void printCudaVersion() {
    61         cudaError_t status = (cudaError_t)0;
    62         int driverVersion = 0, runtimeVersion = 0;     
    63 
    64         if ((status = cudaDriverGetVersion(&driverVersion)) != cudaSuccess) {
    65                 fprintf(stderr, "cudaDriverGetVersion() FAILED, status = %d (%s)\n", status, cudaGetErrorString(status));
    66                 exit(1);
    67         } else {
    68                 printf("CUDA driver version: %d.%d\n", driverVersion / 1000, driverVersion % 100);
    69         }
    70 
    71         if ((status = cudaRuntimeGetVersion(&runtimeVersion)) != cudaSuccess) {
    72                 fprintf(stderr, "cudaRuntimeGetVersion() FAILED, status = %d (%s)\n", status, cudaGetErrorString(status));
    73                 exit(1);
    74         } else {
    75                 printf("CUDA runtime version: %d.%d\n", runtimeVersion / 1000, runtimeVersion % 100);
    76         }
    77 }
    78 
    79 int printDeviceCount() {
    80         cudaError_t status = (cudaError_t)0;
    81         int deviceCount;
    82        
    83         if ((status = cudaGetDeviceCount(&deviceCount)) != cudaSuccess) {
    84                 fprintf(stderr, "cudaGetDeviceCount() FAILED, status = %d (%s)\n", status, cudaGetErrorString(status));
    85                 exit(1);
    86         }
    87        
    88     if (deviceCount == 0) {
    89                 printf("There are no hardware devices which support CUDA\n");
    90         } else {
    91                 printf("There %s %d CUDA capable hardware device%s\n", deviceCount == 1 ? "is" : "are",
    92                   deviceCount, deviceCount > 1 ? "s" : "");
    93         }
    94         return deviceCount;
    95 }
    96 
    97 void printDeviceProperties(int deviceID) {
    98         struct cudaDeviceProp deviceProperties;
    99        
    100         if ((status = cudaGetDeviceProperties(&deviceProperties, deviceID)) != cudaSuccess) {
    101                 fprintf(stderr, "cudaGetDeviceProperties() FAILED, status = %d (%s)\n", status, cudaGetErrorString(status));
    102                 exit(1);
    103         }
    104 
    105         printf("Device %d:\n", deviceID);
    106         printf("\tname = %s\n", deviceProperties.name);
    107         printf("\tCUDA capability major.minor version = %d.%d\n", deviceProperties.major, deviceProperties.minor);
    108 
    109         //              xxx find library with Convert... call, or another way to count the blocks for a total core count
    110         //     #if CUDART_VERSION >= 2000
    111         //         printf("\tMultiprocessors x Cores/MP = Cores: %d (MP) x %d (Cores/MP) = %d (Cores)\n",
    112         //                deviceProperties.multiProcessorCount,
    113         //                ConvertSMVer2Cores(deviceProperties.major, deviceProperties.minor),
    114         //                ConvertSMVer2Cores(deviceProperties.major, deviceProperties.minor) * deviceProperties.multiProcessorCount);
    115         //     #endif
    116 // Compute Capability <= 1.3 --> 8 CUDA Cores / SM
    117 // CC == 2.0 --> 32 CUDA cores / SM
    118 // CC == 2.1 --> 48 CUDA cores / SM
    119 
    120 Where SM = single multiprocessor.       
    121        
    122         printf("\tmultiProcessorCount = %d\n", deviceProperties.multiProcessorCount);
    123 
    124         printf("\ttotalGlobalMem = %ld bytes\n", (long)deviceProperties.totalGlobalMem);
    125         printf("\tsharedMemPerBlock = %d bytes\n", (int)deviceProperties.sharedMemPerBlock);
    126         printf("\tregsPerBlock = %d\n", deviceProperties.regsPerBlock);
    127         printf("\twarpSize = %d\n", deviceProperties.warpSize);
    128         printf("\tmemPitch = %d bytes\n", (int)deviceProperties.memPitch);
    129         printf("\tmaxThreadsPerBlock = %d\n", deviceProperties.maxThreadsPerBlock);
    130         printf("\tmaxThreadsDim = %d x %d x %d\n", deviceProperties.maxThreadsDim[0],
    131           deviceProperties.maxThreadsDim[1], deviceProperties.maxThreadsDim[2]);
    132         printf("\tmaxGridSize = %d x %d x %d\n", deviceProperties.maxGridSize[0],
    133           deviceProperties.maxGridSize[1], deviceProperties.maxGridSize[2]);
    134         printf("\n");   
    135         printf("\tmemPitch = %ld bytes\n", (long)deviceProperties.memPitch);
    136         printf("\ttextureAlignment = %ld bytes\n", (long)deviceProperties.textureAlignment);
    137         printf("\tclockRate = %.2f GHz\n", deviceProperties.clockRate * 1e-6f);
    138 
    139 #if CUDART_VERSION >= 2000
    140         printf("\tdeviceOverlap = %s\n", deviceProperties.deviceOverlap ? "Yes" : "No");
    141 #endif
    142 
    143 #if CUDART_VERSION >= 2020
    144         printf("\tkernelExecTimeoutEnabled = %s\n", deviceProperties.kernelExecTimeoutEnabled ? "Yes" : "No");
    145         printf("\tintegrated = %s\n", deviceProperties.integrated ? "Yes" : "No");
    146         printf("\tcanMapHostMemory = %s\n", deviceProperties.canMapHostMemory ? "Yes" : "No");
    147         printf("\tcomputeMode = %s\n", deviceProperties.computeMode == cudaComputeModeDefault ?
    148           "Default (multiple host threads can use this device simultaneously)" :
    149           deviceProperties.computeMode == cudaComputeModeExclusive ?
    150           "Exclusive (only one host thread at a time can use this device)" :
    151           deviceProperties.computeMode == cudaComputeModeProhibited ?
    152           "Prohibited (no host thread can use this device)" :
    153           "Unknown");
    154 #endif
    155 
    156 #if CUDART_VERSION >= 3000
    157         printf("\tconcurrentKernels = %s\n", deviceProperties.concurrentKernels ? "Yes" : "No");
    158 #endif
    159 
    160 #if CUDART_VERSION >= 3010
    161         printf("\tECCEnabled = %s\n", deviceProperties.ECCEnabled ? "Yes" : "No");
    162 #endif
    163 
    164 #if CUDART_VERSION >= 3020
    165         printf("\ttccDriver = %s\n", deviceProperties.tccDriver ? "Yes" : "No");
    166 #endif 
    167 
    168         printf("\n");
    169 }
  • bccd-ng/branches/charliep/packages/etc/skel/CUDA/hello-cuda.cu

    r3751 r3780  
    11/*
    2   * $Id: hello-cuda.cu,v 1.3 2012/05/01 13:53:22 charliep Exp $
    3   *
    4   * This file is part of BCCD, an open-source live CD for computational science
    5   * education.
    6   *
    7   * Copyright (C) 2010 Andrew Fitz Gibbon, Paul Gray, Kevin Hunter, Dave
    8   *   Joiner, Sam Leeman-Munk, Tom Murphy, Charlie Peck, Skylar Thompson,
    9   *   & Aaron Weeden
    10   *
    11   * This program is free software: you can redistribute it and/or modify
    12   * it under the terms of the GNU General Public License as published by
    13   * the Free Software Foundation, either version 3 of the License, or
    14   * (at your option) any later version.
    15   *
    16   * This program is distributed in the hope that it will be useful,
    17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    19   * GNU General Public License for more details.
    20   *
    21   * You should have received a copy of the GNU General Public License
    22   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
     2 * CUDA Hello World Program
     3 *
     4 * Usage: hello-cuda <number thread blocks> <number threads per block>
     5 *
     6 * charliep     09-April-2011   First pass, based on the example by Alan Kaminsky
     7 * charliep     01-July-2011    Improved error handling.
    238*/
    24 
    25 /*
    26   * CUDA Hello World Program
    27   *
    28   * Usage: hello-cuda <number thread blocks> <number threads per block>
    29   *
    30   * charliep    09-April-2011   First pass, based on the example by Alan Kaminsky
    31   * charliep    01-July-2011    Improved error handling.
    32 */
    33 
    349#include <stdlib.h>
    3510#include <stdio.h>
     
    4116  * variables blockDim, blockIdx and threadIdx.
    4217*/
    43 __global__ void hello(int* barray, int* tarray) {
     18__global__ void hello(int* barray, int* tarray, int* gtarray) {
    4419        int i;
    4520       
     
    4722        barray[i] = blockIdx.x;
    4823        tarray[i] = threadIdx.x;
     24        gtarray[i] = blockDim.x * blockIdx.x + threadIdx.x;
    4925}
    5026
     
    5632int main(int argc, char** argv) {
    5733        int numThreadBlocks, numThreadsPerBlock, totalNumThreads, size, i;
    58         int *cpuBlockArray, *cpuThreadArray, *gpuBlockArray, *gpuThreadArray;
     34        int *cpuBlockArray, *cpuThreadArray, *cpuGThreadArray;
     35        int *gpuBlockArray, *gpuThreadArray, *gpuGThreadArray;
    5936        cudaError_t status = (cudaError_t)0;
    6037
     
    7249       
    7350        if (!(cpuBlockArray = (int*) malloc(size))) {
    74                 fprintf(stderr, "malloc() FAILED (block)\n");
     51                fprintf(stderr, "malloc() FAILED (Block)\n");
    7552                exit(0);
    7653        }
    7754       
    7855        if (!(cpuThreadArray = (int*) malloc(size))) {
    79                 fprintf(stderr, "malloc() FAILED (thread)\n");
     56                fprintf(stderr, "malloc() FAILED (Thread)\n");
     57                exit(0);
     58        }
     59       
     60        if (!(cpuGThreadArray = (int*) malloc(size))) {
     61                fprintf(stderr, "malloc() FAILED (GThread)\n");
    8062                exit(0);
    8163        }
     
    8365        /* Allocate GPGPU memory. */
    8466        if ((status = cudaMalloc ((void**) &gpuBlockArray, size)) != cudaSuccess) {
    85                 printf("cudaMalloc() FAILED (block), status = %d (%s)\n", status, cudaGetErrorString(status));
     67                printf("cudaMalloc() FAILED (Block), status = %d (%s)\n", status,     
     68                  cudaGetErrorString(status));
    8669                exit(1);
    8770        }
    8871
    8972        if ((status = cudaMalloc ((void**) &gpuThreadArray, size)) != cudaSuccess) {
    90                 printf("cudaMalloc() FAILED (thread), status = %d (%s)\n", status, cudaGetErrorString(status));
     73                printf("cudaMalloc() FAILED (Thread), status = %d (%s)\n", status, cudaGetErrorString(status));
     74                exit(1);
     75        }
     76       
     77        if ((status = cudaMalloc ((void**) &gpuGThreadArray, size)) != cudaSuccess) {
     78                printf("cudaMalloc() FAILED (GThread), status = %d (%s)\n", status, cudaGetErrorString(status));
    9179                exit(1);
    9280        }
     
    9482        /* Call the kernel function to run on the GPGPU chip. */
    9583        hello <<<numThreadBlocks, numThreadsPerBlock>>>
    96           (gpuBlockArray, gpuThreadArray);
     84          (gpuBlockArray, gpuThreadArray, gpuGThreadArray);
    9785       
    9886        /* Copy the result arrays from the GPU's memory to the CPU's memory. */
    9987        cudaMemcpy(cpuBlockArray, gpuBlockArray, size, cudaMemcpyDeviceToHost);
    10088        cudaMemcpy(cpuThreadArray, gpuThreadArray, size, cudaMemcpyDeviceToHost);
     89        cudaMemcpy(cpuGThreadArray, gpuGThreadArray, size, cudaMemcpyDeviceToHost);
    10190       
    10291        /* Display the results. */
     92        printf("block\tthread\tglobal thread\n");
     93       
    10394        for (i = 0; i < totalNumThreads; ++i) {
    104                 printf("%d\t%d\n", cpuBlockArray[i], cpuThreadArray[i]);
     95                printf("%d\t%d\t%d\n", cpuBlockArray[i], cpuThreadArray[i], cpuGThreadArray[i]);
    10596        }
    10697       
     
    110101        free(cpuBlockArray);
    111102        free(cpuThreadArray);
     103        free(cpuGThreadArray);
    112104        cudaFree(gpuBlockArray);
    113105        cudaFree(gpuThreadArray);
     106        cudaFree(gpuGThreadArray);
    114107       
    115108        exit(0);
Note: See TracChangeset for help on using the changeset viewer.