[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Octave-bug-tracker] [bug #43826] Deadlock with CUBLAS and dlopen inside
From: |
anonymous |
Subject: |
[Octave-bug-tracker] [bug #43826] Deadlock with CUBLAS and dlopen inside a accelerated BLAS replacement. |
Date: |
Mon, 15 Dec 2014 16:09:38 +0000 |
User-agent: |
Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0 |
URL:
<http://savannah.gnu.org/bugs/?43826>
Summary: Deadlock with CUBLAS and dlopen inside a accelerated
BLAS replacement.
Project: GNU Octave
Submitted by: None
Submitted on: Mo 15 Dez 2014 16:09:38 UTC
Category: Interpreter
Severity: 3 - Normal
Priority: 5 - Normal
Item Group: Crash
Status: None
Assigned to: None
Originator Name:
Originator Email:
Open/Closed: Open
Discussion Lock: Any
Release: 3.8.1
Operating System: GNU/Linux
_______________________________________________________
Details:
I created a small wrapper experiment around CUBLAS which I want to use with
Octave. Therefore I picked up the NETLIB BLAS implementation and replaced the
dgemm.f file ( Which implements the double precision matrix-matrix multiply)
by the following C code:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <dlfcn.h>
#ifndef PREFIX
#define PREFIX "/usr/lib/blastest"
#endif
static void *dl_handle;
static void *gemm_handle;
static int initialized = 0;
typedef void (*gemm_call)(char * TA, char *TB, int *m , int *n , int *k,
double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double
*c, int *ldc);
__attribute__((constructor)) void __dgemm_init () {
initialized = 1;
dlerror();
dl_handle = dlopen (PREFIX"/libblastest_dgemm.so",
RTLD_NOW|RTLD_GLOBAL|RTLD_DEEPBIND);
if (dl_handle == NULL) {
printf("DLOPEN: %s\n", dlerror());
abort();
}
gemm_handle = dlsym(dl_handle,"gpu_dgemm");
if ( gemm_handle == NULL ) {
printf("DLSYM: %s\n", dlerror());
abort();
}
return;
}
__attribute__((destructor)) void __dgemm_exit(){
if ( initialized == 0 ) {
return;
}
dlclose(dl_handle);
return;
}
void dgemm_(char * TA, char *TB, int *m , int *n , int *k, double *alpha,
double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc){
gemm_call call;
if ( initialized == 0 || gemm_handle == NULL) {
printf("Not initialized. \n");
abort();
}
call = (gemm_call) gemm_handle;
call(TA, TB, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
return;
}
The I wrote the real wrapper as a second shared object in order to be able to
replace it easily without relinking or LD_LIBRARY_PATH stuff. This looks like
#ifndef Int
#define Int int
#endif
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#ifndef GPU_ALIGN
#define GPU_ALIGN 32
#endif
int gemm_initialized = 0;
cublasHandle_t gemm_handle;
cublasOperation_t acc_blas_to_cublas_operation(char *trans) {
/**
* Convert transposition indicators from blas to cublas style
* eg. "N" -> "CUBLAS_OP_N"
*/
char t = tolower(*trans);
switch(t) {
case 'n':
return CUBLAS_OP_N;
break;
case 't':
return CUBLAS_OP_T;
break;
case 'c':
return CUBLAS_OP_C;
break;
default:
fprintf(stderr, "Unknown operation '%c'. Using 'N'
instead.\n",
*trans);
return CUBLAS_OP_N;
}
}
void dgemm_exit() {
cublasDestroy(gemm_handle);
}
void gpu_dgemm(char *transA, char *transB, int * M , int *N , int * K ,
double *alpha, double *A, int * LDA, double *B, int *LDB, double *beta, double
*C , int *LDC){
/** \brief Matrix Multiplication on GPU
* Device memory is allocated, matrices are transfered and cublas
function is called. C=alpha*A*B+C is transferred back to host.
*/
//Device Pointers
double *dA;
double *dB;
double *dC;
int _M = *M;
int _N = *N;
int _K = *K;
int _LDA = *LDA;
int _LDB = *LDB;
int _LDC = *LDC;
int ldda = ((_M+(GPU_ALIGN-1))/GPU_ALIGN) *GPU_ALIGN ;
int lddb = ((_K+(GPU_ALIGN-1))/GPU_ALIGN) *GPU_ALIGN ;
int lddc = ((_M+(GPU_ALIGN-1))/GPU_ALIGN) *GPU_ALIGN ;
cublasOperation_t cublTransA;
cublasOperation_t cublTransB;
cublasStatus_t cublasError=0;
cudaError_t cudaError = cudaSuccess;
if ( gemm_initialized == 0 ) {
cudaSetDevice(0);
if ( cublasCreate(&gemm_handle) != CUBLAS_STATUS_SUCCESS ) {
fprintf(stderr, "ACC_BLAS: Failed to create CUBLAS
Handle.\n");
abort();
}
atexit(dgemm_exit);
gemm_initialized = 1 ;
}
//Convert transposition indicators from fortran to cublas style
cublTransA=acc_blas_to_cublas_operation(transA);
cublTransB=acc_blas_to_cublas_operation(transB);
//Allocate device memory
cudaError=cudaMalloc((void**)&dA, ldda*_K*sizeof(double));
if (cudaError != cudaSuccess )
{
printf("GPU memory allocation failed.\n");
abort();
}
cudaError=cudaMalloc((void**)&dB, lddb*_N*sizeof(double));
if (cudaError != cudaSuccess)
{
cudaFree(dA);
printf("GPU memory allocation failed.\n");
abort();
}
cudaError=cudaMalloc((void**)&dC, lddc*_N*sizeof(double));
if (cudaError!=cudaSuccess)
{
cudaFree(dA);
cudaFree(dB);
abort();
}
//Copy matrices from host to device
cublasError=cublasSetMatrix(_M,_K,sizeof(double),A,_LDA,dA,ldda);
if (cublasError != CUBLAS_STATUS_SUCCESS)
{
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
abort();
}
cublasError=cublasSetMatrix(_K,_N,sizeof(double),B,_LDB,dB,lddb);
if (cublasError != CUBLAS_STATUS_SUCCESS)
{
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
printf("Copying to GPU memory failed.\n");
abort();
}
cublasError=cublasSetMatrix(_M,_N,sizeof(double),C,_LDC,dC,lddc);
if (cublasError != CUBLAS_STATUS_SUCCESS)
{
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
printf("Copying to GPU memory failed.\n");
abort();
}
//Matrix multiplication on device
cublasError=CUBLASGEMM(gemm_handle,cublTransA,cublTransB,_M,_N,_K,
(double*) alpha, (double *) dA, ldda, (double *)dB, lddb,(double *)beta,
(double *)dC, lddc);
if (cublasError != CUBLAS_STATUS_SUCCESS)
{
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
printf("Matrix Multiplication failed.\n");fflush(stdout);
abort();
}
//Copy resulting matrix from device to host
cublasError=cublasGetMatrix(_M,_N,sizeof(double),dC,lddc,C,_LDC);
if (cublasError!=CUBLAS_STATUS_SUCCESS)
{
printf("Copying to CPU memory failed.\n");
abort();
}
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
return;
}
And is compiled using "gcc -shared -O2 -o libblastest_dgemm.so -lcublas
-lcudart". I integrated my the whole thing using update-alternatives into my
Ubuntu system. If I then run programs that use the dgemm call everything works
fine and faster than on the CPU for large data sets. Only one application does
not work as well and run into a deadlock. If I run GNU Octave it hangs after
displaying its copyright information. Interrupting octave with GDB I got the
following backtrace:
Program received signal SIGINT, Interrupt.
0x00007ffff31ff4d4 in __GI___pthread_mutex_lock (mutex=0x7ffff7ffd968
<_rtld_global+2312>) at ../nptl/pthread_mutex_lock.c:114
114 ../nptl/pthread_mutex_lock.c: No such file or directory.
(gdb) bt
#0 0x00007ffff31ff4d4 in __GI___pthread_mutex_lock (mutex=0x7ffff7ffd968
<_rtld_global+2312>) at ../nptl/pthread_mutex_lock.c:114
#1 0x00007ffff7ddb150 in tls_get_addr_tail (ti=0x7ffff6dc66f8,
dtv=0x7ffff7fab0d0, the_map=0x7ffff7fca000) at dl-tls.c:722
#2 0x00007ffff6b37aec in __cxa_get_globals () from
/usr/lib/x86_64-linux-gnu/libstdc++.so.6
#3 0x00007ffff6b37a19 in std::uncaught_exception() () from
/usr/lib/x86_64-linux-gnu/libstdc++.so.6
#4 0x00007ffff6b6ec92 in std::basic_ostream<char, std::char_traits<char>
>& std::__ostream_insert<char, std::char_traits<char>
>(std::basic_ostream<char, std::char_traits<char> >&, char const*, long) ()
from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#5 0x00007ffff770dccb in operator<< <char, std::char_traits<char>,
std::allocator<char> > (__str=..., __os=...) at
/usr/include/c++/4.8/bits/basic_string.h:2758
#6 octave_base_reader::octave_gets (this=0xcf8250, address@hidden:
false) at corefcn/input.cc:252
#7 0x00007ffff770e07d in octave_terminal_reader::get_input
(this=<optimized out>, eof=<optimized out>) at corefcn/input.cc:623
#8 0x00007ffff74a6d0b in get_input (address@hidden: false,
this=0xcf7a78) at corefcn/input.h:262
#9 octave_lexer::fill_flex_buffer (this=0xcf77f0, buf=0xcf8620 "",
max_size=8192) at ../../libinterp/parse-tree/lex.ll:3268
#10 0x00007ffff74a6732 in yy_get_next_buffer
(address@hidden) at parse-tree/lex.cc:3315
#11 0x00007ffff74abb6c in octave_lex
(address@hidden, yyscanner=0xcf81b0) at
parse-tree/lex.cc:3156
#12 0x00007ffff74b9ff3 in octave_pull_parse (address@hidden,
parser=...) at parse-tree/oct-parse.cc:3536
#13 0x00007ffff74ba05a in octave_parse (parser=...) at
parse-tree/oct-parse.cc:3506
#14 0x00007ffff74ba065 in octave_parser::run
(address@hidden) at parse-tree/oct-parse.yy:3409
#15 0x00007ffff7865d1d in main_loop () at corefcn/toplev.cc:563
#16 0x00007ffff7088184 in octave_execute_interpreter () at octave.cc:889
#17 0x00007ffff6735ec5 in __libc_start_main (main=0x400870 <main(int,
char**)>, argc=2, argv=0x7fffffffdf28, init=<optimized out>, fini=<optimized
out>, rtld_fini=<optimized out>, stack_end=0x7fffffffdf18) at
libc-start.c:287
#18 0x00000000004008ff in _start ()
I think the problem may the fact that the `libcuda.so` from the driver is as
well loaded via dlopen from the libcudart library. But I do not know how to
fix this. Any suggestions how I can fix this problem while keeping my dlopen
approach?
The whole source of the BLAS library is available at:
http://www-e.uni-magdeburg.de/makoehle/blas_test.tar.gz (Also attached) The
`make install` target also includes the setup of update-alternatives on
Debian-like systems.
Use hard and software:
- Ubuntu 14.04.1 (amd64) with gcc 4.8.2 and glibc 2.19
- CUDA 6.5, Driver 340.29
- GNU Octave 3.8.1
- GeForce GTX 580
_______________________________________________________
File Attachments:
-------------------------------------------------------
Date: Mo 15 Dez 2014 16:09:38 UTC Name: blas_test.tar.gz Size: 130kB By:
None
Blas Wrapper mentioned in the Bug
<http://savannah.gnu.org/bugs/download.php?file_id=32673>
_______________________________________________________
Reply to this item at:
<http://savannah.gnu.org/bugs/?43826>
_______________________________________________
Nachricht gesendet von/durch Savannah
http://savannah.gnu.org/
- [Octave-bug-tracker] [bug #43826] Deadlock with CUBLAS and dlopen inside a accelerated BLAS replacement.,
anonymous <=