This program gives an error when it gets to the "#vectorize(['float32(float32)'], target='cuda')" function with the error:
File "/home/idf/anaconda3/envs/gpu/lib/python3.9/site-packages/numba/cuda/vectorizers.py", line 206, in _compile_core
return cudevfn, cudevfn.overloads[sig.args].signature.return_type
AttributeError: 'FakeOverload' object has no attribute 'signature'
Notice that I am not even calling the function. It seems to be an error at the interpreter stage:
import math
import numpy as np
from numba import vectorize,guvectorize,cuda
import timeit
def numpy_sqrt_x(x):
return np.sqrt(x)
def math_sqrt_x(x):
return [math.sqrt(xx) for xx in x]
#vectorize
def cpu_sqrt(x):
return math.sqrt(x)
#vectorize(['float32(float32)'], target='cuda')
def gpu_sqrt(x):
return math.sqrt(x)
if __name__ == '__main__':
x = np.arange(10)
print(x**2)
#print(np.exp(x))
#x = np.arange(int(1e2))
#print(timeit.timeit("numpy_sqrt_x(x)", globals=globals()))
#print(timeit.timeit("math_sqrt_x(x)", globals=globals()))
#print(gpu_sqrt(x))
Not sure what I am doing wrong?
Related
I read the data slice from a large file. A 400Mb picture took 4 seconds. It only takes 1 second for the disk to read this file. The program does very little computation. How to improve speed?
from opentile import OpenTile
import time
import traceback
import os
os.environ.setdefault('TURBOJPEG', 'C:/lib/')
try:
tiler = OpenTile.open('svs800.svs')
except:
traceback.print_exc()
s=tiler.get_level(0)
tile_size=str(s.tiled_size).split("x")
time1=time.time()
from multiprocessing.pool import ThreadPool
def get_data(s):
# This function reads a piece of binary data from a certain position of the image
# and then adds the header data
return tiler.get_tile(0,0,0, (s[0], s[1]))
pool = ThreadPool(5)
y = pool.map(get_data, [(i,j) for i in range(int(tile_size[0])) for j in range(int(tile_size[1]))])
print("tiles",len(y))
time2=time.time()
print(time2)
Simple sequential approach:
from opentile import OpenTile
import os
import time
def timer(func):
def wrapper(*args, **kwargs):
start = time.perf_counter()
result = func(*args, **kwargs)
end = time.perf_counter()
print(f'{func.__name__} {end-start:.4f}s')
return result
return wrapper
os.environ.setdefault('TURBOJPEG', '/opt/libjpeg-turbo')
#timer
def open_svs(filename):
return OpenTile.open(filename)
#timer
def get_data(tiler, x, y):
return [tiler.get_tile(0, 0, 0, (x_, y_)) for x_ in range(x) for y_ in range(y)]
tiler = open_svs('18959.svs')
x, y = map(int, str(tiler.get_level(0).tiled_size).split('x'))
data = get_data(tiler, x, y)
assert len(data) == x * y
Output:
open_svs 0.0082s
get_data 0.5843s
Note:
x, y values for this file are 183, 114 respectively. The file size is 563,271,749 bytes
I want to have multiple processes read from a different row of a numpy array in parallel to speed things up. However, when I run the following code, the first process to reach func throws an error as if var is no longer in scope. Why is this happening?
import numpy as np
import multiprocessing as mp
num_procs = 16
num_points = 2500000
def init_worker(X):
global var
var = X
def func(proc):
X_np = np.frombuffer(var).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4, initializer=init_worker, initargs=(X,))
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
Traceback (most recent call last):
File "parallel_test.py", line 26, in <module>
pool.apply_async(func(proc))
File "parallel_test.py", line 13, in func
X_np = np.frombuffer(var).reshape((num_procs, num_points))
NameError: global name 'var' is not defined
Update:
For some reason, if I use Pool.map instead of the for loop with Pool.apply_async, it seems to work. I don’t understand why though.
Any reason to not declare X as global in the top-level scope? This eliminates the NameError.
import numpy as np
import multiprocessing as mp
num_procs = 16
num_points = 25000000
def func(proc):
X_np = np.frombuffer(X).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
global X
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4 )
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
When I run a reduced instance of this problem, n=20:
import numpy as np
import multiprocessing as mp
num_procs = 4
num_points = 5
def func(proc):
X_np = np.frombuffer(X).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
global X
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4 )
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
print("\n".join(map(str, X)))
I get the following output:
-0.6346037804619162
1.1005724710066107
0.33458763357165255
0.6409345714971889
0.7124888766851982
0.36760459213332963
0.23593304931386933
-0.8668969562941349
-0.8842756219923469
0.005979036105620422
1.386422154089567
-0.8770988782214508
0.25187448339771057
-0.2473967968471952
-0.4909708883978521
0.5423521489750244
0.018749603867333802
0.035304792504378055
1.3263872668956616
1.0199839603892742
You haven't provided a sample of the expected output. Does this look similar to what you expect?
I want to use the scipy.optimize.minimize function. The function contains commands from a DLL which require a ctypes array. The goal is to vary the inputs in the ctypes array to optimize a specific output which is also a ctypes array (see code below).
import os
import ctypes
import tkinter as tk
from PIL import ImageTk
from tkinter import filedialog
import numpy as np
from scipy.optimize import minimize
dll = ctypes.cdll.LoadLibrary(library)
LoadModelDef = dll.addModelDef(model)
nrExperiments = 1
nrin = dll.getNumInputs(LoadModelDef)
PDBL2ARR = ctypes.c_double * nrin * nrExperiments
inputs = PDBL2ARR()
inputs_init = PDBL2ARR()
def evaluaterel(library,Model,InputArray):
nrExp = len(InputArray)
DBL2ARR = ctypes.c_double * nrExp
outputs = DBL2ARR()
for i in range(2,13):
Name= outputName(Model,i)
library.evalVBA(Model,InputArray,nrExp,i,outputs)
for i in range(nrExp):
Value = str(outputs[i])
# text = label.cget("text") + '\n' + str(Name)+ ' ' + str(Value)
# label.configure(text=text)
return outputs
data = np.array([line.split()[-1] for line in open("DATA.txt")], dtype=np.float64)
for i in range(nrExperiments):
for j in range(nrin):
inputs_init[i][j]= 0
for i in range(nrExperiments):
for j in range(0,nrin):
inputs[i][j]=data[j]
solution=minimize(evaluaterel(dll,LoadModelDef,inputs),inputs_init,method='SLSQP')
print(solution)
File "c:\app\python27\lib\site-packages\scipy\optimize\optimize.py", line 292, in function_wrapper
return function(*(wrapper_args + args))
TypeError: 'c_double_Array_1' object is not callable
According to [SciPy.Docs]: scipy.optimize.minimize(fun, x0, args=(), method=None, jac=None, hess=None, hessp=None, bounds=None, constraints=(), tol=None, callback=None, options=None), the 1st argument should be a callable (function, in your case). But, you're calling the function yourself when passing it, and therefore you're passing the function return value.
Modify your code (faulty line) to:
solution = minimize(evaluaterel, inputs_init, args=(dll, LoadModelDef, inputs), method="SLSQP")
I show you below an example of code using pycuda with "kernel" code included in itself (with SourceModule)
import pycuda
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import threading
import numpy
class GPUThread(threading.Thread):
def __init__(self, number, some_array):
threading.Thread.__init__(self)
self.number = number
self.some_array = some_array
def run(self):
self.dev = cuda.Device(self.number)
self.ctx = self.dev.make_context()
self.array_gpu = cuda.mem_alloc(some_array.nbytes)
cuda.memcpy_htod(self.array_gpu, some_array)
test_kernel(self.array_gpu)
print "successful exit from thread %d" % self.number
self.ctx.pop()
del self.array_gpu
del self.ctx
def test_kernel(input_array_gpu):
mod = SourceModule("""
__global__ void f(float * out, float * in)
{
int idx = threadIdx.x;
out[idx] = in[idx] + 6;
}
""")
func = mod.get_function("f")
output_array = numpy.zeros((1,512))
output_array_gpu = cuda.mem_alloc(output_array.nbytes)
func(output_array_gpu,
input_array_gpu,
block=(512,1,1))
cuda.memcpy_dtoh(output_array, output_array_gpu)
return output_array
cuda.init()
some_array = numpy.ones((1,512), dtype=numpy.float32)
num = cuda.Device.count()
gpu_thread_list = []
for i in range(num):
gpu_thread = GPUThread(i, some_array)
gpu_thread.start()
gpu_thread_list.append(gpu_thread)
I would like to use the same method but instead of using a "kernel code", I would like to do multiple calls of a function which is external (not a function like "kernel code"), i.e a classical function defined in my main program and which takes in argument different parameters shared by all the main program. Is it possible ?
People who have practiced Matlab may know the function arrayfun where B = arrayfun(func,A) is a vector of results given by applying function funcfor each element of vector A.
Actually, it is a version of what is commonly called the map function: I would like to do the same but with GPU/pycuda version.
Update 1
Sorry, I forgot from the beginning of my post to say what I call an extern and classical function. Here is below an example of function which is used in main section :
def integ(I1):
function_A = aux_fun_LU(way, ecs, I1[0], I1[1])
integrale_A = 0.25*delta_x*delta_y*np.sum(function_A[0:-1, 0:-1] + function_A[1:, 0:-1] + function_A[0:-1, 1:] + function_A[1:, 1:])
def g():
for j in range(6*i, 6*i+6):
for l in range(j, 6*i+6):
yield j, l
## applied integ function to g() generator.
## Here I a using simple map function (no parallelization)
if __name__ == '__main__':
map(integ, g())
Update 2
Maybe a solution would be to call the extern function from a kernel code, benefiting as well of the high GPU power of multiple calls on kernel code. But how to deal with the returned value of this extern function to get it back into main program?
Update 3
Here is below what I have tried:
# Class GPUThread
class GPUThread(threading.Thread):
def __init__(self, number, some_array):
threading.Thread.__init__(self)
self.number = number
self.some_array = some_array
def run(self):
self.dev = cuda.Device(self.number)
self.ctx = self.dev.make_context()
self.array_gpu = cuda.mem_alloc(some_array.nbytes)
cuda.memcpy_htod(self.array_gpu, some_array)
test_kernel(self.array_gpu)
print "successful exit from thread %d" % self.number
self.ctx.pop()
del self.array_gpu
del self.ctx
def test_kernel(input_array_gpu):
mod1 = SourceModule("""
__device__ void integ1(int *I1)
{
function_A = aux_fun_LU(way, ecs, I1[0], I1[1]);
integrale_A = 0.25*delta_x*delta_y*np.sum(function_A[0:-1, 0:-1] + function_A[1:, 0:-1] + function_A[0:-1, 1:] + function_A[1:, 1:]);
}""")
func1 = mod1.get_function("integ1")
# Calling function
func1(input_array_gpu)
# Define couples (i,j) to build Fisher matrix
def g1():
for j in range(6*i, 6*i+6):
for l in range(j, 6*i+6):
yield j, l
# Cuda init
if __name__ == '__main__':
cuda.init()
# Input gTotal lists
some_array1 = np.array(list(g1()))
print 'some_array1 = ', some_array1
# Parameters for cuda
num = cuda.Device.count()
gpu_thread_list = []
for i in range(num):
gpu_thread = GPUThread(i, some_array1)
#gpu_thread = GPUThread(i, eval("some_array"+str(j)))
gpu_thread.start()
gpu_thread_list.append(gpu_thread)
I get the following error at the execution:
`Traceback (most recent call last):
File "/Users/mike/anaconda2/envs/py2cuda/lib/python2.7/threading.py", line 801, in __bootstrap_inner
self.run()
File "Example_GPU.py", line 1232, in run
self.array_gpu = cuda.mem_alloc(some_array.nbytes)
NameError: global name 'some_array' is not defined`
I can't see what's wrong with the variable 'some_array' and the line
self.array_gpu = cuda.mem_alloc(some_array.nbytes)
What can I try next?
I'm using the R DTW package with rpy2. I would like to be able specify a window type and size for running the DTW analysis.
I have run the following code:
import numpy as np
import rpy2.robjects as robjects
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
r = robjects.r
r('library("dtw")')
query = np.array([0.0,1.0,2.0,3.0])
reference = np.array([0.0,1.9,2.4,3.0])
# Attempt 1:
kwargs = {'step':r("asymmetric"),'window_type':r("sakoeChibaWindow"),'window_size':r("as.integer(\"3\")")}
alig = r.dtw(query, reference, **kwargs)
# Attempt 2:
alig = r.dtw(query, reference, keep=r('TRUE'), step=r('asymmetric'),window_type=r('sakoeChibaWindow'),window_size="as.integer(\"3\")")
# Attempt 3:
alig = r.dtw(query, reference, keep=r('TRUE'), step=r('asymmetric'),window_type=r('sakoeChibaWindow'),window_size=3)
# Note: The line of code below works correctly.
# alig = r.dtw(query, reference, keep=r('TRUE'), step=r('asymmetric'))
robjects.globalenv["alignment"] = alig
print r('alignment$distance')
I get the following error message:
Error in abs(jw - iw) <= window.size : 'window.size' is missing
Traceback (most recent call last):
File "testrdtw.py", line 19, in <module>
alig = r.dtw(query, reference, **kwargs)
File "/Users/jsmith/Dropbox/IW/env/lib/python2.7/site-packages/rpy2/robjects/functions.py", line 86, in __call__
return super(SignatureTranslatedFunction, self).__call__(*args, **kwargs)
File "/Users/jsmith/Dropbox/IW/env/lib/python2.7/site-packages/rpy2/robjects/functions.py", line 35, in __call__
res = super(Function, self).__call__(*new_args, **new_kwargs)
rpy2.rinterface.RRuntimeError: Error in abs(jw - iw) <= window.size : 'window.size' is missing
How do I properly specify the window.size argument such that it is passed correctly?
I'm quite new to R and rpy so I could very well be using these libraries incorrectly.
Any suggestions,hints, or help greatly appreciated.
-- js
Use importr():
from rpy2.robjects.packages import importr
dtw = importr('dtw')
alig = dtw.dtw(query, reference, keep=True,
step='asymmetric',
window_type='sakoeChibaWindow',
window_size=3)
This works for me:
import numpy as np
import rpy2.robjects.numpy2ri
from rpy2.robjects.packages import importr
rpy2.robjects.numpy2ri.activate()
R = rpy2.robjects.r
DTW = importr('dtw')
x = np.array([0.0, 1.0, 2.0, 3.0])
y = np.array([0.0, 1.9, 2.4, 3.0])
alignment1 = R.dtw(x, y, keep=True, dist_method="Euclidean",step_pattern=DTW.asymmetric,type="sakoechiba")
alignment2 = R.dtw(x, y, keep=True, dist_method="Euclidean",step_pattern=DTW.symmetric1,type="itakura")
alignment3 = R.dtw(x, y, keep=True, dist_method="Euclidean", step_pattern=DTW.symmetric2, type=DTW.sakoeChibaWindow, window_size=2)
dist1 = alignment1.rx('distance')[0][0]
dist2 = alignment2.rx('distance')[0][0]
dist3= alignment3.rx('distance')[0][0]
print(dist1)
#1.0
print(dist2)
#1.3
print(dist3)
#1.3
The documentation states:"window.type can also be an user-defined windowing function. See dtwWindowingFunctions for all available windowing functions"
There u can fix the window.size.
Hope it helps