Is it that hard to translate python code into c code?

Is it that hard to translate python code into c code? - python

once I wrote a piece of python code to do the calculation of inversions.
And just for fun plus want to see if c is really way faster than runtime language,
I wrote the similar python code in c.
the python code took less than 3 secs to run,
but my c code took about 8 secs.
Can anyone tell me where I did wrong and how I should change/improve my C code?
Thank you!!
the input file is just 100000 lines of numbers, not repeated, from 1 to 100000 in random order.
python code:
import time
def lookup(arr, num):
lower=0
upper=len(arr)-1
while 1:
if arr[upper]==num:
return upper
mid=(upper+lower)/2
if arr[mid]==num:
return mid
elif arr[mid]<num:
lower=mid+1
else:
upper=mid
def main():
time.clock()
f=open("IntegerArray.txt", "r").read().strip().split('\n')
array=[int(i) for i in f]
array.reverse()
s=sorted(array)
v=0
while(array):
i=lookup(s, array.pop())
v+=i
s.pop(i)
print v
print time.clock()
raw_input()
main()
C code:
#include <stdio.h>
#include <stdlib.h>
#define limit 100000
int comp(const void *, const void *);
void inversion(int *, int *, int);
int lookup(int *, int, int);
int main(void){
FILE *f = fopen("IntegerArray.txt", "r");
int n=0, array[limit], copy[limit];
while(n<limit){
fscanf(f, "%d", array+n);
copy[n]=array[n];
n++;
}
qsort(copy, limit, sizeof(int), comp);
// int i;
// for(i=limit-100;i<limit;i++)printf("%d %d\n", array[i], copy[i]);
inversion(array, copy, limit);
// getchar();
return 0;
}
int comp(const void *a, const void *b){
return (*(int*)a>*(int*)b)?1:-1;
}
void inversion(int *a, int *b, int n){
int c=0, index;
unsigned int v=0;
while(c<n){
// if(c%1000==0)printf("%d %d\n", c, b[n-c-1]);
index=lookup(b, *a, n-c);
v+=index;
if((n-c)/2<index)
for(;index<n-c-1;index++)
b[index]=b[index+1];
else{
for(;index>0;index--)
b[index]=b[index-1];
b++;
}
a++;
c++;
}
printf("%u\n", v);
}
int lookup(int *arr, int num, int n){
int lower=0, upper=n-1, mid;
// if(arr[upper]==num)return upper;
// if(arr[lower]==num)return lower;
while(1){
// printf("%d %d %d\n", lower, upper, num);
mid=(lower+upper)/2;
// if(lower==upper && arr[mid]!=num){
// printf("%d %d %d\n", lower, upper, num);
// exit(1);
// }
if(arr[mid]==num)return mid;
else if(arr[mid]<num)
lower=mid+1;
else
upper=mid;
}
}

In Python inversion, you have:
s.pop(i)
While in C, you have:
if((n-c)/2<index)
for(;index<n-c-1;index++)
b[index]=b[index+1];
else{
for(;index>0;index--)
b[index]=b[index-1];
b++;
}
Looks like a big difference to me.
I don't think Python implements pop by copying all elements.

Related

unordered_map giving Segmentation fault (core dumped) when calling C++ library from python

i am trying to use https://github.com/Spyros-DC/words-in-some-editdistance/blob/master/my_distance.cpp C++ implementation in python, however i kept receiving Segmentation fault (core dumped). Below is the entire code in python and C++, i have edited small parts of the original library to suit my use case. I have managed to find that when trying to use the unordered_map children.count(remaining_w.at(0)) it is throwing the segmentation fault error. I was wondering if anyone knows what is causing the error. Thank you so much in advance.
#include <iostream>
#include <unordered_map>
#include <string>
#include <fstream>
#include <unordered_set>
#include <vector>
#include <sstream>
#include <typeinfo>
using namespace std;
// using namespace std::chrono;
class trie{
public:
string word;
unordered_map<char, trie*> children;
// this function works
// with the argument flag set to zero
int insert(string w, int flag, string remaining_w = ""){
//the first time we call insert
if(flag == 0)
remaining_w = w;
int the_size = remaining_w.size();
if(children.count(remaining_w.at(0)) == 0){
children[remaining_w.at(0)] = new trie();
}
if(the_size == 0){
word = w;
return 0;
}else{
//the recursive calls with flag one
children[remaining_w.at(0)]->insert(w, 1, remaining_w.erase(0, 1));
return 0;
}
}
};
class AutoCorrect{
public:
// The tree
trie tree;
//the dictionary with the words
const int max_cost = 2;
const int too_big_distance = 10;
void insert(char* word){
ifstream ifp(word);
while(ifp >> word){
cout << word <<endl;
tree.insert(word, 0);
// }
}
}
void test(char* test){
cout << test << endl;
}
void search_recursive(trie* p_tree, char ch, const string& word, vector<int>& previous_row, int max_cost, unordered_map <string, int>& results)
{
int sz = previous_row.size();
int min_row = 12;
vector<int> current_row(sz, too_big_distance);
current_row[0] = previous_row[0] + 1;
// Calculate the min cost of insertion, deletion, match or substution
int insert_or_del, replace;
for (int i = 1; i < sz; i++) {
insert_or_del = min(current_row[i-1] + 1, previous_row[i] + 1);
replace = (word[i-1] == ch) ? previous_row[i-1] : (previous_row[i-1] + 1);
current_row[i] = min(insert_or_del, replace);
}
if ((current_row[sz-1] <= max_cost) && (p_tree->word != "")) {
results[p_tree->word] = current_row[sz-1];
}
for(auto& it: current_row){
if (it < min_row)
min_row = it;
}
if(min_row <= max_cost){
for(auto& it: p_tree->children){
search_recursive(it.second, it.first, word, current_row, max_cost, results);
}
}
}
int search(string word)
{
unordered_map <string, int> results;
int sz = word.size();
vector<int> current_row(sz + 1);
for (int i = 0; i <= sz; ++i){
current_row[i] = i;
}
for(auto& it: tree.children){
search_recursive(it.second, it.first, word, current_row, max_cost, results);
}
for(auto& p:results)
cout << p.first << ", " << p.second << endl;
return 0;
}
};
// The cost and a distance for vector initialization
extern "C" {
AutoCorrect* AutoCorrect_new(){ return new AutoCorrect(); }
void AutoCorrect_insert(AutoCorrect* autocorrect, char* word){ autocorrect->insert(word); }
void AutoCorrect_search(AutoCorrect* autocorrect, string input_word){ autocorrect->search(input_word); }
void AutoCorrect_test(AutoCorrect* autocorrect, char* name){ autocorrect-> test(name); }
}
Python main.py:
from ctypes import cdll
lib = cdll.LoadLibrary('autocorrect.so')
class AutoCorrect(object):
def __init__(self):
self.obj = lib.AutoCorrect_new()
def insert(self, word):
lib.AutoCorrect_insert(self.obj,word)
def search(self,input_word):
lib.AutoCorrect_search(self.obj,input_word)
def test(self,test):
lib.AutoCorrect_test(self.obj,test)
if __name__ == "__main__":
import json
WordCount = 0
autocorrect = AutoCorrect()
data_dir = "some_txt_file.txt"
autocorrect.insert(bytes(str(data_dir), encoding='utf8'))

Looking at the line you specified, i think there is an instance you are trying to add a value to a non existent key:
if(children.count(remaining_w.at(0)) == 0){
children[remaining_w.at(0)] = new trie();
if "children.count" returns 0 then that character is not present, then trying to add a value on the second line there means....
what i think you want to do on that line is:
if(children.count(remaining_w.at(0)) == 1){
children[remaining_w.at(0)] = new trie();
meaning you add the value only if key is present.

Callback functions in ROS ? How to properly update them?

So I make robot, which should get data from laser sensor and then move until some distance and stops.
But I find problem in callback functions. Is there somewhere better explanation how to update variables with callback properly ? I had same problem with python and there I found out that time.sleep(0.2) let the class to update properly. Even this is little bit magic for me. Because I was thinking that in python this works automatically because separated threading.
In c++ I know that the basic is using spinOnce() and spin(). This works how it should in normal non-object-oriented case. But in the class again I found out that the class is not updated properly. Why is this a case ?
I can not find why the callback function is not working properly. I could see if it was the case by print full range from reading, but it never happens. I have ros::spinOnce() and I think I have correctly member functions. Can someone please help me ? And help me to understand ?
robot.h
#include <ros/ros.h>
#include "sensor_msgs/LaserScan.h"
#include "geometry_msgs/Twist.h"
#include <math.h>
#include <iostream>
class Robot{
geometry_msgs::Twist velocity;
sensor_msgs::LaserScan ls_scan;
ros::Subscriber sub;
ros::Publisher pub;
ros::Rate rate{10};
double speed_linear;
double speed_angular;
public:
Robot(ros::NodeHandle *handle_IN, double rate_IN, double speed_linear_IN,double speed_angular_IN){
rate = ros::Rate{rate_IN};
speed_linear=speed_linear_IN;
speed_angular=speed_angular_IN;
sub = handle_IN->subscribe("/scan",1000,&Robot::scan_callback,this);
pub = handle_IN->advertise<geometry_msgs::Twist>("/cmd_vel",10);
usleep(2000000);
}
void scan_callback(const sensor_msgs::LaserScan::ConstPtr& msg);
void move();
void rotate(bool clock_wise);
void stop();
bool obstacle_in_front(double distance);
bool can_drive(double distance);
std::vector<float> front_read();
std::vector<float> back_read();
};
robot.cpp
#include "robot.h"
void Robot::scan_callback(const sensor_msgs::LaserScan::ConstPtr& msg){
ls_scan = *msg;
for(float x: ls_scan.ranges)
std::cout << x;
std::cout << std::endl;
}
void Robot::move(){
velocity = geometry_msgs::Twist();
velocity.linear.x = speed_linear;
ros::spinOnce();
while(!obstacle_in_front(0.56)){
ros::spinOnce();
std::cout << "moving\n";
pub.publish(velocity);
rate.sleep();
}
stop();
}
void Robot::rotate(bool clock_wise){
velocity = geometry_msgs::Twist();
velocity.angular.z = speed_angular;
ros::spinOnce();
while(can_drive(2)){
ros::spinOnce();
std::cout << "rotating\n";
pub.publish(velocity);
rate.sleep();
}
stop();
}
void Robot::stop(){
std::cout << "stop\n";
velocity = geometry_msgs::Twist();
pub.publish(velocity);
}
float min(const std::vector<float>& v){
if(v.size() == 0)
return -1;
float min = v[0];
for(float x: v){
if(x < min)
min = x;
}
return min;
}
std::vector<float> Robot::front_read(){
ros::spinOnce();
std::vector<float> front(20);
if(ls_scan.ranges.size()<350)
return front;
for(int i = 0; i < 10; ++i)
front.push_back(ls_scan.ranges[i]);
for(int i = 350; i < 360; ++i)
front.push_back(ls_scan.ranges[i]);
return front;
}
std::vector<float> Robot::back_read(){
ros::spinOnce();
std::vector<float> front(20);
if(ls_scan.ranges.size()<350)
return front;
for(int i = 169; i < 190; ++i)
front.push_back(ls_scan.ranges[i]);
return front;
}
bool Robot::obstacle_in_front(double distance){
float minN = min(front_read());
if(minN > distance)
return false;
else
return true;
std::cout << minN << "\n";
}
bool Robot::can_drive(double distance){
float minN = min(front_read());
if(minN > distance)
return true;
else
return false;
}
robot_control_node.cpp
#include "robot.h"
int main(int argc, char **argv){
ros::init(argc,argv, "robot_node_node");
ros::NodeHandle n;
Robot robot(&n,10,0.5,0.3);
robot.move();
}

So the final answer is this.
Wait for valid data -> I did this with simple wait loop which runs after initialization of publisher/subscriber in constructor.
Then you can check for new valid data when you need it. Example is calling spinOnce() in move member function. This is done with do-while, because you want to check condition after new data.
void Robot::wait_for_valid_data(){
while(ls_scan.ranges.size() < 359)
{
std::cout << "waiting for valid data\n";
ros::spinOnce();
rate.sleep();
}
std::cout << "valid data";
}
void Robot::move(){
velocity = geometry_msgs::Twist();
velocity.linear.x = speed_linear;
do{
if(!ros::ok())
break;
std::cout << "moving\n";
pub.publish(velocity);
ros::spinOnce();
rate.sleep();
}while(!obstacle_in_front(0.56));
stop();
}

I found the problem.
Basically with callbacks. You have to be sure, that the publisher catches up. So before you call the spinOnce() which checks if it is something there. You have to call some sort of wait function. ros::rate/ros::Duration and wait. Then when you call spinOnce(). You will have new incoming data, which the callback function can read.
In this sence:
std::vector<float> Robot::front_read(){
ros::Duration(0.1).sleep(); //wait for new callback
ros::spinOnce(); //read the callback
std::vector<float> front;
if(ls_scan.ranges.size()<350)
return front;
for(int i = 0; i < 10; ++i)
front.push_back(ls_scan.ranges[i]);
for(int i = 350; i < 360; ++i)
front.push_back(ls_scan.ranges[i]);
return front;
}

CUDA Runtime API error 1: invalid argument (cudaMemcpy) [duplicate]

I am currently trying to get a simple multi-GPU program running with CUDA.
What it basically does is it copies a large array with some dummy data in chunks to the GPUs, which do some math, and then copy the resulting array back.
I dont get any errors in the output of VS2017, but some error messages I have set up show me that while trying to copy either H2D or D2H.
It tells me that a cudaErrorInvalidValue is occuring.
Also, when using the cudaFree(); function, i get a cudaErrorInvalidDevicePointer error.
The output of the program, the result, is completely wrong. The kernel is, for testing purposes, only setting every value of the output array to a value of 50. The result is a relatively large negative number, always the same no matter what the kernel does.
I have already tried to use a pointer that is not part of a struct, but is defined right before the cudaMalloc, where it is used first. That did not change anything.
This is the function that runs the Kernel:
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
Then here, how the struct is defined in the "kernel.h":
#ifndef KERNEL_H
#define KERNEL_H
#include "cuda_runtime.h"
//GPU plan
typedef struct
{
unsigned int Computations; //computations on this GPU
unsigned int Repetitions; // amount of kernel repetitions
unsigned int ComputationsPerRepetition; // amount of computations in every kernel execution
unsigned int AdditionalComputationRepetitionsCount; // amount of repetitions that need to do one additional computation
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this GPU has to start working
float* d_data_ptr;
float* d_out_ptr;
cudaStream_t stream;
} GPUplan;
typedef struct
{
unsigned int Computations;
unsigned int ComputationsPerThread; // number of computations every thread of this repetition on this GPU has to do
unsigned int AdditionalComputationThreadCount; // number of threads in this repetition on this GPU that have to
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this repetition has to start working
} KernelPlan;
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter);
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N);
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan);
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan);
void memFree(int device, GPUplan gpuPlan);
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount);
#endif
here the part of code that calls runKernel:
int GPU_N;
cudaGetDeviceCount(&GPU_N);
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
I dont think that the kernel itself would be of any importance here since the memcpy error already appears before it is even executed.
The expected output is, that every element of the output array is 50. Instead, every element is -431602080.0
The array is a float array.
EDIT: here is the full code used to reproduce the problem (in addition to kernel.h from above):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include "kernel.h"
#define MAX_GPU_COUNT 32
#define MAX_REP_COUNT 64
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
int computations = d_ComputationsPerThread; //computations to be performed in this repetition on this GPU
const int threadID = blockDim.x * blockIdx.x + threadIdx.x; //thread id within GPU Repetition
if (threadID > d_AdditionalComputationThreadCount) {
computations++; //check if thread has to do an additional computation
}
for (int i = 0; i < computations; i++) {
d_out[i * blockDim.x * gridDim.x + threadID] = 50;
}
}
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter)
{
GPUplan plan;
size_t free, total;
//computations on GPU #device
plan.Computations = DATA_N / GPU_N;
//take into account odd data size for this GPU
if (DATA_N % GPU_N > device) {
plan.Computations++;
}
plan.DataStartingPoint = dataCounter;
//get memory information
cudaSetDevice(device);
cudaMemGetInfo(&free, &total);
//calculate Repetitions on this GPU #device
plan.Repetitions = ((plan.Computations * MemoryPerComputation / free) + 1);
printf("Repetitions: %i\n", plan.Repetitions);
if (plan.Repetitions > MAX_REP_COUNT) {
printf("Repetition count larger than MAX_REP_COUNT %i\n\n", MAX_REP_COUNT);
}
//calculate Computations per Repetition
plan.ComputationsPerRepetition = plan.Computations / plan.Repetitions;
//calculate how many Repetitions have to do an additional Computation
plan.AdditionalComputationRepetitionsCount = plan.Computations % plan.Repetitions;
return plan;
}
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N)
{
KernelPlan plan;
//calculate total Calculations in this Repetition
plan.Computations = GPUComputationsPerRepetition;
if (GPUAdditionalComputationRepetitionsCount > Repetition) {
plan.Computations++;
}
plan.ComputationsPerThread = plan.Computations / (THREAD_N * BLOCK_N); // Computations every thread has to do (+- 1)
plan.AdditionalComputationThreadCount = plan.Computations % (THREAD_N * BLOCK_N); // how many threads have to do +1 calculation
plan.DataStartingPoint = GPUDataStartingPoint + dataCounter;
return plan;
}
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Error Selecting device %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 1 on GPU %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_out_ptr), MemoryPerComputation * kernelPlan.Computations); // device output array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 2 on GPU %i: Error %i\n", device, x);
}
}
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
void memFree(int device, GPUplan gpuPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaFree(gpuPlan.d_data_ptr);
cudaFree(gpuPlan.d_out_ptr);
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memfree on GPU %i: Error %i\n", device, x);
}
else {
printf("memory freed.\n");
}
//17 = cudaErrorInvalidDevicePointer
}
int main()
{
//get device count
int GPU_N;
cudaGetDeviceCount(&GPU_N);
//adjust for device count larger than MAX_GPU_COUNT
if (GPU_N > MAX_GPU_COUNT)
{
GPU_N = MAX_GPU_COUNT;
}
printf("GPU count: %i\n", GPU_N);
//definitions for running the program
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
///////////////////////////////////////////////////////////
//Subdividing input data across GPUs
//////////////////////////////////////////////
//GPUplan
GPUplan plan[MAX_GPU_COUNT];
int dataCounter = 0;
for (int i = 0; i < GPU_N; i++)
{
plan[i] = planGPUComputation(DATA_N, GPU_N, i, MemoryPerComputation, dataCounter);
dataCounter += plan[i].Computations;
}
//KernelPlan
KernelPlan kernelPlan[MAX_GPU_COUNT*MAX_REP_COUNT];
for (int i = 0; i < GPU_N; i++)
{
int GPURepetitions = plan[i].Repetitions;
dataCounter = plan[i].DataStartingPoint;
for (int j = 0; j < GPURepetitions; j++)
{
kernelPlan[i*MAX_REP_COUNT + j] = planKernelComputation(plan[i].DataStartingPoint, plan[i].ComputationsPerRepetition, plan[i].AdditionalComputationRepetitionsCount, j, dataCounter, THREAD_N, BLOCK_N);
dataCounter += kernelPlan[i*MAX_REP_COUNT + j].Computations;
}
}
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
//generate some input data
for (int i = 0; i < DATA_N; i++) {
h_data[i] = 2 * i;
}
//get highest repetition count
int maxRepetitionCount = 0;
for (int i = 0; i < GPU_N; i++) {
if (plan[i].Repetitions > maxRepetitionCount) {
maxRepetitionCount = plan[i].Repetitions;
}
}
printf("maxRepetitionCount: %i\n\n", maxRepetitionCount);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
//printing expected results and results
for (int i = 0; i < 50; i++)
{
printf("%f\t", h_data[i]);
printf("%f\n", h_out[i]);
}
free(h_data);
free(h_out);
getchar();
return 0;
}

The first problem has nothing to do with CUDA, actually. When you pass a struct by-value to a function in C or C++, a copy of that struct is made for use by the function. Modifications to that struct in the function have no effect on the original struct in the calling environment. This is affecting you in your memAllocation function:
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
^^^^^^^
passed by value
{
cudaSetDevice(device); //select device to allocate memory on
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Error Selecting device %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
^^^^^^^^^^^^^^^^^^
modifying the copy, not the original
This is fairly easily fixable by passing the gpuPlan struct by reference rather than by value. Modify both the prototype in the kernel.h header file, as well as the definition:
void memAllocation(int device, int MemoryPerComputation, GPUplan &gpuPlan, KernelPlan kernelPlan)
^
with that change, the struct is passed by reference, and modifications (such as the setting of the allocated pointers) will show up in the calling environment. This is the proximal reason for the invalid argument report on the cudaMemcpy operations. The pointers you were passing were unallocated, because your allocations were done on the pointer copies, not the originals.
After that change your code may appear to be running correctly. At least when I run it no errors are displayed and the outputs appear to be all set to 50.
However there are still problems with this code. If you run your code with cuda-memcheck (or turn on the memory checker functionality in nsight VSE) you should see errors associated with this line of code, which is indexing out of bounds:
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
...
d_out[i * blockDim.x * gridDim.x + threadID] = 50; //indexing out of bounds
I'm not going to try to sort that out for you. It seems evident to me that your for-loop, coupled with the way you are calculating the index, is going beyond the end of the array. You can follow the methodology discussed here if needed.

CTYPES and PYTHON3: Can't return long string

I have a C++ code for which I created a python.ctypes wrapper. It works quite well, except when the returning string is long. Example of C code:
extern "C" {
const char * hallo(char * nome)
{
string str(nome);
str = "hallo " + str;
for(int i=0; i<100; i++)
str += " new content";
return str.c_str();
}
My python code is:
self.lib = ctypes.CDLL(LIB_PATH)
self.lib.hallo.restype = ctypes.c_char_p
self.lib.hallo.argtypes =[ctypes.c_char_p]
j = self.lib.hallo('my name'.encode())
print('INIT: ' + j.decode())
The string size is dynamic (in fact, it will be a json string). What is the best way to deal with this situation?
Thanks so much.

The issue here is that when you return str.c_str() you're returning a pointer to stack allocated memory which gets overwritten on return from the C++ code.
Potential workarounds are to use static string str such as:
#include <string>
#include <sstream>
extern "C" {
const char * hallo(char * nome)
{
static std::string str;
std::stringstream stream;
stream << "hallo " << nome;
for(int i=0; i<100; i++)
stream << " new content";
str = stream.str();
return str.c_str();
}
}
Although this will prevent you from calling the routine from multiple threads.
If you want to be able to call it from multiple places, you should probably take a parameter pointer to some memory, and pass in a pointer buffer created from ctypes.create_string_buffer (hoping that it has the right size in this case).
For example:
#include <string>
#include <sstream>
extern "C" {
const char * hallo(char * nome, char *writebuffer, unsigned int buffersize)
{
std::string str(nome);
str = "hallo " + str;
for(int i=0; i<100; i++)
str += " new content";
if (str.size() < buffersize) {
str.copy(writebuffer, buffersize);
return writebuffer;
} else {
return 0;
}
}
}
Then some sample python code that uses this library; passing in a 128k buffer (python 3 updated):
import ctypes
lib = ctypes.CDLL(LIB_PATH)
lib.hallo.restype = ctypes.c_char_p
lib.hallo.argtypes =[ctypes.c_char_p, ctypes.c_char_p, ctypes.c_uint]
allocation = ctypes.create_string_buffer(128 * 1024)
j = lib.hallo('my name'.encode(), allocation, 128 * 1024)
if j is not None:
print('INIT: ' + j.decode("UTF-8"))
else:
print("Buffer was too small")

AttributeError: getattr

I'm new to both Python and SWIG, so maybe this is a common mistake. I have the following simple C++ class:
Header File:
class myMath
{
public:
myMath(void);
~myMath(void);
int add(int x, int y);
int minusOne(int x);
void sub(int *x, int *y, int *result);
int divide(int n, int d, int *r);
double avg_array(double *array, int len);
int sum_array(int *array, int len);
};
C++ File:
/* File : example.cpp */
#include "Example.h"
myMath::myMath()
{
}
myMath::~myMath()
{
}
int myMath::add(int x, int y)
{
return x + y;
}
int myMath::minusOne(int x)
{
return x - 1;
}
void myMath::sub(int *x, int *y, int *result)
{
*result = *x - *y;
}
int myMath::divide(int n, int d, int *r)
{
int q;
q = n/d;
*r = n - q*d;
return q;
}
double myMath::avg_array(double *array, int len)
{
double sum = 0.0;
double avg;
for (int i = 0; i < len; i++)
{
sum += array[i];
}
avg = sum / (double) len;
return avg;
}
int myMath::sum_array(int *array, int len)
{
int sum = 0;
for (int i = 0; i < len; i++)
{
sum += array[i];
}
return sum;
}
Interface File:
/* File : example.i */
%module example
%{
#include "Example.h"
%}
%include "Example.h"
And here is the Python code:
# file: swigExample.py
import sys
# Include the local Python modules
sys.path.append('C:/Temp/PTest_3')
import example
a = 37
b = 42
c = 0
print " a =",a
print " b =",b
print " c =",c
ex = example.myMath()
d = ex.myMath.minusOne( 10 )
print " d =",d
This is the error I get when I run the Python code:
Traceback (most recent call last):
File "C:\Temp\PTest_3\swigExample.py", line 20
d = ex.myMath.minusOne( 10 )
File "C:\Temp\PTest_3\example.py", line 94, in <lambda>
__getattr__ = lambda self, name: _swig_getattr(self, myMath, name)
File "C:\Temp\PTest_3\example.py", line 71, in _swig_getattr
return _swig_getattr_nondynamic(self, class_type, name, 0)
File "C:\Temp\PTest_3\example.py", line 66, in _swig_getattr_nondynamic
return object.__getattr__(self, name)
AttributeError: type object 'object' has no attribute '__getattr__'
I'm sure this must be something really basic, but I can't find a similar problem on the site. Thanks in advance.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Is it that hard to translate python code into c code? - python

In Python inversion, you have: s.pop(i) While in C, you have: if((n-c)/2<index) for(;index<n-c-1;index++) b[index]=b[index+1]; else{ for(;index>0;index--) b[index]=b[index-1]; b++; } Looks like a big difference to me. I don't think Python implements pop by copying all elements.

Related

unordered_map giving Segmentation fault (core dumped) when calling C++ library from python

Callback functions in ROS ? How to properly update them?

CUDA Runtime API error 1: invalid argument (cudaMemcpy) [duplicate]

CTYPES and PYTHON3: Can't return long string

AttributeError: getattr

Categories

Resources

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Is it that hard to translate python code into c code? - python

In Python inversion, you have: s.pop(i) While in C, you have: if((n-c)/2<index) for(;index<n-c-1;index++) b[index]=b[index+1]; else{ for(;index>0;index--) b[index]=b[index-1]; b++; } Looks like a big difference to me. I don't think Python implements pop by copying all elements.

Related

unordered_map giving Segmentation fault (core dumped) when calling C++ library from python

Callback functions in ROS ? How to properly update them?

CUDA Runtime API error 1: invalid argument (cudaMemcpy) [duplicate]

CTYPES and PYTHON3: Can't return long string

AttributeError: __getattr__

Categories

Resources

AttributeError: getattr