CUDA Runtime API error 1: invalid argument (cudaMemcpy) [duplicate]

CUDA Runtime API error 1: invalid argument (cudaMemcpy) [duplicate] - python

I am currently trying to get a simple multi-GPU program running with CUDA.
What it basically does is it copies a large array with some dummy data in chunks to the GPUs, which do some math, and then copy the resulting array back.
I dont get any errors in the output of VS2017, but some error messages I have set up show me that while trying to copy either H2D or D2H.
It tells me that a cudaErrorInvalidValue is occuring.
Also, when using the cudaFree(); function, i get a cudaErrorInvalidDevicePointer error.
The output of the program, the result, is completely wrong. The kernel is, for testing purposes, only setting every value of the output array to a value of 50. The result is a relatively large negative number, always the same no matter what the kernel does.
I have already tried to use a pointer that is not part of a struct, but is defined right before the cudaMalloc, where it is used first. That did not change anything.
This is the function that runs the Kernel:
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
Then here, how the struct is defined in the "kernel.h":
#ifndef KERNEL_H
#define KERNEL_H
#include "cuda_runtime.h"
//GPU plan
typedef struct
{
unsigned int Computations; //computations on this GPU
unsigned int Repetitions; // amount of kernel repetitions
unsigned int ComputationsPerRepetition; // amount of computations in every kernel execution
unsigned int AdditionalComputationRepetitionsCount; // amount of repetitions that need to do one additional computation
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this GPU has to start working
float* d_data_ptr;
float* d_out_ptr;
cudaStream_t stream;
} GPUplan;
typedef struct
{
unsigned int Computations;
unsigned int ComputationsPerThread; // number of computations every thread of this repetition on this GPU has to do
unsigned int AdditionalComputationThreadCount; // number of threads in this repetition on this GPU that have to
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this repetition has to start working
} KernelPlan;
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter);
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N);
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan);
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan);
void memFree(int device, GPUplan gpuPlan);
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount);
#endif
here the part of code that calls runKernel:
int GPU_N;
cudaGetDeviceCount(&GPU_N);
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
I dont think that the kernel itself would be of any importance here since the memcpy error already appears before it is even executed.
The expected output is, that every element of the output array is 50. Instead, every element is -431602080.0
The array is a float array.
EDIT: here is the full code used to reproduce the problem (in addition to kernel.h from above):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include "kernel.h"
#define MAX_GPU_COUNT 32
#define MAX_REP_COUNT 64
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
int computations = d_ComputationsPerThread; //computations to be performed in this repetition on this GPU
const int threadID = blockDim.x * blockIdx.x + threadIdx.x; //thread id within GPU Repetition
if (threadID > d_AdditionalComputationThreadCount) {
computations++; //check if thread has to do an additional computation
}
for (int i = 0; i < computations; i++) {
d_out[i * blockDim.x * gridDim.x + threadID] = 50;
}
}
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter)
{
GPUplan plan;
size_t free, total;
//computations on GPU #device
plan.Computations = DATA_N / GPU_N;
//take into account odd data size for this GPU
if (DATA_N % GPU_N > device) {
plan.Computations++;
}
plan.DataStartingPoint = dataCounter;
//get memory information
cudaSetDevice(device);
cudaMemGetInfo(&free, &total);
//calculate Repetitions on this GPU #device
plan.Repetitions = ((plan.Computations * MemoryPerComputation / free) + 1);
printf("Repetitions: %i\n", plan.Repetitions);
if (plan.Repetitions > MAX_REP_COUNT) {
printf("Repetition count larger than MAX_REP_COUNT %i\n\n", MAX_REP_COUNT);
}
//calculate Computations per Repetition
plan.ComputationsPerRepetition = plan.Computations / plan.Repetitions;
//calculate how many Repetitions have to do an additional Computation
plan.AdditionalComputationRepetitionsCount = plan.Computations % plan.Repetitions;
return plan;
}
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N)
{
KernelPlan plan;
//calculate total Calculations in this Repetition
plan.Computations = GPUComputationsPerRepetition;
if (GPUAdditionalComputationRepetitionsCount > Repetition) {
plan.Computations++;
}
plan.ComputationsPerThread = plan.Computations / (THREAD_N * BLOCK_N); // Computations every thread has to do (+- 1)
plan.AdditionalComputationThreadCount = plan.Computations % (THREAD_N * BLOCK_N); // how many threads have to do +1 calculation
plan.DataStartingPoint = GPUDataStartingPoint + dataCounter;
return plan;
}
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Error Selecting device %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 1 on GPU %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_out_ptr), MemoryPerComputation * kernelPlan.Computations); // device output array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 2 on GPU %i: Error %i\n", device, x);
}
}
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
void memFree(int device, GPUplan gpuPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaFree(gpuPlan.d_data_ptr);
cudaFree(gpuPlan.d_out_ptr);
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memfree on GPU %i: Error %i\n", device, x);
}
else {
printf("memory freed.\n");
}
//17 = cudaErrorInvalidDevicePointer
}
int main()
{
//get device count
int GPU_N;
cudaGetDeviceCount(&GPU_N);
//adjust for device count larger than MAX_GPU_COUNT
if (GPU_N > MAX_GPU_COUNT)
{
GPU_N = MAX_GPU_COUNT;
}
printf("GPU count: %i\n", GPU_N);
//definitions for running the program
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
///////////////////////////////////////////////////////////
//Subdividing input data across GPUs
//////////////////////////////////////////////
//GPUplan
GPUplan plan[MAX_GPU_COUNT];
int dataCounter = 0;
for (int i = 0; i < GPU_N; i++)
{
plan[i] = planGPUComputation(DATA_N, GPU_N, i, MemoryPerComputation, dataCounter);
dataCounter += plan[i].Computations;
}
//KernelPlan
KernelPlan kernelPlan[MAX_GPU_COUNT*MAX_REP_COUNT];
for (int i = 0; i < GPU_N; i++)
{
int GPURepetitions = plan[i].Repetitions;
dataCounter = plan[i].DataStartingPoint;
for (int j = 0; j < GPURepetitions; j++)
{
kernelPlan[i*MAX_REP_COUNT + j] = planKernelComputation(plan[i].DataStartingPoint, plan[i].ComputationsPerRepetition, plan[i].AdditionalComputationRepetitionsCount, j, dataCounter, THREAD_N, BLOCK_N);
dataCounter += kernelPlan[i*MAX_REP_COUNT + j].Computations;
}
}
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
//generate some input data
for (int i = 0; i < DATA_N; i++) {
h_data[i] = 2 * i;
}
//get highest repetition count
int maxRepetitionCount = 0;
for (int i = 0; i < GPU_N; i++) {
if (plan[i].Repetitions > maxRepetitionCount) {
maxRepetitionCount = plan[i].Repetitions;
}
}
printf("maxRepetitionCount: %i\n\n", maxRepetitionCount);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
//printing expected results and results
for (int i = 0; i < 50; i++)
{
printf("%f\t", h_data[i]);
printf("%f\n", h_out[i]);
}
free(h_data);
free(h_out);
getchar();
return 0;
}

The first problem has nothing to do with CUDA, actually. When you pass a struct by-value to a function in C or C++, a copy of that struct is made for use by the function. Modifications to that struct in the function have no effect on the original struct in the calling environment. This is affecting you in your memAllocation function:
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
^^^^^^^
passed by value
{
cudaSetDevice(device); //select device to allocate memory on
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Error Selecting device %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
^^^^^^^^^^^^^^^^^^
modifying the copy, not the original
This is fairly easily fixable by passing the gpuPlan struct by reference rather than by value. Modify both the prototype in the kernel.h header file, as well as the definition:
void memAllocation(int device, int MemoryPerComputation, GPUplan &gpuPlan, KernelPlan kernelPlan)
^
with that change, the struct is passed by reference, and modifications (such as the setting of the allocated pointers) will show up in the calling environment. This is the proximal reason for the invalid argument report on the cudaMemcpy operations. The pointers you were passing were unallocated, because your allocations were done on the pointer copies, not the originals.
After that change your code may appear to be running correctly. At least when I run it no errors are displayed and the outputs appear to be all set to 50.
However there are still problems with this code. If you run your code with cuda-memcheck (or turn on the memory checker functionality in nsight VSE) you should see errors associated with this line of code, which is indexing out of bounds:
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
...
d_out[i * blockDim.x * gridDim.x + threadID] = 50; //indexing out of bounds
I'm not going to try to sort that out for you. It seems evident to me that your for-loop, coupled with the way you are calculating the index, is going beyond the end of the array. You can follow the methodology discussed here if needed.

Related

Trouble using Python ctypes to run C dll function (array output with unknown size)

I'm trying to execute a C function using Python ctypes, but I'm doing something wrong.
The C function was originally converted from MATLAB to C code using MATLAB coder. The function gives as output an array of undefined length, which depends on the input.
This is the MATLAB code:
function a = array_output(n)
%$codegen
if n > 2*pi
a = 1:n;
else
a = [1,2,3];
end
end
And this is the obtained C code:
void array_output(double n, emxArray_real_T *a)
{
int i;
int loop_ub;
if (n > 6.2831853071795862) {
i = a->size[0] * a->size[1];
a->size[0] = 1;
loop_ub = (int)floor(n - 1.0);
a->size[1] = loop_ub + 1;
emxEnsureCapacity_real_T(a, i);
for (i = 0; i <= loop_ub; i++) {
a->data[i] = (double)i + 1.0;
}
} else {
i = a->size[0] * a->size[1];
a->size[0] = 1;
a->size[1] = 3;
emxEnsureCapacity_real_T(a, i);
a->data[0] = 1.0;
a->data[1] = 2.0;
a->data[2] = 3.0;
}
}
struct emxArray_real_T
{
double *data;
int *size;
int allocatedSize;
int numDimensions;
boolean_T canFreeData;
};
Please, find at the end of my question an example implementation they provide.
I am trying to execute it using Python's ctype using the following code:
dll = ctypes.cdll.LoadLibrary(dll_path)
class DataStruct(ctypes.Structure):
_fields_ = [
('data', ctypes.POINTER(ctypes.c_double)),
('size', ctypes.POINTER(ctypes.c_int)),
('allocatedSize', ctypes.c_int),
('numDimensions', ctypes.c_int),
('canFreeData', ctypes.c_bool)
]
array = ctypes.POINTER(ctypes.c_double)()
size = numpy.array([0, 0]).ctypes.data_as(ctypes.POINTER(ctypes.c_int))
allocatedSize = 0
numDimensions = 2
canFreeData = True
data_struct = DataStruct(array, size, allocatedSize, numDimensions, canFreeData)
dll.array_output.argtypes = [ctypes.c_double, DataStruct]
dll.array_output.restype = None
dll.array_output(50, data_struct)
The output data_struct contains the right size field (data_struct.size[1] is 50), but when I try to access data_struct.data[0], I get the following error:
ValueError: NULL pointer access
Can anyone help me understanding what I'm doing wrong here?
--
Example implementation (code snippets):
void main(){
// pseudo-code here
emxArray_real_T *a;
emxInitArray_real_T(&a, 2);
/* Initialize function 'array_output' input arguments. */
/* Call the entry-point 'array_output'. */
array_output(5.0, a);
}
void emxInitArray_real_T(emxArray_real_T **pEmxArray, int numDimensions)
{
emxInit_real_T(pEmxArray, numDimensions);
}
void emxInit_real_T(emxArray_real_T **pEmxArray, int numDimensions)
{
emxArray_real_T *emxArray;
int i;
*pEmxArray = (emxArray_real_T *)malloc(sizeof(emxArray_real_T));
emxArray = *pEmxArray;
emxArray->data = (double *)NULL;
emxArray->numDimensions = numDimensions;
emxArray->size = (int *)malloc(sizeof(int) * numDimensions);
emxArray->allocatedSize = 0;
emxArray->canFreeData = true;
for (i = 0; i < numDimensions; i++) {
emxArray->size[i] = 0;
}
}
void emxEnsureCapacity_real_T(emxArray_real_T *emxArray, int oldNumel)
{
int newNumel;
int i;
void *newData;
if (oldNumel < 0) {
oldNumel = 0;
}
newNumel = 1;
for (i = 0; i < emxArray->numDimensions; i++) {
newNumel *= emxArray->size[i];
}
if (newNumel > emxArray->allocatedSize) {
i = emxArray->allocatedSize;
if (i < 16) {
i = 16;
}
while (i < newNumel) {
if (i > 1073741823) {
i = MAX_int32_T;
} else {
i *= 2;
}
}
newData = calloc((unsigned int)i, sizeof(double));
if (emxArray->data != NULL) {
memcpy(newData, emxArray->data, sizeof(double) * oldNumel);
if (emxArray->canFreeData) {
free(emxArray->data);
}
}
emxArray->data = (double *)newData;
emxArray->allocatedSize = i;
emxArray->canFreeData = true;
}
}

The second argument of array_output is a emxArray_real_T *, but you're trying to pass the structure by value as if it were just a emxArray_real_T. To fix that, change dll.array_output.argtypes = [ctypes.c_double, DataStruct] to dll.array_output.argtypes = [ctypes.c_double, ctypes.POINTER(DataStruct)] and dll.array_output(50, data_struct) to dll.array_output(50, ctypes.byref(data_struct)).

Simple Multilateration c++ math realisation?

I'm trying to realize a multi trilateration in c++. I have three ranges with which I want to determine a certain point in 3d space. I've already looked up for solutions and found a usable python code snipped. Then I have rewritten the whole snipped in c++ but the results are kind of odd and of the expected results. Their must be some common mistakes which I just don't see (maybe it's too obvious lol).
The test data is from here, and I know that this linear way of solving the gps trillat problem is not even remotely accurate but that's not important for my purpose.
The resulting vector is: -3.39803e+29, 3.39803e+29, -3.39803e+29 which cannot be true.
/* Recursive function for finding determinant of matrix.
n(rows) is current dimension of A[][]. */
double clacDeterminant(double **A, int rows)
{
double D = 0; // Initialize result
// Base case : if matrix contains single element
if (rows == 1)
return A[0][0];
int sign = 1; // To store sign multiplier
// Iterate for each element of first row
for (int f = 0; f < rows; f++)
{
// Getting Cofactor of A[0][f]
double **temp = getCofactor(A, 0, f, rows);
D += sign * A[0][f] * clacDeterminant(A, rows - 1);
// terms are to be added with alternate sign
sign = -sign;
}
return D;
}
// Function to get adjoint of A[N][N] in adj[N][N].
double** calcAdjoint(double **A, int matrixArows)
{
double** adj = allocate2Ddouble(matrixArows, posMTrillatASize);
int sign = 0;
for (int i=0; i<matrixArows; i++)
{
for (int j=0; j<posMTrillatASize; j++)
{
// Get cofactor of A[i][j]
double **temp = getCofactor(A, i, j, posMTrillatASize);
// sign of adj[j][i] positive if sum of row
// and column indexes is even.
sign = ((i+j)%2==0)? 1: -1;
// Interchanging rows and columns to get the
// transpose of the cofactor matrix
adj[i][j] = (sign)*(clacDeterminant(A, posMTrillatASize-1));
}
}
return adj;
}
// by https://www.programiz.com/cpp-programming/examples/matrix-multiplication-function modified by L.K.
// method assumes that matrices have same dim sizes
double** multiplyMatrices(double **matrixA, double **matrixB, int matrixArows)
{
double** outputMatrix = allocate2Ddouble(matrixArows, posMTrillatASize);
int i, j, k;
// Initializing elements of matrix mult to 0.
for(i = 0; i < matrixArows; ++i)
{
for(j = 0; j < posMTrillatASize; ++j)
{
outputMatrix[i][j] = 0;
}
}
// Multiplying matrix matrixA and matrixB and storing in array mult.
for(i = 0; i < matrixArows; ++i)
{
for(j = 0; j < posMTrillatASize; ++j)
{
for(k=0; k<posMTrillatASize; ++k)
{
outputMatrix[i][j] += matrixA[i][k] * matrixB[k][j];
}
}
}
return outputMatrix;
}
double* multiplyMatrixByVector(double **matrixA, double vectorB[])
{
double vectorRes[posMTrillatASize];
memset(vectorRes, 0, posMTrillatASize*sizeof(double));
for (int i=0;i<posMTrillatASize;i++)
{
for (int j=0;j<posMTrillatASize;j++)
{
vectorRes[i]+= (matrixA[i][j]*vectorB[j]);
}
}
return vectorRes;
}
double** transpose2DimMatrix(double **inputArr, int matrixArows, int transpose2DimMatrix)
{
double **outputArr = allocate2Ddouble(transpose2DimMatrix, matrixArows);
for (int i = 0; i < matrixArows; ++i)
{
for (int j = 0; j < transpose2DimMatrix; ++j)
{
outputArr[j][i] = inputArr[i][j];
}
}
return outputArr;
}
// Function to calculate and store inverse, returns 0 if false
// matrix is singular by https://www.geeksforgeeks.org/adjoint-inverse-matrix/
double** calcInverse(double **A, int matrixArows)
{
double** inverse = allocate2Ddouble(matrixArows, posMTrillatASize);
// Find determinant of A[][]
int det = clacDeterminant(A, posMTrillatASize);
if (det == 0)
{
cout << "Singular matrix, can't find its inverse";
return 0;
}
// Find adjoint
double **adj = calcAdjoint(A, matrixArows);
// Find Inverse using formula "inverse(A) = adj(A)/det(A)"
for (int i=0; i<matrixArows; i++)
for (int j=0; j<posMTrillatASize; j++)
inverse[i][j] = adj[i][j]/double(det);
return inverse;
}
double* leastSquareReg(double **matrixA, double vectorB[], int matrixArows, int matrixAcol)
{
double **matrixATransposed = transpose2DimMatrix(matrixA, matrixArows, matrixAcol);
double **matrixATransposedA = multiplyMatrices(matrixATransposed, matrixA, matrixAcol);
double **matrixATransposedAInverse = calcInverse(matrixATransposedA, matrixArows);
double **matrixATransposedAAdd = multiplyMatrices(matrixATransposedAInverse, matrixATransposed, matrixArows);
double *finalX = multiplyMatrixByVector(matrixATransposedAAdd, vectorB);
return finalX;
}
ecefPos trillatPosFromRange(satLocation finalSatPos, satRanges finalSatRanges)
{
std::map<int, ecefPos>::iterator it_;
std::map<int, double>::iterator finalSatRangesMap;
int matrixArows, matrixAcol = 0;
double x, y, z;
double Am, Bm, Cm, Dm;
double range;
int nSat = finalSatPos.locations.size();
ecefPos finalPos = { 0.0, 0.0, 0.0 };
matrixAcol = posMTrillatASize;
matrixArows = nSat;
double **matrixA = allocate2Ddouble(nSat, 3);
double vectorB[posMTrillatASize] = {};
int i = 0;
for (it_ = finalSatPos.locations.begin(); it_ != finalSatPos.locations.end(); it_++)
{
// look up for pseudo range with same sat id
finalSatRangesMap = finalSatRanges.ranges.find(it_->first);
range = finalSatRangesMap->second;
if (it_ != finalSatPos.locations.end())
{
x = it_->second.x;
y = it_->second.y;
z = it_->second.z;
Am = -2*x;
Bm = -2*y;
Cm = -2*z;
Dm = EARTH_RADIUS_KM*EARTH_RADIUS_KM + (pow(x,2)+pow(y,2)+pow(z,2)) - pow(range,2);
matrixA[i][0] = Am;
matrixA[i][1] = Bm;
matrixA[i][2] = Cm;
vectorB[i] = Dm;
i++;
} else
{
std::cout << "could not find sat pos for user pos trilateration" << std::endl;
}
}
// least square regression
double *finalECEF = leastSquareReg(matrixA, vectorB, matrixArows, matrixAcol);
finalPos.x = finalECEF[0];
finalPos.y = finalECEF[1];
finalPos.z = finalECEF[2];
for (int i = 0; i < 5; ++i)
{
std::cout << finalECEF[i] << std::endl;
}
return finalPos;
}

Pass a 2d numpy array to c using ctypes

What is the correct way to pass a numpy 2d - array to a c function using ctypes ?
My current approach so far (leads to a segfault):
C code :
void test(double **in_array, int N) {
int i, j;
for(i = 0; i<N; i++) {
for(j = 0; j<N; j++) {
printf("%e \t", in_array[i][j]);
}
printf("\n");
}
}
Python code:
from ctypes import *
import numpy.ctypeslib as npct
array_2d_double = npct.ndpointer(dtype=np.double,ndim=2, flags='CONTIGUOUS')
liblr = npct.load_library('libtest.so', './src')
liblr.test.restype = None
liblr.test.argtypes = [array_2d_double, c_int]
x = np.arange(100).reshape((10,10)).astype(np.double)
liblr.test(x, 10)

This is probably a late answer, but I finally got it working. All credit goes to Sturla Molden at this link.
The key is, note that double** is an array of type np.uintp. Therefore, we have
xpp = (x.ctypes.data + np.arange(x.shape[0]) * x.strides[0]).astype(np.uintp)
doublepp = np.ctypeslib.ndpointer(dtype=np.uintp)
And then use doublepp as the type, pass xpp in. See full code attached.
The C code:
// dummy.c
#include <stdlib.h>
__declspec(dllexport) void foobar(const int m, const int n, const
double **x, double **y)
{
size_t i, j;
for(i=0; i<m; i++)
for(j=0; j<n; j++)
y[i][j] = x[i][j];
}
The Python code:
# test.py
import numpy as np
from numpy.ctypeslib import ndpointer
import ctypes
_doublepp = ndpointer(dtype=np.uintp, ndim=1, flags='C')
_dll = ctypes.CDLL('dummy.dll')
_foobar = _dll.foobar
_foobar.argtypes = [ctypes.c_int, ctypes.c_int, _doublepp, _doublepp]
_foobar.restype = None
def foobar(x):
y = np.zeros_like(x)
xpp = (x.__array_interface__['data'][0]
+ np.arange(x.shape[0])*x.strides[0]).astype(np.uintp)
ypp = (y.__array_interface__['data'][0]
+ np.arange(y.shape[0])*y.strides[0]).astype(np.uintp)
m = ctypes.c_int(x.shape[0])
n = ctypes.c_int(x.shape[1])
_foobar(m, n, xpp, ypp)
return y
if __name__ == '__main__':
x = np.arange(9.).reshape((3, 3))
y = foobar(x)
Hope it helps,
Shawn

#include <stdio.h>
void test(double (*in_array)[3], int N){
int i, j;
for(i = 0; i < N; i++){
for(j = 0; j < N; j++){
printf("%e \t", in_array[i][j]);
}
printf("\n");
}
}
int main(void)
{
double a[][3] = {
{1., 2., 3.},
{4., 5., 6.},
{7., 8., 9.},
};
test(a, 3);
return 0;
}
if you want to use a double ** in your function, you must pass an array of pointer to double (not a 2d array):
#include <stdio.h>
void test(double **in_array, int N){
int i, j;
for(i = 0; i < N; i++){
for(j = 0; j< N; j++){
printf("%e \t", in_array[i][j]);
}
printf("\n");
}
}
int main(void)
{
double a[][3] = {
{1., 2., 3.},
{4., 5., 6.},
{7., 8., 9.},
};
double *p[] = {a[0], a[1], a[2]};
test(p, 3);
return 0;
}
Another (as suggested by #eryksun): pass a single pointer and do some arithmetic to get the index:
#include <stdio.h>
void test(double *in_array, int N){
int i, j;
for(i = 0; i < N; i++){
for(j = 0; j< N; j++){
printf("%e \t", in_array[i * N + j]);
}
printf("\n");
}
}
int main(void)
{
double a[][3] = {
{1., 2., 3.},
{4., 5., 6.},
{7., 8., 9.},
};
test(a[0], 3);
return 0;
}

While the reply might be rather late, I hope it could help other people with the same problem.
As numpy arrays are internally saved as 1d arrays, one can simply rebuild 2d shape in C. Here is a small MWE:
// libtest2d.c
#include <stdlib.h> // for malloc and free
#include <stdio.h> // for printf
// create a 2d array from the 1d one
double ** convert2d(unsigned long len1, unsigned long len2, double * arr) {
double ** ret_arr;
// allocate the additional memory for the additional pointers
ret_arr = (double **)malloc(sizeof(double*)*len1);
// set the pointers to the correct address within the array
for (int i = 0; i < len1; i++) {
ret_arr[i] = &arr[i*len2];
}
// return the 2d-array
return ret_arr;
}
// print the 2d array
void print_2d_list(unsigned long len1,
unsigned long len2,
double * list) {
// call the 1d-to-2d-conversion function
double ** list2d = convert2d(len1, len2, list);
// print the array just to show it works
for (unsigned long index1 = 0; index1 < len1; index1++) {
for (unsigned long index2 = 0; index2 < len2; index2++) {
printf("%1.1f ", list2d[index1][index2]);
}
printf("\n");
}
// free the pointers (only)
free(list2d);
}
and
# test2d.py
import ctypes as ct
import numpy as np
libtest2d = ct.cdll.LoadLibrary("./libtest2d.so")
libtest2d.print_2d_list.argtypes = (ct.c_ulong, ct.c_ulong,
np.ctypeslib.ndpointer(dtype=np.float64,
ndim=2,
flags='C_CONTIGUOUS'
)
)
libtest2d.print_2d_list.restype = None
arr2d = np.meshgrid(np.linspace(0, 1, 6), np.linspace(0, 1, 11))[0]
libtest2d.print_2d_list(arr2d.shape[0], arr2d.shape[1], arr2d)
If you compile the code with gcc -shared -fPIC libtest2d.c -o libtest2d.so and then run python test2d.py it should print the array.
I hope the example is more or less self-explaining. The idea is, that the shape is also given to the C-Code which then creates a double ** pointer for which the space for the additional pointers is reserved. And these then are then set to point to the correct part of the original array.
PS: I am rather a beginner in C so please comment if there are reasons not to do this.

Here i hava pass two 2d numpy array and print value of one array for the reference
you can use and write your own logic in cpp
cpp_function.cpp
compile it using : g++ -shared -fPIC cpp_function.cpp -o cpp_function.so
#include <iostream>
extern "C" {
void mult_matrix(double *a1, double *a2, size_t a1_h, size_t a1_w,
size_t a2_h, size_t a2_w, int size)
{
//std::cout << "a1_h & a1_w" << a1_h << a1_w << std::endl;
//std::cout << "a2_h & a2_w" << a2_h << a2_w << std::endl;
for (size_t i = 0; i < a1_h; i++) {
for (size_t j = 0; j < a1_w; j++) {
printf("%f ", a1[i * a1_h + j]);
}
printf("\n");
}
printf("\n");
}
}
Python File
main.py
import ctypes
import numpy
from time import time
libmatmult = ctypes.CDLL("./cpp_function.so")
ND_POINTER_1 = numpy.ctypeslib.ndpointer(dtype=numpy.float64,
ndim=2,
flags="C")
ND_POINTER_2 = numpy.ctypeslib.ndpointer(dtype=numpy.float64,
ndim=2,
flags="C")
libmatmult.mult_matrix.argtypes = [ND_POINTER_1, ND_POINTER_2, ctypes.c_size_t, ctypes.c_size_t]
# print("-->", ctypes.c_size_t)
def mult_matrix_cpp(a,b):
shape = a.shape[0] * a.shape[1]
libmatmult.mult_matrix.restype = None
libmatmult.mult_matrix(a, b, *a.shape, *b.shape , a.shape[0] * a.shape[1])
size_a = (300,300)
size_b = size_a
a = numpy.random.uniform(low=1, high=255, size=size_a)
b = numpy.random.uniform(low=1, high=255, size=size_b)
t2 = time()
out_cpp = mult_matrix_cpp(a,b)
print("cpp time taken:{:.2f} ms".format((time() - t2) * 1000))
out_cpp = numpy.array(out_cpp).reshape(size_a[0], size_a[1])

NO Idea why this python script will not run [closed]

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 10 years ago.
so I have this python script here, and I am trying to run it but I get a syntax error. I do not see anything wrong with the syntax. Here is the code:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define KSYM_NAME_LEN 127
struct sym_entry {
unsigned long long addr;
unsigned int len;
unsigned char *sym;
};
static struct sym_entry *table;
static unsigned int table_size, table_cnt;
static unsigned long long _text, _stext, _etext, _sinittext, _einittext, _sextratext, _eextratext;
static int all_symbols = 0;
static char symbol_prefix_char = '\0';
int token_profit[0x10000];
/* the table that holds the result of the compression */
unsigned char best_table[256][2];
unsigned char best_table_len[256];
static void usage(void)
{
fprintf(stderr, "Usage: kallsyms [--all-symbols] [--symbol-prefix=<prefix char>] < in.map > out.S\n");
exit(1);
}
/*
* This ignores the intensely annoying "mapping symbols" found
* in ARM ELF files: $a, $t and $d.
*/
static inline int is_arm_mapping_symbol(const char *str)
{
return str[0] == '$' && strchr("atd", str[1])
&& (str[2] == '\0' || str[2] == '.');
}
static int read_symbol(FILE *in, struct sym_entry *s)
{
char str[500];
char *sym, stype;
int rc;
rc = fscanf(in, "%llx %c %499s\n", &s->addr, &stype, str);
if (rc != 3) {
if (rc != EOF) {
/* skip line */
fgets(str, 500, in);
}
return -1;
}
sym = str;
/* skip prefix char */
if (symbol_prefix_char && str[0] == symbol_prefix_char)
sym++;
/* Ignore most absolute/undefined (?) symbols. */
if (strcmp(sym, "_text") == 0)
_text = s->addr;
else if (strcmp(sym, "_stext") == 0)
_stext = s->addr;
else if (strcmp(sym, "_etext") == 0)
_etext = s->addr;
else if (strcmp(sym, "_sinittext") == 0)
_sinittext = s->addr;
else if (strcmp(sym, "_einittext") == 0)
_einittext = s->addr;
else if (strcmp(sym, "_sextratext") == 0)
_sextratext = s->addr;
else if (strcmp(sym, "_eextratext") == 0)
_eextratext = s->addr;
else if (toupper(stype) == 'A')
{
/* Keep these useful absolute symbols */
if (strcmp(sym, "__kernel_syscall_via_break") &&
strcmp(sym, "__kernel_syscall_via_epc") &&
strcmp(sym, "__kernel_sigtramp") &&
strcmp(sym, "__gp"))
return -1;
}
else if (toupper(stype) == 'U' ||
is_arm_mapping_symbol(sym))
return -1;
/* exclude also MIPS ELF local symbols ($L123 instead of .L123) */
else if (str[0] == '$')
return -1;
/* include the type field in the symbol name, so that it gets
* compressed together */
s->len = strlen(str) + 1;
s->sym = malloc(s->len + 1);
if (!s->sym) {
fprintf(stderr, "kallsyms failure: "
"unable to allocate required amount of memory\n");
exit(EXIT_FAILURE);
}
strcpy((char *)s->sym + 1, str);
s->sym[0] = stype;
return 0;
}
static int symbol_valid(struct sym_entry *s)
{
/* Symbols which vary between passes. Passes 1 and 2 must have
* identical symbol lists. The kallsyms_* symbols below are only added
* after pass 1, they would be included in pass 2 when --all-symbols is
* specified so exclude them to get a stable symbol list.
*/
static char *special_symbols[] = {
"kallsyms_addresses",
"kallsyms_num_syms",
"kallsyms_names",
"kallsyms_markers",
"kallsyms_token_table",
"kallsyms_token_index",
/* Exclude linker generated symbols which vary between passes */
"_SDA_BASE_", /* ppc */
"_SDA2_BASE_", /* ppc */
NULL };
int i;
int offset = 1;
/* skip prefix char */
if (symbol_prefix_char && *(s->sym + 1) == symbol_prefix_char)
offset++;
/* if --all-symbols is not specified, then symbols outside the text
* and inittext sections are discarded */
if (!all_symbols) {
if ((s->addr < _stext || s->addr > _etext)
&& (s->addr < _sinittext || s->addr > _einittext)
&& (s->addr < _sextratext || s->addr > _eextratext))
return 0;
/* Corner case. Discard any symbols with the same value as
* _etext _einittext or _eextratext; they can move between pass
* 1 and 2 when the kallsyms data are added. If these symbols
* move then they may get dropped in pass 2, which breaks the
* kallsyms rules.
*/
if ((s->addr == _etext && strcmp((char*)s->sym + offset, "_etext")) ||
(s->addr == _einittext && strcmp((char*)s->sym + offset, "_einittext")) ||
(s->addr == _eextratext && strcmp((char*)s->sym + offset, "_eextratext")))
return 0;
}
/* Exclude symbols which vary between passes. */
if (strstr((char *)s->sym + offset, "_compiled."))
return 0;
for (i = 0; special_symbols[i]; i++)
if( strcmp((char *)s->sym + offset, special_symbols[i]) == 0 )
return 0;
return 1;
}
static void read_map(FILE *in)
{
while (!feof(in)) {
if (table_cnt >= table_size) {
table_size += 10000;
table = realloc(table, sizeof(*table) * table_size);
if (!table) {
fprintf(stderr, "out of memory\n");
exit (1);
}
}
if (read_symbol(in, &table[table_cnt]) == 0)
table_cnt++;
}
}
static void output_label(char *label)
{
if (symbol_prefix_char)
printf(".globl %c%s\n", symbol_prefix_char, label);
else
printf(".globl %s\n", label);
printf("\tALGN\n");
if (symbol_prefix_char)
printf("%c%s:\n", symbol_prefix_char, label);
else
printf("%s:\n", label);
}
/* uncompress a compressed symbol. When this function is called, the best table
* might still be compressed itself, so the function needs to be recursive */
static int expand_symbol(unsigned char *data, int len, char *result)
{
int c, rlen, total=0;
while (len) {
c = *data;
/* if the table holds a single char that is the same as the one
* we are looking for, then end the search */
if (best_table[c][0]==c && best_table_len[c]==1) {
*result++ = c;
total++;
} else {
/* if not, recurse and expand */
rlen = expand_symbol(best_table[c], best_table_len[c], result);
total += rlen;
result += rlen;
}
data++;
len--;
}
*result=0;
return total;
}
static void write_src(void)
{
unsigned int i, k, off;
unsigned int best_idx[256];
unsigned int *markers;
char buf[KSYM_NAME_LEN+1];
printf("#include <asm/types.h>\n");
printf("#if BITS_PER_LONG == 64\n");
printf("#define PTR .quad\n");
printf("#define ALGN .align 8\n");
printf("#else\n");
printf("#define PTR .long\n");
printf("#define ALGN .align 4\n");
printf("#endif\n");
printf(".data\n");
/* Provide proper symbols relocatability by their '_text'
* relativeness. The symbol names cannot be used to construct
* normal symbol references as the list of symbols contains
* symbols that are declared static and are private to their
* .o files. This prevents .tmp_kallsyms.o or any other
* object from referencing them.
*/
output_label("kallsyms_addresses");
for (i = 0; i < table_cnt; i++) {
if (toupper(table[i].sym[0]) != 'A') {
printf("\tPTR\t_text + %#llx\n",
table[i].addr - _text);
} else {
printf("\tPTR\t%#llx\n", table[i].addr);
}
}
printf("\n");
output_label("kallsyms_num_syms");
printf("\tPTR\t%d\n", table_cnt);
printf("\n");
/* table of offset markers, that give the offset in the compressed stream
* every 256 symbols */
markers = malloc(sizeof(unsigned int) * ((table_cnt + 255) / 256));
if (!markers) {
fprintf(stderr, "kallsyms failure: "
"unable to allocate required memory\n");
exit(EXIT_FAILURE);
}
output_label("kallsyms_names");
off = 0;
for (i = 0; i < table_cnt; i++) {
if ((i & 0xFF) == 0)
markers[i >> 8] = off;
printf("\t.byte 0x%02x", table[i].len);
for (k = 0; k < table[i].len; k++)
printf(", 0x%02x", table[i].sym[k]);
printf("\n");
off += table[i].len + 1;
}
printf("\n");
output_label("kallsyms_markers");
for (i = 0; i < ((table_cnt + 255) >> 8); i++)
printf("\tPTR\t%d\n", markers[i]);
printf("\n");
free(markers);
output_label("kallsyms_token_table");
off = 0;
for (i = 0; i < 256; i++) {
best_idx[i] = off;
expand_symbol(best_table[i], best_table_len[i], buf);
printf("\t.asciz\t\"%s\"\n", buf);
off += strlen(buf) + 1;
}
printf("\n");
output_label("kallsyms_token_index");
for (i = 0; i < 256; i++)
printf("\t.short\t%d\n", best_idx[i]);
printf("\n");
}
/* table lookup compression functions */
/* count all the possible tokens in a symbol */
static void learn_symbol(unsigned char *symbol, int len)
{
int i;
for (i = 0; i < len - 1; i++)
token_profit[ symbol[i] + (symbol[i + 1] << 8) ]++;
}
/* decrease the count for all the possible tokens in a symbol */
static void forget_symbol(unsigned char *symbol, int len)
{
int i;
for (i = 0; i < len - 1; i++)
token_profit[ symbol[i] + (symbol[i + 1] << 8) ]--;
}
/* remove all the invalid symbols from the table and do the initial token count */
static void build_initial_tok_table(void)
{
unsigned int i, pos;
pos = 0;
for (i = 0; i < table_cnt; i++) {
if ( symbol_valid(&table[i]) ) {
if (pos != i)
table[pos] = table[i];
learn_symbol(table[pos].sym, table[pos].len);
pos++;
}
}
table_cnt = pos;
}
/* replace a given token in all the valid symbols. Use the sampled symbols
* to update the counts */
static void compress_symbols(unsigned char *str, int idx)
{
unsigned int i, len, size;
unsigned char *p1, *p2;
for (i = 0; i < table_cnt; i++) {
len = table[i].len;
p1 = table[i].sym;
/* find the token on the symbol */
p2 = memmem(p1, len, str, 2);
if (!p2) continue;
/* decrease the counts for this symbol's tokens */
forget_symbol(table[i].sym, len);
size = len;
do {
*p2 = idx;
p2++;
size -= (p2 - p1);
memmove(p2, p2 + 1, size);
p1 = p2;
len--;
if (size < 2) break;
/* find the token on the symbol */
p2 = memmem(p1, size, str, 2);
} while (p2);
table[i].len = len;
/* increase the counts for this symbol's new tokens */
learn_symbol(table[i].sym, len);
}
}
/* search the token with the maximum profit */
static int find_best_token(void)
{
int i, best, bestprofit;
bestprofit=-10000;
best = 0;
for (i = 0; i < 0x10000; i++) {
if (token_profit[i] > bestprofit) {
best = i;
bestprofit = token_profit[i];
}
}
return best;
}
/* this is the core of the algorithm: calculate the "best" table */
static void optimize_result(void)
{
int i, best;
/* using the '\0' symbol last allows compress_symbols to use standard
* fast string functions */
for (i = 255; i >= 0; i--) {
/* if this table slot is empty (it is not used by an actual
* original char code */
if (!best_table_len[i]) {
/* find the token with the breates profit value */
best = find_best_token();
/* place it in the "best" table */
best_table_len[i] = 2;
best_table[i][0] = best & 0xFF;
best_table[i][1] = (best >> 8) & 0xFF;
/* replace this token in all the valid symbols */
compress_symbols(best_table[i], i);
}
}
}
/* start by placing the symbols that are actually used on the table */
static void insert_real_symbols_in_table(void)
{
unsigned int i, j, c;
memset(best_table, 0, sizeof(best_table));
memset(best_table_len, 0, sizeof(best_table_len));
for (i = 0; i < table_cnt; i++) {
for (j = 0; j < table[i].len; j++) {
c = table[i].sym[j];
best_table[c][0]=c;
best_table_len[c]=1;
}
}
}
static void optimize_token_table(void)
{
build_initial_tok_table();
insert_real_symbols_in_table();
/* When valid symbol is not registered, exit to error */
if (!table_cnt) {
fprintf(stderr, "No valid symbol.\n");
exit(1);
}
optimize_result();
}
int main(int argc, char **argv)
{
if (argc >= 2) {
int i;
for (i = 1; i < argc; i++) {
if(strcmp(argv[i], "--all-symbols") == 0)
all_symbols = 1;
else if (strncmp(argv[i], "--symbol-prefix=", 16) == 0) {
char *p = &argv[i][16];
/* skip quote */
if ((*p == '"' && *(p+2) == '"') || (*p == '\'' && *(p+2) == '\''))
p++;
symbol_prefix_char = *p;
} else
usage();
}
} else if (argc != 1)
usage();
read_map(stdin);
optimize_token_table();
write_src();
return 0;
}
Here is the error that I am getting:
File "joe.py", line 11
struct sym_entry {
^
SyntaxError: invalid syntax

Primary problem may be that the python code is actually C

Is it that hard to translate python code into c code?

once I wrote a piece of python code to do the calculation of inversions.
And just for fun plus want to see if c is really way faster than runtime language,
I wrote the similar python code in c.
the python code took less than 3 secs to run,
but my c code took about 8 secs.
Can anyone tell me where I did wrong and how I should change/improve my C code?
Thank you!!
the input file is just 100000 lines of numbers, not repeated, from 1 to 100000 in random order.
python code:
import time
def lookup(arr, num):
lower=0
upper=len(arr)-1
while 1:
if arr[upper]==num:
return upper
mid=(upper+lower)/2
if arr[mid]==num:
return mid
elif arr[mid]<num:
lower=mid+1
else:
upper=mid
def main():
time.clock()
f=open("IntegerArray.txt", "r").read().strip().split('\n')
array=[int(i) for i in f]
array.reverse()
s=sorted(array)
v=0
while(array):
i=lookup(s, array.pop())
v+=i
s.pop(i)
print v
print time.clock()
raw_input()
main()
C code:
#include <stdio.h>
#include <stdlib.h>
#define limit 100000
int comp(const void *, const void *);
void inversion(int *, int *, int);
int lookup(int *, int, int);
int main(void){
FILE *f = fopen("IntegerArray.txt", "r");
int n=0, array[limit], copy[limit];
while(n<limit){
fscanf(f, "%d", array+n);
copy[n]=array[n];
n++;
}
qsort(copy, limit, sizeof(int), comp);
// int i;
// for(i=limit-100;i<limit;i++)printf("%d %d\n", array[i], copy[i]);
inversion(array, copy, limit);
// getchar();
return 0;
}
int comp(const void *a, const void *b){
return (*(int*)a>*(int*)b)?1:-1;
}
void inversion(int *a, int *b, int n){
int c=0, index;
unsigned int v=0;
while(c<n){
// if(c%1000==0)printf("%d %d\n", c, b[n-c-1]);
index=lookup(b, *a, n-c);
v+=index;
if((n-c)/2<index)
for(;index<n-c-1;index++)
b[index]=b[index+1];
else{
for(;index>0;index--)
b[index]=b[index-1];
b++;
}
a++;
c++;
}
printf("%u\n", v);
}
int lookup(int *arr, int num, int n){
int lower=0, upper=n-1, mid;
// if(arr[upper]==num)return upper;
// if(arr[lower]==num)return lower;
while(1){
// printf("%d %d %d\n", lower, upper, num);
mid=(lower+upper)/2;
// if(lower==upper && arr[mid]!=num){
// printf("%d %d %d\n", lower, upper, num);
// exit(1);
// }
if(arr[mid]==num)return mid;
else if(arr[mid]<num)
lower=mid+1;
else
upper=mid;
}
}

In Python inversion, you have:
s.pop(i)
While in C, you have:
if((n-c)/2<index)
for(;index<n-c-1;index++)
b[index]=b[index+1];
else{
for(;index>0;index--)
b[index]=b[index-1];
b++;
}
Looks like a big difference to me.
I don't think Python implements pop by copying all elements.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

CUDA Runtime API error 1: invalid argument (cudaMemcpy) [duplicate] - python

Related

Trouble using Python ctypes to run C dll function (array output with unknown size)

Simple Multilateration c++ math realisation?

Pass a 2d numpy array to c using ctypes

NO Idea why this python script will not run [closed]

Is it that hard to translate python code into c code?

Categories

Resources