How, when and what to vectorize in python?

How, when and what to vectorize in python? - python

Right, so this is basically a follow up of an earlier question of mine. I have some binary data that are in floating point binary format. Using C, the process is fast, but I lose some precision with atof(). I tried looking through the forum, and also elsewhere, but my problem was not solved. As such, I moved to python. Ah joy! the program worked perfectly well, but is so very slow compared to C. I looked up optimizations on python, which pointed me to Cython and Weave, but I have some doubts. If you will follow my code, I am confused where to apply the optimizing C code, since I am reading from the numpy object. My question, is it possible to read data using numpy functions within the Cython, and if so, please provide a small example.
The C Code uses PolSARpro's header files, and libbmp for creating the .bmp file
As a note, I am posting both my codes. God knows I had to go through a lot just to get the formulas working. This way, others in need can give their thoughts and input too :)
C Code (Working, but atof() loses precision, thus output lat long are slightly off)
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <polSARpro/bmpfile.c>
#include <polSARpro/graphics.c>
#include <polSARpro/matrix.c>
#include <polSARpro/processing.c>
#include <polSARpro/util.c>
#define METAL_THRESHOLD 5.000000
#define POLARIZATION_FRACTION_THRESHOLD 0.900000
#define PI 3.14159265
#define FOURTHPI PI/4
#define deg2rad PI/180
#define rad2deg 180./PI
/*double PI = 3.14159265;
double FOURTHPI = PI / 4;
double deg2rad = PI / 180;
double rad2deg = 180.0 / PI;*/
FILE *L1,*PF,*SPF;
FILE *txt;
FILE *finalLocations;
long i=0,loop_end;
int lig,col;
float l1,pf,spf;
long pos;
int Nlig,Ncol;
float *bufferout;
float *bufferin_L1,*bufferin_L2;
float valueL1,valuePF,xx;
float sizeGridX, sizeGridY, startX, startY;
float posX,posY;
int ZONE;
char Heading[10];
char setZone[15];
int p[4][2];
int degree, minute, second;
void UTM2LL(int ReferenceEllipsoid, double UTMNorthing, double UTMEasting, char* UTMZone, double *Lat, double *Long)
{
//converts UTM coords to lat/long. Equations from USGS Bulletin 1532
//East Longitudes are positive, West longitudes are negative.
//North latitudes are positive, South latitudes are negative
//Lat and Long are in decimal degrees.
//Written by Chuck Gantz- chuck.gantz#globalstar.com
double k0 = 0.9996;
double a = 6378137;
double eccSquared = 0.00669438;
double eccPrimeSquared;
double e1 = (1-sqrt(1-eccSquared))/(1+sqrt(1-eccSquared));
double N1, T1, C1, R1, D, M;
double LongOrigin;
double mu, phi1, phi1Rad;
double x, y;
int ZoneNumber;
char* ZoneLetter;
int NorthernHemisphere; //1 for northern hemispher, 0 for southern
x = UTMEasting - 500000.0; //remove 500,000 meter offset for longitude
y = UTMNorthing;
ZoneNumber = strtoul(UTMZone, &ZoneLetter, 10);
if((*ZoneLetter - 'N') >= 0)
NorthernHemisphere = 1;//point is in northern hemisphere
else
{
NorthernHemisphere = 0;//point is in southern hemisphere
y -= 10000000.0;//remove 10,000,000 meter offset used for southern hemisphere
}
LongOrigin = (ZoneNumber - 1)*6 - 180 + 3; //+3 puts origin in middle of zone
eccPrimeSquared = (eccSquared)/(1-eccSquared);
M = y / k0;
mu = M/(a*(1-eccSquared/4-3*eccSquared*eccSquared/64-5*eccSquared*eccSquared*eccSquared/256));
phi1Rad = mu + (3*e1/2-27*e1*e1*e1/32)*sin(2*mu)
+ (21*e1*e1/16-55*e1*e1*e1*e1/32)*sin(4*mu)
+(151*e1*e1*e1/96)*sin(6*mu);
phi1 = phi1Rad*rad2deg;
N1 = a/sqrt(1-eccSquared*sin(phi1Rad)*sin(phi1Rad));
T1 = tan(phi1Rad)*tan(phi1Rad);
C1 = eccPrimeSquared*cos(phi1Rad)*cos(phi1Rad);
R1 = a*(1-eccSquared)/pow(1-eccSquared*sin(phi1Rad)*sin(phi1Rad), 1.5);
D = x/(N1*k0);
*Lat = phi1Rad - (N1*tan(phi1Rad)/R1)*(D*D/2-(5+3*T1+10*C1-4*C1*C1-9*eccPrimeSquared)*D*D*D*D/24
+(61+90*T1+298*C1+45*T1*T1-252*eccPrimeSquared-3*C1*C1)*D*D*D*D*D*D/720);
*Lat = *Lat * rad2deg;
*Long = (D-(1+2*T1+C1)*D*D*D/6+(5-2*C1+28*T1-3*C1*C1+8*eccPrimeSquared+24*T1*T1)
*D*D*D*D*D/120)/cos(phi1Rad);
*Long = LongOrigin + *Long * rad2deg;
}
void convertToDegree(float decimal)
{
int negative = decimal < 0;
decimal = abs(decimal);
minute = (decimal * 3600/ 60);
second = fmodf((decimal * 3600),60);
degree = minute / 60;
minute = minute % 60;
if (negative)
{
if (degree > 0)
degree = -degree;
else if (minute > 0)
minute = -minute;
else
second = -second;
}
}
void readConfig(int *Row, int *Col)
{
char tmp[70];
int i=0;
FILE *fp = fopen("config.txt","r");
if(fp == NULL)
{
perror("Config.txt");
exit(1);
}
while(!feof(fp))
{
fgets(tmp,70,fp);
if (i==1)
*Row = atoi(tmp);
if(i==4)
*Col = atoi(tmp);
i++;
}
fclose(fp);
}
void readHDR(float *gridX,float *gridY,float *startXPos,float *startYPos)
{
FILE *fp = fopen("PF.bin.hdr","r");
int i=0;
char tmp[255];
char junk[255];
memset(tmp,0X00,sizeof(tmp));
memset(junk,0X00,sizeof(junk));
if(fp==NULL)
{
perror("Please locate or create PF.bin.hdr");
exit(0);
}
while(!feof(fp))
{
if(i==13)
break;
fgets(tmp,255,fp);
i++;
}
fclose(fp);
strcpy(junk,strtok(tmp,","));
strtok(NULL,",");
strtok(NULL,",");
strcpy(tmp,strtok(NULL,","));
//puts(tmp);
*startXPos = atof(tmp);
strcpy(tmp,strtok(NULL,","));
//puts(tmp);
*startYPos = atof(tmp);
strcpy(tmp,strtok(NULL,","));
//puts(tmp);
*gridX = atof(tmp);
strcpy(tmp,strtok(NULL,","));
//puts(tmp);
*gridY = atof(tmp);
strcpy(tmp,strtok(NULL,","));
ZONE = atoi(tmp);
strcpy(tmp,strtok(NULL,","));
strcpy(Heading,tmp);
}
int main()
{
bmpfile_t *bmp;
double Lat;
double Long;
int i;
rgb_pixel_t pixelMetal = {128, 64, 0, 0};
rgb_pixel_t pixelOthers = {128, 64, 0, 0};
readConfig(&Nlig,&Ncol);
readHDR(&sizeGridX,&sizeGridY,&startX,&startY);
//startX = startX - (double) 0.012000;
//startY = startY + (double)0.111000;
printf("Enter the rectangle's top-left and bottom-right region of interest points as: x y\n");
for(i=0;i<2;i++)
{
printf("Enter point %d::\t",i+1);
scanf("%d %d",&p[i][0], &p[i][1]);
}
printf("Grid Size(X,Y)::( %f,%f ), Start Positions(X,Y)::( %f, %f ), ZONE::%d, Heading:: %s\n\n",sizeGridX,sizeGridY,startX,startY,ZONE,Heading);
pixelMetal.red = 255;
pixelMetal.blue = 010;
pixelMetal.green = 010;
pixelOthers.red = 8;
pixelOthers.blue = 8;
pixelOthers.green = 8;
L1 = fopen("l1.bin","rb");
PF =fopen("PF.bin","rb");
SPF = fopen("SPF_L1.bin","wb");
//txt = fopen("locations(UTM).txt","w");
finalLocations = fopen("locationsROI.txt","w");
if(L1==NULL || PF==NULL || SPF==NULL || finalLocations == NULL)
{
perror("Error in opening files!");
return -1;
}
fseek(L1,0,SEEK_END);
pos = ftell(L1);
loop_end = pos;
printf("L1.bin contains::\t%ld elements\n",pos);
fseek(PF,0,SEEK_END);
pos = ftell(PF);
printf("PF.bin contains::\t%ld elements\n",pos);
fseek(L1,0,SEEK_SET);
fseek(PF,0,SEEK_SET);
bmp = bmp_create(Ncol,Nlig,8); //width * height
bufferin_L1 = vector_float(Ncol);
bufferin_L2 = vector_float(Ncol);
bufferout = vector_float(Ncol);
printf("Resources Allocated. Beginning...\n");
for (lig = 0; lig < Nlig; lig++) /* rows */
{
if (lig%(int)(Nlig/20) == 0)
{
printf("%f\r", 100. * lig / (Nlig - 1));
fflush(stdout);
}
fread(&bufferin_L1[0], sizeof(float), Ncol, L1);
fread(&bufferin_L2[0], sizeof(float), Ncol, PF);
for (col = 0; col < Ncol; col++) /* columns */
{
valueL1 = bufferin_L1[col];
valuePF = bufferin_L2[col];
if(valueL1 >= METAL_THRESHOLD && valuePF >= POLARIZATION_FRACTION_THRESHOLD)
{
if(col >= p[0][0] && col <= p[1][0] && lig >= p[0][1] && lig <= p[1][1])
{
xx = fabs(valueL1 + valuePF);
bmp_set_pixel(bmp,col,lig,pixelMetal);
posX = startX + (sizeGridX * col);
posY = startY - (sizeGridY * lig);
//fprintf(txt,"%f %f %d %s\n",posX,posY,ZONE,Heading);
sprintf(setZone,"%d",ZONE);
if(strstr(Heading,"Nor")!=NULL)
strcat(setZone,"N");
else
strcat(setZone,"S");
UTM2LL(23, posY, posX, setZone, &Lat, &Long); // 23 for WGS-84
convertToDegree(Lat);
//fprintf(finalLocations,"UTM:: %.2fE %.2fN , Decimal: %f %f , Degree: %d %d %d, ",posX,posY,Lat,Long,degree,minute,second);
//fprintf(finalLocations,"%.2fE,%.2fN,%f,%f ,%d,%d,%d,",posX,posY,Lat,Long,degree,minute,second);
fprintf(finalLocations,"%.2f,%.2f,%f,%f ,%d,%d,%d,",posX,posY,Lat,Long,degree,minute,second);
convertToDegree(Long);
fprintf(finalLocations,"%d,%d,%d\n",degree,minute,second);
}
else
{
xx = fabs(valueL1) ;
bmp_set_pixel(bmp,col,lig,pixelOthers);
}
}
else
{
xx = fabs(valueL1) ;
bmp_set_pixel(bmp,col,lig,pixelOthers);
}
bufferout[col] = xx;
}
fwrite(&bufferout[0], sizeof(float), Ncol, SPF);
}
free_vector_float(bufferout);
fclose(L1);
fclose(PF);
fclose(SPF);
//fclose(txt);
fclose(finalLocations);
printf("\n----------Writing BMP File!----------\n");
bmp_save(bmp,"SPF_L1(ROI).bmp");
bmp_destroy(bmp);
printf("\nDone!\n");
}
As well as the Python code::
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 10 10:29:18 2013
#author: Binayaka
"""
import numpy as Num;
import math;
import array;
class readConfiguration(object):
def __init__(self,x):
self.readConfig(x);
def readConfig(self,x):
try:
crs = open(x,'r');
srs = open('config.txt','r');
except IOError:
print "Files missing!";
else:
rows = crs.readlines();
values = rows[12].split(',');
rows = srs.readlines();
self.startX = float(values[3]);
self.startY = float(values[4]);
self.gridSizeX = float(values[5]);
self.gridSizeY = float(values[6]);
self.Zone = int(values[7]);
self.Hemisphere = values[8];
self.NRows = int(rows[1].strip());
self.NCols = int(rows[4].strip());
self.MetalThreshold = 5.000000;
self.PFThreshold = 0.900000;
self.rad2deg = 180/math.pi;
self.deg2rad = math.pi/180;
self.FOURTHPI = math.pi/4;
crs.close();
srs.close();
def decdeg2dms(dd):
negative = dd < 0;
dd = abs(dd);
minutes,seconds = divmod(dd*3600,60);
degrees,minutes = divmod(minutes,60);
if negative:
if degrees > 0:
degrees = -degrees;
elif minutes > 0:
minutes = -minutes;
else:
seconds = -seconds;
return (degrees,minutes,seconds);
def UTM2LL(self,UTMEasting, UTMNorthing):
k0 = 0.9996;
a = 6378137;
eccSquared = 0.00669438;
e1 = (1-math.sqrt(1-eccSquared))/(1+math.sqrt(1-eccSquared));
x = UTMEasting - 500000.0;#remove 500,000 meter offset for longitude
y = UTMNorthing;
if self.Hemisphere == "North":
self.Hemi = 1;
else:
self.Hemi = -1;
y -= 10000000.0;
LongOrigin = (self.Zone - 1)*6 - 180 + 3;
eccPrimeSquared = (eccSquared)/(1-eccSquared);
M = y / k0;
mu = M/(a*(1-eccSquared/4-3*eccSquared*eccSquared/64-5*eccSquared*eccSquared*eccSquared/256));
phi1Rad = mu + (3*e1/2-27*e1*e1*e1/32)*math.sin(2*mu) + (21*e1*e1/16-55*e1*e1*e1*e1/32)*math.sin(4*mu) +(151*e1*e1*e1/96)*math.sin(6*mu);
#phi1 = phi1Rad*self.rad2deg;
N1 = a/math.sqrt(1-eccSquared*math.sin(phi1Rad)*math.sin(phi1Rad));
T1 = math.tan(phi1Rad)*math.tan(phi1Rad);
C1 = eccPrimeSquared*math.cos(phi1Rad)*math.cos(phi1Rad);
R1 = a*(1-eccSquared)/pow(1-eccSquared*math.sin(phi1Rad)*math.sin(phi1Rad), 1.5);
D = x/(N1*k0);
self.Lat = phi1Rad - (N1*math.tan(phi1Rad)/R1)*(D*D/2-(5+3*T1+10*C1-4*C1*C1-9*eccPrimeSquared)*D*D*D*D/24 +(61+90*T1+298*C1+45*T1*T1-252*eccPrimeSquared-3*C1*C1)*D*D*D*D*D*D/720);
self.Lat = self.Lat * self.rad2deg;
self.Long = (D-(1+2*T1+C1)*D*D*D/6+(5-2*C1+28*T1-3*C1*C1+8*eccPrimeSquared+24*T1*T1)*D*D*D*D*D/120)/math.cos(phi1Rad);
self.Long = LongOrigin + self.Long * self.rad2deg;
def printConfiguration(self):
""" Just to check whether our reading was correct """
print "Metal Threshold:\t" + str(self.MetalThreshold);
print "PF Threshold:\t" + str(self.PFThreshold);
print "Start X:\t" + str(self.startX);
print "Start Y:\t" + str(self.startY);
print "Grid size(X) :\t" + str(self.gridSizeX);
print "Grid size(Y) :\t" + str(self.gridSizeY);
def createROIfile(self,ROIFilename):
firstPoint = raw_input('Enter topLeft point coord\t').split();
secondPoint = raw_input('Enter bottomRight point coord\t').split();
try:
L1 = open('l1.bin','rb');
PF = open('PF.bin','rb');
SPF = open('pySPF_L1.bin','wb');
targetFilename = open(ROIFilename,'w');
except IOError:
print "Files Missing!";
else:
L1.seek(0,2);
elementsL1 = L1.tell();
L1.seek(0,0);
PF.seek(0,2);
elementsPF = PF.tell();
PF.seek(0,0);
print "L1.bin contains\t" + str(elementsL1) + " elements";
print "PF.bin contains\t" + str(elementsPF) + " elements";
binvaluesL1 = array.array('f');
binvaluesPF = array.array('f');
binvaluesSPF = array.array('f');
for row in range(0,self.NRows):
binvaluesL1.read(L1,self.NCols);
binvaluesPF.read(PF,self.NCols);
dataL1 = Num.array(binvaluesL1, dtype=Num.float);
dataPF = Num.array(binvaluesPF, dtype=Num.float);
dataSPF = dataL1 + dataPF;
binvaluesSPF.fromlist(Num.array(dataSPF).tolist());
for col in range(0,self.NCols):
if(dataL1[col] >= self.MetalThreshold and dataPF[col] >= self.PFThreshold):
if(col >= int(firstPoint[0]) and col <= int(secondPoint[0]) and row >= int(firstPoint[1]) and row <= int(secondPoint[1])):
posX = self.startX + (self.gridSizeX * col);
posY = self.startY - (self.gridSizeY * row);
self.UTM2LL(posY,posX);
tmp1 = self.decdeg2dms(posY);
tmp2 = self.decdeg2dms(posX);
strTarget = "Decimal Degree:: " + str(posX) + "E " + str(posY) + "N \t Lat long:: " + str(tmp1) + " " + str(tmp2) + "\n";
targetFilename.write(strTarget);
binvaluesSPF.tofile(SPF);
L1.close();
PF.close();
SPF.close();
targetFilename.close();
print "Done!";
dimensions = readConfiguration('PF.bin.hdr');
dimensions.printConfiguration();
dimensions.createROIfile('testPythonROI.txt');
Its the Python code that needs Optimization, as the values of NRows and NCols can and do reach the order of thousands.

A few general comments:
With python, it's really best to stick to PEP8 for a multitude of reasons. Python programmers are particularly picky about readability and essentially universally adhere to the community coding guidelines (PEP8). Avoid camelCase, keep lines below 80 columns, leave the semicolons out, and feel free to occasionally ignore these guidelines where they'd make things less readable.
There's no need for the builtin array type here if you're using numpy. I'm confused why you're constantly converting back and forth...
Use a projection library. Specify what datum and ellipsoid you're using, otherwise the coordinates (easting/northing or lat/long) have absolutely no meaning.
Don't use one big class as a hold-all for unrelated things. There's nothing wrong with just having a few functions. You don't need to make it into a class unless it makes sense to do so.
Use vectorized operations with numpy arrays.
Here's what would appear to be your performance bottleneck:
for row in range(0,self.NRows):
binvaluesL1.read(L1,self.NCols);
binvaluesPF.read(PF,self.NCols);
dataL1 = Num.array(binvaluesL1, dtype=Num.float);
dataPF = Num.array(binvaluesPF, dtype=Num.float);
dataSPF = dataL1 + dataPF;
binvaluesSPF.fromlist(Num.array(dataSPF).tolist());
for col in range(0,self.NCols):
if(dataL1[col] >= self.MetalThreshold and dataPF[col] >= self.PFThreshold):
if(col >= int(firstPoint[0]) and col <= int(secondPoint[0]) and row >= int(firstPoint[1]) and row <= int(secondPoint[1])):
posX = self.startX + (self.gridSizeX * col);
posY = self.startY - (self.gridSizeY * row);
self.UTM2LL(posY,posX);
tmp1 = self.decdeg2dms(posY);
tmp2 = self.decdeg2dms(posX);
strTarget = "Decimal Degree:: " + str(posX) + "E " + str(posY) + "N \t Lat long:: " + str(tmp1) + " " + str(tmp2) + "\n";
targetFilename.write(strTarget);
binvaluesSPF.tofile(SPF);
One of your biggest problems is the way you're reading in your data. You're constantly reading things in as one thing, then converting that to a list, then converting that to a numpy array. There's absolutely no need to jump through all those hoops. Numpy will unpack your binary floats for you just like array will.
Just do grid = np.fromfile(yourfile, dtype=np.float32).reshape(ncols, nrows). (Outside the loop.)
After that, your nested loops can be easily vectorized and expressed with just a few lines of code.
Here's how I would write your code. This probably won't run as-is, as I can't test it with your data. However, it should give you some general ideas.
import numpy as np
import pyproj
def main():
config = Config('PF.bin.hdr')
grid1, grid2 = load_data('l1.bin', 'PF.bin', config.nrows, config.ncols)
spf = grid1 + grid2
spf.tofile('pySPF_L1.bin')
easting_aoi, northing_aoi = subset_data(grid1, grid2, config)
save_selected_region(easting_aoi, northing_aoi, config.zone,
'testPythonROI.txt')
def load_data(filename1, filename2, nrows, ncols):
"""It would really be good to use more descriptive variable names than "L1"
and "PF". I have no idea what L1 and PF are, so I'm just calling them
grid1 and grid2."""
grid1 = np.fromfile(filename1, dtype=np.float32).reshape(nrows, ncols)
grid2 = np.fromfile(filename2, dtype=np.float32).reshape(nrows, ncols)
return grid1, grid2
def subset_data(grid1, grid2, config):
"""Select points that satisfy some threshold criteria (explain??) and are
within a user-specified rectangular AOI."""
northing, easting = np.mgrid[:config.nrows, :config.ncols]
easting = config.xstart + config.xgridsize * easting
northing = config.ystart + config.ygridsize * northing
grids = grid1, grid2, easting, northing
grid1, grid2, easting, northing = [item[config.user_aoi] for item in grids]
mask = (grid1 >= config.metal_threshold) & (grid2 >= config.pf_threshold)
return easting[mask], northing[mask]
def save_selected_region(easting, northing, zone, filename):
"""Convert the given eastings and northings (in UTM zone "zone") to
lat/long and save to a tab-delimited-text file."""
lat, lon = utm2geographic(easting, northing, zone)
data = np.vstack([easting, northing, lat, lon]).T
with open(filename, 'w') as outfile:
outfile.write('Easting\tNorthing\tLatitude\tLongitude\n')
np.savetxt(outfile, data, delimiter='\t')
def utm2geographic(easting, northing, zone):
"""We need to know which datum/ellipsoid the UTM coords are in as well!!!!
I'm assuming it's a Clark 1866 ellipsoid, based on the numbers in your
code..."""
utm = pyproj.Proj(proj='utm', zone=zone, ellip='clrk66')
geographic = pyproj.Proj(proj='latlong', ellip='clrk66')
return pyproj.transform(utm, geographic, easting, northing)
class Config(object):
"""Read and store configuration values for (something?)."""
config_file = 'config.txt'
def __init__(self, filename):
"""You should add docstrings to clarify what you're expecting
"filename" to contain."""
with open(filename, 'r') as infile:
crs_values = list(infile)[12].split(',')
crs_values = [float(item) for item in crs_values]
self.xstart, self.ystart = crs_values[3:5]
self.xgridsize, self.ygridsize = crs_values[5:7]
self.zone = int(crs_values[7])
with open(self.config_file, 'r') as infile:
srs_values = list(infile)
self.nrows, self.ncols = srs_values[1], srs_values[4]
# It would be good to explain a bit about these (say, units, etc)
self.metal_threshold = 5.0
self.pf_threshold = 0.9
self.user_aoi = self.read_user_aoi()
def read_user_aoi(self):
"""Get an area of interest of the grids in pixel coordinates."""
top_left = raw_input('Enter top left index\t')
bottom_right = raw_input('Enter bottom right index\t')
min_i, min_j = [int(item) for item in top_left.split()]
max_i, max_j = [int(item) for item in bottom_right.split()]
return slice(min_i, max_i), slice(min_j, max_j)
if __name__ == '__main__':
main()

Related

Convert Eigen to numpy matrix

I have the following code in C++ that uses the Eigen library, need help to translate to python (numpy)
Initialization
double b = 20.0;
Eigen::Vector3d C(1.0/10.2, 1.0/10.2, 1/30);
Eigen::MatrixXd U(5200, 3);
int i = 0;
for (double x = 10.2/2.0; x < 200; x += 10) {
for (double y = 10.2/2.0; y < 200; y += 10) {
for (double t = 0; t <= 360; t += 30) {
U(i, 0) = x;
U(i, 1) = y;
U(i, 2) = psi;
i += 1;
}
}
}
Function:
Eigen::VectorXd operator()(const Eigen::VectorXd& s) {
Eigen::VectorXd p(length());
p(0) = s[0];
p(1) = s[1];
p(2) = s[2];
p(3) = s[3];
for (int i = 0; i < U.rows(); i++) {
p(i + 4) = b*exp(-0.5*(s.tail(U.cols()) - U.row(i).transpose()).dot(C*(s.tail(U.cols())
- U.row(i).transpose())));
if (p(i + 4) < 0.1) {
p(i + 4) = 0;
}
}
return p;
}
Python version
Initialization:
my_x = 10.2/2.0
my_y = 10.2/2.0
my_p = 0
xx = []
while my_x < 200:
xx.append(my_x)
my_x += 10
yy = []
while my_y < 200:
yy.append(my_y)
my_y += 10
pps = []
while my_psi <= 360:
pps.append(my_p)
my_p+=30
U =[]
for x in xx:
for y in yy:
for p in pps:
U.append([x,y,p])
U = numpy.matrix(U)
C = numpy.array([1.0/10.2, 1.0/10.2, 1.0/30.0])
b = 20.0
The Function
Instead of operator() I will call the function doSomething()
def doSomething(s): # Where s is a numpy array (1-d vector)
p[0:4] = s[0:4]
for i in range (U.shape[0]):
s_dash = -0.5*(s - U[i].T)
s_ddash = C*s
s_dddash = s_dash.dot(s_ddash) - U[i].T
p[i+4] = b * numpy.exp(s_dddash)
if p[i+4] < 0.1: p[i+4] = 0
What I am confused:
In the C++ implementation, I think p[i+4] is supposed to be a single value
In my python version, I get a p[i+4] as square matrix
Each p[i+4] is a zero matrix.
I am unable to decipher my mistake. Please help!

Deriving an ECDSA uncompressed public key from a compressed one

I am currently trying to derive a Bitcoin uncompressed ECDSA public key from a compressed one.
According to this link on the Bitcoin wiki, it is possible to do so... But how?
To give you more details: as of now I have compressed keys (33-bytes-long) gathered on the bitcoin network.
They are of the following format: <1-byte-long prefix><32-bytes-long X>.
From there, I would like to obtain an uncompressed key (65-bytes-long) whose format is:
<1-byte-long prefix><32-bytes-long X><32-bytes-long Y>
According to this other link on the Bitcoin wiki, it should be as easy as solving the equation:
Y^2 = X^3 + 7
However, I cannot seem to get there. My value for Y is simply far-off. Here is my code (the value for the public key come from the Bitcoin wiki example):
import binascii
from decimal import *
expected_uncompressed_key_hex = '0450863AD64A87AE8A2FE83C1AF1A8403CB53F53E486D8511DAD8A04887E5B23522CD470243453A299FA9E77237716103ABC11A1DF38855ED6F2EE187E9C582BA6'
expected_y_hex = expected_uncompressed_key_hex[-64:]
expected_y_dec = int(expected_y_hex, 16)
x_hex = expected_uncompressed_key_hex[2:66]
if expected_y_dec % 2 == 0:
prefix = "02"
else:
prefix = "03"
artificial_compressed_key = prefix + x_hex
getcontext().prec = 500
test_dec = Decimal(int(x_hex, 16))
y_square_dec = test_dec**3 + 7
if prefix == "02":
y_dec = - Decimal(y_square_dec).sqrt()
else:
y_dec = Decimal(y_square_dec).sqrt()
computed_y_hex = hex(int(y_dec))
computed_uncompressed_key = "04" + x + computed_y_hex
For information, my outputs are:
computed_y_hex = '0X2D29684BD207BF6D809F7D0EB78E4FD61C3C6700E88AB100D1075EFA8F8FD893080F35E6C7AC2E2214F8F4D088342951'
expected_y_hex = '2CD470243453A299FA9E77237716103ABC11A1DF38855ED6F2EE187E9C582BA6'
Thank you for your help!

You need to calculate in the field , which mostly means that you have to reduce your number to the remainder after dividing with p after each calculation. Calculating this is called taking the modulo and is written as % p in python.
Exponentiating in this field can be done more effectively than the naive way of just multiplying and reducing many times. This is called modular exponentiation. Python's built-in exponentation function pow(n,e,p) can take care of this.
The remaining problem is to find the square root. Luckily secp256k1 is chosen in a special way (), so that taking square roots is easy: A square root of x is .
So a simplified version of your code becomes:
import binascii
p_hex = 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F'
p = int(p_hex, 16)
compressed_key_hex = '0250863AD64A87AE8A2FE83C1AF1A8403CB53F53E486D8511DAD8A04887E5B2352'
x_hex = compressed_key_hex[2:66]
x = int(x_hex, 16)
prefix = compressed_key_hex[0:2]
y_square = (pow(x, 3, p) + 7) % p
y_square_square_root = pow(y_square, (p+1)/4, p)
if (prefix == "02" and y_square_square_root & 1) or (prefix == "03" and not y_square_square_root & 1):
y = (-y_square_square_root) % p
else:
y = y_square_square_root
computed_y_hex = format(y, '064x')
computed_uncompressed_key = "04" + x_hex + computed_y_hex
print computed_uncompressed_key

Here a sample code without any 3rd party python libs:
def pow_mod(x, y, z):
"Calculate (x ** y) % z efficiently."
number = 1
while y:
if y & 1:
number = number * x % z
y >>= 1
x = x * x % z
return number
# prime p = 2^256 - 2^32 - 2^9 - 2^8 - 2^7 - 2^6 - 2^4 - 1
p = 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f
# bitcoin's compressed public key of private key 55255657523dd1c65a77d3cb53fcd050bf7fc2c11bb0bb6edabdbd41ea51f641
compressed_key = '0314fc03b8df87cd7b872996810db8458d61da8448e531569c8517b469a119d267'
y_parity = int(compressed_key[:2]) - 2
x = int(compressed_key[2:], 16)
a = (pow_mod(x, 3, p) + 7) % p
y = pow_mod(a, (p+1)//4, p)
if y % 2 != y_parity:
y = -y % p
uncompressed_key = '04{:x}{:x}'.format(x, y)
print(uncompressed_key)
# should get 0414fc03b8df87cd7b872996810db8458d61da8448e531569c8517b469a119d267be5645686309c6e6736dbd93940707cc9143d3cf29f1b877ff340e2cb2d259cf
refer to bitcoin talk: https://bitcointalk.org/index.php?topic=644919.0

The field of the elliptic curve is not over the field of real numbers. It's over a finite field modulo some prime.
For Secp256k1 the prime p = 2^256 - 2^32 - 2^9 - 2^8 - 2^7 - 2^6 - 2^4 - 1.
Thus: y^2= (x^3) + 7 (mod p)
There's no direct way to solve the equation, you would need to use Cipolla's algorithm: https://en.wikipedia.org/wiki/Cipolla%27s_algorithm

I know that this question has been answered and I actually benefited from this answer, so thank you. The problem is that I found these answers 3 times while looking for the same solution in C# and I don't really code in python :). So for anybody trying to solve this here is a C# solution, have fun! :) (It uses BouncyCastle Library).
using System;
using System.Collections.Generic;
using System.Linq;
using MoreLinq;
using NBitcoin;
using Org.BouncyCastle.Asn1.X9;
using Org.BouncyCastle.Crypto;
using Org.BouncyCastle.Crypto.Parameters;
using Org.BouncyCastle.Math;
using Org.BouncyCastle.Math.EC;
namespace BitcoinPublicKeyDecompression
{
public class Program
{
public static void Main()
{
const string cPubKey = "0250863ad64a87ae8a2fe83c1af1a8403cb53f53e486d8511dad8a04887e5b2352";
var uPubKey = cPubKey.ToHexByteArray().BitcoinDecompressPublicKey().ToHexString();
var expectedUPubKey = new PubKey(cPubKey).Decompress().ToString();
Console.WriteLine($"Public Key:\n\n{cPubKey}\n\nhas been {(uPubKey == expectedUPubKey ? "correctly" : "incorrectly")} decompressed to:\n\n{uPubKey}");
Console.WriteLine("\nPress any key to quit...");
Console.ReadKey();
}
}
public static class Extensions
{
public static readonly byte[] EmptyByteArray = new byte[0];
public static byte[] BitcoinDecompressPublicKey(this byte[] bPubC)
{
var ecPubKey = bPubC.BitcoinCompressedPublicKeyToECPublicKey();
return ecPubKey.ToBitcoinUncompressedPublicKey();
}
public static ECPublicKeyParameters BitcoinCompressedPublicKeyToECPublicKey(this byte[] bPubC)
{
var pubKey = bPubC.Skip(1).ToArray();
var curve = ECNamedCurveTable.GetByName("secp256k1");
var domainParams = new ECDomainParameters(curve.Curve, curve.G, curve.N, curve.H, curve.GetSeed());
var yParity = new BigInteger(bPubC.Take(1).ToArray()).Subtract(BigInteger.Two);
var x = new BigInteger(1, pubKey);
var p = ((FpCurve)curve.Curve).Q;
var a = x.ModPow(new BigInteger("3"), p).Add(new BigInteger("7")).Mod(p);
var y = a.ModPow(p.Add(BigInteger.One).FloorDivide(new BigInteger("4")), p);
if (!y.Mod(BigInteger.Two).Equals(yParity))
y = y.Negate().Mod(p);
var q = curve.Curve.CreatePoint(x, y);
return new ECPublicKeyParameters(q, domainParams);
}
public static byte[] ToBitcoinUncompressedPublicKey(this AsymmetricKeyParameter ecPublicKey)
{
var publicKey = ((ECPublicKeyParameters)ecPublicKey).Q;
var xs = publicKey.AffineXCoord.ToBigInteger().ToByteArrayUnsigned().PadStart(32);
var ys = publicKey.AffineYCoord.ToBigInteger().ToByteArrayUnsigned().PadStart(32);
return new byte[] { 0x04 }.ConcatMany(xs, ys).ToArray();
}
public static BigInteger FloorDivide(this BigInteger a, BigInteger b)
{
if (a.CompareTo(BigInteger.Zero) > 0 ^ b.CompareTo(BigInteger.Zero) < 0 && !a.Mod(b).Equals(BigInteger.Zero))
return a.Divide(b).Subtract(BigInteger.One);
return a.Divide(b);
}
public static byte[] ToHexByteArray(this string str)
{
byte[] bytes;
if (string.IsNullOrEmpty(str))
bytes = EmptyByteArray;
else
{
var string_length = str.Length;
var character_index = str.StartsWith("0x", StringComparison.Ordinal) ? 2 : 0;
var number_of_characters = string_length - character_index;
var add_leading_zero = false;
if (0 != number_of_characters % 2)
{
add_leading_zero = true;
number_of_characters += 1;
}
bytes = new byte[number_of_characters / 2];
var write_index = 0;
if (add_leading_zero)
{
bytes[write_index++] = CharacterToByte(str[character_index], character_index);
character_index += 1;
}
for (var read_index = character_index; read_index < str.Length; read_index += 2)
{
var upper = CharacterToByte(str[read_index], read_index, 4);
var lower = CharacterToByte(str[read_index + 1], read_index + 1);
bytes[write_index++] = (byte)(upper | lower);
}
}
return bytes;
}
public static byte CharacterToByte(char character, int index, int shift = 0)
{
var value = (byte)character;
if (0x40 < value && 0x47 > value || 0x60 < value && 0x67 > value)
{
if (0x40 != (0x40 & value))
return value;
if (0x20 == (0x20 & value))
value = (byte)((value + 0xA - 0x61) << shift);
else
value = (byte)((value + 0xA - 0x41) << shift);
}
else if (0x29 < value && 0x40 > value)
value = (byte)((value - 0x30) << shift);
else
throw new InvalidOperationException($"Character '{character}' at index '{index}' is not valid alphanumeric character.");
return value;
}
public static string ToHexString(this byte[] value, bool prefix = false)
{
var strPrex = prefix ? "0x" : "";
return strPrex + string.Concat(value.Select(b => b.ToString("x2")).ToArray());
}
public static IEnumerable<T> ConcatMany<T>(this IEnumerable<T> enumerable, params IEnumerable<T>[] enums)
{
return enumerable.Concat(enums.SelectMany(x => x));
}
}
}
Result:

optical flow .flo files

I have a few questions for doing optical flow projects. I use Python 2 (planning to use lasagne to use deep learning to learn optical flow), and don't know how to convert the c++ functions to that of python in visualization of the flows.
I downloaded (from http://vision.middlebury.edu/flow/data/comp/zip/other-gt-flow.zip) some image pairs where I have to estimate their optical flow, and their ground truth flow (.flo file). The problem is, when I read the .flo file into the program, it is a vectorized code. How do I view them like how they show in the webpage (http://vision.middlebury.edu/flow/data/)? I read from various sources and tried the following, but doesn't work.
In evaluating EPE (end point error) in what form should I have my prediction to be compared with the .flo file?
The code:
################################ Reading flow file ################################
f = open('flow10.flo', 'rb')
x = np.fromfile(f, np.int32, count=1) # not sure what this gives
w = np.fromfile(f, np.int32, count=1) # width
h = np.fromfile(f, np.int32, count=1) # height
print 'x %d, w %d, h %d flo file' % (x, w, h)
data = np.fromfile(f, np.float32) # vector
data_2D = np.reshape(data, newshape=(388,584,2)); # convert to x,y - flow
x = data_2D[...,0]; y = data_2D[...,1];
################################ visualising flow file ################################
mag, ang = cv2.cartToPolar(x,y)
hsv = np.zeros_like(x)
hsv = np.array([ hsv,hsv,hsv ])
hsv = np.reshape(hsv, (388,584,3)); # having rgb channel
hsv[...,1] = 255; # full green channel
hsv[...,0] = ang*180/np.pi/2 # angle in pi
hsv[...,2] = cv2.normalize(mag,None,0,255,cv2.NORM_MINMAX) # magnitude [0,255]
bgr = cv2.cvtColor(hsv,cv2.COLOR_HSV2BGR)
bgr = draw_hsv(data_2D)
cv2.imwrite('opticalhsv.png',bgr)

On Middlebury's page there is a zip file called flow-code (http://vision.middlebury.edu/flow/code/flow-code.zip), which provides a tool called color_flow to convert those .flo files to color images.
On the other hand, if you want to implement your own code to do the transformation, i have this piece of code (i cannot provide the original author, it has been some time) that helps you to first compute the color:
static Vec3b computeColor(float fx, float fy)
{
static bool first = true;
// relative lengths of color transitions:
// these are chosen based on perceptual similarity
// (e.g. one can distinguish more shades between red and yellow
// than between yellow and green)
const int RY = 15;
const int YG = 6;
const int GC = 4;
const int CB = 11;
const int BM = 13;
const int MR = 6;
const int NCOLS = RY + YG + GC + CB + BM + MR;
static Vec3i colorWheel[NCOLS];
if (first)
{
int k = 0;
for (int i = 0; i < RY; ++i, ++k)
colorWheel[k] = Vec3i(255, 255 * i / RY, 0);
for (int i = 0; i < YG; ++i, ++k)
colorWheel[k] = Vec3i(255 - 255 * i / YG, 255, 0);
for (int i = 0; i < GC; ++i, ++k)
colorWheel[k] = Vec3i(0, 255, 255 * i / GC);
for (int i = 0; i < CB; ++i, ++k)
colorWheel[k] = Vec3i(0, 255 - 255 * i / CB, 255);
for (int i = 0; i < BM; ++i, ++k)
colorWheel[k] = Vec3i(255 * i / BM, 0, 255);
for (int i = 0; i < MR; ++i, ++k)
colorWheel[k] = Vec3i(255, 0, 255 - 255 * i / MR);
first = false;
}
const float rad = sqrt(fx * fx + fy * fy);
const float a = atan2(-fy, -fx) / (float)CV_PI;
const float fk = (a + 1.0f) / 2.0f * (NCOLS - 1);
const int k0 = static_cast<int>(fk);
const int k1 = (k0 + 1) % NCOLS;
const float f = fk - k0;
Vec3b pix;
for (int b = 0; b < 3; b++)
{
const float col0 = colorWheel[k0][b] / 255.f;
const float col1 = colorWheel[k1][b] / 255.f;
float col = (1 - f) * col0 + f * col1;
if (rad <= 1)
col = 1 - rad * (1 - col); // increase saturation with radius
else
col *= .75; // out of range
pix[2 - b] = static_cast<uchar>(255.f * col);
}
return pix;
}
Then it calls the above function for all the pixels:
static void drawOpticalFlow(const Mat_<Point2f>& flow, Mat& dst, float maxmotion = -1)
{
dst.create(flow.size(), CV_8UC3);
dst.setTo(Scalar::all(0));
// determine motion range:
float maxrad = maxmotion;
if (maxmotion <= 0)
{
maxrad = 1;
for (int y = 0; y < flow.rows; ++y)
{
for (int x = 0; x < flow.cols; ++x)
{
Point2f u = flow(y, x);
if (!isFlowCorrect(u))
continue;
maxrad = max(maxrad, sqrt(u.x * u.x + u.y * u.y));
}
}
}
for (int y = 0; y < flow.rows; ++y)
{
for (int x = 0; x < flow.cols; ++x)
{
Point2f u = flow(y, x);
if (isFlowCorrect(u))
dst.at<Vec3b>(y, x) = computeColor(u.x / maxrad, u.y / maxrad);
}
}
}
This is for my use in OpenCV, but the code help should anyone who wants achieve something similar.

OpenCL: Access neighbors error

I'm working on a solver for a differential equation for a particle smulation using Pyopencl.
To solve this equation each particle must access it's neighbors information.
The arrays I'm using are numpy complex64 arrays each with 7 elements.
When accessing the neighbors, the program returns the error:
clWaitForEvents failed: out of resources
My OpenCl code is the following. I guess most of it isn't related to this error but i'll post it anyway because it might help somehow:
#define complex_ctr(x, y) (float2)(x, y)
#define complex_add(a, b) complex_ctr((a).x + (b).x, (a).y + (b).y)
#define complex_mul(a, b) complex_ctr(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y))
#define complex_mul_scalar(a, b) complex_ctr((a).x * (b), (a).y * (b))
#define complex_div_scalar(a, b) complex_ctr((a).x / (b), (a).y / (b))
#define conj(a) complex_ctr((a).x, -(a).y)
#define conj_transp(a) complex_ctr(-(a).y, (a).x)
#define conj_transp_and_mul(a, b) complex_ctr(-(a).y * (b), (a).x * (b))
#define complex_real(a) a.x
#define complex_imag(a) a.y
#define complex_unit (float2)(0, 1)
constant int M=10;
constant float L=1e-09;
constant float p0=1.0;
constant float delta=1.0;
constant float gama=1.0;
constant float omc=1.0;
constant float k_p=1.0;
constant float om_p=1.0;
constant float v = 0.001;
constant float b = 1.0;
constant float dt=0.01;
void f(__global float2 *X,
__global float2 *K,
int id,
uint W,
float t){
float exp_arg;
float2 p11, p22, p33, p21, p31, p32, op, aux, ar, al, p;
p11 = X[id*W];
p22 = X[id*W+1];
p33 = X[id*W+2];
p21 = X[id*W+3];
p31 = X[id*W+4];
p32 = X[id*W+5];
al = X[(id-1)*W+6];
ar = X[(id+1)*W+6];
op = p0 * complex_mul(X[id*W+6], complex_unit);
aux = p22 * gama/2 + complex_mul(op, p22) + conj(p22) * gama/2 + complex_mul(op, conj(p22));
K[id*W] = aux;
aux = (-p22*gama - complex_mul(op, p21) + complex_mul(p32, complex_unit)*omc
- conj(p22)*gama - complex_mul(op, conj(p21)) + complex_mul(conj(p32), complex_unit)*omc);
K[id*W+1] = aux;
aux = p22*gama/2 - complex_mul(p32, complex_unit)*omc + conj(p22)*gama/2 - complex_mul(conj(p32), complex_unit)*omc;
K[id*W+2] = aux;
aux = complex_mul(op, p11) - complex_mul(op, p22) - p21*gama + complex_mul(p21, complex_unit)*delta + complex_mul(p31, complex_unit)*omc;
K[id*W+3] = aux;
aux = complex_mul(p21, complex_unit)*omc + complex_mul(p31, complex_unit)*delta - complex_mul(op, p32);
K[id*W+4] = aux;
aux = (complex_mul(p22, complex_unit)*omc - complex_mul(p33, complex_unit)*omc - complex_mul(op, p31) - p32*gama);
K[id*W+5] = aux;
exp_arg = k_p * L * id - om_p * t;
p = complex_mul(b*p0*p21*complex_ctr(cos(exp_arg), sin(exp_arg)), complex_unit);
aux = (X[(id-1)*W+6] + X[(id+1)*W+6]);
aux = aux + p;
K[id*W+6] = aux;
}
__kernel void RK4Step(__global float2 *X,
__global float2 *K,
__global float2 *Xs,
__global float2 *Xm,
uint W,
float t){
const int gid_x = get_global_id(0);
int idx = 0;
//computation of k1
f(X, K, gid_x, W, t);
for(int i=0; i<W; i++)
{
idx = gid_x*W+i;
Xs[idx] = X[idx] + dt*K[idx]/6;
Xm[idx] = X[idx] + dt*K[idx]/2;
}
//computation of k2
f(Xm, K, gid_x, W, t);
for(int i=0; i<W; i++)
{
idx = gid_x*W+i;
Xs[idx] = Xs[idx] + dt*K[idx]/3;
Xm[idx] = X[idx] + dt*K[idx]/2;
}
//computation of k3
f(Xm, K, gid_x, W, t);
for(int i=0; i<W; i++)
{
idx = gid_x*W+i;
Xs[idx] = Xs[idx] + dt*K[idx]/3;
Xm[idx] = X[idx] + dt*K[idx];
}
//computation of k4
f(Xm, K, gid_x, W, t);
for(int i=0; i<W; i++)
{
idx = gid_x*W+i;
Xs[idx] = Xs[idx] + dt*K[idx]/6;
}
//update photon
for(int i=0; i<W; i++)
{
idx = gid_x*W+i;
X[idx] = Xs[idx];
}
}
If i comment this line:
aux = (X[(id-1)*W+6] + X[(id+1)*W+6]);
The code runs through with no errors, but if I uncomment it, i get the error i described.
The python code that calls this kernel is the following:
import pyopencl as cl
import numpy as np
from pylab import *
import matplotlib.pyplot as plt
import time
"""
Solve the problem
Xi' = M1*Xi + M2*Xi~
M1 and M2 are 6*6 Matrixes which elements are complex numbers
Xi in the form [P11i, P22i, P33i, P21i, P31i, P32i, Ai] where Pxyi is a complex number
"""
########################################################
# #
#'_h' buffers are host buffers. '_d' are device buffers#
# #
########################################################
#Initialization of the device and workspace
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags
# Constants
M = 2000 # Number of atoms
L = np.float32(0.000000001) # Atom Spacing
N = 1000 # Number of time intervals
dt = np.float32(0.1) # Time interval
Timeline = np.arange(0.0, N, dt).astype(np.float32)
p0 = np.float32(1.0) # constant P0 [OMP = P0*Ai]
delta = np.float32(1.0) # constant DELTA
gama = np.float32(1.0) # constant GAMA
omc = np.float32(1.0) # constant OMC
# Writing the source code with the constants declared by the user
text = ""
##text = "__constant int M=" + str(M) + "; \n"
##text += "__constant float L=" + str(L) + "; \n"
##text += "__constant float dt=" + str(dt) + "; \n"
##text += "__constant float p0=" + str(p0) + "; \n"
##text += "__constant float delta=" + str(delta) + "; \n"
##text += "__constant float gama=" + str(gama) + "; \n"
##text += "__constant float omc=" + str(omc) + "; \n"
f1 = open("precode.cl", "r")
f2 = open("kernel.cl", "r")
f3 = open("source.cl",'w+')
precode = f1.read()
kernel = f2.read()
f3.write(precode + text + kernel)
f1.close()
f2.close()
f3.close()
#Initial Conditions
A_h = (np.arange(M) + 1j*np.zeros(M)).astype(np.complex64)
A_h = np.exp(-((A_h-M/2.0)/(0.05 * M))**2)*np.exp(1j * 200.0 * A_h /M)
P11_h = (np.random.randn(M) + 1j*np.random.randn(M)).astype(np.complex64)
P22_h = (np.random.randn(M) + 1j*np.random.randn(M)).astype(np.complex64)
P33_h = (np.random.randn(M) + 1j*np.random.randn(M)).astype(np.complex64)
P21_h = (np.random.randn(M) + 1j*np.random.randn(M)).astype(np.complex64)
P31_h = (np.random.randn(M) + 1j*np.random.randn(M)).astype(np.complex64)
P32_h = (np.random.randn(M) + 1j*np.random.randn(M)).astype(np.complex64)
W = np.uint32(7) # The row width to compute the index inside the kernel
X_h = []
for i in range(M):
X_h.append( np.array([P11_h[i], P22_h[i], P33_h[i], P21_h[i], P31_h[i], P32_h[i], A_h[i]]).astype(np.complex64) )
X_h = np.array(X_h).astype(np.complex64)
K_h = np.empty_like(X_h)
Xs_h = np.empty_like(X_h)
Xm_h = np.empty_like(X_h)
A_h = X_h[:,6]
figure(1)
plt.plot(np.real(A_h))
plt.plot(np.abs(A_h))
# Allocation of required buffers on the device
X_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=X_h)
K_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=K_h)
Xs_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=Xs_h)
Xm_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=Xm_h)
f = open("source.cl", "r")
source = f.read()
f.close()
prg = cl.Program(ctx, source).build()
print "Begin Calculation"
start_time = time.time()
for t in Timeline:
completeevent = prg.RK4Step(queue, (M,), None, X_d, K_d, Xs_d, Xm_d, W, t)
completeevent.wait()
cl.enqueue_copy(queue, X_h, X_d)
end_time = time.time()
print "All done"
print "Calculation took " + str(end_time - start_time) + " seconds"
A_h = X_h[:,6]
figure(2)
plt.plot(np.real(A_h))
plt.plot(np.abs(A_h))
##plt.show()
Some code is commented because I'm still working on it but it's minor stuff just to get things a bit cleaner.
I can't understand why this happens. I've tried something similiar but with a much simpler code and the neighbors access goes just fine as I intended.
For examples, when I run this module:
import pyopencl as cl
import numpy as np
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags
M = 3
zero = np.complex64(0.0)
X1_h = np.array([1 + 1j*2, 2 + 1j*3, 3 + 1j*4]).astype(np.complex64)
X2_h = np.array([1 + 1j*2, 2 + 1j*3, 3 + 1j*4]).astype(np.complex64)
X3_h = np.array([1 + 1j*2, 2 + 1j*3, 3 + 1j*4]).astype(np.complex64)
Y1_h = np.array([4 + 1j*5, 5 + 1j*6, 6 + 1j*7]).astype(np.complex64)
Y2_h = np.array([4 + 1j*5, 5 + 1j*6, 6 + 1j*7]).astype(np.complex64)
Y3_h = np.array([4 + 1j*5, 5 + 1j*6, 6 + 1j*7]).astype(np.complex64)
aux_h = np.complex64(1 + 1j*1)
RES_h = np.empty_like(X1_h)
dados_h = []
for i in range(3):
dados_h.append(np.array([X1_h[i], X2_h[i], X3_h[i], Y1_h[i], Y2_h[i], Y3_h[i]]).astype(np.complex64))
dados_h = np.array(dados_h).astype(np.complex64)
print dados_h
aux_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=aux_h)
dados_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=dados_h)
RES_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf = RES_h)
Source = """
__kernel void soma( __global float2 *dados, __global float2 *res, int rowWidth){
const int gid_x = get_global_id(0);
res[gid_x] = dados[(gid_x-1)*rowWidth] + dados[(gid_x+1)*rowWidth];
}
"""
prg = cl.Program(ctx, Source).build()
completeEvent = prg.soma(queue, (M,), None, dados_d, RES_d, np.int32(6))
completeEvent.wait()
cl.enqueue_copy(queue, RES_h, RES_d)
print "GPU RES"
print RES_h
the result i get is:
[[ 1.+2.j 1.+2.j 1.+2.j 4.+5.j 4.+5.j 4.+5.j]
[ 2.+3.j 2.+3.j 2.+3.j 5.+6.j 5.+6.j 5.+6.j]
[ 3.+4.j 3.+4.j 3.+4.j 6.+7.j 6.+7.j 6.+7.j]]
GPU RES
[ 2.+3.j 4.+6.j 2.+3.j]
which is exactly what i expected.
Can anyone give me some help on what is happening here? It's probably something simple but I can't find what's wrong.
One additional info: This happens only when i run the code on my GTX970. I have a laptop with an old ATI card that handles the code above just fine with no errors, which got me even more confused on this whole thing.
PS: Sorry for the long post

If id=0 and W=7 then you use negative index value to access element of 'X' array:
aux = (X[(id-1)*W+6] + X[(id+1)*W+6]);
which for id=0 and W=7 is:
aux = (X[-1] + X[13]);
From my experience a code with error like this one may produce still good results on some GPUs therefore I always test my opencl code using different GPU vendors. Especially opencl validation on CPU seems to be very sensitive to such errors.

Does matplotlib only capture the center frequencies of each bin

I'm wanting to know if matplotlib function spectrogram only takes into account the centre frequencies of a signal?
For example, plotting a spectrogram (in none decibels) here is the result:
Plotting a spectrogram normally:
Where are those points going from 0-50) and between (80-140) are they being removed? If so.. Why is this exactly?
EDIT: Source code :-
Here is "matplotlib" spectral_helper
def _spectral_helper2(x, y=None, NFFT=None, Fs=None, detrend_func=None,
window=None, noverlap=None, pad_to=None,
sides=None, scale_by_freq=None, mode=None):
'''
This is a helper function that implements the commonality between the
psd, csd, spectrogram and complex, magnitude, angle, and phase spectrums.
It is *NOT* meant to be used outside of mlab and may change at any time.
'''
if y is None:
# if y is None use x for y
same_data = True
else:
#The checks for if y is x are so that we can use the same function to
#implement the core of psd(), csd(), and spectrogram() without doing
#extra calculations. We return the unaveraged Pxy, freqs, and t.
same_data = y is x
if Fs is None:
Fs = 2
if noverlap is None:
noverlap = 0
if detrend_func is None:
detrend_func = detrend_none
if window is None:
window = window_hanning
# if NFFT is set to None use the whole signal
if NFFT is None:
NFFT = 256
if mode is None or mode == 'default':
mode = 'psd'
elif mode not in ['psd', 'complex', 'magnitude', 'angle', 'phase']:
raise ValueError("Unknown value for mode %s, must be one of: "
"'default', 'psd', 'complex', "
"'magnitude', 'angle', 'phase'" % mode)
if not same_data and mode != 'psd':
raise ValueError("x and y must be equal if mode is not 'psd'")
#Make sure we're dealing with a numpy array. If y and x were the same
#object to start with, keep them that way
x = np.asarray(x)
if not same_data:
y = np.asarray(y)
if sides is None or sides == 'default':
if np.iscomplexobj(x):
sides = 'twosided'
else:
sides = 'onesided'
elif sides not in ['onesided', 'twosided']:
raise ValueError("Unknown value for sides %s, must be one of: "
"'default', 'onesided', or 'twosided'" % sides)
# zero pad x and y up to NFFT if they are shorter than NFFT
if len(x) < NFFT:
n = len(x)
x = np.resize(x, (NFFT,))
x[n:] = 0
if not same_data and len(y) < NFFT:
n = len(y)
y = np.resize(y, (NFFT,))
y[n:] = 0
if pad_to is None:
pad_to = NFFT
if mode != 'psd':
scale_by_freq = False
elif scale_by_freq is None:
scale_by_freq = True
# For real x, ignore the negative frequencies unless told otherwise
if sides == 'twosided':
numFreqs = pad_to
if pad_to % 2:
freqcenter = (pad_to - 1)//2 + 1
else:
freqcenter = pad_to//2
scaling_factor = 1.
elif sides == 'onesided':
if pad_to % 2:
numFreqs = (pad_to + 1)//2
else:
numFreqs = pad_to//2 + 1
scaling_factor = 2.
result = stride_windows(x, NFFT, noverlap, axis=0)
result = detrend(result, detrend_func, axis=0)
result, windowVals = apply_window(result, window, axis=0,
return_window=True)
result = np.fft.fft(result, n=pad_to, axis=0)[:numFreqs, :]
freqs = np.fft.fftfreq(pad_to, 1/Fs)[:numFreqs]
if not same_data:
# if same_data is False, mode must be 'psd'
#resultY = stride_windows(y, NFFT, noverlap)
resultY = apply_window(resultY, window, axis=0)
resultY = detrend(resultY, detrend_func, axis=0)
resultY = np.fft.fft(resultY, n=pad_to, axis=0)[:numFreqs, :]
result = np.conjugate(result) * resultY
elif mode == 'psd':
result = np.conjugate(result) * result
elif mode == 'magnitude':
result = np.absolute(result)
elif mode == 'angle' or mode == 'phase':
# we unwrap the phase later to handle the onesided vs. twosided case
result = np.angle(result)
elif mode == 'complex':
pass
if mode == 'psd':
# Scale the spectrum by the norm of the window to compensate for
# windowing loss; see Bendat & Piersol Sec 11.5.2.
#result /= (np.abs(windowVals)**2).sum()
# Also include scaling factors for one-sided densities and dividing by
# the sampling frequency, if desired. Scale everything, except the DC
# component and the NFFT/2 component:
result[1:-1] *= 1554848
# MATLAB divides by the sampling frequency so that density function
# has units of dB/Hz and can be integrated by the plotted frequency
# values. Perform the same scaling here.
if scale_by_freq:
result /= Fs
t = np.arange(NFFT/2, len(x) - NFFT/2 + 1, NFFT - noverlap)/Fs
if sides == 'twosided':
# center the frequency range at zero
freqs = np.concatenate((freqs[freqcenter:], freqs[:freqcenter]))
# result = np.concatenate((result[freqcenter:, :],
#result[:freqcenter, :]), 0)
elif not pad_to % 2:
# get the last value correctly, it is negative otherwise
freqs[-1] *= -1
# we unwrap the phase here to handle the onesided vs. twosided case
if mode == 'phase':
pass
#result = np.unwrap(result, axis=0)
return result, freqs, t
Here is my attempt in C++
std::vector<std::vector<Complex::complex> > ComputeSTFT(std::vector<double> &vals,
std::size_t NFFT, std::size_t overlap)
{
std::vector<double> hanning = getHanningWindow(NFFT);
double NENBW = 0.0;
double ENBW = 0.0;
double fRes = 0.0;
double avg = 0.0;
for(unsigned i=0; (i < vals.size()); i++)
{
avg+= vals[i];
}
avg = avg / vals.size();
for(unsigned i=0; (i < vals.size()); i++)
{
vals[i] = vals[i] - avg;
}
std::vector<std::vector<double> > temp_vars = frame(vals, NFFT, overlap);
std::vector<std::vector<Complex::complex> > STFT(temp_vars.size());
for(unsigned i=0; (i < temp_vars.size()-1); i++)
{
for(unsigned j=0; (j < temp_vars[i].size()); j++) {
double value = 0.5 * (1 - cos(2 * PI * j / (NFFT - 1)));
S1 += value;
S2 += value * value;
double calculation = temp_vars[i][j] * value;
temp_vars[i][j] = calculation;
}
}
NENBW = NFFT * (S2 / S1*S1);
// This assume that the frequency is KNOWN.
fRes = 12000 / NFFT;
ENBW = NENBW * fRes;
std::vector<std::vector<Complex::complex> > fft_vars(temp_vars.size());
for(unsigned i=0; (i < temp_vars.size()); i++)
{
fft_vars.resize(temp_vars[i].size());
FFT f(temp_vars[i].begin(), temp_vars[i].end(), temp_vars[i].size());
std::vector<Complex::complex> temp_fft = f.transformed();
fft_vars[i] = temp_fft;
temp_fft.empty();
}
std::vector<std::vector<double> > RESULT(temp_vars.size());
for(unsigned i=0; (i < temp_vars.size()); i++)
{
STFT[i].resize(temp_vars[i].size()/2+1);
for(unsigned j=0; (j < temp_vars[i].size()/2 + 1); j++)
{
STFT[i][j].re = fft_vars[i][j].re;
STFT[i][j].im = fft_vars[i][j].im;
}
}
return STFT;
}
Where am I going wrong to produce such different results?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How, when and what to vectorize in python? - python

Related

Convert Eigen to numpy matrix

Deriving an ECDSA uncompressed public key from a compressed one

optical flow .flo files

OpenCL: Access neighbors error

Does matplotlib only capture the center frequencies of each bin

Categories

Resources