I have a string of data from an accelerometer (x, y, z) (looks like this in text file "XXX XXX XXX" and I am attempting to read it and convert to a line graph with three subplots of data. I'm adapting some code from a friend to do this but I'm not sure where some of these errors are coming from. Obviously beginner programmer. Help much appreciated.
Error: invalid literal for int() with base 10
import os
import numpy as npy
import matplotlib.pyplot as plt
global y0,y1,y2
increment_size = 8000
datasample_size = 16000
from os.path import join
filepath = "C:\\Users\\Riley\\Documents\\Programming\\"
infile = join(filepath, 'data.txt')
infile = open(infile,"r")
singleline = infile.readline()
asciidata = singleline.split()
asciidata[0]=asciidata[0][3:]
y0=[int(asciidata[0])]
y1=[int(asciidata[1])]
y2=[int(asciidata[2])]
count = 0
for singleline in infile:
count += 1
if (count % 10000) == 0:
print(count)
asciidata = singleline.split()
y0.append(int(asciidata[0]))
y1.append(int(asciidata[1]))
y2.append(int(asciidata[2]))
infile.close()
totaldata=count-1
print(totaldata)
low = 0
high = datasample_size
while low < totaldata:
t = npy.arange(low,high)
plt.subplot(311)
plt.ylim(-2000,2000)
plt.plot(t,y0[low:high])
plt.subplot(312)
plt.ylim(-2000,2000)
plt.plot(t,y1[low:high])
plt.subplot(313)
plt.ylim(-2000,2000)
plt.plot(t,y2[low:high])
outfilename = filepath + 'Plots/' + shortfilename + '_' + str(low) + '.png'
plt.savefig(outfilename)
outfilename2 = filepath + 'Datasegments/' + shortfilename + '_' + str(low) + '.txt'
outfile = open(outfilename2,"w")
for j in range(low,high):
outfile.write(str(y0[j])+'\t'+str(y1[j])+'\t'+str(y2[j])+'\n')
# print(low),
plt.show()
low = low + increment_size
high = high + increment_size
if high > totaldata:
high = totaldata
# if low > 10000:
# break
# plt.close()
It is possible that you may be trying to parse a float() with int().
If you are needing to take care of empty values, try int(s or 0)
There's a numpy function that does almost all of this for you. It's hard for me to test it without knowing the format of your data file (it would help if you pasted in the first few lines of 'data.txt')
from os import path
import numpy as npy
import matplotlib.pyplot as plt
increment_size = 8000
datasample_size = 16000
filepath = "C:\\Users\\Riley\\Documents\\Programming\\"
infile = path.join(filepath, 'data.txt')
# This line replaces all the file reading lines:
y0, y1, y2 = npy.genfromtxt(infile, unpack=True)
totaldata = len(y0)
print(totaldata)
low = 0
high = datasample_size
while low < totaldata:
...
Possibly the plotting could be done more simply too, but I'm not sure I understand why you are plotting it section by section.
Related
I think the problem is the following steps, but just in case,I will also write the whole body of my code down blow. The most strange thing is, this code can read over 6000 csv files and a graphic can also successfully show, but when I want more files to be read, then occours an error. The screenshot shows the graphic and the content of the csv files. as you can see, the path = r'C:\Users\AK6PRAKT\Desktop\6daten' includes all datas and path = r'C:\Users\AK6PRAKT\Desktop\daten' includes only parts of them.enter image description here
import os
from matplotlib import pyplot as pyplot
from collections import defaultdict
import csv
import numpy as np
path = r'C:\Users\AK6PRAKT\Desktop\6daten'
dirs = os.listdir(path)
s = []
x = []
y = []
names = []
...(ignore some steps for reading the datas from csv files)
print(list_temp1,list_temp2) #list_temp1 is the datas of xaxise, and list_temp2 of yaxise.
y.append(float(list_temp2))
names.append(list_temp1)
x = range(len(names))
pyplot.ylim((0, 40))
my_y_ticks = np.arange(0, 40, 10)
pyplot.plot(x,y, linewidth=2)
pyplot.xticks(x,names,rotation = 90)
fig = pyplot.figure(figsize=(10,10))
pyplot.show()
and then...the whole body, and i must say something to declare: I had no background about computer science before, it's really a little bit hard for me to deal with such many datas at the very beginning. Actually i am now doing Internship in a german company and i started to learn python one week ago. I got an assignment from my mentor, I tried to devide the whole assignment into several steps, and I searched the commands of each of the steps and then combine them together with some revising. So, it may seem that I did a lot of useless work. Please be kind in commends(If you have suggestions about that, always glad to hear that of course)
import os
from matplotlib import pyplot as pyplot
from collections import defaultdict
import csv
import numpy as np
path = r'C:\Users\AK6PRAKT\Desktop\6daten'
dirs = os.listdir(path)
s = []
x = []
y = []
names = []
fig = pyplot.figure()
for i in dirs:
if os.path.splitext(i)[1] == ".csv":
f = open(path+"/"+i)
iter_f = iter(f);
str = ""
for line in iter_f:
str = str + line
s.append(str)
with open(path+"/"+i,'r') as r:
lines=r.readlines()
with open(path+"/"+i,'w') as w:
for row in lines:
if 'Date' not in row:
w.write(row)
columns = defaultdict(list)
with open(path+"/"+i) as f:
reader = csv.reader(f)
for row in reader:
for (i,v) in enumerate(row):
columns[i].append(v)
list_temp1 = columns[0]
list_temp1 = np.array(list_temp1)
list_temp2 = columns[1]
list_temp2 = np.array(list_temp2)
print(list_temp1,list_temp2)
y.append(float(list_temp2))
names.append(list_temp1)
x = range(len(names))
pyplot.ylim((0, 40))
my_y_ticks = np.arange(0, 40, 10)
pyplot.plot(x,y, linewidth=2)
pyplot.xticks(x,names,rotation = 90)
pyplot.yticks(my_y_ticks)
fig = pyplot.figure(figsize=(10,10))
pyplot.show()
the graphic from parts of datas
the graphic can not show while reading all datas
Hello I am using Python to try to read the digit data provided by MNIST into a data structure I can use to train a neural network. I am testing to ensure the data was read properly by creating an image using PIL. The image that is being created is horribly wrong, and I am not sure if it is because I am using PIL incorrectly or my data structures and methods are not right.
The format of the two data files is described here:
http://yann.lecun.com/exdb/mnist/
Here are the applicable functions:
read_image_data reads the pixel data organizing it into a list of 2D array numpy arrays
def read_image_data():
fd = open("train-images.idx3-ubyte", "rb")
images_bin_string = fd.read()
num_images = struct.unpack(">i", images_bin_string[4:8])[0]
image_data_bank = []
uint32_num_bytes = 4
current_index = 8
num_rows = struct.unpack(">I", \
images_bin_string[current_index: current_index + uint32_num_bytes])[0]
num_cols = struct.unpack(">I", \
images_bin_string[current_index + uint32_num_bytes: \
current_index + uint32_num_bytes * 2])[0]
current_index += 8
i = 0
while i < num_images:
image_data = np.zeros([num_rows, num_cols])
for j in range(num_rows - 1):
for k in range(num_cols - 1):
image_data[j][k] = images_bin_string[current_index + j * k]
current_index += num_rows * num_cols
i += 1
image_data_bank.append(image_data)
return image_data_bank
read_label_data reads the corresponding labels into a list
def read_label_data():
fd = open("train-labels.idx1-ubyte", "rb")
images_bin_string = fd.read()
num_images = struct.unpack(">i", images_bin_string[4:8])[0]
image_data_bank = []
current_index = 8
i = 0
while i < num_images:
image_data_bank.append(images_bin_string[current_index])
current_index += 1
i += 1
return image_data_bank
collect_data zips the structures together
def collect_data():
print("Reading image data...")
image_data = read_image_data()
print("Reading label data...")
label_data = read_label_data()
print("Zipping data sets...")
all_data = np.array(list(zip(image_data, label_data)))
return all_data
lastly run_test uses PIL to print the pixels from the first 28x28 np structure created by read_image_data
def run_test(data):
example = data[0]
pixel_data = example[0]
number = example[1]
print(number)
im = Image.fromarray(pixel_data)
im.show()
When I run the script:
Collecting data... Reading image data... Reading label data... Zipping
data sets... 5
I must be messing something up with the PIL library, but I do not know what.
That is a really weird looking 5. I am guessing that I went wrong somewhere in my organization of the data. The directions did say "Pixels are organized row-wise.", but I think I covered that by having my outer loop as the row loop then the inner as the column loop
UPDATE
I reversed the order of the row and column index in the np.arrays in read_image_data and it is making no difference.
image_data[k][j] = images_bin_string[current_index + j * k]
UPDATE
Ran quick test with matplotlib
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
imgplot = plt.imshow(pixel_data)
plt.show()
Here is what I got from matplotlib
That means it is definitely a problem with my code and not the library. The question is if it is the way I am passing the pixels to the imaging libraries or how I structured the data. If anyone can find the mistake, I would greatly appreciate.
I have a question \ problem. I need to plot the graph by the numbers that I got from the file (which I did) and then I need to draw a line connecting start and end, and calculate the area that between these two lines. I try to make a lot of variations, but i have no idea how I can make it..
I'm trying do it via matplotlib.pyplot library
Here the 'figure' whitch I should to get after add 'connection line between beginning and and' and now I need calcutale square between black line and blue.
PS the black one is kind of straight :)
Here is soure of code, and my data file...
http://pastebin.com/g40bAzPR
#!/path/to/python -tt
# numerical data
# python GraphicalPart.py ../dataFile.txt
import sys
import matplotlib.pyplot as plt
import numpy as np
def startDivide(fileName):
for i in range(1,2):
inputFile = open(fileName)
outputFile = open(fileName + "_" + str(i) + "_out.csv", "w")
floatList = []
for line in inputFile.readlines():
data = line.split(" ")
string = data[i]
if string.startswith('-'): #remove '-'
string = string[1:]
floatList.append(float(string))
floatList.sort() #sorting the list of data
for item in floatList:
outputFile.write("%s\n" % item)
outputFile.close()
inputFile.close()
data1=np.genfromtxt(fileName + "_" + str(i) + '_out.csv', skip_header=1)
plt.plot(data1)
plt.savefig(fileName + "_" + str(i) + "_.png")
plt.clf()
def main():
if len(sys.argv) != 2:
print "Not enough arguments. *_data.txt file only!"
else:
startDivide(sys.argv[1])
if __name__ == "__main__":
main()
for i in range(1,2) is a loop which only iterates once. Maybe you plan on increasing the number of iterations? If so, bear in mind that it's quicker to load the data once, rather than multiple times in a for-loop. You can do that using np.genfromtxt with the usecols parameter to specify the desired columns.
To find the area under the curve, you could use
np.trapz.
To find the area between two curves, you subtract area under the upper curve from the area under the lower curve. Assuming the diagonal line is always above the data curve:
import sys
import matplotlib.pyplot as plt
import numpy as np
def startDivide(filename):
data = np.genfromtxt(filename, dtype=None, usecols=[1])
data = np.abs(data)
data.sort()
np.savetxt("{}_1_out.csv".format(filename), data)
plt.plot(data)
plt.plot([0,len(data)-1], [data[0], data[-1]])
plt.savefig("{}_1_.png".format(filename))
area = np.trapz([data[0], data[-1]], dx=len(data)-1) - np.trapz(data)
print(area)
if __name__ == "__main__":
startDivide(sys.argv[1])
We prepare a following python scripts (python 2.7) to make histograms.
histogram.py
#!/usr/bin/env python
import sys
import numpy as np
import matplotlib as mpl
import matplotlib.mlab as mlab
mpl.use('Agg')
import matplotlib.pyplot as plt
sys.argv[1] # Define input name
sys.argv[2] # Define output name
sys.argv[3] # Define title
# Open the file name called "input_file"
input_file=sys.argv[1]
inp = open (input_file,"r")
lines = inp.readlines()
if len(lines) >= 20:
x = []
#numpoints = []
for line in lines:
# if int(line) > -10000: # Activate this line if you would like to filter any date (filter out values smaller than -10000 here)
x.append(float(line))
# the histogram of the data
n, bins, patches = plt.hist(x, 50, normed=False, facecolor='gray')
plt.xlabel('Differences')
numpoints = len(lines)
plt.ylabel('Frequency ( n =' + str(numpoints) + ' ) ' )
title=sys.argv[3]
plt.title(title)
plt.grid(True)
save_file=sys.argv[2]
plt.savefig(save_file+".png")
plt.clf()
inp.close()
example: input
1
2
3
The script will do the following
python histogram.py input ${output_file_name}.png ${title_name}
We add a line "if len(lines) >= 20:" so if the data points are less than 20, we don't make a plot.
However, if the file is empty, this python script will be freeze.
We add a bash line to remove any empty files before running "python histogram.py input ${output_file_name}.png ${title_name}"
find . -size 0 -delete
For some reasons, this line always works in small scale testings but not in real production runs under several loops. So we would love to make the "histogram.py" ignore any empty files if possible.
The search only finds this link which doesn't seem to be quite helpful : (
Ignoring empty files from coverage report
Could anyone kindly offer some comments? Thanks!
Check if the input_file file is empty os.path.getsize(input_file) > 0
os.path.getsize
You will need the full path which I presume you will have and it will raise an error if the file does not exist or is inaccessible so you may want to handle those cases.
This code works, ignoring empty files:
#!/usr/bin/env python
import sys
import numpy as np
import matplotlib as mpl
import matplotlib.mlab as mlab
import os
mpl.use('Agg')
import matplotlib.pyplot as plt
sys.argv[1] # Define input name
sys.argv[2] # Define output name
sys.argv[3] # Define title
input_file=sys.argv[1]
# Open the file name called "input_file"
if os.path.getsize(input_file) > 0:
inp = open (input_file,"r")
lines = inp.readlines()
if len(lines) >= 20:
x = []
#numpoints = []
for line in lines:
# if int(line) > -10000: # Activate this line if you would like to filter any date (filter out values smaller than -10000 here)
x.append(float(line))
# the histogram of the data
n, bins, patches = plt.hist(x, 50, normed=False, facecolor='gray')
plt.xlabel('Differences')
numpoints = len(lines)
plt.ylabel('Frequency ( n =' + str(numpoints) + ' ) ' )
title=sys.argv[3]
plt.title(title)
plt.grid(True)
save_file=sys.argv[2]
plt.savefig(save_file+".png")
plt.clf()
inp.close()
else:
print "Empty file"
~$ python test.py empty.txt foo bar
Empty file
Check if the file exists + is not empty before hand.
import os
def emptyfile(filepath):
return ((os.path.isfile(filepath) > 0) and (os.path.getsize(filepath) > 0))
I would like to write a program that creates 100 masked plots from a spread of 100 text files. i.e. for fnum in range(1,100,1):
The text files are numbered xydata1.txt, xydata2.txt ... until xydata100.txt.
How is this best done in Python?
Below is my plotting program, where (file number fnum) = 1,2,3...100.
fn = 'xydata'+fnum+'.txt'
y = loadtxt(fn,unpack=True,usecols=[0])
x = loadtxt(fn,unpack=True,usecols=[1])
n = ma.masked_where(gradient(y) < 0, y)
p = ma.masked_where(gradient(y) > 0, y)
pylab.plot(x,n,'r',x,p,'g')
pylab.savefig('data'+fnum+'.png')
pylab.show()
Assuming Python 2.7
from glob import glob
from pylab import *
for fname in glob("xydata*.txt"):
x, y = loadtxt(fname, unpack=True, usecols=[1, 0])
mask_inf = gradient(y) < 0
mask_sup = gradient(y) >= 0
plot(x[mask_inf], y[mask_inf], 'r')
plot(x[mask_sup], y[mask_sup], 'g')
legend(("grad(y) < 0", "grad(y) >= 0"))
title(fname)
savefig(fname.replace("xydata", "data").replace(".txt", ".svg"))
clf()
You can also use masked arrays. But the only advantage of them is to avoid allocating new memory. If your plots are small enough, you don't need them.
By the way there is no "best answer".