Unable to pull data from a file and place into two arrays - python

The code uses the matrix and arrpow functions to calculate the fibonacci numbers for the elements in my list, num. Oddly, right after a.append(float(row[0])) is completed, the error I get is
IndexError: list index out of range
Which is obviously coming from b.append.
Here's the file I want to pull from
import time
import math
import csv
import matplotlib.pyplot as plt
def arrpow(arr, n):
yarr=arr
if n<1:
pass
if n==1:
return arr
yarr = arrpow(arr, n//2)
yarr = [[yarr[0][0]*yarr[0][0]+yarr[0][1]*yarr[1][0],yarr[0][0]*yarr[0][1]+yarr[0][1]*yarr[1][1]],
[yarr[1][0]*yarr[0][0]+yarr[1][1]*yarr[1][0],yarr[1][0]*yarr[0][1]+yarr[1][1]*yarr[1][1]]]
if n%2:
yarr=[[yarr[0][0]*arr[0][0]+yarr[0][1]*arr[1][0],yarr[0][0]*arr[0][1]+yarr[0][1]*arr[1][1]],
[yarr[1][0]*arr[0][0]+yarr[1][1]*arr[1][0],yarr[1][0]*arr[0][1]+yarr[1][1]*arr[1][1]]]
return yarr
def matrix(n):
arr= [[1,1],[1,0]]
f=arrpow(arr,n-1)[0][0]
return f
num = [10,100,1000,10000,100000,1000000]
with open('matrix.dat', 'w') as h:
for i in num:
start_time = 0
start_time = time.time()
run = matrix(i)
h.write(str(math.log10(i)))
h.write('\n')
h.write((str(math.log10(time.time()-start_time))))
h.write('\n')
a = []
b = []
with open('matrix.dat','r+') as csvfile:
plots = csv.reader(csvfile, delimiter=',')
for row in plots:
a.append(float(row[0]))
b.append(float(row[1]))
plt.plot(a,b,label = " ")

row = ['1.0']
So row is a list with 1 value. row[1] is trying to access the second index of a list with 1 value. That is why you are getting an error.

When you are constructing matrix.dat, you do not add a comma for the CSV reader to separate the data. So when it tries to read the file, the whole thing is converted into a 1-element array. Attempting to access the second element throws an error because it doesn't exist.
Solution: Replace \n on line 34 with a comma (,).

Related

Processing data in text files

I have multiple text file in a directory. Each of these files contains 7 columns and 20 rows. The last column has 0 values in all rows at the beginning.
What i want to do is: I want to use the first three column of each txt file (line by line) to make some calćulation and store the result in the 7th column respectively line by line.
To clarify the structure of one txt file:
642.29 710.87 154.24 -0.50384 -0.17085 0.067804 0
641.57 711.98 154.42 -0.50681 -0.16978 0.06784 0
640.82 713.14 154.58 -0.50944 -0.1711 0.068266 0
639.72 714.53 154.59 -0.50496 -0.19229 0.057764 0
638.99 715.79 154.75 -0.50728 -0.18873 0.057795 0
638.18 717.13 154.96 -0.51024 -0.18653 0.057893 0
After the calculations are done the last column becomes with the new values as following and the txt file should be stored with the new values:
642.29 710.87 154.24 -0.50384 -0.17085 0.067804 0
641.57 711.98 154.42 -0.50681 -0.16978 0.06784 1.3352527850560352
640.82 713.14 154.58 -0.50944 -0.1711 0.068266 2.725828205520504
639.72 714.53 154.59 -0.50496 -0.19229 0.057764 3.1632005923493804
638.99 715.79 154.75 -0.50728 -0.18873 0.057795 3.237582509147674
638.18 717.13 154.96 -0.51024 -0.18653 0.057893 3.044767452434894
I did the process for one file. But how can i do it for multiple files? Open each file automatically, do some calculations on that file and store it.
Thanks
My code for one file:
import numpy as np
import os
Capture_period= 10
Marker_frames= 2000
Sampling_time = Capture_period/Marker_frames
coords = []
vel_list = [0]
ins_vel_list=[0]
# Define a function to calculate the euclidean distance
def Euclidean_Distance(a, b):
a = np.array(a)
b = np.array(b)
return np.linalg.norm(a-b)
def process(contents):
contents = first_source_data.tolist()
# Extract the xyz coordiantes
for i, item in enumerate(contents):
coords.append([[float(x) for x in item[0:3]], i+1])
print(coords)
rang=range(len(coords))
for i in rang:
if i !=rang[-1]:
Eucl_distance = Euclidean_Distance(coords[i][0], coords[i+1][0])
vel = ((Eucl_distance / (Sampling_time*100)))# + " cm/sec"
vel_list.append(vel)
ins_vel=(vel_list[i]+vel_list[i+1])/2
ins_vel_list.append(ins_vel)
continue
#del ins_vel_list[:]
#print(ins_vel_list)
from glob import glob
filepaths = glob("/home/experiment/*.txt")
for path in filepaths:
print(path)
process(path)
Problems:
The first 4 lines in each file are not read!
The append list must be reseted before the new file
You can create three text files with the 7 columns and whatever rows to test it.
Each file consists of coordinates of motion (xyz) and (theta_x, theta_y,theta_z) and the last column is the instantaneous velocity which the average of the average velocities.
The first component of the last column should equal in all files to zero (because a t the staring time the velocity is zero).
Any helps or solutions is appreciated!
Put your code in a function and make the function accept the path as argument, then call the function in a for loop iterating over the list of files.
E.g.:
from glob import glob
import numpy as np
from scipy.spatial import distance
CAPTURE_PERIOD = 10
MARKER_FRAMES = 2000
SAMPLING_TIME = CAPTURE_PERIOD/MARKER_FRAMES
def get_euclidean_distance(a, b):
a = np.array(a)
b = np.array(b)
return np.linalg.norm(a - b)
def make_chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i : i+n]
def write_in_chunks(f, lst, n):
for chunk in make_chunks(lst, n):
f.write(" ".join(str(val) for val in chunk) + "\n")
def process_file(filepath):
"""Process a single file.
:param filepath: path to the file to process
"""
# Load data from file
with open(filepath) as datafile:
contents = [line.split() for line in datafile]
# Define an empty list for coordinates
coords = []
# Set the first component in the velocity vector to 0 in velocity_list
vel_list = [0]
inst_vel_list=[0]
# Extract the xyz coordiantes
for i, item in enumerate(contents):
coords.append([[float(x) for x in item[0:3]], i+1])
# Calculate the euclidean distance and the speed using the previous coordinates system
rang = range(len(coords))
for i in rang:
if i != rang[-1]:
eucl_distance = get_euclidean_distance(coords[i][0], coords[i+1][0])
vel = ((eucl_distance / (SAMPLING_TIME*100)))# + " cm/sec"
vel_list.append(vel)
inst_vel = (vel_list[i]+vel_list[i+1])/2
inst_vel_list.append(inst_vel)
continue
for i, item in enumerate(contents):
item[-1] = inst_vel_list[i]
contents = np.ravel(contents)
with open(filepath, "w") as f:
write_in_chunks(f, contents, 7)
if __name__ == "__main__":
filepaths = glob("/home/experiment/*.txt")
for path in filepaths:
process(path)

Store data in an array from a loop

I have two set of datas which I would like to multiply one by each other, and store the result in an array for each value.
For now I have this:
import csv
from mpdaf.obj import Spectrum, WaveCoord
import matplotlib.pyplot as plt
import pandas as pd
from csv import reader
file_path = input("Enter full transmission curve path : ")
with open(file_path, 'rw') as f:
data = list(reader(f, delimiter=","))
wavelength = [i[0] for i in data]
percentage = [float(str(i[1]).replace(',','.')) for i in data]
spectrum = input("Full spectrum path : ")
spe = Spectrum(filename=spectrum, ext=0)
data_flux = spe.data
flux_array = []
for i in percentage:
for j in data_flux:
flux = i*j
flux_array.append(flux)
print(flux_array)
Like this it take the first i then multiply it by all the j then takes the next i etc etc ...
I would like to just multiply the first i by the first j, then store the value in the array, then multiply the 2nd i by the second j and store the value etc ...
It is as the error message says: your indices i and j are floats, not integers. When you write for i in percentage:, i takes on every value in the percentage list. Instead, you might want to iterate through a range. Here's an example to illustrate the difference:
percentage = [50.0, 60.0, 70.0]
for i in percentage:
print(i)
# 50.0
# 60.0
# 70.0
for i in range(len(percentage)):
print(i)
# 0
# 1
# 2
To iterate through a list of indices, you probably want to iterate through a range:
for i in range(len(percentage)):
for j in range(len(data_flux)):
flux = percentage[i]*data_flux[j]
flux_array.append(flux)
This will iterate through the integers of each list, starting at 0 and ending at the maximum index of the list.

Matching multiple array value to row in csv file slow

I have a numpy array consisting of about 1200 arrays containing 10 values each. np.shape = 1200, 10. Each element has a value between 0 and 5,7 million.
Next I have a .csv file with 3800 lines. Every line contains 2 values. The first value indicates a range the second value is an identifier. The first and last 5 rows of the .csv file:
509,47222
1425,47220
2404,47219
4033,47218
6897,47202
...,...
...,...
...,...
5793850,211
5794901,186
5795820,181
5796176,43
5796467,33
The first columns goes up until it reaches 5,7 million. For each value in the numpy array I want to check the first column of the .csv file. I have for example the value 3333, this means the identifier belonging to 3333 is 47218. Each row indicates that from the first column of the row before till the first column of this row, eg: 2404 - 4033 the identifier is 47218.
Now I want to get the identifier for each value in the numpy array, then I want to safe the identifier and the frequency of which this identifier is found in the numpy array. Which means I need to loop 3800 times over a csv file of 12000 lines and subsequently ++ an integer. This process takes about 30 seconds which is way too long.
This is the code I am currently using:
numpy_file = np.fromfile(filename, dtype=np.int32)
#some code to format numpy_file correctly
with open('/identifer_file.csv') as read_file:
csv_reader = csv.reader(read_file, delimiter=',')
csv_reader = list(csv_reader)
identifier_dict = {}
for numpy_array in numpy_file:
for numpy_value in numpy_array:
#there are 12000 numpy_value in numpy_file
for row in csv_reader:
last_identifier = 0
if numpy_value <= int(row[0]):
last_identifier = int(row[1])
#adding the frequency of the identifier in numpy_file to a dict
if last_identifier in identifier_dict:
identifier_dict[last_identifier] += 1
else:
identifier_dict[last_identifier] = 1
else:
continue
break
for x, y in identifier_dict.items():
if(y > 40):
print("identifier: {} amount of times found: {}".format(x, y))
What algorithm should I implement to speed up this process?
Edit
I have tried folding the numpy array to a 1D array, so it has 12000 values. This has no real affect on the speed. Latest test was 33 seconds
Setup:
import numpy as np
import collections
np.random.seed(100)
numpy_file = np.random.randint(0, 5700000, (1200,10))
#'''range, identifier'''
read_file = io.StringIO('''509,47222
1425,47220
2404,47219
4033,47218
6897,47202
5793850,211
5794901,186
5795820,181
5796176,43
5796467,33''')
csv_reader = csv.reader(read_file, delimiter=',')
csv_reader = list(csv_reader)
# your example code put in a function and adapted for the setup above
def original(numpy_file,csv_reader):
identifier_dict = {}
for numpy_array in numpy_file:
for numpy_value in numpy_array:
#there are 12000 numpy_value in numpy_file
for row in csv_reader:
last_identifier = 0
if numpy_value <= int(row[0]):
last_identifier = int(row[1])
#adding the frequency of the identifier in numpy_file to a dict
if last_identifier in identifier_dict:
identifier_dict[last_identifier] += 1
else:
identifier_dict[last_identifier] = 1
else:
continue
break
# for x, y in identifier_dict.items():
# if(y > 40):
# print("identifier: {} amount of times found: {}".format(x, y))
return identifier_dict
Three solutions each vectorizing some of the operations. The first function consumes the least memory, the last consumes the most memory.
def first(numpy_file,r):
'''compare each value in the array to the entire first column of the csv'''
alternate = collections.defaultdict(int)
for value in np.nditer(numpy_file):
comparison = value < r[:,0]
identifier = r[:,1][comparison.argmax()]
alternate[identifier] += 1
return alternate
def second(numpy_file,r):
'''compare each row of the array to the first column of csv'''
alternate = collections.defaultdict(int)
for row in numpy_file:
comparison = row[...,None] < r[:,0]
indices = comparison.argmax(-1)
id_s = r[:,1][indices]
for thing in id_s:
#adding the frequency of the identifier in numpy_file to a dict
alternate[thing] += 1
return alternate
def third(numpy_file,r):
'''compare the whole array to the first column of csv'''
alternate = collections.defaultdict(int)
other = collections.Counter()
comparison = numpy_file[...,None] < r[:,0]
indices = comparison.argmax(-1)
id_s = r[:,1][indices]
other = collections.Counter(map(int,np.nditer(id_s)))
return other
The functions require the csv file be read into a numpy array:
read_file.seek(0) #io.StringIO object from setup
csv_reader = csv.reader(read_file, delimiter=',')
r = np.array([list(map(int,thing)) for thing in csv_reader])
one = first(numpy_file, r)
two = second(numpy_file,r)
three = third(numpy_file,r)
assert zero == one
assert zero == two
assert zero == three

How to separate different input formats from the same text file with Python

I'm new to programming and python and I'm looking for a way to distinguish between two input formats in the same input file text file. For example, let's say I have an input file like so where values are comma-separated:
5
Washington,A,10
New York,B,20
Seattle,C,30
Boston,B,20
Atlanta,D,50
2
New York,5
Boston,10
Where the format is N followed by N lines of Data1, and M followed by M lines of Data2. I tried opening the file, reading it line by line and storing it into one single list, but I'm not sure how to go about to produce 2 lists for Data1 and Data2, such that I would get:
Data1 = ["Washington,A,10", "New York,B,20", "Seattle,C,30", "Boston,B,20", "Atlanta,D,50"]
Data2 = ["New York,5", "Boston,10"]
My initial idea was to iterate through the list until I found an integer i, remove the integer from the list and continue for the next i iterations all while storing the subsequent values in a separate list, until I found the next integer and then repeat. However, this would destroy my initial list. Is there a better way to separate the two data formats in different lists?
You could use itertools.islice and a list comprehension:
from itertools import islice
string = """
5
Washington,A,10
New York,B,20
Seattle,C,30
Boston,B,20
Atlanta,D,50
2
New York,5
Boston,10
"""
result = [[x for x in islice(parts, idx + 1, idx + 1 + int(line))]
for parts in [string.split("\n")]
for idx, line in enumerate(parts)
if line.isdigit()]
print(result)
This yields
[['Washington,A,10', 'New York,B,20', 'Seattle,C,30', 'Boston,B,20', 'Atlanta,D,50'], ['New York,5', 'Boston,10']]
For a file, you need to change it to:
with open("testfile.txt", "r") as f:
result = [[x for x in islice(parts, idx + 1, idx + 1 + int(line))]
for parts in [f.read().split("\n")]
for idx, line in enumerate(parts)
if line.isdigit()]
print(result)
You're definitely on the right track.
If you want to preserve the original list here, you don't actually have to remove integer i; you can just go on to the next item.
Code:
originalData = []
formattedData = []
with open("data.txt", "r") as f :
f = list(f)
originalData = f
i = 0
while i < len(f): # Iterate through every line
try:
n = int(f[i]) # See if line can be cast to an integer
originalData[i] = n # Change string to int in original
formattedData.append([])
for j in range(n):
i += 1
item = f[i].replace('\n', '')
originalData[i] = item # Remove newline char in original
formattedData[-1].append(item)
except ValueError:
print("File has incorrect format")
i += 1
print(originalData)
print(formattedData)
The following code will produce a list results which is equal to [Data1, Data2].
The code assumes that the number of entries specified is exactly the amount that there is. That means that for a file like this, it will not work.
2
New York,5
Boston,10
Seattle,30
The code:
# get the data from the text file
with open('filename.txt', 'r') as file:
lines = file.read().splitlines()
results = []
index = 0
while index < len(lines):
# Find the start and end values.
start = index + 1
end = start + int(lines[index])
# Everything from the start up to and excluding the end index gets added
results.append(lines[start:end])
# Update the index
index = end

Removing quotes from 2D array python

I am currently trying to execute code that evaluetes powers with big exponents without calculating them, but instead logs of them. I have a file containing 1000 lines. Each line contains two itegers separated by a comma. I got stuck at point where i tried to remove quotes from array. I tried many way of which none worked. Here is my code:
function from myLib called split() takes two argumanets of which one is a list and second is to how many elemts to split the original list. Then does so and appends smaller lists to the new one.
import math
import myLib
i = 0
record = 0
cmpr = 0
with open("base_exp.txt", "r") as f:
fArr = f.readlines()
fArr = myLib.split(fArr, 1)
#place get rid of quotes
print(fArr)
while i < len(fArr):
cmpr = int(fArr[i][1]) * math.log(int(fArr[i][0]))
if cmpr > record:
record = cmpr
print(record)
i = i + 1
This is how my Array looks like:
[['519432,525806\n'], ['632382,518061\n'], ... ['172115,573985\n'], ['13846,725685\n']]
I tried to find a way around the 2d array and tried:
i = 0
record = 0
cmpr = 0
with open("base_exp.txt", "r") as f:
fArr = f.readlines()
#fArr = myLib.split(fArr, 1)
fArr = [x.replace("'", '') for x in fArr]
print(fArr)
while i < len(fArr):
cmpr = int(fArr[i][1]) * math.log(int(fArr[i][0]))
if cmpr > record:
record = cmpr
print(i)
i = i + 1
But output looked like this:
['519432,525806\n', '632382,518061\n', '78864,613712\n', ...
And the numbers in their current state cannot be considered as integers or floats so this isnt working as well...:
[int(i) for i in lst]
Expected output for the array itself would look like this, so i can pick one of the numbers and work with it:
[[519432,525806], [632382,518061], [78864,613712]...
I would really apreciate your help since im still very new to python and programming in general.
Thank you for your time.
You can avoid all of your problems by simply using numpy's convenient loadtxt function:
import numpy as np
arr = np.loadtxt('p099_base_exp.txt', delimiter=',')
arr
array([[519432., 525806.],
[632382., 518061.],
[ 78864., 613712.],
...,
[325361., 545187.],
[172115., 573985.],
[ 13846., 725685.]])
If you need a one-dimensional array:
arr.flatten()
# array([519432., 525806., 632382., ..., 573985., 13846., 725685.])
This is your missing piece:
fArr = [[int(num) for num in line.rstrip("\n").split(",")] for line in fArr]
Here, rstrip("\n") will remove trailing \n character from the line and then the string will be split on , so that each string will be become a list and all integers in that line will become elements of that list but as a string. Then, we can call int() function on each list element to convert them into int data type.
Below code should do the job if you don't want to import an additional library.
i = 0
record = 0
cmpr = 0
with open("base_exp.txt", "r") as f:
fArr = f.readlines()
fArr = [[int(num) for num in line.rstrip("\n").split(",")] for line in fArr]
print(fArr)
while i < len(fArr):
cmpr = fArr[i][1] * math.log(fArr[i][0])
if cmpr > record:
record = cmpr
print(i)
i = i + 1
This snippet will transform your array to 1D array of integers:
from itertools import chain
arr = [['519432,525806\n'], ['632382,518061\n']]
new_arr = [int(i.strip()) for i in chain.from_iterable(i[0].split(',') for i in arr)]
print(new_arr)
Prints:
[519432, 525806, 632382, 518061]
For 2D output you can use this:
arr = [['519432,525806\n'], ['632382,518061\n']]
new_arr = [[int(i) for i in v] for v in (i[0].split(',') for i in arr)]
print(new_arr)
This prints:
[[519432, 525806], [632382, 518061]]
new_list=[]
a=['519432,525806\n', '632382,518061\n', '78864,613712\n',]
for i in a:
new_list.append(list(map(int,i.split(","))))
print(new_list)
Output:
[[519432, 525806], [632382, 518061], [78864, 613712]]
In order to flatten the new_list
from functools import reduce
reduce(lambda x,y: x+y,new_list)
print(new_list)
Output:
[519432, 525806, 632382, 518061, 78864, 613712]

Categories

Resources