Processing data in text files - python

I have multiple text file in a directory. Each of these files contains 7 columns and 20 rows. The last column has 0 values in all rows at the beginning.
What i want to do is: I want to use the first three column of each txt file (line by line) to make some calćulation and store the result in the 7th column respectively line by line.
To clarify the structure of one txt file:
642.29 710.87 154.24 -0.50384 -0.17085 0.067804 0
641.57 711.98 154.42 -0.50681 -0.16978 0.06784 0
640.82 713.14 154.58 -0.50944 -0.1711 0.068266 0
639.72 714.53 154.59 -0.50496 -0.19229 0.057764 0
638.99 715.79 154.75 -0.50728 -0.18873 0.057795 0
638.18 717.13 154.96 -0.51024 -0.18653 0.057893 0
After the calculations are done the last column becomes with the new values as following and the txt file should be stored with the new values:
642.29 710.87 154.24 -0.50384 -0.17085 0.067804 0
641.57 711.98 154.42 -0.50681 -0.16978 0.06784 1.3352527850560352
640.82 713.14 154.58 -0.50944 -0.1711 0.068266 2.725828205520504
639.72 714.53 154.59 -0.50496 -0.19229 0.057764 3.1632005923493804
638.99 715.79 154.75 -0.50728 -0.18873 0.057795 3.237582509147674
638.18 717.13 154.96 -0.51024 -0.18653 0.057893 3.044767452434894
I did the process for one file. But how can i do it for multiple files? Open each file automatically, do some calculations on that file and store it.
Thanks
My code for one file:
import numpy as np
import os
Capture_period= 10
Marker_frames= 2000
Sampling_time = Capture_period/Marker_frames
coords = []
vel_list = [0]
ins_vel_list=[0]
# Define a function to calculate the euclidean distance
def Euclidean_Distance(a, b):
a = np.array(a)
b = np.array(b)
return np.linalg.norm(a-b)
def process(contents):
contents = first_source_data.tolist()
# Extract the xyz coordiantes
for i, item in enumerate(contents):
coords.append([[float(x) for x in item[0:3]], i+1])
print(coords)
rang=range(len(coords))
for i in rang:
if i !=rang[-1]:
Eucl_distance = Euclidean_Distance(coords[i][0], coords[i+1][0])
vel = ((Eucl_distance / (Sampling_time*100)))# + " cm/sec"
vel_list.append(vel)
ins_vel=(vel_list[i]+vel_list[i+1])/2
ins_vel_list.append(ins_vel)
continue
#del ins_vel_list[:]
#print(ins_vel_list)
from glob import glob
filepaths = glob("/home/experiment/*.txt")
for path in filepaths:
print(path)
process(path)
Problems:
The first 4 lines in each file are not read!
The append list must be reseted before the new file
You can create three text files with the 7 columns and whatever rows to test it.
Each file consists of coordinates of motion (xyz) and (theta_x, theta_y,theta_z) and the last column is the instantaneous velocity which the average of the average velocities.
The first component of the last column should equal in all files to zero (because a t the staring time the velocity is zero).
Any helps or solutions is appreciated!

Put your code in a function and make the function accept the path as argument, then call the function in a for loop iterating over the list of files.
E.g.:
from glob import glob
import numpy as np
from scipy.spatial import distance
CAPTURE_PERIOD = 10
MARKER_FRAMES = 2000
SAMPLING_TIME = CAPTURE_PERIOD/MARKER_FRAMES
def get_euclidean_distance(a, b):
a = np.array(a)
b = np.array(b)
return np.linalg.norm(a - b)
def make_chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i : i+n]
def write_in_chunks(f, lst, n):
for chunk in make_chunks(lst, n):
f.write(" ".join(str(val) for val in chunk) + "\n")
def process_file(filepath):
"""Process a single file.
:param filepath: path to the file to process
"""
# Load data from file
with open(filepath) as datafile:
contents = [line.split() for line in datafile]
# Define an empty list for coordinates
coords = []
# Set the first component in the velocity vector to 0 in velocity_list
vel_list = [0]
inst_vel_list=[0]
# Extract the xyz coordiantes
for i, item in enumerate(contents):
coords.append([[float(x) for x in item[0:3]], i+1])
# Calculate the euclidean distance and the speed using the previous coordinates system
rang = range(len(coords))
for i in rang:
if i != rang[-1]:
eucl_distance = get_euclidean_distance(coords[i][0], coords[i+1][0])
vel = ((eucl_distance / (SAMPLING_TIME*100)))# + " cm/sec"
vel_list.append(vel)
inst_vel = (vel_list[i]+vel_list[i+1])/2
inst_vel_list.append(inst_vel)
continue
for i, item in enumerate(contents):
item[-1] = inst_vel_list[i]
contents = np.ravel(contents)
with open(filepath, "w") as f:
write_in_chunks(f, contents, 7)
if __name__ == "__main__":
filepaths = glob("/home/experiment/*.txt")
for path in filepaths:
process(path)

Related

How to create a 2d nested list from a text file using python?

I'm a beginner programmer, and I'm trying to figure out how to create a 2d nested list (grid) from a particular text file. For example, the text file would look like this:
3
3
150
109
80
892
123
982
0
98
23
The first two lines in the text file would be used to create the grid, meaning that it is 3x3. The next 9 lines would be used to populate the grid, with the first 3 making up the first row, the next 3 making up the middle row, and the final 3 making up the last row. So the nested list would look like this:
[[150, 109, 80] [892, 123, 982] [0, 98, 23]]
How do I go about doing this? I was able to make a list of all of the contents, but I can't figure out how to use the first 2 lines to define the size of the inner lists within the outer list:
lineContent = []
innerList = ?
for lines in open('document.txt','r'):
value = int(lines)
lineContent.append(value)
From here, where do I go to turn it into a nested list using the given values on the first 2 lines?
Thanks in advance.
You can make this quite neat using list comprehension.
def txt_grid(your_txt):
with open(your_txt, 'r') as f:
# Find columns and rows
columns = int(f.readline())
rows = int(f.readline())
your_list = [[f.readline().strip() for i in range(rows)] for j in range(columns)]
return your_list
print(txt_grid('document.txt'))
strip() just clears the newline characters (\n) from each line before storing them in the list.
Edit: A modified version with logic for if your txt file didn't have enough rows for the defined dimensions.
def txt_grid(your_txt):
with open(your_txt, 'r') as f:
# Find columns and rows
columns = int(f.readline())
rows = int(f.readline())
dimensions = columns * rows
# Test to see if there are enough rows, creating grid if there are
nonempty_lines = len([line.strip("\n") for line in f]) # This ignores the first two lines as they have already been written
if nonempty_lines < dimensions:
# Either raise an error
# raise ValueError("Insufficient non-empty rows in text file for given dimensions")
# Or return something that's not a list
your_list = None
else:
# Creating grid
your_list = [[f.readline().strip() for i in range(rows)] for j in range(columns)]
return your_list
print(txt_grid('document.txt'))
def parse_txt(filepath):
lineContent = []
with open(filepath, 'r') as txt: # The with statement closes the txt file after its been used
nrows = int(txt.readline())
ncols = int(txt.readline())
for i in range(nrows): # For each row
row = []
for j in range(ncols): # Grab each value in the row
row.append(int(txt.readline()))
lineContent.append(row)
return lineContent
grid_2d = parse_txt('document.txt')
lineContent = []
innerList = []
for lines in open('testQuestion.txt', 'r'):
value = int(lines)
lineContent.append(value)
rowSz = lineContent[0] # row size
colSz = lineContent[1] # column size
del lineContent[0], lineContent[0] # makes line contents just the values in the matrix, could also just start currentLine at 2, notice 0 index is repeated because 1st element was deleted
assert rowSz * colSz == len(lineContent), 'not enough values for array' # to ensure there are enough entries to complete array of rowSz * colSz elements
arr = []
currentLine = 0
for x in range(rowSz):
arr.append([])
for y in range(colSz):
arr[x].append(lineContent[currentLine])
currentLine += 1
print(arr)

Using Returned Value in Later Function

Let's say I wanted to call a function to do some calculation, but I also wanted to use that calculated value in a later function. When I return the value of the first function can I not just send it to my next function? Here is an example of what I am talking about:
def add(x,y):
addition = x + y
return addition
def square(a):
result = a * a
return result
sum = add(1,4)
product = square(addition)
If I call the add function, it'll return 5 as the addition result. But I want to use that number 5 in the next function, can I just send it to the next function as shown? In the main program I am working on it does not work like this.
Edit: This is a sample of the code I am actually working on which will give a better idea of what the problem is. The problem is when I send the mean to the calculateStdDev function.
#import libraries to be used
import time
import StatisticsCalculations
#global variables
mean = 0
stdDev = 0
#get file from user
fileChoice = input("Enter the .csv file name: ")
inputFile = open(fileChoice)
headers = inputFile.readline().strip('\n').split(',') #create headers for columns and strips unnecessary characters
#create a list with header-number of lists in it
dataColumns = []
for i in headers:
dataColumns.append([]) #fills inital list with as many empty lists as there are columns
#counts how many rows there are and adds a column of data into each empty list
rowCount = 0
for row in inputFile:
rowCount = rowCount + 1
comps = row.strip().split(',') #components of data
for j in range(len(comps)):
dataColumns[j].append(float(comps[j])) #appends the jth entry into the jth column, separating data into categories
k = 0
for entry in dataColumns:
print("{:>11}".format(headers[k]),"|", "{:>10.2f}".format(StatisticsCalculations.findMax(dataColumns[k])),"|",
"{:>10.2f}".format(StatisticsCalculations.findMin(dataColumns[k])),"|","{:>10.2f}".format(StatisticsCalculations.calculateMean(dataColumns[k], rowCount)),"|","{:>10.2f}".format()) #format each data entry to be right aligned and be correctly spaced in its column
#prining break line for each row
k = k + 1 #counting until dataColumns is exhausted
inputFile.close()
And the StatisticsCalculations module:
import math
def calculateMean(data, rowCount):
sumForMean = 0
for entry in data:
sumForMean = sumForMean + entry
mean = sumForMean/rowCount
return mean
def calculateStdDev(data, mean, rowCount, entry):
stdDevSum = 0
for x in data:
stdDevSum = float(stdDevSum) + ((float(entry[x]) - mean)** 2) #getting sum of squared difference to be used in std dev formula
stdDev = math.sqrt(stdDevSum / rowCount) #using the stdDevSum for the remaining parts of std dev formula
return stdDev
def findMin(data):
lowestNum = 1000
for component in data:
if component < lowestNum:
lowestNum = component
return lowestNum
def findMax(data):
highestNum = -1
for number in data:
if number > highestNum:
highestNum = number
return highestNum
First of all, sum is a reserved word, you shouldn't use it as a variable.
You can do it this way:
def add(x,y):
addition = x + y
return addition
def square(a):
result = a * a
return result
s = add(1, 4)
product = square(s)
Or directly:
product = square(add(1, 4))

Matching multiple array value to row in csv file slow

I have a numpy array consisting of about 1200 arrays containing 10 values each. np.shape = 1200, 10. Each element has a value between 0 and 5,7 million.
Next I have a .csv file with 3800 lines. Every line contains 2 values. The first value indicates a range the second value is an identifier. The first and last 5 rows of the .csv file:
509,47222
1425,47220
2404,47219
4033,47218
6897,47202
...,...
...,...
...,...
5793850,211
5794901,186
5795820,181
5796176,43
5796467,33
The first columns goes up until it reaches 5,7 million. For each value in the numpy array I want to check the first column of the .csv file. I have for example the value 3333, this means the identifier belonging to 3333 is 47218. Each row indicates that from the first column of the row before till the first column of this row, eg: 2404 - 4033 the identifier is 47218.
Now I want to get the identifier for each value in the numpy array, then I want to safe the identifier and the frequency of which this identifier is found in the numpy array. Which means I need to loop 3800 times over a csv file of 12000 lines and subsequently ++ an integer. This process takes about 30 seconds which is way too long.
This is the code I am currently using:
numpy_file = np.fromfile(filename, dtype=np.int32)
#some code to format numpy_file correctly
with open('/identifer_file.csv') as read_file:
csv_reader = csv.reader(read_file, delimiter=',')
csv_reader = list(csv_reader)
identifier_dict = {}
for numpy_array in numpy_file:
for numpy_value in numpy_array:
#there are 12000 numpy_value in numpy_file
for row in csv_reader:
last_identifier = 0
if numpy_value <= int(row[0]):
last_identifier = int(row[1])
#adding the frequency of the identifier in numpy_file to a dict
if last_identifier in identifier_dict:
identifier_dict[last_identifier] += 1
else:
identifier_dict[last_identifier] = 1
else:
continue
break
for x, y in identifier_dict.items():
if(y > 40):
print("identifier: {} amount of times found: {}".format(x, y))
What algorithm should I implement to speed up this process?
Edit
I have tried folding the numpy array to a 1D array, so it has 12000 values. This has no real affect on the speed. Latest test was 33 seconds
Setup:
import numpy as np
import collections
np.random.seed(100)
numpy_file = np.random.randint(0, 5700000, (1200,10))
#'''range, identifier'''
read_file = io.StringIO('''509,47222
1425,47220
2404,47219
4033,47218
6897,47202
5793850,211
5794901,186
5795820,181
5796176,43
5796467,33''')
csv_reader = csv.reader(read_file, delimiter=',')
csv_reader = list(csv_reader)
# your example code put in a function and adapted for the setup above
def original(numpy_file,csv_reader):
identifier_dict = {}
for numpy_array in numpy_file:
for numpy_value in numpy_array:
#there are 12000 numpy_value in numpy_file
for row in csv_reader:
last_identifier = 0
if numpy_value <= int(row[0]):
last_identifier = int(row[1])
#adding the frequency of the identifier in numpy_file to a dict
if last_identifier in identifier_dict:
identifier_dict[last_identifier] += 1
else:
identifier_dict[last_identifier] = 1
else:
continue
break
# for x, y in identifier_dict.items():
# if(y > 40):
# print("identifier: {} amount of times found: {}".format(x, y))
return identifier_dict
Three solutions each vectorizing some of the operations. The first function consumes the least memory, the last consumes the most memory.
def first(numpy_file,r):
'''compare each value in the array to the entire first column of the csv'''
alternate = collections.defaultdict(int)
for value in np.nditer(numpy_file):
comparison = value < r[:,0]
identifier = r[:,1][comparison.argmax()]
alternate[identifier] += 1
return alternate
def second(numpy_file,r):
'''compare each row of the array to the first column of csv'''
alternate = collections.defaultdict(int)
for row in numpy_file:
comparison = row[...,None] < r[:,0]
indices = comparison.argmax(-1)
id_s = r[:,1][indices]
for thing in id_s:
#adding the frequency of the identifier in numpy_file to a dict
alternate[thing] += 1
return alternate
def third(numpy_file,r):
'''compare the whole array to the first column of csv'''
alternate = collections.defaultdict(int)
other = collections.Counter()
comparison = numpy_file[...,None] < r[:,0]
indices = comparison.argmax(-1)
id_s = r[:,1][indices]
other = collections.Counter(map(int,np.nditer(id_s)))
return other
The functions require the csv file be read into a numpy array:
read_file.seek(0) #io.StringIO object from setup
csv_reader = csv.reader(read_file, delimiter=',')
r = np.array([list(map(int,thing)) for thing in csv_reader])
one = first(numpy_file, r)
two = second(numpy_file,r)
three = third(numpy_file,r)
assert zero == one
assert zero == two
assert zero == three

How to separate different input formats from the same text file with Python

I'm new to programming and python and I'm looking for a way to distinguish between two input formats in the same input file text file. For example, let's say I have an input file like so where values are comma-separated:
5
Washington,A,10
New York,B,20
Seattle,C,30
Boston,B,20
Atlanta,D,50
2
New York,5
Boston,10
Where the format is N followed by N lines of Data1, and M followed by M lines of Data2. I tried opening the file, reading it line by line and storing it into one single list, but I'm not sure how to go about to produce 2 lists for Data1 and Data2, such that I would get:
Data1 = ["Washington,A,10", "New York,B,20", "Seattle,C,30", "Boston,B,20", "Atlanta,D,50"]
Data2 = ["New York,5", "Boston,10"]
My initial idea was to iterate through the list until I found an integer i, remove the integer from the list and continue for the next i iterations all while storing the subsequent values in a separate list, until I found the next integer and then repeat. However, this would destroy my initial list. Is there a better way to separate the two data formats in different lists?
You could use itertools.islice and a list comprehension:
from itertools import islice
string = """
5
Washington,A,10
New York,B,20
Seattle,C,30
Boston,B,20
Atlanta,D,50
2
New York,5
Boston,10
"""
result = [[x for x in islice(parts, idx + 1, idx + 1 + int(line))]
for parts in [string.split("\n")]
for idx, line in enumerate(parts)
if line.isdigit()]
print(result)
This yields
[['Washington,A,10', 'New York,B,20', 'Seattle,C,30', 'Boston,B,20', 'Atlanta,D,50'], ['New York,5', 'Boston,10']]
For a file, you need to change it to:
with open("testfile.txt", "r") as f:
result = [[x for x in islice(parts, idx + 1, idx + 1 + int(line))]
for parts in [f.read().split("\n")]
for idx, line in enumerate(parts)
if line.isdigit()]
print(result)
You're definitely on the right track.
If you want to preserve the original list here, you don't actually have to remove integer i; you can just go on to the next item.
Code:
originalData = []
formattedData = []
with open("data.txt", "r") as f :
f = list(f)
originalData = f
i = 0
while i < len(f): # Iterate through every line
try:
n = int(f[i]) # See if line can be cast to an integer
originalData[i] = n # Change string to int in original
formattedData.append([])
for j in range(n):
i += 1
item = f[i].replace('\n', '')
originalData[i] = item # Remove newline char in original
formattedData[-1].append(item)
except ValueError:
print("File has incorrect format")
i += 1
print(originalData)
print(formattedData)
The following code will produce a list results which is equal to [Data1, Data2].
The code assumes that the number of entries specified is exactly the amount that there is. That means that for a file like this, it will not work.
2
New York,5
Boston,10
Seattle,30
The code:
# get the data from the text file
with open('filename.txt', 'r') as file:
lines = file.read().splitlines()
results = []
index = 0
while index < len(lines):
# Find the start and end values.
start = index + 1
end = start + int(lines[index])
# Everything from the start up to and excluding the end index gets added
results.append(lines[start:end])
# Update the index
index = end

Unable to pull data from a file and place into two arrays

The code uses the matrix and arrpow functions to calculate the fibonacci numbers for the elements in my list, num. Oddly, right after a.append(float(row[0])) is completed, the error I get is
IndexError: list index out of range
Which is obviously coming from b.append.
Here's the file I want to pull from
import time
import math
import csv
import matplotlib.pyplot as plt
def arrpow(arr, n):
yarr=arr
if n<1:
pass
if n==1:
return arr
yarr = arrpow(arr, n//2)
yarr = [[yarr[0][0]*yarr[0][0]+yarr[0][1]*yarr[1][0],yarr[0][0]*yarr[0][1]+yarr[0][1]*yarr[1][1]],
[yarr[1][0]*yarr[0][0]+yarr[1][1]*yarr[1][0],yarr[1][0]*yarr[0][1]+yarr[1][1]*yarr[1][1]]]
if n%2:
yarr=[[yarr[0][0]*arr[0][0]+yarr[0][1]*arr[1][0],yarr[0][0]*arr[0][1]+yarr[0][1]*arr[1][1]],
[yarr[1][0]*arr[0][0]+yarr[1][1]*arr[1][0],yarr[1][0]*arr[0][1]+yarr[1][1]*arr[1][1]]]
return yarr
def matrix(n):
arr= [[1,1],[1,0]]
f=arrpow(arr,n-1)[0][0]
return f
num = [10,100,1000,10000,100000,1000000]
with open('matrix.dat', 'w') as h:
for i in num:
start_time = 0
start_time = time.time()
run = matrix(i)
h.write(str(math.log10(i)))
h.write('\n')
h.write((str(math.log10(time.time()-start_time))))
h.write('\n')
a = []
b = []
with open('matrix.dat','r+') as csvfile:
plots = csv.reader(csvfile, delimiter=',')
for row in plots:
a.append(float(row[0]))
b.append(float(row[1]))
plt.plot(a,b,label = " ")
row = ['1.0']
So row is a list with 1 value. row[1] is trying to access the second index of a list with 1 value. That is why you are getting an error.
When you are constructing matrix.dat, you do not add a comma for the CSV reader to separate the data. So when it tries to read the file, the whole thing is converted into a 1-element array. Attempting to access the second element throws an error because it doesn't exist.
Solution: Replace \n on line 34 with a comma (,).

Categories

Resources