Create an array from data in a table using Python - python

I want to get data from a table in a text file into a python array. The text file that I am using as an input has 7 columns and 31 rows. Here is an example of the first two rows:
10672 34.332875 5.360831 0.00004035881220 0.00000515052523 4.52E-07 6.5E-07
12709 40.837833 19.429158 0.00012010938453 -0.00000506426720 7.76E-06 2.9E-07
The code that I have tried to write isn't working as it is not reading one line at a time when it goes through the for loop.
data = []
f = open('hyadeserr.txt', 'r')
while True:
eof = "no"
array = []
for i in range(7):
line = f.readline()
word = line.split()
if len(word) == 0:
eof = "yes"
else:
array.append(float(word[0]))
print array
if eof == "yes": break
data.append(array)
Any help would be greatly appreciated.

A file with space-separated values is just a dialect of the classic comma-separated values (CSV) file where the delimiter is a space (), followed by more spaces, which can be ignored.
Happily, Python comes with a csv.reader class that understands dialects.
You should use this:
Example:
#!/usr/bin/env python
import csv
csv.register_dialect('ssv', delimiter=' ', skipinitialspace=True)
data = []
with open('hyadeserr.txt', 'r') as f:
reader = csv.reader(f, 'ssv')
for row in reader:
floats = [float(column) for column in row]
data.append(floats)
print data

If you don't want to use cvs here, since you don't really need it:
data = []
with open("hyadeserr.txt") as file:
for line in file:
data.append([float(f) for f in line.strip().split()])
Or, if you know for sure that the only extra chars are spaces and line ending \n, you can turn the last line into:
data.append([float(f) for f in line[:-1].split()])

Related

How to convert this text file to csv?

I try analyze text file with data - columns, and records.
My file:
Name Surname Age Sex Grade
Chris M. 14 M 4
Adam A. 17 M
Jack O. M 8
The text file has some empty data. As above.
User want to show Name and Grade:
import csv
with open('launchlog.txt', 'r') as in_file:
stripped = (line.strip() for line in in_file)
lines = (line.split() for line in stripped if line)
with open('log.txt', 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(('Name', 'Surname', 'Age', 'Sex', 'Grade'))
writer.writerows(lines)
log.txt :
Chris,M.,14,M,4
Adam,A.,17,M
Jack,O.,M,8
How to empty data insert a "None" string?
For example:
Chris,M.,14,M,4
Adam,A.,17,M,None
Jack,O.,None,M,8
What would be the best way to do this in Python?
Use pandas:
import pandas
data=pandas.read_fwf("file.txt")
To get your dictionnary:
data.set_index("Name")["Grade"].to_dict()
Here's something in Pure Pythonâ„¢ that seems to do what you want, at least on the sample data file in your question.
In a nutshell what it does is first determine where each of the field names in column header line start and end, and then for each of the remaining lines of the file, does the same thing getting a second list which is used to determine what column each data item in the line is underneath (which it then puts in its proper position in the row that will be written to the output file).
import csv
def find_words(line):
""" Return a list of (start, stop) tuples with the indices of the
first and last characters of each "word" in the given string.
Any sequence of consecutive non-space characters is considered
as comprising a word.
"""
line_len = len(line)
indices = []
i = 0
while i < line_len:
start, count = i, 0
while line[i] != ' ':
count += 1
i += 1
if i >= line_len:
break
indices.append((start, start+count-1))
while i < line_len and line[i] == ' ': # advance to start of next word
i += 1
return indices
# convert text file with missing fields to csv
with open('name_grades.txt', 'rt') as in_file, open('log.csv', 'wt', newline='') as out_file:
writer = csv.writer(out_file)
header = next(in_file) # read first line
fields = header.split()
writer.writerow(fields)
# determine the indices of where each field starts and stops based on header line
field_positions = find_words(header)
for line in in_file:
line = line.rstrip('\r\n') # remove trailing newline
row = ['None' for _ in range(len(fields))]
value_positions = find_words(line)
for (vstart, vstop) in value_positions:
# determine what field the value is underneath
for i, (hstart, hstop) in enumerate(field_positions):
if vstart <= hstop and hstart <= vstop: # overlap?
row[i] = line[vstart:vstop+1]
break # stop looking
writer.writerow(row)
Here's the contents of the log.csv file it created:
Name,Surname,Age,Sex,Grade
Chris,C.,14,M,4
Adam,A.,17,M,None
Jack,O.,None,M,8
I would use baloo's answer over mine -- but if you just want to get a feel for where your code went wrong, the solution below mostly works (there is a formatting issue with the Grade field, but I'm sure you can get through that.) Add some print statements to your code and to mine and you should be able to pick up the differences.
import csv
<Old Code removed in favor of new code below>
EDIT: I see your difficulty now. Please try the below code; I'm out of time today so you will have to fill in the writer parts where the print statement is, but this will fulfill your request to replace empty fields with None.
import csv
with open('Test.txt', 'r') as in_file:
with open('log.csv', 'w') as out_file:
writer = csv.writer(out_file)
lines = [line for line in in_file]
name_and_grade = dict()
for line in lines[1:]:
parts = line[0:10], line[11:19], line[20:24], line[25:31], line[32:]
new_line = list()
for part in parts:
val = part.replace('/n','')
val = val.strip()
val = val if val != '' else 'None'
new_line.append(val)
print(new_line)
Without using pandas:
Edited based on your comment, I hard coded this solution based on your data. This will not work for the rows doesn't have Surname column.
I'm writing out Name and Grade since you only need those two columns.
o = open("out.txt", 'w')
with open("inFIle.txt") as f:
for lines in f:
lines = lines.strip("\n").split(",")
try:
grade = int(lines[-1])
if (lines[-2][-1]) != '.':
o.write(lines[0]+","+ str(grade)+"\n")
except ValueError:
print(lines)
o.close()

coordinates (str) to list

I have a very long txt file containing geographic coordinates..the format for each row looks like this:
501418.209 5314160.484 512.216
501418.215 5314160.471 512.186
501418.188 5314160.513 512.216
so separated by a blank (" ") and at the end a line break (\n)
I need to import that file into a list...so far I only managed to import it as a string and then tried to converted in into a list. Unforunately, I have no idea how I can keep the formatting of the txt file, as I need to perform calculations on each row.
My solution so far to import the txt file to a string variable:
fileobj = file(source,'r')
data = ""
for line in fileobj.readlines():
linevals = line.strip().split(" ")
data += "%s %s %s\n" % (linevals[0], linevals[1], linevals[2])
print type(data)
And my solution for importing as list that didn't work:
fileobj = file(source,'r')
data = []
for line in fileobj.readlines():
linevals = line.strip().split(" ")
data.append(linevals)
On stackoverflow I found lots of solutions that suggested the eval function - but that didn't work as I need the whole row as one list element. Hope that was clear. Any solutions for this problem? I'm pretty newish to python, but that bothers me for quite some time now. Thank you!
You don't need eval or anything other than simply splitting each row and casting to float:
with open(source) as f:
for row in f:
print(map(float,row.split()))
[501418.209, 5314160.484, 512.216]
[501418.215, 5314160.471, 512.186]
[501418.188, 5314160.513, 512.216]
If you want all rows in a single list:
with open(source) as f:
data = [ map(float,row.split()) for row in f] # python3 ->list(map(float,row.split()))
print(data)
[[501418.209, 5314160.484, 512.216], [501418.215, 5314160.471, 512.186], [501418.188, 5314160.513, 512.216]]
Or using the csv module:
import csv
with open(source) as f:
data = [map(float,row) for row in csv.reader(f,delimiter=" ")]
print(data)
If you want a flat list of all data:
with open(source) as f:
data = []
for row in f:
data.extend(map(float,row.split()))
If you are doing a lot of work on the data you may find numpy useful:
import numpy as np
data = np.genfromtxt(source,delimiter=" ").flatten()

Pythonic way to extract first line of a tabular file to a list

This is a function that takes the tabular file as input and return the first row as list:
def firstline_to_list(fvar):
"""
Input tab separated file.
Output first row as list.
"""
import csv
lineno = 0
with open(fvar, 'r') as tsvfile:
tabreader = csv.reader(tsvfile, delimiter='\t')
for row in tabreader:
lineno += 1
if lineno == 1:
return row
break
Is there a better way to do it than this clunky code of mine?
Just replace your for loop with a single call of next on the iterator tabreader. In python 2.7, this should be tabreader.next(), and in python 3, I think it's next(tabreader). You might also want to wrap the call in try except block for StopIteration exception, just in case the file is empty.
So putting everything together, here's version that's compatible with python 2 and 3:
def firstline_to_list(fvar):
"""
Input tab separated file.
Output first row as list.
"""
import csv, sys
with open(fvar, 'r') as tsvfile:
tabreader = csv.reader(tsvfile, delimiter='\t')
try:
if sys.version > '3':
result = next(tabreader)
else:
result = tabreader.next()
except StopIteration:
result = None
return result
The absolute minimum modification to your code would be this:
def firstline_to_list(fvar):
"""
Input tab separated files.
Output first row as list.
"""
import csv
with open(fvar, 'r') as tsvfile:
tabreader = csv.reader(tsvfile, delimiter='\t')
for row in tabreader:
return row
A better way would be to use Reader.next() as documented here: https://docs.python.org/2/library/csv.html
def firstline_to_list(fvar):
"""
Input tab separated files.
Output first row as list.
"""
import csv
with open(fvar, 'r') as tsvfile:
tabreader = csv.reader(tsvfile, delimiter='\t')
return tabreader.next()
How about:
import pandas
return list(pandas.read_csv(fvar,sep='\t',nrows=1))

Python - get number of columns from csv file

I'm trying to determine the number of columns that are present in a CSV file in python v2.6. This has to be in general, as in, for any input that I pass, I should be able to obtain the number of columns in the file.
Sample input file: love hurt hit
Other input files: car speed beforeTune afterTune repair
So far, what I have tried to do is read the file (with lots of rows), get the first row, and then count the number of words in the first row. Delimiter is ,. I ran into a problem when I try to split headings based on the sample input, and next len(headings) gives me 14 which is wrong as it should give me 3. Any ideas? I am a beginner.
with open(filename1, 'r') as f1:
csvlines = csv.reader(f1, delimiter=',')
for lineNum, line in enumerate(csvlines):
if lineNum == 0:
#colCount = getColCount(line)
headings = ','.join(line) # gives me `love, hurt, hit`
print len(headings) # gives me 14; I need 3
else:
a.append(line[0])
b.append(line[1])
c.append(line[2])
len("love, hurt, hit") is 14 because it's a string.
The len you want is of line, which is a list:
print len(line)
This outputs the number of columns, rather than the number of characters
# old school
import csv
c=0
field={}
with open('csvmsdos.csv', 'r') as csvFile:
reader = csv.reader(csvFile)
for row in reader:
field[c]=row
print(field[c])
c=c+1
row=len (field[0])
column=len(field)
csvFile.close()
A simple solution:
with open(filename1) as file:
# for each row in a given file
for row in file:
# split that row into list elements
# using comma (",") as a separator,
# count the elements and print
print(len(row.split(",")))
# break out of the loop after
# first iteration
break

Read file from line 2 or skip header row

How can I skip the header row and start reading a file from line2?
with open(fname) as f:
next(f)
for line in f:
#do something
f = open(fname,'r')
lines = f.readlines()[1:]
f.close()
If you want the first line and then you want to perform some operation on file this code will helpful.
with open(filename , 'r') as f:
first_line = f.readline()
for line in f:
# Perform some operations
If slicing could work on iterators...
from itertools import islice
with open(fname) as f:
for line in islice(f, 1, None):
pass
f = open(fname).readlines()
firstLine = f.pop(0) #removes the first line
for line in f:
...
To generalize the task of reading multiple header lines and to improve readability I'd use method extraction. Suppose you wanted to tokenize the first three lines of coordinates.txt to use as header information.
Example
coordinates.txt
---------------
Name,Longitude,Latitude,Elevation, Comments
String, Decimal Deg., Decimal Deg., Meters, String
Euler's Town,7.58857,47.559537,0, "Blah"
Faneuil Hall,-71.054773,42.360217,0
Yellowstone National Park,-110.588455,44.427963,0
Then method extraction allows you to specify what you want to do with the header information (in this example we simply tokenize the header lines based on the comma and return it as a list but there's room to do much more).
def __readheader(filehandle, numberheaderlines=1):
"""Reads the specified number of lines and returns the comma-delimited
strings on each line as a list"""
for _ in range(numberheaderlines):
yield map(str.strip, filehandle.readline().strip().split(','))
with open('coordinates.txt', 'r') as rh:
# Single header line
#print next(__readheader(rh))
# Multiple header lines
for headerline in __readheader(rh, numberheaderlines=2):
print headerline # Or do other stuff with headerline tokens
Output
['Name', 'Longitude', 'Latitude', 'Elevation', 'Comments']
['String', 'Decimal Deg.', 'Decimal Deg.', 'Meters', 'String']
If coordinates.txt contains another headerline, simply change numberheaderlines. Best of all, it's clear what __readheader(rh, numberheaderlines=2) is doing and we avoid the ambiguity of having to figure out or comment on why author of the the accepted answer uses next() in his code.
If you want to read multiple CSV files starting from line 2, this works like a charm
for files in csv_file_list:
with open(files, 'r') as r:
next(r) #skip headers
rr = csv.reader(r)
for row in rr:
#do something
(this is part of Parfait's answer to a different question)
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Skip the column names
file.readline()
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Process only the first 1000 rows
for j in range(0, 1000):
# Split the current line into a list: line
line = file.readline().split(',')
# Get the value for the first column: first_col
first_col = line[0]
# If the column value is in the dict, increment its value
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
# Else, add to the dict and set value to 1
else:
counts_dict[first_col] = 1
# Print the resulting dictionary
print(counts_dict)

Categories

Resources