Read file from line 2 or skip header row - python

How can I skip the header row and start reading a file from line2?

with open(fname) as f:
next(f)
for line in f:
#do something

f = open(fname,'r')
lines = f.readlines()[1:]
f.close()

If you want the first line and then you want to perform some operation on file this code will helpful.
with open(filename , 'r') as f:
first_line = f.readline()
for line in f:
# Perform some operations

If slicing could work on iterators...
from itertools import islice
with open(fname) as f:
for line in islice(f, 1, None):
pass

f = open(fname).readlines()
firstLine = f.pop(0) #removes the first line
for line in f:
...

To generalize the task of reading multiple header lines and to improve readability I'd use method extraction. Suppose you wanted to tokenize the first three lines of coordinates.txt to use as header information.
Example
coordinates.txt
---------------
Name,Longitude,Latitude,Elevation, Comments
String, Decimal Deg., Decimal Deg., Meters, String
Euler's Town,7.58857,47.559537,0, "Blah"
Faneuil Hall,-71.054773,42.360217,0
Yellowstone National Park,-110.588455,44.427963,0
Then method extraction allows you to specify what you want to do with the header information (in this example we simply tokenize the header lines based on the comma and return it as a list but there's room to do much more).
def __readheader(filehandle, numberheaderlines=1):
"""Reads the specified number of lines and returns the comma-delimited
strings on each line as a list"""
for _ in range(numberheaderlines):
yield map(str.strip, filehandle.readline().strip().split(','))
with open('coordinates.txt', 'r') as rh:
# Single header line
#print next(__readheader(rh))
# Multiple header lines
for headerline in __readheader(rh, numberheaderlines=2):
print headerline # Or do other stuff with headerline tokens
Output
['Name', 'Longitude', 'Latitude', 'Elevation', 'Comments']
['String', 'Decimal Deg.', 'Decimal Deg.', 'Meters', 'String']
If coordinates.txt contains another headerline, simply change numberheaderlines. Best of all, it's clear what __readheader(rh, numberheaderlines=2) is doing and we avoid the ambiguity of having to figure out or comment on why author of the the accepted answer uses next() in his code.

If you want to read multiple CSV files starting from line 2, this works like a charm
for files in csv_file_list:
with open(files, 'r') as r:
next(r) #skip headers
rr = csv.reader(r)
for row in rr:
#do something
(this is part of Parfait's answer to a different question)

# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Skip the column names
file.readline()
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Process only the first 1000 rows
for j in range(0, 1000):
# Split the current line into a list: line
line = file.readline().split(',')
# Get the value for the first column: first_col
first_col = line[0]
# If the column value is in the dict, increment its value
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
# Else, add to the dict and set value to 1
else:
counts_dict[first_col] = 1
# Print the resulting dictionary
print(counts_dict)

Related

How to convert this text file to csv?

I try analyze text file with data - columns, and records.
My file:
Name Surname Age Sex Grade
Chris M. 14 M 4
Adam A. 17 M
Jack O. M 8
The text file has some empty data. As above.
User want to show Name and Grade:
import csv
with open('launchlog.txt', 'r') as in_file:
stripped = (line.strip() for line in in_file)
lines = (line.split() for line in stripped if line)
with open('log.txt', 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(('Name', 'Surname', 'Age', 'Sex', 'Grade'))
writer.writerows(lines)
log.txt :
Chris,M.,14,M,4
Adam,A.,17,M
Jack,O.,M,8
How to empty data insert a "None" string?
For example:
Chris,M.,14,M,4
Adam,A.,17,M,None
Jack,O.,None,M,8
What would be the best way to do this in Python?
Use pandas:
import pandas
data=pandas.read_fwf("file.txt")
To get your dictionnary:
data.set_index("Name")["Grade"].to_dict()
Here's something in Pure Pythonâ„¢ that seems to do what you want, at least on the sample data file in your question.
In a nutshell what it does is first determine where each of the field names in column header line start and end, and then for each of the remaining lines of the file, does the same thing getting a second list which is used to determine what column each data item in the line is underneath (which it then puts in its proper position in the row that will be written to the output file).
import csv
def find_words(line):
""" Return a list of (start, stop) tuples with the indices of the
first and last characters of each "word" in the given string.
Any sequence of consecutive non-space characters is considered
as comprising a word.
"""
line_len = len(line)
indices = []
i = 0
while i < line_len:
start, count = i, 0
while line[i] != ' ':
count += 1
i += 1
if i >= line_len:
break
indices.append((start, start+count-1))
while i < line_len and line[i] == ' ': # advance to start of next word
i += 1
return indices
# convert text file with missing fields to csv
with open('name_grades.txt', 'rt') as in_file, open('log.csv', 'wt', newline='') as out_file:
writer = csv.writer(out_file)
header = next(in_file) # read first line
fields = header.split()
writer.writerow(fields)
# determine the indices of where each field starts and stops based on header line
field_positions = find_words(header)
for line in in_file:
line = line.rstrip('\r\n') # remove trailing newline
row = ['None' for _ in range(len(fields))]
value_positions = find_words(line)
for (vstart, vstop) in value_positions:
# determine what field the value is underneath
for i, (hstart, hstop) in enumerate(field_positions):
if vstart <= hstop and hstart <= vstop: # overlap?
row[i] = line[vstart:vstop+1]
break # stop looking
writer.writerow(row)
Here's the contents of the log.csv file it created:
Name,Surname,Age,Sex,Grade
Chris,C.,14,M,4
Adam,A.,17,M,None
Jack,O.,None,M,8
I would use baloo's answer over mine -- but if you just want to get a feel for where your code went wrong, the solution below mostly works (there is a formatting issue with the Grade field, but I'm sure you can get through that.) Add some print statements to your code and to mine and you should be able to pick up the differences.
import csv
<Old Code removed in favor of new code below>
EDIT: I see your difficulty now. Please try the below code; I'm out of time today so you will have to fill in the writer parts where the print statement is, but this will fulfill your request to replace empty fields with None.
import csv
with open('Test.txt', 'r') as in_file:
with open('log.csv', 'w') as out_file:
writer = csv.writer(out_file)
lines = [line for line in in_file]
name_and_grade = dict()
for line in lines[1:]:
parts = line[0:10], line[11:19], line[20:24], line[25:31], line[32:]
new_line = list()
for part in parts:
val = part.replace('/n','')
val = val.strip()
val = val if val != '' else 'None'
new_line.append(val)
print(new_line)
Without using pandas:
Edited based on your comment, I hard coded this solution based on your data. This will not work for the rows doesn't have Surname column.
I'm writing out Name and Grade since you only need those two columns.
o = open("out.txt", 'w')
with open("inFIle.txt") as f:
for lines in f:
lines = lines.strip("\n").split(",")
try:
grade = int(lines[-1])
if (lines[-2][-1]) != '.':
o.write(lines[0]+","+ str(grade)+"\n")
except ValueError:
print(lines)
o.close()

python array indexing through function

I'm writing a function that will read a file given the number of header lines to skip and the number of footer lines to skip.
def LoadText(file, HeaderLinesToSkip, FooterLinesToSkip):
fin = open(file)
text = []
for line in fin.readlines()[HeaderLinesToSkip, -FooterLinesToSkip]
text.append(line.strip())
return text
My problem is that this function will work properly only of FooterLinesToSkip is at least equal to 1. If FooterLinesToSkip = 0, then the function will return []. I can solve this problem with an if statement, but is there a much simpler form?
Edit : I actually simplified my problem; the lines read from the file contains columns separated by a semi-column. The real function includes .split(delimiter_character) and should store only column 1.
def LoadText(file, HeaderLinesToSkip, FooterLinesToSkip):
fin = open(file)
text = []
for line in fin.readlines()[HeaderLinesToSkip, -FooterLinesToSkip]
text.append(line.strip().split(';')[1])
return text
Set FooterLinesToSkip to None instead, so the slice defaults to the list length:
def LoadText(file, HeaderLinesToSkip, FooterLinesToSkip):
with open(file) as fin:
FooterLinesToSkip = -FooterLinesToSkip if FooterLinesToSkip else None
text = []
for line in fin.readlines()[HeaderLinesToSkip:FooterLinesToSkip]):
text.append(line.strip().split(';')[1])
Let me offer you an improvement, which does not require you to read the whole list into memory:
from collections import deque
from itertools import islice
def skip_headers_and_footers(fh, header_skip, footer_skip):
buffer = deque(islice(fh, header_skip, header_skip + footer_skip), footer_skip)
for line in fh:
yield buffer.popleft()
buffer.append(line)
This reads lines one by one, after skipping header_skip lines, and keeping footer_skip lines in a buffer. By the time we looped over all lines in the file, footer_skip lines remain in the buffer and are ignored.
This is a generator function, so it'll yield lines in a loop:
with open(filename) as open_file:
for line in skip_headers_and_footers(open_file, 2, 2):
# do something with this line.
line = line.strip()
I moved the file opening out of the function so that it can be used for other iterables too, not just files.
Now you can use the csv module to handle the column splitting and stripping:
import csv
with open(filename, 'rb') as open_file:
reader = csv.reader(open_file, delimiter=';')
for row in skip_headers_and_footers(reader, 2, 2):
column = row[1]
and the skip_headers_and_footers() generator has skipped the first two rows for you and will never yield the last two rows either.

Making a dictionary from file, first word is key in each line then other four numbers are to be a tuple value

This dictionary is supposed to take the three letter country code of a country, i.e, GRE for great britain, and then take the four consecutive numbers after it as a tuple. it should be something like this:
{GRE:(204,203,112,116)} and continue doing that for every single country in the list. The txt file goes down like so:
Country,Games,Gold,Silver,Bronze
AFG,13,0,0,2
ALG,15,5,2,8
ARG,40,18,24,28
ARM,10,1,2,9
ANZ,2,3,4,5 etc.;
This isn't actually code i just wanted to show it is formatted.
I need my program to skip the first line because it's a header. Here's what my code looks like thus far:
def medals(goldMedals):
infile = open(goldMedals, 'r')
medalDict = {}
for line in infile:
if infile[line] != 0:
key = line[0:3]
value = line[3:].split(',')
medalDict[key] = value
print(medalDict)
infile.close()
return medalDict
medals('GoldMedals.txt')
Your for loop should be like:
next(infile) # Skip the first line
for line in infile:
words = line.split(',')
medalDict[words[0]] = tuple(map(int, words[1:]))
A variation on a theme, I'd convert all the remaining cols to ints, and I'd use a namedtuple:
from collections import namedtuple
with open('file.txt') as fin:
# The first line names the columns
lines = iter(fin)
columns = lines.next().strip().split(',')
row = namedtuple('Row', columns[1:])
results = {}
for line in lines:
columns = line.strip().split(',')
results[columns[0]] = row(*(int(c) for c in columns[1:]))
# Results is now a dict to named tuples
This has the nice feature of 1) skipping the first line and 2) providing both offset and named access to the rows:
# These both work to return the 'Games' column
results['ALG'].Games
results['ALG'][0]
with open('path/to/file') as infile:
answer = {}
for line in infile:
k,v = line.strip().split(',',1)
answer[k] = tuple(int(i) for i in v.split(','))
I think inspectorG4dget's answer is the most readable... but for those playing code golf:
with open('medals.txt', 'r') as infile:
headers = infile.readline()
dict([(i[0], tuple(i[1:])) for i in [list(line.strip().split(',')) for line in infile]])

Replace character in line inside a file

I have these different lines with values in a text file
sample1:1
sample2:1
sample3:0
sample4:15
sample5:500
and I want the number after the ":" to be updated sometimes
I know I can split the name by ":" and get a list with 2 values.
f = open("test.txt","r")
lines = f.readlines()
lineSplit = lines[0].split(":",1)
lineSplit[1] #this is the value I want to change
im not quite sure how to update the lineSplit[1] value with the write functions
You can use the fileinput module, if you're trying to modify the same file:
>>> strs = "sample4:15"
Take the advantage of sequence unpacking to store the results in variables after splitting.
>>> sample, value = strs.split(':')
>>> sample
'sample4'
>>> value
'15'
Code:
import fileinput
for line in fileinput.input(filename, inplace = True):
sample, value = line.split(':')
value = int(value) #convert value to int for calculation purpose
if some_condition:
# do some calculations on sample and value
# modify sample, value if required
#now the write the data(either modified or still the old one) to back to file
print "{}:{}".format(sample, value)
Strings are immutable, meaning, you can't assign new values inside them by index.
But you can split up the whole file into a list of lines, and change individual lines (strings) entirely. This is what you're doing in lineSplit[1] = A_NEW_INTEGER
with open(filename, 'r') as f:
lines = f.read().splitlines()
for i, line in enumerate(lines):
if condition:
lineSplit = line.split(':')
lineSplit[1] = new_integer
lines[i] = ':'.join(lineSplit)
with open(filename, 'w') as f:
f.write('\n'.join(lines)
Maybe something as such (assuming that each first element before : is indeed a key):
from collections import OrderedDict
with open('fin') as fin:
samples = OrderedDict(line.split(':', 1) for line in fin)
samples['sample3'] = 'something else'
with open('output') as fout:
lines = (':'.join(el) + '\n' for el in samples.iteritems())
fout.writelines(lines)
Another option is to use csv module (: is a column delimiter in your case).
Assuming there is a test.txt file with the following content:
sample1:1
sample2:1
sample3:0
sample4:15
sample5:500
And you need to increment each value. Here's how you can do it:
import csv
# read the file
with open('test.txt', 'r') as f:
reader = csv.reader(f, delimiter=":")
lines = [line for line in reader]
# write the file
with open('test.txt', 'w') as f:
writer = csv.writer(f, delimiter=":")
for line in lines:
# edit the data here
# e.g. increment each value
line[1] = int(line[1]) + 1
writer.writerows(lines)
The contents of test.txt now is:
sample1:2
sample2:2
sample3:1
sample4:16
sample5:501
But, anyway, fileinput sounds more logical to use in your case (editing the same file).
Hope that helps.

Create an array from data in a table using Python

I want to get data from a table in a text file into a python array. The text file that I am using as an input has 7 columns and 31 rows. Here is an example of the first two rows:
10672 34.332875 5.360831 0.00004035881220 0.00000515052523 4.52E-07 6.5E-07
12709 40.837833 19.429158 0.00012010938453 -0.00000506426720 7.76E-06 2.9E-07
The code that I have tried to write isn't working as it is not reading one line at a time when it goes through the for loop.
data = []
f = open('hyadeserr.txt', 'r')
while True:
eof = "no"
array = []
for i in range(7):
line = f.readline()
word = line.split()
if len(word) == 0:
eof = "yes"
else:
array.append(float(word[0]))
print array
if eof == "yes": break
data.append(array)
Any help would be greatly appreciated.
A file with space-separated values is just a dialect of the classic comma-separated values (CSV) file where the delimiter is a space (), followed by more spaces, which can be ignored.
Happily, Python comes with a csv.reader class that understands dialects.
You should use this:
Example:
#!/usr/bin/env python
import csv
csv.register_dialect('ssv', delimiter=' ', skipinitialspace=True)
data = []
with open('hyadeserr.txt', 'r') as f:
reader = csv.reader(f, 'ssv')
for row in reader:
floats = [float(column) for column in row]
data.append(floats)
print data
If you don't want to use cvs here, since you don't really need it:
data = []
with open("hyadeserr.txt") as file:
for line in file:
data.append([float(f) for f in line.strip().split()])
Or, if you know for sure that the only extra chars are spaces and line ending \n, you can turn the last line into:
data.append([float(f) for f in line[:-1].split()])

Categories

Resources