skipping unknown number of lines to read the header python pandas - python

i have an excel data that i read in with python pandas:
import pandas as pd
data = pd.read_csv('..../file.txt', sep='\t' )
the mock data looks like this:
unwantedjunkline1
unwantedjunkline2
unwantedjunkline3
ID ColumnA ColumnB ColumnC
1 A B C
2 A B C
3 A B C
...
the data in this case contains 3 junk lines(lines i don't want to read in) before hitting the header and sometimes it contains 4 or more suck junk lines. so in this case i read in the data :
data = pd.read_csv('..../file.txt', sep='\t', skiprows = 3 )
data looks like:
ID ColumnA ColumnB ColumnC
1 A B C
2 A B C
3 A B C
...
But each time the number of unwanted lines is different, is there a way to read in a table file using pandas without using 'skiprows=' but instead using some command that matches the header so it knows to start reading from the header? so I don't have to click open the file to count how many unwanted lines the file contains each time and then manually change the 'skiprows=' option.

If you know what the header startswith:
def skip_to(fle, line,**kwargs):
if os.stat(fle).st_size == 0:
raise ValueError("File is empty")
with open(fle) as f:
pos = 0
cur_line = f.readline()
while not cur_line.startswith(line):
pos = f.tell()
cur_line = f.readline()
f.seek(pos)
return pd.read_csv(f, **kwargs)
Demo:
In [18]: cat test.txt
1,2
3,4
The,header
foo,bar
foobar,foo
In [19]: df = skip_to("test.txt","The,header", sep=",")
In [20]: df
Out[20]:
The header
0 foo bar
1 foobar foo
By calling .tell we keep track of where the pointer is for the previous line so when we hit the header we seek back to that line and just pass the file object to pandas.
Or using the junk if they all started with something in common:
def skip_to(fle, junk,**kwargs):
if os.stat(fle).st_size == 0:
raise ValueError("File is empty")
with open(fle) as f:
pos = 0
cur_line = f.readline()
while cur_line.startswith(junk):
pos = f.tell()
cur_line = f.readline()
f.seek(pos)
return pd.read_csv(f, **kwargs)
df = skip_to("test.txt", "junk",sep="\t")

Another simple way to achieve a dynamic skiprows would something like this which worked for me:
# Open the file
with open('test.csv', encoding='utf-8') as readfile:
ls_readfile = readfile.readlines()
#Find the skiprows number with ID as the startswith
skip = next(filter(lambda x: x[1].startswith('ID'), enumerate(ls_readfile)))[0]
print(skip)
#import the file with the separator \t
df = pd.read_csv(r'test.txt', skiprows=skip, sep ='\t')

Related

Python Pandas - Read csv with commented header line

I want to read and process a csv file with pandas. The file (as seen below) contains multiple header lines which are indicated by a # tag. I can import that file easily by using
import pandas as pd
file = "data.csv"
data = pd.read_csv(file, delimiter="\s+",
names=["Time", "Cd", "Cs", "Cl", "CmRoll", "CmPitch", "CmYaw", "Cd(f)",
"Cd(r)", "Cs(f)", "Cs(r)", "Cl(f)", "Cl(r)"],
skiprows=13)
However, I have a lot of such files with different header names and I don't want to name them (Time Cd Cs ...) manually. Also the number of commented lines is different between each file. So I want to automate that task.
Do I have to use something like regular expression here, before passing the data into a pandas dataframe?
Thanks for any advise.
And yes, the header names are also beginning with an #.
data.csv:
# Force coefficients
# dragDir : (9.9735673312816520e-01 7.2660490528994301e-02 0.0000000000000000e+00)
# sideDir : (0.0000000000000000e+00 0.0000000000000000e+00 -1.0000000000000002e+00)
# liftDir : (-7.2660490528994315e-02 9.9735673312816520e-01 0.0000000000000000e+00)
# rollAxis : (9.9735673312816520e-01 7.2660490528994301e-02 0.0000000000000000e+00)
# pitchAxis : (0.0000000000000000e+00 0.0000000000000000e+00 -1.0000000000000002e+00)
# yawAxis : (-7.2660490528994315e-02 9.9735673312816520e-01 0.0000000000000000e+00)
# magUInf : 4.5000000000000000e+01
# lRef : 5.9399999999999997e-01
# Aref : 3.5639999999999999e-03
# CofR : (1.4999999999999999e-01 0.0000000000000000e+00 0.0000000000000000e+00)
#
# Time Cd Cs Cl CmRoll CmPitch CmYaw Cd(f) Cd(r) Cs(f) Cs(r) Cl(f) Cl(r)
5e-06 1.8990180226147195e+00 1.4919925634649792e-11 2.1950119509976829e+00 -1.1085971520784955e-02 -1.0863798447281650e+00 9.5910040927874810e-03 9.3842303978657482e-01 9.6059498282814471e-01 9.5910041002474442e-03 -9.5910040853275178e-03 1.1126130770676479e-02 2.1838858202270064e+00
1e-05 2.1428508927716594e+00 1.0045114197556737e-08 2.5051633252700962e+00 -1.2652317494411272e-02 -1.2367567798452046e+00 1.0822379290263353e-02 1.0587731288914184e+00 1.0840777638802410e+00 1.0822384312820453e-02 -1.0822374267706254e-02 1.5824882789843508e-02 2.4893384424802525e+00
...
What about extracting the header before you read the file?
We only assume that your header lines start with #. Extraction of the header as well as its position in the file is automated. We also ensure that no more lines than necessary are read (except the first data line).
with open(file) as f:
line = f.readline()
cnt = 0
while line.startswith('#'):
prev_line = line
line = f.readline()
cnt += 1
# print(prev_line)
header = prev_line.strip().lstrip('# ').split()
df = pd.read_csv(file, delimiter="\s+",
names=header,
skiprows=cnt
)
With this, you can also proccess the other header lines. It also gives you the position of the header in the file.
This should do, it's easy and efficient, it keeps variables at the minimum and it doesn't require any input aside from the filename.
with open(file, 'r') as f:
for line in f:
if line.startswith('#'):
header = line
else:
break #stop when there are no more #
header = header[1:].strip().split()
data = pd.read_csv(file, delimiter="\s+", comment='#', names=header)
You first open the file and read only the commented line (it will be fast and memory-efficient). The last valid line will be the final header, which will be cleaned and converted to a list. Finally, you open the file with pandas.read_csv() with comment='#', which will skip the commented lines, and names=header.
A little bit of regex could help. This is not the most beautiful of solutions so feel free to post a better solution.
Let's read the first 50 rows of any file to find the last occurrence of the hash which should be the column name.
^ asserts position at start of a line
# matches the character # literally (case sensitive)
Code:
import re
n_rows = 50
path_ = 'your_file_location'
with open(path_,'r') as f:
data = []
for i in range(n_rows): # read only 50 rows here.
for line in f:
if re.match('^#',line):
data.append(line)
start_col = max(enumerate(data))[0]
df = pd.read_csv(path_,sep='\s+',skiprows=start_col) # use your actual delimiter.
# Time Cd Cs Cl CmRoll CmPitch \
0 0.000005 1.899018 1.491993e-11 2.195012 -0.011086 -1.086380 0.009591
1 0.000010 2.142851 1.004511e-08 2.505163 -0.012652 -1.236757 0.010822
CmYaw Cd(f) Cd(r) Cs(f) Cs(r) Cl(f) Cl(r)
0 0.938423 0.960595 0.009591 -0.009591 0.011126 2.183886 NaN
1 1.058773 1.084078 0.010822 -0.010822 0.015825 2.489338 NaN
Edit, handling the # in the column name.
We can do this in two steps. We can read in 0 rows but slice the header column.
First read in the file from the header row, but set the header argument to None so no headers will be set.
We can then set the column headers manually:
df = pd.read_csv(path_,sep='\s+',skiprows=start_col + 1, header=None)
df.columns = pd.read_csv(path_,sep='\s+',skiprows=start_col,nrows=0).columns[1:]
print(df)
Time Cd Cs Cl CmRoll CmPitch CmYaw \
0 0.000005 1.899018 1.491993e-11 2.195012 -0.011086 -1.086380 0.009591
1 0.000010 2.142851 1.004511e-08 2.505163 -0.012652 -1.236757 0.010822
Cd(f) Cd(r) Cs(f) Cs(r) Cl(f) Cl(r)
0 0.938423 0.960595 0.009591 -0.009591 0.011126 2.183886
1 1.058773 1.084078 0.010822 -0.010822 0.015825 2.489338
To simplify it, and to save time without using loops, you can create 2 dataframes for # commented rows, and the rest.
From those commented rows take last one - that's your header, and then merge data dataframe and this title using concat() also if it's neccesary to assign first row as header you can use df.columns=df.iloc[0]
df = pd.DataFrame({
'A':['#test1 : (000000)','#test1 (000000)','#test1 (000000)','#test1 (000000)','#Time (000000)','5e-06','1e-05'],
})
print(df)
A
0 #test1 : (000000)
1 #test1 (000000)
2 #test1 (000000)
3 #test1 (000000)
4 #Time (000000)
5 5e-06
6 1e-05
df_header = df[df.A.str.contains('^#')]
print(df_header)
A
0 #test1 : (000000)
1 #test1 (000000)
2 #test1 (000000)
3 #test1 (000000)
4 #Time (000000)
df_data = df[~df.A.str.contains('^#')]
print(df_data)
A
5 5e-06
6 1e-05
df = (pd.concat([df_header.iloc[[-1]],df_data])).reset_index(drop=True)
df.A=df.A.str.replace(r'^#',"")
print(df)
A
0 Time (000000)
1 5e-06
2 1e-05
Assuming that comments always start with a single '#' and the header is in the last commented line:
import csv
def read_comments(csv_file):
for row in csv_file:
if row[0] == '#':
yield row.split('#')[1].strip()
def get_last_commented_line(filename):
with open(filename, 'r', newline='') as f:
decommented_lines = [line for line in csv.reader(read_comments(f))]
header = decommented_lines[-1]
skiprows = len(decommented_lines)
return header, skiprows
header, skiprows = get_last_commented_line(path)
pd.read_csv(path, names=header, skiprows=skiprows)
# Read the lines in file
with open(file) as f:
lines = f.readlines()
# Last commented line is header
header = [line for line in lines if line.startswith('#')][-1]
# Strip line and remove '#'
header = header[1:].strip().split()
df = pd.read_csv(file, delimiter="\s+", names=header, comment='#')

Remove blank cells from CSV file with python

I have a text file that I am converting to csv using python. The text file has columns that are set using several spaces. My code strips the line, converts 2 spaces in a row to commas, and then splits the lines again. When I do this, the columns don't line up because there are some columns that have more blank spaces than others. How can I add something to my code that will remove the blank cells in my csv file?
I have tried converting the csv file to a pandas database, but when I run
import pandas as pd
df = pd.read_csv('old.Csv')
delim_whitespace=True
df.to_csv("New.Csv", index=False)
it returns an error ParserError: Error tokenizing data. C error: Expected 40 fields in line 10, saw 42
The code that is stripping the lines and splitting them is
import csv
txtfile = r"Old.txt"
csvfile = r"Old.Csv"
with open(txtfile, 'r') as infile, open(csvfile, 'w', newline='') as outfile:
stripped = (line.strip() for line in infile)
replace = (line.replace(" ", ",") for line in stripped if line)
lines = (line.split(",") for line in replace if infile)
writer = csv.writer(outfile)
writer.writerows(lines)
One solution is to declare column names beforehand, so as to force pandas to data with different number of columns. Something like this should work :
df = pd.read_csv('myfilepath', names = ['col1', 'col2', 'col3'])
You will have to adapt separator and column names / number of columns yourself.
(edited)below code should work for your text file:
a b c d e
=============================
1 qwerty 3 4 5 6
2 ewer e r y i
3 asdfghjkutrehg c v b n
you can try:
import pandas as pd
df = pd.read_fwf('textfile.txt', delimiter=' ', header=0, skiprows=[1])
df.to_csv("New.csv", index=False)
print(df)
Unnamed: 0 a b c d e
0 1 qwerty 3 4 5 6
1 2 ewer e r y i
2 3 asdfghjkutrehg c v b n

How do i add column header, in the second row in a pandas dataframe?

I have a data frame frame from pandas and now I want to add columns names, but only for the second row. Here is an example of my previous output:
Desired output:
My code:
data_line=open("file1.txt", mode="r")
lines=[]
for line in data_line:
lines.append(line)
for i, line in enumerate(lines):
# print('{}={}'.format(i+1, line.strip()))
file1_header=lines[0]
num_line=1
Dictionary_File1={}
Value_File1= data_type[0:6]
Value_File1_short=[]
i=1
for element in Value_File1:
type=element.split(',')
Value_File1_short.append(type[0] + ", " + type[1] + ", " + type[4])
i += 1
Dictionary_File1[ file1_header]=Value_File1_short
pd_file1=pd.DataFrame.from_dict(Dictionary_File1)
You should have a look at DataFrame.read_csv. The header keyword parameter allows you to indicate a line in the file to use for header names.
You could probably do it with something like:
pd.read_csv("file1.txt", header=1)
From my python shell I tested it out with:
>>> from io import StringIO # I use python3
>>> import pandas as pd
>>> >>> data = """Type Type2 Type3
... A B C
... 1 2 3
... red blue green"""
>>> # StringIO below allows us to use "data" as input to read_csv
>>> # "sep" keyword is used to indicate how columns are separated in data
>>> df = pd.read_csv(StringIO(data), header=1, sep='\s+')
>>> df
A B C
0 1 2 3
1 red blue green
You can write a row using the csv module before writing your dataframe to the same file. Notice this won't help when reading back to Pandas, which doesn't work with "duplicate headers". You can create MultiIndex columns, but this isn't necessary for your desired output.
import pandas as pd
import csv
from io import StringIO
# input file
x = """A,B,C
1,2,3
red,blue,green"""
# replace StringIO(x) with 'file.txt'
df = pd.read_csv(StringIO(x))
with open('file.txt', 'w', newline='') as fout:
writer = csv.writer(fout)
writer.writerow(['Type', 'Type2', 'Type3'])
df.to_csv(fout, index=False)
# read file to check output is correct
df = pd.read_csv('file.txt')
print(df)
# Type Type2 Type3
# 0 A B C
# 1 1 2 3
# 2 red blue green
So if I understand properly, you have a file "file.txt" containing your data, and a list containing the types of your data.
You want to add the list of types, to the pandas.DataFrame of your data. Correct?
If so, you can read the data from the txt file into a pandas.df using pandas.read_csv(), and then define the columns headers using df.columns.
So it would look something like:
df = pd.read_csv("file1.txt", header=None)
df.columns = data_type[0:6]
I hope this helps!
Cheers

Find element in set from reading file line

I have the text file with delimiter |: file1.txt
ID|Name|Date
1|A|2017-12-19
2|B|2017-12-20
3|C|2017-12-21
And following SET: <type 'set'>
id_set = set(['1','2'])
date_set = set(['2017-12-19', '2017-12-20'])
I just want to find the matching element from set to file and write that record from file1.txt to output.txt.
Expected Output: Output.txt should get following data,
ID|Name|Date
1|A|2017-12-19
2|B|2017-12-20
You can try out this solution:
id_set = {'1','2'}
date_set = {'2017-12-19', '2017-12-20'}
# open files for reading and writing
with open('file.txt') as in_file, open('output.txt', 'w') as out_file:
# write headers
out_file.write(next(in_file))
# go over lines in file
for line in in_file:
# extract id and date
id, _, date = line.rstrip().split('|')
# keep lines have an id or date in the sets
if id in id_set or date in date_set:
out_file.write(line)
Which gives the following output.txt:
ID|Name|Date
1|A|2017-12-19
2|B|2017-12-20
If you are happy to use a 3rd party library, you can use Pandas:
import pandas as pd
from io import StringIO
mystr = StringIO("""ID|Name|Date
1|A|2017-12-19
2|B|2017-12-20
3|C|2017-12-21""")
# replace mystr with 'file1.txt'
df = pd.read_csv(mystr, sep='|')
# criteria
id_set = {'1', '2'}
date_set = {'2017-12-19', '2017-12-20'}
# apply criteria
df2 = df[df['ID'].astype(str).isin(id_set) | df['Date'].isin(date_set)]
print(df2)
# ID Name Date
# 0 1 A 2017-12-19
# 1 2 B 2017-12-20
# export to csv
df2.to_csv('file1_out.txt', sep='|')

Remove row from CSV that contains empty cell using Python

I am splitting a CSV file based on a column with dates into separate files. However, some rows do contain a date but the others cells are empty. I want to remove these rows that contain empty cells from the CSV. But I'm not sure how to do this.
Here's is my code:
csv.field_size_limit(sys.maxsize)
with open(main_file, "r") as fp:
root = csv.reader(fp, delimiter='\t', quotechar='"')
result = collections.defaultdict(list)
next(root)
for row in root:
year = row[0].split("-")[0]
result[year].append(row)
for i,j in result.items():
row_count = sum(1 for row in j)
print(row_count)
file_path = "%s%s-%s.csv"%(src_path, i, row_count)
with open(file_path, 'w') as fp:
writer = csv.writer(fp, delimiter='\t', quotechar='"')
writer.writerows(j)
Pandas is perfect for this, especially if you want this to be easily adjusted to, say, other file formats. Of course one could consider it an overkill.
To just remove rows with empty cells:
>>> import pandas as pd
>>> data = pd.read_csv('example.csv', sep='\t')
>>> print data
A B C
0 1 2 5
1 NaN 1 9
2 3 4 4
>>> data.dropna()
A B C
0 1 2 5
2 3 4 4
>>> data.dropna().to_csv('example_clean.csv')
I leave performing the splitting and saving into separate files using pandas as an exercise to start learning this great package if you want :)
This would skip all all rows with at least one empty cell:
with open(main_file, "r") as fp:
....
for row in root:
if not all(map(len, row)):
continue
Pandas is Best in Python for handling any type of data processing.For help you can go through on link :- http://pandas.pydata.org/pandas-docs/stable/10min.html

Categories

Resources