Replacing and deleting columns from a csv using python

Replacing and deleting columns from a csv using python - python

Here is a code that I am writing
import csv
import openpyxl
def read_file(fn):
rows = []
with open(fn) as f:
reader = csv.reader(f, quotechar='"',delimiter=",")
for row in reader:
if row:
rows.append(row)
return rows
replace = {x[0]:x[1:] for x in read_file("replace.csv")}
delete = set( (row[0] for row in read_file("delete.csv")) )
result = []
input_file="input.csv"
with open(input_file) as f:
reader = csv.reader(f, quotechar='"')
for row in reader:
if row:
if row[7] in delete:
continue
elif row[7] in replace:
result.append(replace[row[7]])
else:
result.append(row)
with open ("done.csv", "w+", newline="") as f:
w = csv.writer(f,quotechar='"', delimiter= ",")
w.writerows(result)
here are my files:
input.csv:
c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13
"-","-","-","-","-","-","-","aaaaa","-","-","bbbbb","-",","
"-","-","-","-","-","-","-","ccccc","-","-","ddddd","-",","
"-","-","-","-","-","-","-","eeeee","-","-","fffff","-",","
this is a 13 column csv. I am interested only in the 8th and the 11th fields.
this is my replace.csv:
"aaaaa","11111","22222"
delete.csv:
ccccc
so what I am doing is compare the first column of replace.csv(line by line) with the 8th column of input.csv and if they match then replace 8th column of input.csv with the second column of replace.csv and 11th column of input with the 3rd column of replace.csv
and for delete.csv it compares both files line by line and if match is found it deletes the entire row.
and if any line is not present in either replace.csv or delete.csv then print the line as it is.
so my desired output is:
c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13
"-","-","-","-","-","-","-",11111,"-","-",22222,"-",","
"-","-","-","-","-","-","-","eeeee","-","-","fffff","-",","
but when I run this code it gives me an output like this:
c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13
11111,22222
where am I going wrong?
I am trying to make changes to my program that I had earlier posted a question about.Since the input file has changed I am trying to make changes to my program.
https://stackoverflow.com/a/54388144/9279313

#anuj
I think SafeDev's solution is optimal but if you don't want to go with pandas, just make little changes in your code.
for row in reader:
if row:
if row[7] in delete:
continue
elif row[7] in replace:
key = row[7]
row[7] = replace[key][0]
row[10]= replace[key][1]
result.append(row)
else:
result.append(row)
Hope this solves your issue.

It's actually quite simple. Instead of making it by scratch just use the panda library. From there it's easier to handle any dataset. This is how you would do it:
EDIT:
import pandas as pd
input_csv = pd.read_csv('input.csv')
replace_csv = pd.read_csv('replace.csv', header=None)
delete_csv = pd.read_csv('delete.csv')
r_lst = [i for i in replace_csv.iloc[:, 0]]
d_lst = [i for i in delete_csv]
input2_csv = pd.DataFrame.copy(input_csv)
for i, row in input_csv.iterrows():
if row['c8'] in r_lst:
input2_csv.loc[i, 'c8'] = replace_csv.iloc[r_lst.index(row['c8']), 1]
input2_csv.loc[i, 'c11'] = replace_csv.iloc[r_lst.index(row['c8']), 2]
if row['c8'] in d_lst:
input2_csv = input2_csv[input2_csv.c8 != row['c8']]
input2_csv.to_csv('output.csv', index=False)
This process can be made even more dynamic by turning it into a function that has parameters of column names and replacing 'c8' and 'c11' with those two parameters.

Related

How to delete a row in a CSV file if a cell is empty using Python

I want to go through large CSV files and if there is missing data I want to remove that row completely, This is only row specific so if there is a cell that = 0 or has no value then I want to remove the entire row. I want this to happen for all the columns so if any column has a black cell it should delete the row, and return the corrected data in a corrected csv.
import csv
with open('data.csv', 'r') as csvfile:
csvreader = csv.reader(csvfile)
for row in csvreader:
print(row)
if not row[0]:
print("12")
This is what I found and tried but it doesnt not seem to be working and I dont have any ideas about how to aproach this problem, help please?
Thanks!

Due to the way in which CSV reader presents rows of data, you need to know how many columns there are in the original CSV file. For example, if the CSV file content looks like this:
1,2
3,
4
Then the lists return by iterating over the reader would look like this:
['1','2']
['3','']
['4']
As you can see, the third row only has one column whereas the first and second rows have 2 columns albeit that one is (effectively) empty.
This function allows you to either specify the number of columns (if you know them before hand) or allow the function to figure it out. If not specified then it is assumed that the number of columns is the greatest number of columns found in any row.
So...
import csv
DELIMITER = ','
def valid_column(col):
try:
return float(col) != 0
except ValueError:
pass
return len(col.strip()) > 0
def fix_csv(input_file, output_file, cols=0):
if cols == 0:
with open(input_file, newline='') as indata:
cols = max(len(row) for row in csv.reader(indata, delimiter=DELIMITER))
with open(input_file, newline='') as indata, open(output_file, 'w', newline='') as outdata:
writer = csv.writer(outdata, delimiter=DELIMITER)
for row in csv.reader(indata, delimiter=DELIMITER):
if len(row) == cols:
if all(valid_column(col) for col in row):
writer.writerow(row)
fix_csv('original.csv', 'fixed.csv')

maybe like this
import csv
with open('data.csv', 'r') as csvfile:
csvreader = csv.reader(csvfile)
data=list(csvreader)
data=[x for x in data if '' not in x and '0' not in x]
you can then rewrite the the csv file if you like

Instead of using csv, you should use Pandas module, something like this.
import pandas as pd
df = pd.read_csv('file.csv')
print(df)
index = 1 #index of the row that you want to remove
df = df.drop(index)
print(df)
df.to_csv('file.csv')

Python "list index out of range" when I am converting a csv to docx

My csv have blank rows in every 17 rows (blank row in row 17,34....)
I tried to convert it to a docx. However, the "list index out of range" error came out. If I delete the blank row in the csv, it will be working fine. Even if I manually add it via excel and the docx function will be work fine.
It seems that the table.row cannot deal with the blank row in my csv.
row_cells[i].text = row[i]
IndexError: list index out of range
Would you please help me? Thank you in advance.
path = os.getcwd()
##Use glob.glob to get all the "txt.csv.csv" files.
allfiles = glob.glob(path + "/*.txt.csv.csv")
df_out_filename = 'Xunjian-Report.csv'
with open(df_out_filename, 'w+', newline='') as fout:
writer = csv.writer(fout)
for filename in allfiles:
with open(filename) as fin:
reader = csv.reader(fin)
writer.writerows(reader)
writer.writerow("[]") ##Insert blank row at the end
###To generate the docx
csvfile = 'Xunjian-Report.csv'
doc = docx.Document()
date = datetime.datetime.now()
with open(csvfile, newline='') as f:
csv_reader = csv.reader(f)
csv_headers = next(csv_reader)
csv_cols = len(csv_headers)
print(csv_cols)
# Adding Subject of the document and the date
doc.add_heading('Report', level=0)
doc.add_heading('Date: %s/%s/%s' % (date.day, date.month, date.year), level=1)
table = doc.add_table(rows=1, cols=csv_cols)
table.autofix=False
hdr_cells = table.rows[0].cells
for i in range(csv_cols):
hdr_cells[i].text = csv_headers[i]
for row in csv_reader:
row_cells = table.add_row().cells
for i in range(csv_cols):
row_cells[i].text = row[i]

If you row list does not have enough elements for row[i], you hit IndexError. If you have blank rows in your input, those will indeed be shorter (or even just empty list).
Depending on your processing needs you could, either only process as many columns as you have in each row, replace:
for i in range(csv_cols):
with:
for i in range(len(row)):
Or even:
for (idx, cell) in enumerate(csv_cols):
row_cells[idx].text = cell
Or skip rows that are empty list (but this is a bit less robust, former would also deal with uneven length of rows, should it be possible to hit that case):
for row in csv_reader:
Could read:
for row in csv_reader:
if not row: continue # skip empty row, go to the next one

Read in only rows in between certain strings Python

So I have a text file that I am trying to read with csv in python, however I only want the rows in between two rows that start with certain strings. I have no problems with just reading the data, I have:
import csv
with open('path to file','r') as inf:
reader = csv.reader(inf, delimiter=" ")
and to get all the data I can just loop through and append to a list:
raw_data=[]
for row in reader:
raw_data.append(row)
I know I can get the rows I want by doing something like:
for row in raw_data:
if row[0] == 'string1':
begin_idx = raw_data.index(row)
elif row[0] == 'string2':
end_idx = raw_data.index(row)
data=[]
for idx in range(begin_idx+1,end_idx):
data.append(raw_data[idx])
However, I was hoping to be able to do this all at once when I first loop through the text file, so if anyone has any ideas on how this could be done it would appreciated.
Note, the reason I am not just looking for index of the rows I want is because they are just a list of integers that will change each time I run this. The pdf to text conversion I run isn't extremely clean, so the row titles don't line up with the actual data for the row.

Iterator objects are nice in that they are just calling next() on the object like reader when using in
So this will allow you to go through this in one linear pass by looping through separately when you hit the starting string. Try this:
import csv
with open('path to file','r') as inf:
reader = csv.reader(inf, delimiter=" ")
data=[]
for row in reader:
if row[0] == 'string1':
for row in reader:
if row[0]=='string2':
break
data.append(row)

You can introduce a state variable into your for loop:
data = []
copying = False
for row in reader:
if copying:
data.append(row)
if row[0] == 'string1':
copying = True
if row[0] == 'string2':
copying = False

Iterating through particular rows in a csvFile in Python

I have a programming assignment that include csvfiles. So far, I only have a issue with obtaining values from specific rows only, which are the rows that the user wants to look up.
When I got frustrated I just appended each column to a separate list, which is very slow (when the list is printed for test) because each column has hundreds of values.
Question:
The desired rows are the rows whose index[0] == user_input. How can I obtain these particular rows only and ignore the others?

This should give you an idea:
import csv
with open('file.csv', 'rb') as f:
reader = csv.reader(f, delimiter=',')
user_rows = filter(lambda row: row[0] == user_input, reader)

Python has the module csv
import csv
rows=[]
for row in csv.reader(open('a.csv','r'),delimiter=','):
if(row[0]==user_input):
rows.append(row)

def filter_csv_by_prefix (csv_path, prefix):
with open (csv_path, 'r') as f:
return tuple (filter (lambda line : line.split(',')[0] == prefix, f.readlines ()))
for line in filter_csv_by_prefix ('your_csv_file', 'your_prefix'):
print (line)

How do create new column in csv file using python by shifting one row

I have CSV file like below. It is huge file with thousands of records.
input.csv
No;Val;Rec;CSR
0;10;1;1200
0;100;2;1300
0;100;3;1300
0;100;4;1400
0;10;5;1200
0;11;6;1200
I want to create output.csv file by adding new column "PSR" after 1st column "No". This column value depends on column "PSR" Value. For 1st row, "PSR" shall be zero. From next record on-wards, it depends on "CSR" value in previous row. If present and previous record CSR value is same, then "PSR" shall be zero. If not, PSR value shall have the previous CSR value. For exmple, Value of CSR in 2nd row is 1300 which is different to the value in 1st record ( it is 1200). So PSR value for 2nd row shall be 1200. Where in 2nd and 3rd row, CSR value is same. So PSR value for 3rd row shall be zero. So new value PSR depends on CSR value in present and previous field.
Output.csv
No;PCR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
My Approach:
Use csv.reader and iterate over the objects in a list. Copy 5th column to 2nd column in list. Shift it one row down.
Then check the values in 2nd and 5th column (PCR and CSR), if both values are same. Replace the PCR value with zero.
I have problem in getting 1st step coded. I am able to duplicate the column but not able to shift it. Also 2nd step is quite straightforward.
Also, I am not sure whether this approach is correct Any pointers/recommendation would be really helpful.
Note: I am not able to install Pandas on CentOS. So help without this module would be better.
My Code:
with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
mylist = []
header = next(reader)
mylist.append(header)
for rec in reader:
mylist.append(rec)
rec.insert(1, rec[3])
mylist.append(rec)
writer.writerows(mylist)

If your open to non-python solutions then awk could be a good option:
awk 'NR==1{$2="PSR;"$2}NR>1{$2=($4==a?0";"$2:+a";"$2);a=$4}1' FS=';' OFS=';' file
No;PSR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
Awk is distributed with pretty much all Linux distributions and was designed exactly for this kind of task. It will blaze through your file. Add a redirection to the end > output.csv to save the output in a file.
A simple python approach using the same logic:
#!/usr/bin/env python
last = "0"
with open('input.csv') as csv:
print next(csv).strip().replace(';', ';PSR;', 1)
for line in csv:
field = line.strip().split(';')
if field[3] == last: field.insert(1, "0")
else: field.insert(1, last)
last = field[4]
print ';'.join(field)
Produces the same output:
$ python parse.py
No;PSR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
Again just redirect the output to save it:
$ python parse.py > output.csv

Just code it as you explained it. Store the previous CSR and refer to it on the next loop through; just be sure to update it.
import csv
with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
mylist = []
header = next(reader)
mylist.append(header)
mylist.insert(1,'PCR')
prev_csr = 0
for rec in reader:
rec.insert(1,prev_csr)
mylist.append(rec)
prev_csr = rec[4]
writer.writerows(mylist)

with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
header = next(reader)
header.insert(1, 'PCR')
writer.writerow(header)
prevRow = next(reader)
prevRow.insert(1, '0')
writer.writerow(prevRow)
for row in reader:
if prevRow[-1] == row[-1]:
val = '0'
else:
val = prevRow[-1]
row.insert(1,val)
prevRow = row
writer.writerow(row)

Or, even easier using the DictReader and DictWriter capabilities of csv:
input_header = ['No','Val','Rec','CSR']
output_header = ['No','PCR','Val','Rec','CSR']
with open('input.csv', 'rb') as in_file, open('output.csv', 'wb') as out_file:
in_reader, out_writer = DictReader(in_file, input_header, delemeter =';'), DictWriter(out_file, output_header, delemeter =';')
in_reader.next() # skip the header
out_writer.writeheader() # place the output header
last_csr = None
for row in in_reader():
current_csr = row['CSR']
row['PCR'] = last_csr if current_csr != last_csr else 0
last_csr = current_csr
out_writer.writerow(row)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Replacing and deleting columns from a csv using python - python

Related

How to delete a row in a CSV file if a cell is empty using Python

Python "list index out of range" when I am converting a csv to docx

Read in only rows in between certain strings Python

Iterating through particular rows in a csvFile in Python

How do create new column in csv file using python by shifting one row

Categories

Resources