How to delete rows of tsv - python

1 7 c
5 2 q
4 5 a
5 0 c
for i,line in enumerate(read_tsv):
first = read_tsv[i][0]
second = read_tsv[i][1]
letter = read_tsv[i][2]
if i == 2:
I have a tsv file and I'd like to delete the rows where the 3rd values are not c. So I'd like it to look like this. So far I know how to seperate the values I just don't know how to delete the row based on the third tabbed value.
1 7 c
5 0 c

You can open the doc read/iterate it and filter out the unwanted rows then open it in write and write that data back
import csv
with open('filename.tsv', 'r') as f:
reader = csv.reader(f, delimiter='\t')
data = [row for row in reader if row[2] == 'c']
with open('filename.tsv', 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerows(data)

Related

Compare two csv files in python and retain headers of changes

I'm trying to compare two csv files in python and output the differences along with the headers of each column. So far, with what I'm doing, it outputs all columns instead of just the ones with differences
import csv
with open('firstfile.csv', 'r') as f1:
file1 = f1.readlines()
with open('secondfile.csv', 'r') as f2:
file2 = f2.readlines()
with open('results.csv', 'w') as outFile:
outFile.write(file1[0])
for line in file2:
if line not in file1:
outFile.write(line)
I think this code resolves your problem
import sys
with open('file1.csv', 'r') as f1:
file1 = f1.readlines()
with open('file2.csv', 'r') as f2:
file2 = f2.readlines()
delimiter = '\t' # Column delimiter in you file
headers_of_first_file = file1[0].strip().split(delimiter)
headers_of_second_file = file2[0].strip().split(delimiter)
# You can remove this assert if you want to work files with different columns then you have to add some more code in next blocks
different_headers = set(headers_of_first_file).symmetric_difference(headers_of_second_file)
if different_headers:
print('Files have difference in headers: ', different_headers)
sys.exit(-1)
# Build map {header: [all_values]}
first_file_map = {header: [] for header in headers_of_first_file}
for row in file1[1:]:
for index, cell in enumerate(row.strip().split(delimiter)):
first_file_map[headers_of_first_file[index]].append(cell)
# Check by built map. Dont forget that columns may change order
result = set()
for row in file2[1:]:
for index, cell in enumerate(row.strip().split(delimiter)):
if cell not in first_file_map[headers_of_second_file[index]]:
result.add(headers_of_second_file[index])
with open('results.csv', 'w') as out_file:
out_file.write('\t'.join(result))
UPD files example:
Column1 Column2 Column3 Column5 Column4
1 2 3 5 4
10 20 30 50 40
Column1 Column2 Column3 Column4 Column5
11 2 3 4 5
10 10 30 40 50
'\t' is delimiter
import csv
def compareList(l1,l2):
if(len(l1)==len(l2) and len(l1)==sum([1 for i,j in zip(l1,l2) if i==j])):
return "Equal"
else:
return "Non equal"
file1 = "C:/Users/Sarvesh/Downloads/a.csv"
file2 = "C:/Users/Sarvesh/Downloads/b.csv"
with open(file1, 'r') as csv1, open(file2, 'r') as csv2: # Import CSV files
import1 = csv1.readlines()
import2 = csv2.readlines()
# creating an object of csv reader
# with the delimiter as ,
csv_reader = csv.reader(import1, delimiter='|')
# list to store the names of columns
list_of_column_name1 = []
# loop to iterate through the rows of csv
for row in csv_reader:
# adding the first row
list_of_column_name1.append(row)
# breaking the loop after the
# first iteration itself
break
csv_reader = csv.reader(import2, delimiter='|')
# list to store the names of columns
list_of_column_name2 = []
# loop to iterate through the rows of csv
for row in csv_reader:
# adding the first row
list_of_column_name2.append(row)
# breaking the loop after the
# first iteration itself
break
# printing the result
print("1List of column names : ", list_of_column_name1[0])
print("2List of column names : ", list_of_column_name2[0])
print("First comparison",compareList(list_of_column_name1,list_of_column_name2))

how to add new col at the end of csv/txt file python

I am looking for a script to add a new data column into existing csv file by python. I have a file (e.g. file.csv) which will have many rows and few columns. From for loop calculation, I got a new array (A in my code here). I want to append that new array (A) as the last column of existing csv file. I used the below code.
for xxx in xxxx:
A= xxx
f=open("file.csv")
data=[item for item in csv.reader(f)]
f.close()
new_column=[A]
new_data=[]
for i, item in enumerate (data):
try:
item.append (new_column[i])
except IndexError, e:
item.append(A)
new_data.append(item)
f=open('outfilefinal1.csv','w')
csv.writer(f).writerows(new_data)
f.close()
It did append a new column as the last column. But the problem is the whole column got one same value (A value form the last loop). So, how can I do if I want the A value from each for loop as my last column. Thanks.
Example input file
1 2
2 4
0 9
4 8
A value from each loop
3
4
0
9
So the final file should show
1 2 3
2 4 4
0 9 0
4 8 9
but in my case it shows as
1 2 9
2 4 9
0 9 9
4 8 9
Problem with your code is that you are overwriting your file inside a nested for loop.
Is A really an array that does not depend on file.csv? Then you could do something like this:
import csv
A = compute_new_values_list()
with open('file.csv') as fpi, open('out.csv', 'w') as fpo:
reader = csv.reader(fpi)
writer = csv.writer(fpo)
#optionaly handle csv header
headers = next(reader)
headers.append('new_column')
writer.writerow(headers)
for index, row in enumerate(reader):
row.append(A[index])
writer.writerow(row)
EDIT:
If you need a row of file.csv to compute your new value, you can use the same code, just compute your new value inside for loop:
import csv
with open('file.csv') as fpi, open('out.csv', 'w') as fpo:
reader = csv.reader(fpi)
writer = csv.writer(fpo)
#optionaly handle csv header
headers = next(reader)
headers.append('new_column')
writer.writerow(headers)
for row in reader:
new_value = compute_from_row(row)
row.append(new_value)
writer.writerow(row)

Write and recode from one csv file to another

I am trying to select specific columns from a large tab-delimited CSV file and output only certain columns to a new CSV file. Furthermore, I want to recode the data as this happens. If the cell has a value of 0 then just output 0. However, if the cell has a value of greater than 0, then just output 1 (i.e., all values greater than 0 are coded as 1).
Here's what I have so far:
import csv
outputFile = open('output.csv', 'wb')
outputWriter = csv.writer(outputFile)
included_cols = range(9,2844)
with open('source.txt', 'rb') as f:
reader = csv.reader(f, delimiter='\t')
for row in reader:
content = list(row[i] for i in included_cols)
outputWriter.writerow(content)
The first issue I am having is that I want to also take from column 6. I wasn't sure how to write column 6 and then columns 9-2844.
Second, I wasn't sure how to do the recoding on the fly as I write the new CSV.
I wasn't sure how to write column 6 and then columns 9-2844.
included_cols = [6] + list(range(9,2844))
This works because you can add two lists together. Note that in Python3, range doesn't return a list, so we have to coerce it.
I wasn't sure how to do the recoding on the fly
content = list((1 if row[i] > 0 else 0) for i in included_cols)
This works because of the conditional expression: 1 if row[i] > 0 else 0. The general form A if cond else B evaluates to either A or B, depending upon the condition.
Another form, which I think is "too clever by half" is content = list((row[i] and 1) for i in included_cols). This works because the and operator always returns one or the other of its inputs.
This should work:
import csv
outputFile = open('output.csv', 'wb')
outputWriter = csv.writer(outputFile)
included_cols = [5] + range(8,2844) # you can just merge two lists
with open('source.txt', 'rb') as f:
reader = csv.reader(f, delimiter='\t')
outputWriter.writerow(reader[0]) # write header row unchanged
for row in reader[1:]: # skip header row
content = [int(row[i]) if i == 5 else (0 if int(row[i]) == 0 else 1) for i in included_cols]
outputWriter.writerow(content)

Remove row from CSV that contains empty cell using Python

I am splitting a CSV file based on a column with dates into separate files. However, some rows do contain a date but the others cells are empty. I want to remove these rows that contain empty cells from the CSV. But I'm not sure how to do this.
Here's is my code:
csv.field_size_limit(sys.maxsize)
with open(main_file, "r") as fp:
root = csv.reader(fp, delimiter='\t', quotechar='"')
result = collections.defaultdict(list)
next(root)
for row in root:
year = row[0].split("-")[0]
result[year].append(row)
for i,j in result.items():
row_count = sum(1 for row in j)
print(row_count)
file_path = "%s%s-%s.csv"%(src_path, i, row_count)
with open(file_path, 'w') as fp:
writer = csv.writer(fp, delimiter='\t', quotechar='"')
writer.writerows(j)
Pandas is perfect for this, especially if you want this to be easily adjusted to, say, other file formats. Of course one could consider it an overkill.
To just remove rows with empty cells:
>>> import pandas as pd
>>> data = pd.read_csv('example.csv', sep='\t')
>>> print data
A B C
0 1 2 5
1 NaN 1 9
2 3 4 4
>>> data.dropna()
A B C
0 1 2 5
2 3 4 4
>>> data.dropna().to_csv('example_clean.csv')
I leave performing the splitting and saving into separate files using pandas as an exercise to start learning this great package if you want :)
This would skip all all rows with at least one empty cell:
with open(main_file, "r") as fp:
....
for row in root:
if not all(map(len, row)):
continue
Pandas is Best in Python for handling any type of data processing.For help you can go through on link :- http://pandas.pydata.org/pandas-docs/stable/10min.html

How to select every Nth row in CSV file using python

I have a CSV file with hundreds of rows, and I would like to select and export every 3 rows to a new CSV file with the new output CSV file being named after the first row of the selection.
For example in the following CSV file....
1980 10 12
1 2 3 4 5 6 7
4 6 8 1 0 8 6
1981 10 12
2 4 9 7 5 4 1
8 9 3 8 3 7 3
I would like to select the first 3 rows and export to a new CSV named "1980 10 12" based on the first row then select the next 3 rows and export to a new CSV named "1981 10 12" based on the first row of the next 3 rows. I would like to do this using python.
Using the csv module, plus itertools.islice() to select 3 rows each time:
import csv
import os.path
from itertools import islice
with open(inputfilename, 'rb') as infh:
reader = csv.reader(infh)
for row in reader:
filename = row[0].replace(' ', '_') + '.csv')
filename = os.path.join(directory, filename)
with open(filename, 'wb') as outfh:
writer = csv.writer(outfh)
writer.writerow(row)
writer.writerows(islice(reader, 2))
The writer.writerows(islice(reader, 2)) line takes the next 2 rows from the reader, copying them across to the writer CSV, after writing the current row (with the date) to the output file first.
You may need to adjust the delimiter argument for the csv.reader() and csv.writer() objects; the default is a comma, but you didn't specify the exact format and perhaps you need to set it to a '\t' tab instead.
If you are using Python 3, open the files with 'r' and 'w' text mode, and set newline='' for both; open(inputfilename, 'r', newline='') and open(filename, 'w', newline='').
import csv
with open("in.csv") as f:
reader = csv.reader(f)
chunks = []
for ind, row in enumerate(reader, 1):
chunks.append(row)
if ind % 3 == 0: # if we have three new rows, create a file using the first row as the name
with open("{}.csv".format(chunks[0][0].strip(), "w") as f1:
wr = csv.writer(f1)
wr.writerows(chunks) # write all rows
chunks = [] # reset chunks to an empty list
Using slight iterator trickery:
with open('in.csv', 'r') as infh:
for block in zip(*[infh]*3):
filename = block[0].strip() + '.csv'
with open(filename, 'w') as outfh:
outfh.writelines(block)
On Python 2.X you would use itertools.izip. The docs actually mention izip(*[iter(s)]*n) as an idiom for clustering a data series.

Categories

Resources