I have a huge csv file with approximately 992 rows * 992columns.
The file for example looks like this:
I need to create an output file that essentially contains a header and looks like below:
I tried to use csv reader and dict reader too but i am getting stuck on removing NA columns and also getting the name of the column into one list (or column) and the corresponding value into another.
I am not at all good at pandas and clueless in that aspect.
I tried:
def csv_reader():
with open("/Users/svadali/Downloads/test_1.csv") as csv_infile, open("/Users/svadali/Downloads/result_file.txt", "w+") as outfile:
reader = csv.reader(csv_infile, delimiter=',')
file_writer = csv.writer(outfile, delimiter="\t")
file_writer.writerow(["SPC", "SPCs_within_0.2_phylo_distance", "Phylo_Distances"])
for row in reader:
for column in reader:
print("this is row", row)
print("this is column", column)
if column == 'NA':
print("this non NA", column)
print("this is supposed to be non NA row", row)
break
I also trie transpose but they are not yielding the results I need.
You can extract the names from the header, zip them with the distances in each row, filter those with invalid distances, and then zip them again to produce names and distances in separate columns:
with open("test_1.csv") as infile, open("result_file.txt", "w+") as outfile:
reader = csv.reader(infile, delimiter=',')
writer = csv.writer(outfile, delimiter="\t")
writer.writerow(["SPC", "SPCs_within_0.2_phylo_distance", "Phylo_Distances"])
_, *names = next(reader)
for name, *distances in reader:
writer.writerow((
name,
*map(
','.join,
zip(*((n, d) for n, d in zip(names, distances) if d != 'NA'))
)
))
Demo: https://replit.com/#blhsing/OutrageousInvolvedProtools
Related
Hey intelligent community,
I need a little bit of help because i think i don't see the the wood in the trees.
i have to CSV files that look like this:
Name,Number
AAC;2.2.3
AAF;2.4.4
ZCX;3.5.2
Name,Number
AAC;2.2.3
AAF;2.4.4
ZCX;3.5.5
I would like to compare both files and than write any changes like this:
Name,Number,Changes
AAC;2.2.3
AAF;2.4.4
ZCX;5.5.5;change: 3.5.2
So on every line when there is a difference in the number, i want to add this as a new column at the end of the line.
The Files are formated the same but sometimes have a new row so thats why i think i have to map the keys.
I come this far but now iam lost in my thoughts:
Python 3.10.9
import csv
Reading the first csv and set mapping
with open('test1.csv', 'r') as csvfile:
reader= csv.reader(csvfile)
rows = list(reader)
file1_dict = {row[1]: row[0] for row in rows}
Reading the second csv and set mapping
with open('test2.csv', 'r') as csvfile:
reader= csv.reader(csvfile)
rows = list(reader)
file2_dict = {row[1]: row[0] for row in rows}
comparing the keys and find the diff
for k in test1_dict:
if test1_dict[k] != test2:dict[k]
test1_dict[k] = test2_dict[k]
for row in rows:
if row[1] == k:
row.append(test2_dict[k])
#write the csv (not sure how to add the word "change:")
with open('test1.csv', 'w', newline ='') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(rows)
If i try this, i don't get a new column, it just "updates" the csv file with the same columns.
For example this code gives me the diff row but i'am not able to just add it to existing file and row.
with open('test1.csv') as fin1:
with open('test2.csv') as fin2:
read1 = csv.reader(fin1)
read2 = csv.reader(fin2)
diff_rows = (row1 for row1, row2 in zip(read1, read2) if row1 != row2)
with open('test3.csv', 'w') as fout:
writer = csv.writer(fout)
writer.writerows(diff_rows)
Does someone have any tips or help for my problem? I read many answers on here but can't figure it out.
Thanks alot.
#bigkeefer
Thanks for your answer, i tried to change it for the delimiter ; but it gives an "list index out of range error".
with open('test3.csv', 'r') as file1:
reader = csv.reader(file1, delimiter=';')
rows = list(reader)[1:]
file1_dict = {row[0]: row[1] for row in rows}
with open('test4.csv', 'r') as file2:
reader = csv.reader(file2, delimiter=';')
rows = list(reader)[1:]
file2_dict = {row[0]: row[1] for row in rows}
new_file = ["Name;Number;Changes\n"]
with open('output.csv', 'w') as nf:
for key, value in file1_dict.items():
if value != file2_dict[key]:
new_file.append(f"{key};{file2_dict[key]};change: {value}\n")
else:
new_file.append(f"{key};{value}\n")
nf.writelines(new_file)
You will need to adapt this to overwrite your first file etcetera, as you mentioned above, but I've left it like this for your testing purposes. Hopefully this will help you in some way.
I've assumed you've actually got the headers above in each file. If not, remove the slicing on the list creations, and change the new_file variable assignment to an empty list ([]).
with open('f1.csv', 'r') as file1:
reader = csv.reader(file1, delimiter=";")
rows = list(reader)[1:]
file1_dict = {row[0]: row[1] for row in rows if row}
with open('f2.csv', 'r') as file2:
reader = csv.reader(file2, delimiter=";")
rows = list(reader)[1:]
file2_dict = {row[0]: row[1] for row in rows if row}
new_file = ["Name,Number,Changes\n"]
for key, value in file1_dict.items():
if value != file2_dict[key]:
new_file.append(f"{key};{file2_dict[key]};change: {value}\n")
else:
new_file.append(f"{key};{value}\n")
with open('new.csv', 'w') as nf:
nf.writelines(new_file)
I would like to check for duplicates in a .csv (structure bellow). Every value in this .csv has to be unique! You can find "a" thrice, but it should be there only once.
###start
a
a;b;
d;e
f;g
h
i;
i
d;b
a
c;i
### end
The progress so far:
import os,glob
import csv
folder_path = "csv_entities/"
found_rows = set()
for filepath in glob.glob(os.path.join(folder_path, "*.csv")):
with open(filepath) as fin, open("newfile.csv", "w") as fout:
reader = csv.reader(fin, delimiter=";")
writer = csv.writer(fout, delimiter=";")
for row in reader:
# delete empty list elements
if "" in row:
row = row[:-1]
#delete empt row
if not row:
continue
row = tuple(row) # make row hashable
# don't write if row is there already!
if row in found_rows:
continue
print(row)
writer.writerow(row)
found_rows.add(row)
Which results in this csv:
###start
a
a;b
d;e
f;g
h
i
d;b
c;i
###end
The most important question is right now: How can I get rid of the double values?
e.g in the second row there should be only "b" instead of "a;b", because "a" is already in the row before.
your mistake is to consider the rows themselves as unique elements. You have to consider cells as elements.
So use your marker set to mark elements, not rows.
Example with only one input file (using several input files with only one output file makes no sense)
found_values = set()
with open("input.csv") as fin, open("newfile.csv", "w",newline="") as fout:
reader = csv.reader(fin, delimiter=";")
writer = csv.writer(fout, delimiter=";")
for row in reader:
# delete empty list elements & filter out already seen elements
new_row = [x for x in row if x and x not in found_values]
# update marker set with row contents
found_values.update(row)
if new_row:
# new row isn't empty: write it
writer.writerow(new_row)
the resulting csv file is:
a
b
d;e
f;g
h
i
c
I have an array LiveTick = ['ted3m index','US0003m index','USGG3m index'] and I am reading a CSV file book1.csv. I have to find the row which contains the values in csv.
For example, 15th row will contain ted3m index 500 | 600 and 20th row will contain US0003m index 800 | 900 and likewise.
I then have to get the values contained in the row and parse it for each value contained in array LiveTick. How do I proceed? Below is my sample code:
with open('C:\\blp\\book1.csv', 'r') as f:
reader = csv.reader(f, delimiter=',')
writer = csv.writer(outf)
for row in reader:
for list in LiveTick:
if list in row:
print ('Found: {}'.format(row))
You can use pandas, it's pretty fast and will do all reading, writing and filtering job for you out of the box:
import pandas as pd
df = pd.read_csv('C:\\blp\\book1.csv')
filtered_df = df[df['your_column_name'].isin(LiveTick)]
# now you can save it
filtered_df.to_csv('C:\\blp\\book_filtered.csv')
You have the right idea, but there are a few improvements you can make:
Instead of a nested for loop which doesn't short-circuit, use any to compare the first column to multiple values.
Write to your csv as you go along instead of just print. This is memory-efficient, as you hold in memory only one line at any one time.
Define outf as an open object in your with statement.
Do not shadow built-in list. Use another identifier, e.g. i, for elements in LiveTick.
Here's a demo:
with open('in.csv', 'r') as f, open('out.csv', 'wb', newline='') as outf:
reader = csv.reader(f, delimiter=',')
writer = csv.writer(outf, delimiter=',')
for row in reader:
if any(i in row[0] for i in LiveTick):
writer.writerow(row)
I need guidance on code to write a CSV file that drops rows with specific numbers in the first column [0]. My script writes a file, but it contains the rows that I am working to delete. I suspect that I may have an issue with the spreadsheet being read as one long string rather than ~150 rows.
import csv
Property_ID_To_Delete = {4472738, 4905985, 4905998, 4678278, 4919702, 4472936, 2874431, 4949190, 4949189, 4472759, 4905977, 4905995, 4472934, 4905982, 4906002, 4472933, 4905985, 4472779, 4472767, 4472927, 4472782, 4472768, 4472750, 4472769, 4472752, 4472748, 4472751, 4905989, 4472929, 4472930, 4472753, 4933246, 4472754, 4472772, 4472739, 4472761, 4472778}
with open('2015v1.csv', 'rt') as infile:
with open('2015v1_edit.csv', 'wt') as outfile:
writer = csv.writer(outfile)
for row in csv.reader(infile):
if row[0] != Property_ID_To_Delete:
writer.writerow(row)
Here is the data:
https://docs.google.com/spreadsheets/d/19zEMRcir_Impfw3CuexDhj8PBcKPDP46URZ9OA3uV9w/edit?usp=sharing
You need to check if an id, converted into an integer as you set as integers,
is contained in the ids to delete.
Write the line only if its not contained. You compare the id in the
first column with the whole set of ids to be deleted. A string is always
not equal to a set:
>>> '1' != {1}
True
Therefore, you get all rows in your output.
Change:
if row[0] != Property_ID_To_Delete:
into:
if int(row[0]) not in Property_ID_To_Delete:
EDIT
You need tow write the header of your infile first before trying to convert the first column entry into an integer:
with open('2015v1.csv', 'rt') as infile:
with open('2015v1_edit.csv', 'wt') as outfile:
writer = csv.writer(outfile)
reader = csv.reader(infile)
writer.writerow(next(reader))
for row in reader:
if int(row[0]) not in Property_ID_To_Delete:
writer.writerow(row)
I am trying to read a CSV file into a list and then sort it based on the first two columns of the list (first by first column and then by second column if the first column is the same). This is what I am doing:
def sortcsvfiles(inputfilename,outputfilename):
list1=[]
row1=[]
with open(inputfilename,'rt') as csvfile1:
reader=csv.reader(csvfile1)
cnt=0
for row in reader:
if cnt==0: #skip first row as it contains header information
row1=row
cnt+=1
continue
list1.append((row))
list1.sort(key=lambda ro: (int(ro[0]),int(ro[1])))
list1.insert(0, row1)
with open(outputfilename,'wt') as csvfile1:
writer=csv.writer(csvfile1, lineterminator='\n')
for row in list1:
writer.writerow(row)
But I am getting the following error:
File "C:\Users\50004182\Documents\temp.py", line 37, in <lambda>
list1.sort(key=lambda ro: (int(ro[0]),int(ro[1])))
IndexError: list index out of range
How can I fix this?
You have probably an empty line in your file. Perhaps the last one. For example, you can just ignore empty lines:
def sortcsvfiles(inputfilename,outputfilename):
with open(inputfilename,'rt') as csvfile:
reader = csv.reader(csvfile)
header = next(reader)
data = [row for row in reader if row] # ignore empty lines
data.sort(key=lambda ro: (int(ro[0]),int(ro[1])))
with open(outputfilename,'wt') as csvfile:
writer=csv.writer(csvfile, lineterminator='\n')
writer.writerow(header)
writer.writerows(data)
The error occurs because you have at least one row that does not have 2 columns. It may have 1 or even 0 instead.
You could test for this before appending the row:
if len(row) > 1:
list1.append(row)
To sort all rows but skip the first header, you can use the next() function (see a previous answer of mine); using the sorted() function perhaps:
def sortcsvfiles(inputfilename, outputfilename):
with open(inputfilename,'rt') as csvfile1:
reader = csv.reader(csvfile1)
headers = next(reader, None) # get one row, or None if there are no rows
rows = sorted(
(r for r in reader if len(r) > 1),
key=lambda r: (int(r[0]), int(r[1])))
with open(outputfilename,'wt') as csvfile1:
writer = csv.writer(csvfile1, lineterminator='\n')
if headers:
writer.writerow(headers)
writer.writerows(rows)
I used writer.writerows() to write the whole list of sorted rows in one call.