Skip Row In CSV If Contains Strings From List - python

I have a list of approximately 500 strings that I want to check against a CSV file containing 25,000 rows. What I currently have seems to be getting stuck looping. I basically want to skip the row if it contains any of the strings in my string list and then extract other data.
stringList = [] #strings look like "AAA", "AAB", "AAC", etc.
with open('BadStrings.csv', 'r')as csvfile:
filereader = csv.reader(csvfile, delimiter=',')
for row in filereader:
stringToExclude = row[0]
stringList.append(stringToExclude)
with open('OtherData.csv', 'r')as csvfile:
filereader = csv.reader(csvfile, delimiter=',')
next(filereader, None) #Skip header row
for row in filereader:
for s in stringList:
if s not in row:
data1 = row[1]
Edit: Not an infinite loop, but looping is taking too long.

according to Niels I would change the 2 loop and iterate over the row itself and check if the current row entry is inside the "bad" list:
for row in filereader:
for s in row:
if s not in stringlist:
data1 = row[0]
And I also dont know what you want to do with data1 but you always change the object reference when an item is not in stringList.
You could use a list to add the items to a list with data1.append(item)

You could try something like this.
stringList = [] #strings look like "AAA", "AAB", "AAC", etc.
with open('BadStrings.csv', 'r')as csvfile:
filereader = csv.reader(csvfile, delimiter=',')
for row in filereader:
stringToExclude = row[0]
stringList.append(stringToExclude)
data1 = [] # Right now you are overwriting your data1 every time. I don't know what you want to do with it, but you could for exmaple add all row[1] to a list data1
with open('OtherData.csv', 'r')as csvfile:
filereader = csv.reader(csvfile, delimiter=',')
next(filereader, None) #Skip header row
for row in filereader:
found_s = False
for s in stringList:
if s in row:
found_s = True
break
if not found_s:
data1.append(row[1]) # Add row[1] to the list is no element of stringList is found in row.
Still probably not a huge performance improvement, but at least the for loop for s in stringList: will now stop after s is found.

Related

Compare two CSV files and write difference in the same file as an extra column in python

Hey intelligent community,
I need a little bit of help because i think i don't see the the wood in the trees.
i have to CSV files that look like this:
Name,Number
AAC;2.2.3
AAF;2.4.4
ZCX;3.5.2
Name,Number
AAC;2.2.3
AAF;2.4.4
ZCX;3.5.5
I would like to compare both files and than write any changes like this:
Name,Number,Changes
AAC;2.2.3
AAF;2.4.4
ZCX;5.5.5;change: 3.5.2
So on every line when there is a difference in the number, i want to add this as a new column at the end of the line.
The Files are formated the same but sometimes have a new row so thats why i think i have to map the keys.
I come this far but now iam lost in my thoughts:
Python 3.10.9
import csv
Reading the first csv and set mapping
with open('test1.csv', 'r') as csvfile:
reader= csv.reader(csvfile)
rows = list(reader)
file1_dict = {row[1]: row[0] for row in rows}
Reading the second csv and set mapping
with open('test2.csv', 'r') as csvfile:
reader= csv.reader(csvfile)
rows = list(reader)
file2_dict = {row[1]: row[0] for row in rows}
comparing the keys and find the diff
for k in test1_dict:
if test1_dict[k] != test2:dict[k]
test1_dict[k] = test2_dict[k]
for row in rows:
if row[1] == k:
row.append(test2_dict[k])
#write the csv (not sure how to add the word "change:")
with open('test1.csv', 'w', newline ='') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(rows)
If i try this, i don't get a new column, it just "updates" the csv file with the same columns.
For example this code gives me the diff row but i'am not able to just add it to existing file and row.
with open('test1.csv') as fin1:
with open('test2.csv') as fin2:
read1 = csv.reader(fin1)
read2 = csv.reader(fin2)
diff_rows = (row1 for row1, row2 in zip(read1, read2) if row1 != row2)
with open('test3.csv', 'w') as fout:
writer = csv.writer(fout)
writer.writerows(diff_rows)
Does someone have any tips or help for my problem? I read many answers on here but can't figure it out.
Thanks alot.
#bigkeefer
Thanks for your answer, i tried to change it for the delimiter ; but it gives an "list index out of range error".
with open('test3.csv', 'r') as file1:
reader = csv.reader(file1, delimiter=';')
rows = list(reader)[1:]
file1_dict = {row[0]: row[1] for row in rows}
with open('test4.csv', 'r') as file2:
reader = csv.reader(file2, delimiter=';')
rows = list(reader)[1:]
file2_dict = {row[0]: row[1] for row in rows}
new_file = ["Name;Number;Changes\n"]
with open('output.csv', 'w') as nf:
for key, value in file1_dict.items():
if value != file2_dict[key]:
new_file.append(f"{key};{file2_dict[key]};change: {value}\n")
else:
new_file.append(f"{key};{value}\n")
nf.writelines(new_file)
You will need to adapt this to overwrite your first file etcetera, as you mentioned above, but I've left it like this for your testing purposes. Hopefully this will help you in some way.
I've assumed you've actually got the headers above in each file. If not, remove the slicing on the list creations, and change the new_file variable assignment to an empty list ([]).
with open('f1.csv', 'r') as file1:
reader = csv.reader(file1, delimiter=";")
rows = list(reader)[1:]
file1_dict = {row[0]: row[1] for row in rows if row}
with open('f2.csv', 'r') as file2:
reader = csv.reader(file2, delimiter=";")
rows = list(reader)[1:]
file2_dict = {row[0]: row[1] for row in rows if row}
new_file = ["Name,Number,Changes\n"]
for key, value in file1_dict.items():
if value != file2_dict[key]:
new_file.append(f"{key};{file2_dict[key]};change: {value}\n")
else:
new_file.append(f"{key};{value}\n")
with open('new.csv', 'w') as nf:
nf.writelines(new_file)

Create list from csv lines in python

I have a csv file with 2 rows and multiple lines.
import csv
with open('data.csv', 'r') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader)
for row in csv_reader:
print(row[0])
The output is:
row0line0
row0line1
row0line2
...
Is there a way i could further separate the rows into a list of individual cells?
Thanks
As I understand your csv file look like this:
row0line0
row1line1
...
If its possible i should reccomand to change it to:
row0 line0
row1 line1
...
(Add a space between the rows and the lines)
Then you can update your code to the code bellow to print only the rows and create two lists - one that contain the rows and another that contain the lines:
import csv
with open('data.csv', 'r') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader)
rows = []
lines = []
for item in csv_reader:
temp = item[0].split(" ")
rows.append(temp[0])
lines.append(temp[1])
print(temp[0])
If you mean that each row becomes an element of the list that goes this way:
with open('data.csv', 'r') as csv_file:
reader = csv.reader(csv_file)
data_list = [row[0] for row in reader]
Otherwise, if you want to create a list of the first elements of each line, you can do this:
with open('data.csv', 'r') as csv_file:
reader = csv.reader(csv_file)
row0_list = []
for row in reader:
row0_list.append(row[0])
I hope the problem is solved with this explanation.
My understanding is that you are asking to output all the data fields. csv_reader is already separating your rows into a list individual cells!
You current script reads the file one line at a time and prints the first item in each row with this line:
for row in csv_reader:
print(row[0])
Instead of printing row[0], which only prints the first field in the csv row, you can just print the row:
for row in csv_reader:
print(row)
That will output the field lists (from my sample csv):
['r0v0', 'r0v1', 'r0v2']
['r1v0', 'r1v1', 'r1v2']
If you want to print in a nicer format, you can use join:
for row in csv_reader:
print(", ".join(row))
Output:
r0v0, r0v1, r0v2
r1v0, r1v1, r1v2
My csv:
r0v0,r0v1,r0v2
r1v0,r1v1,r1v2

Check for unique elements of csv

I would like to check for duplicates in a .csv (structure bellow). Every value in this .csv has to be unique! You can find "a" thrice, but it should be there only once.
###start
a
a;b;
d;e
f;g
h
i;
i
d;b
a
c;i
### end
The progress so far:
import os,glob
import csv
folder_path = "csv_entities/"
found_rows = set()
for filepath in glob.glob(os.path.join(folder_path, "*.csv")):
with open(filepath) as fin, open("newfile.csv", "w") as fout:
reader = csv.reader(fin, delimiter=";")
writer = csv.writer(fout, delimiter=";")
for row in reader:
# delete empty list elements
if "" in row:
row = row[:-1]
#delete empt row
if not row:
continue
row = tuple(row) # make row hashable
# don't write if row is there already!
if row in found_rows:
continue
print(row)
writer.writerow(row)
found_rows.add(row)
Which results in this csv:
###start
a
a;b
d;e
f;g
h
i
d;b
c;i
###end
The most important question is right now: How can I get rid of the double values?
e.g in the second row there should be only "b" instead of "a;b", because "a" is already in the row before.
your mistake is to consider the rows themselves as unique elements. You have to consider cells as elements.
So use your marker set to mark elements, not rows.
Example with only one input file (using several input files with only one output file makes no sense)
found_values = set()
with open("input.csv") as fin, open("newfile.csv", "w",newline="") as fout:
reader = csv.reader(fin, delimiter=";")
writer = csv.writer(fout, delimiter=";")
for row in reader:
# delete empty list elements & filter out already seen elements
new_row = [x for x in row if x and x not in found_values]
# update marker set with row contents
found_values.update(row)
if new_row:
# new row isn't empty: write it
writer.writerow(new_row)
the resulting csv file is:
a
b
d;e
f;g
h
i
c

Python "list index out of range" when I am converting a csv to docx

My csv have blank rows in every 17 rows (blank row in row 17,34....)
I tried to convert it to a docx. However, the "list index out of range" error came out. If I delete the blank row in the csv, it will be working fine. Even if I manually add it via excel and the docx function will be work fine.
It seems that the table.row cannot deal with the blank row in my csv.
row_cells[i].text = row[i]
IndexError: list index out of range
Would you please help me? Thank you in advance.
path = os.getcwd()
##Use glob.glob to get all the "txt.csv.csv" files.
allfiles = glob.glob(path + "/*.txt.csv.csv")
df_out_filename = 'Xunjian-Report.csv'
with open(df_out_filename, 'w+', newline='') as fout:
writer = csv.writer(fout)
for filename in allfiles:
with open(filename) as fin:
reader = csv.reader(fin)
writer.writerows(reader)
writer.writerow("[]") ##Insert blank row at the end
###To generate the docx
csvfile = 'Xunjian-Report.csv'
doc = docx.Document()
date = datetime.datetime.now()
with open(csvfile, newline='') as f:
csv_reader = csv.reader(f)
csv_headers = next(csv_reader)
csv_cols = len(csv_headers)
print(csv_cols)
# Adding Subject of the document and the date
doc.add_heading('Report', level=0)
doc.add_heading('Date: %s/%s/%s' % (date.day, date.month, date.year), level=1)
table = doc.add_table(rows=1, cols=csv_cols)
table.autofix=False
hdr_cells = table.rows[0].cells
for i in range(csv_cols):
hdr_cells[i].text = csv_headers[i]
for row in csv_reader:
row_cells = table.add_row().cells
for i in range(csv_cols):
row_cells[i].text = row[i]
If you row list does not have enough elements for row[i], you hit IndexError. If you have blank rows in your input, those will indeed be shorter (or even just empty list).
Depending on your processing needs you could, either only process as many columns as you have in each row, replace:
for i in range(csv_cols):
with:
for i in range(len(row)):
Or even:
for (idx, cell) in enumerate(csv_cols):
row_cells[idx].text = cell
Or skip rows that are empty list (but this is a bit less robust, former would also deal with uneven length of rows, should it be possible to hit that case):
for row in csv_reader:
Could read:
for row in csv_reader:
if not row: continue # skip empty row, go to the next one

Compare two CSV files and look for matches Python

I have two CSV files that are like
CSV1
H1,H2,H3
arm,biopsy,forearm
heart,leg biopsy,biopsy
organs.csv
arm
leg
forearm
heart
skin
I need to compare both the files and get an output list like this [arm,forearm,heart,leg] but the script that I'm currently working on doesn't give me any output (I want leg also in the output, though it is mixed with biopsy in the same cell). Here's the code so far. How can I get all the matched words?
import csv
import io
alist, blist = [], []
with open("csv1.csv", "rb") as fileA:
reader = csv.reader(fileA, delimiter=',')
for row in reader:
alist.append(row)
with open("organs.csv", "rb") as fileB:
reader = csv.reader(fileB, delimiter=',')
for row in reader:
blist.append(row)
first_set = set(map(tuple, alist))
secnd_set = set(map(tuple, blist))
matches = set(first_set).intersection(secnd_set)
print matches
Try this:
import csv
alist, blist = [], []
with open("csv1.csv", "rb") as fileA:
reader = csv.reader(fileA, delimiter=',')
for row in reader:
for row_str in row:
alist += row_str.strip().split()
with open("organs.csv", "rb") as fileB:
reader = csv.reader(fileB, delimiter=',')
for row in reader:
blist += row
first_set = set(alist)
second_set = set(blist)
print first_set.intersection(second_set)
Basically, iterating through the csv file via csv reader returns a row which is a list of the items (strings) like this ['arm', 'biopsy', 'forearm'], so you have to sum lists to insert all of the items.
On the other hand, to remove duplications only one set conversion via the set() function is required, and the intersection method returns another set with the elements.
Change the part reading from csv1.csv to:
with open("csv1.csv", "rb") as fileA:
reader = csv.reader(fileA, delimiter=',')
for row in reader:
# append all words in cell
for word in row:
alist.append(word)
I would treat the CSV files as text files, get a lists of all the words in the first and the seconds, then iterate over the first list to see if any exactly match any in the second list.

Categories

Resources