I have a csv of 2 columns A and B. A contains words and B contains the word type. I want to append a count that increases when the cell contains either "." or "?" or "!". However they must only contain one "." or one "?" or one "!". It shouldn't increase when the cell contains "..." or "!!!???!"
I have created the code:
from csv import writer
from csv import reader
sentence_number = 1
with open('input.csv', 'r') as read_obj,\
open('output.csv', 'w', newline='') as write_obj:
csv_reader = reader(read_obj)
csv_writer = writer(write_obj)
for row in csv_reader:
if str(row[0])== "." or str(row[0])=="?" or str(row[0]) == "!":
sentence_number = sentence_number + 1
row.append(sentence_number)
csv_writer.writerow(row)
Edit: The original csv file is
This;Adverb
flower;Noun
is;Verb
pretty;Adjective
.;Punctuation
I;Pronoun
like;Verb
flowers;Noun
!;Punctuation
However it gives rows as
This;Adverb,1
flower;Noun,1
is;Verb,1
pretty;Adjective,1
.;Punctuation,1
I;Pronoun,1
like;Verb,1
flowers;Noun,1
!;Punctuation,1
Expected cvs outcome is:
This;Adverb;1
flower;Noun;1
is;Verb;1
pretty;Adjective;1
.;Punctuation;1
I;Pronoun;2
like;Verb;2
flowers;Noun;2
!;Punctuation;2
Basically I want to recognize which sentence a word belongs to, i.e. "This" belongs to sentence 1. How can I achieve this?
Thank you in advance :)
Once you have determined the file you want to read, you read it with this line:
csv_reader = reader(read_obj)
However, reader doesn't return a string, but an object of this type:
<_csv.reader object at 0x000002145A5B71C0>
The problem occurs because you expect this line:
for row in csv_reader:
to iterate over the object storing in "row" a string with the content of each row. But what it actually stores is an array with the string inside, such as:
["This;Adverb"]
To solve this, you simply need to add another [0] when checking for the punctuation signs.
Besides that, i noticed another error that led to the concatenation of the number with a "," instead of a ";", which was due to row.append(sentence_number), so i swapped it with row += ";" + str(sentence_number).
Here's the code with the changes, i hope it helps:
from csv import writer
from csv import reader
sentence_number = 1
with open('a.txt', 'r') as read_obj, \
open('output.csv', 'w', newline='') as write_obj:
csv_reader = reader(read_obj)
csv_writer = writer(write_obj)
for row in csv_reader:
row = row[0]
row += ";" + str(sentence_number)
if row[0] == "." or row[0] == "?" or row[0] == "!":
sentence_number = sentence_number + 1
csv_writer.writerow([row])
Related
Hey intelligent community,
I need a little bit of help because i think i don't see the the wood in the trees.
i have to CSV files that look like this:
Name,Number
AAC;2.2.3
AAF;2.4.4
ZCX;3.5.2
Name,Number
AAC;2.2.3
AAF;2.4.4
ZCX;3.5.5
I would like to compare both files and than write any changes like this:
Name,Number,Changes
AAC;2.2.3
AAF;2.4.4
ZCX;5.5.5;change: 3.5.2
So on every line when there is a difference in the number, i want to add this as a new column at the end of the line.
The Files are formated the same but sometimes have a new row so thats why i think i have to map the keys.
I come this far but now iam lost in my thoughts:
Python 3.10.9
import csv
Reading the first csv and set mapping
with open('test1.csv', 'r') as csvfile:
reader= csv.reader(csvfile)
rows = list(reader)
file1_dict = {row[1]: row[0] for row in rows}
Reading the second csv and set mapping
with open('test2.csv', 'r') as csvfile:
reader= csv.reader(csvfile)
rows = list(reader)
file2_dict = {row[1]: row[0] for row in rows}
comparing the keys and find the diff
for k in test1_dict:
if test1_dict[k] != test2:dict[k]
test1_dict[k] = test2_dict[k]
for row in rows:
if row[1] == k:
row.append(test2_dict[k])
#write the csv (not sure how to add the word "change:")
with open('test1.csv', 'w', newline ='') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(rows)
If i try this, i don't get a new column, it just "updates" the csv file with the same columns.
For example this code gives me the diff row but i'am not able to just add it to existing file and row.
with open('test1.csv') as fin1:
with open('test2.csv') as fin2:
read1 = csv.reader(fin1)
read2 = csv.reader(fin2)
diff_rows = (row1 for row1, row2 in zip(read1, read2) if row1 != row2)
with open('test3.csv', 'w') as fout:
writer = csv.writer(fout)
writer.writerows(diff_rows)
Does someone have any tips or help for my problem? I read many answers on here but can't figure it out.
Thanks alot.
#bigkeefer
Thanks for your answer, i tried to change it for the delimiter ; but it gives an "list index out of range error".
with open('test3.csv', 'r') as file1:
reader = csv.reader(file1, delimiter=';')
rows = list(reader)[1:]
file1_dict = {row[0]: row[1] for row in rows}
with open('test4.csv', 'r') as file2:
reader = csv.reader(file2, delimiter=';')
rows = list(reader)[1:]
file2_dict = {row[0]: row[1] for row in rows}
new_file = ["Name;Number;Changes\n"]
with open('output.csv', 'w') as nf:
for key, value in file1_dict.items():
if value != file2_dict[key]:
new_file.append(f"{key};{file2_dict[key]};change: {value}\n")
else:
new_file.append(f"{key};{value}\n")
nf.writelines(new_file)
You will need to adapt this to overwrite your first file etcetera, as you mentioned above, but I've left it like this for your testing purposes. Hopefully this will help you in some way.
I've assumed you've actually got the headers above in each file. If not, remove the slicing on the list creations, and change the new_file variable assignment to an empty list ([]).
with open('f1.csv', 'r') as file1:
reader = csv.reader(file1, delimiter=";")
rows = list(reader)[1:]
file1_dict = {row[0]: row[1] for row in rows if row}
with open('f2.csv', 'r') as file2:
reader = csv.reader(file2, delimiter=";")
rows = list(reader)[1:]
file2_dict = {row[0]: row[1] for row in rows if row}
new_file = ["Name,Number,Changes\n"]
for key, value in file1_dict.items():
if value != file2_dict[key]:
new_file.append(f"{key};{file2_dict[key]};change: {value}\n")
else:
new_file.append(f"{key};{value}\n")
with open('new.csv', 'w') as nf:
nf.writelines(new_file)
source.csv as follows.
AB;CD
a;1;
b;2;
c;3;
target.csv as follows.
DE;FG;HI
1;e;1;
2;a;2;
3;f;3;
I need to do a vlookup using file handling mechanisms in python.
So need to update column 'FG' of 'target.csv' by looking up the column 'AB' of 'source.csv' and update with 'CD' column value of 'source.csv'.
So my desired output is like below.
DE;FG;HI
1;e;1;
2;1;2; #a is replaced with 1
3;f;3;
Without using pandas or any other method how I can approach this.
Below is how I approached this.
with open('D:/target.csv', "w+", encoding="utf-8") as Tgt_csvFile:
with open('D:/source.csv', "r", encoding="utf-8") as Src_csvFile:
for line in Src_csvFile:
fields = line.split(";")
x = fields[0]
for line_1 in Tgt_csvFile:
fields_1 = line_1.split(";")
y = fields[1]
if y == x:
update # not sure how to do this
else:
keep as it is
Appreciate on the support
This will solve your particular problem, but if the number of input/output columns changes you will need to adjust the logic accordingly.
It's also worth noting the trailing ; on each non-header row of your csv file will cause most packages to assume there is an extra column. I don't think you want that.
# Read in input, creating a dict where key is column 1 and value is column 2
with open('source.csv', mode='r') as infile:
reader = csv.reader(infile, delimiter=';')
s = {x[0]:x[1] for x in reader}
print(s)
# If column 2 is a key in dict s update with value from dict
output = []
with open('target.csv', mode='r') as infile:
reader = csv.reader(infile, delimiter=';')
for row in reader:
if row[1] in s.keys():
row[1] = s[row[1]]
output.append(row)
# Output to csv
with open('output.csv', mode='w', newline='') as outfile:
writer = csv.writer(outfile, delimiter=';')
writer.writerows(output)
Here is my suggestion:
with open('D:/source.csv', "r", encoding="utf-8") as Src_csvFile:
l=Src_csvFile.readlines()
d={}
for i in l[1:]:
x=i.split(';')
d[x[0]]=x[1]
with open('D:/target.csv', "r", encoding="utf-8") as Tgt_csvFile:
m=Tgt_csvFile.readlines()
for i in range(1,len(m)):
x=m[i].split(';')
if x[1] in d:
x[1]=d.get(x[1])
m[i]=';'.join(x)
with open('D:/target.csv', "w", encoding="utf-8") as Tgt_csvFile:
Tgt_csvFile.writelines(m)
Output:
DE;FG;HI
1;e;1;
2;1;2;
3;f;3;
I am given the task to write a script to check MX records of the given data in the CSV file. I have started by trying checking it using regex and before that I trying to read the CSV file. I would also like to log the progress so I am printing the row number it is on, but whenever I use the cvs_reader object to calculate the row length I am unable to get inside the for loop
import csv
with open('test_list.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
data = list(csv_reader)
row_count = len(data)
for row in csv_reader:
print({row[2]})
line_count += 1
print('Checking '+ str(line_count) +' of '+ str(row_count))
print('Processed lines :'+str(row_count))
I only get the result as
Processed lines : 40
New at python scripting. Please help
My test_list.csv look like this
fname, lname, email
bhanu2, singh2, bhanudoesnotexist#doesnotexit.com
bhanu2, singh2, bhanudoesnotexist#doesnotexit.com
bhanu2, singh2, bhanudoesnotexist#doesnotexit.com
bhanu2, singh2, bhanudoesnotexist#doesnotexit.com
Total 40 times continued
first thing csv data has nothing to do with this problem,
Solution:
import csv
input_file = open("test_list.csv", "r").readlines()
print(len(input_file))
csv_reader = csv.reader(input_file)
line_count = 0
# data = list(csv_reader)
# row_count = len(data)
for row in csv_reader:
print({row[2]})
line_count += 1
print('Checking ' + str(line_count) + ' of ' + str(len(input_file)))
print('Processed lines :' + str(len(input_file)))
Problem Recognition:
with open('test_list.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
data = list(csv_reader)
row_count = len(data)
in your code data = list(csv_reader) because of this line you are exhausting your variable. so it won't be able to loop through in your for loop
so for that you can read csv file like
input_file = open("test_list.csv", "r").readlines()
print(len(input_file))
then use csv.reader()
csv.reader returns an iterable, and when you use list(csv_reader) to read all the rows of the CSV, you have already exhausted the iterable, so when you want to iterate through csv_reader again with a for loop, it has nothing left to iterate.
Since you have a complete list of rows materialized in the variable data, you can simply iterate over it instead.
Change:
for row in csv_reader:
to:
for row in data:
I have a CSV file with contents:
scenario1,5,dosomething
scenario2,10,donothing
scenario3,8,dosomething
scenario4,5,donothing
I would like to take the contents of a variable to firstly see if it is in the first column, if true - I would like to get the row number where it is found and the entire line contents. There will be no duplicate values in column 1 of the csv.
I can partly do the first step which is to find if the variable is in the csv, returning the whole line.
import csv
filename = csv.reader(open('/file.csv', "rb"), delimiter=",")
v = 'scenario1'
for row in configfile:
if 'v' in row[0]:
print row
The results I receive would be:
['scenario1','5','dosomething']
But I need assistance with the second part please. This is to find the row number.
Try this:
import csv
with open("ooo.csv", "r") as f:
reader = csv.reader(f)
for line_num, content in enumerate(reader):
if content[0] == "scenario1":
print content, line_num + 1
Or without csv module:
with open("ooo.csv") as f:
for l, i in enumerate(f):
data = i.split(",")
if data[0] == "scenario1":
print data, l + 1
Output:
['scenario1', '5', 'dosomething'] 1
I have CSV file like below. It is huge file with thousands of records.
input.csv
No;Val;Rec;CSR
0;10;1;1200
0;100;2;1300
0;100;3;1300
0;100;4;1400
0;10;5;1200
0;11;6;1200
I want to create output.csv file by adding new column "PSR" after 1st column "No". This column value depends on column "PSR" Value. For 1st row, "PSR" shall be zero. From next record on-wards, it depends on "CSR" value in previous row. If present and previous record CSR value is same, then "PSR" shall be zero. If not, PSR value shall have the previous CSR value. For exmple, Value of CSR in 2nd row is 1300 which is different to the value in 1st record ( it is 1200). So PSR value for 2nd row shall be 1200. Where in 2nd and 3rd row, CSR value is same. So PSR value for 3rd row shall be zero. So new value PSR depends on CSR value in present and previous field.
Output.csv
No;PCR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
My Approach:
Use csv.reader and iterate over the objects in a list. Copy 5th column to 2nd column in list. Shift it one row down.
Then check the values in 2nd and 5th column (PCR and CSR), if both values are same. Replace the PCR value with zero.
I have problem in getting 1st step coded. I am able to duplicate the column but not able to shift it. Also 2nd step is quite straightforward.
Also, I am not sure whether this approach is correct Any pointers/recommendation would be really helpful.
Note: I am not able to install Pandas on CentOS. So help without this module would be better.
My Code:
with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
mylist = []
header = next(reader)
mylist.append(header)
for rec in reader:
mylist.append(rec)
rec.insert(1, rec[3])
mylist.append(rec)
writer.writerows(mylist)
If your open to non-python solutions then awk could be a good option:
awk 'NR==1{$2="PSR;"$2}NR>1{$2=($4==a?0";"$2:+a";"$2);a=$4}1' FS=';' OFS=';' file
No;PSR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
Awk is distributed with pretty much all Linux distributions and was designed exactly for this kind of task. It will blaze through your file. Add a redirection to the end > output.csv to save the output in a file.
A simple python approach using the same logic:
#!/usr/bin/env python
last = "0"
with open('input.csv') as csv:
print next(csv).strip().replace(';', ';PSR;', 1)
for line in csv:
field = line.strip().split(';')
if field[3] == last: field.insert(1, "0")
else: field.insert(1, last)
last = field[4]
print ';'.join(field)
Produces the same output:
$ python parse.py
No;PSR;Val;Rec;CSR
0;0;10;1;1200
0;1200;100;2;1300
0;0;100;3;1300
0;1300;100;4;1400
0;1400;10;5;1200
0;0;11;6;1200
Again just redirect the output to save it:
$ python parse.py > output.csv
Just code it as you explained it. Store the previous CSR and refer to it on the next loop through; just be sure to update it.
import csv
with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
mylist = []
header = next(reader)
mylist.append(header)
mylist.insert(1,'PCR')
prev_csr = 0
for rec in reader:
rec.insert(1,prev_csr)
mylist.append(rec)
prev_csr = rec[4]
writer.writerows(mylist)
with open('input.csv', 'r') as input, open('output.csv', 'w') as output:
reader = csv.reader(input, delimiter = ';')
writer = csv.writer(output, delimiter = ';')
header = next(reader)
header.insert(1, 'PCR')
writer.writerow(header)
prevRow = next(reader)
prevRow.insert(1, '0')
writer.writerow(prevRow)
for row in reader:
if prevRow[-1] == row[-1]:
val = '0'
else:
val = prevRow[-1]
row.insert(1,val)
prevRow = row
writer.writerow(row)
Or, even easier using the DictReader and DictWriter capabilities of csv:
input_header = ['No','Val','Rec','CSR']
output_header = ['No','PCR','Val','Rec','CSR']
with open('input.csv', 'rb') as in_file, open('output.csv', 'wb') as out_file:
in_reader, out_writer = DictReader(in_file, input_header, delemeter =';'), DictWriter(out_file, output_header, delemeter =';')
in_reader.next() # skip the header
out_writer.writeheader() # place the output header
last_csr = None
for row in in_reader():
current_csr = row['CSR']
row['PCR'] = last_csr if current_csr != last_csr else 0
last_csr = current_csr
out_writer.writerow(row)