Python to not count the header in a csv file - python

I have python code to edit a column in a csv file. It removes the zeros from integers in row 5. And then it adds a zero if the integer is 3 or less so it has a total of 4 integers or more.
The problem I'm having it doesn't like the title row which is not an integer. Does anyone know how I keep the header but adjust the code so that it doesn't look at the first line of the csv file.
Here is the code:
import csv
import re
import os
import sys
with open('', 'r') as infile, open('', 'w') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
#firstline = True
#for row in outfile:
# if outfile:
# firstline = False
for row in reader:
# strip all 0's from the front
stripped_value = re.sub(r'^0+', '', row[5])
# pad zeros on the left to smaller numbers to make them 4 digits
row[5] = '%04d'%int(stripped_value)
writer.writerow(row)

Add this before the loop:
# Python 2.x
writer.writerow(reader.next())
# Python 3.x
writer.writerow(next(reader))
It will get the first line and return it. And then you are writing it to the output.
However, in my opinion you should make the code inside the loop resistant to non-numbers on that column (like in Al.Sal answer).

You could use an exception handler. The try is incredibly cheap; since you'd only have one header, the more expensive except won't get called enough to impact performance. Also, you would have a good way to handle non-number rows later on.
for row in reader:
# strip all 0's from the front
stripped_value = re.sub(r'^0+', '', row[5])
# pad zeros on the left to smaller numbers to make them 4 digits
try:
row[5] = '%04d'%int(stripped_value)
except ValueError:
pass # Or do something, to avoid passing it silently
writer.writerow(row)
Your code snippet with correct indentation:
import csv
import re
import os
import sys
with open('', 'r') as infile, open('', 'w') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
# strip all 0's from the front
stripped_value = re.sub(r'^0+', '', row[5])
# pad zeros on the left to smaller numbers to make them 4 digits
try:
row[5] = '%04d'%int(stripped_value)
except ValueError:
pass # Or do something, to avoid passing it silently
writer.writerow(row)

Related

Hash a column in CSV and output in Base64

Still getting my feet wet with Python, but my goal is to read a CSV file and hash a specific column using SHA256 then output in Base64.
Here is an example of the conversion that needs to take place
This calculator can be found at https://www.liavaag.org/English/SHA-Generator/
Here is the code I have currently
import hashlib
import csv
import base64
with open('File1.csv') as csvfile:
with open('File2.csv', 'w') as newfile:
reader = csv.DictReader(csvfile)
for i, r in enumerate(reader):
# writing csv headers
if i == 0:
newfile.write(','.join(r) + '\n')
# hashing the 'CardNumber' column
r['consumer_id'] = base64.b64encode(hashlib.sha256(r['consumer_id']).encode('utf-8')).digest()
# writing the new row to the file with hashed 'CardNumber'
newfile.write(','.join(r.values()) + '\n')
The error I receive is
r['consumer_id'] = base64.b64encode(hashlib.sha256(r['consumer_id']).encode('utf-8')).digest()
TypeError: Strings must be encoded before hashing
You are on the right track, just need to take it a step at a time before doing it all at once to see how it pieces together:
import hashlib
import base64
text = "1234567890"
encoded = text.encode('utf-8')
encoded = hashlib.sha256(encoded).digest()
encoded = base64.b64encode(encoded)
print(text, str(encoded, encoding="utf-8"))
That should give you:
1234567890 x3Xnt1ft5jDNCqERO9ECZhqziCnKUqZCKreChi8mhkY=
As a "one-liner":
r['consumer_id'] = str(base64.b64encode(hashlib.sha256(r['consumer_id'].encode('utf-8')).digest()), encoding="utf-8")
As you can see, your current use is close, but just has some parentheses opportunities to fix.
If you wanted to use this in a loop, say when iterating over a list of words or the rows of a csv you might do this:
import hashlib
import base64
def encode_text(text):
encoded = text.encode('utf-8')
encoded = hashlib.sha256(encoded).digest()
encoded = base64.b64encode(encoded)
return str(encoded, encoding="utf-8")
words = "1234567890 Hello World".split()
for word in words:
print(word, encode_text(word))
Giving you:
234567890 x3Xnt1ft5jDNCqERO9ECZhqziCnKUqZCKreChi8mhkY=
Hello GF+NsyJx/iX1Yab8k4suJkMG7DBO2lGAB9F2SCY4GWk=
World eK5kfcVUTSJxMKBoKlHjC8d3f7ttio8XAHRjo+zR1SQ=
Assuming the rest of your code works as you like, then:
import hashlib
import csv
import base64
def encode_text(text):
encoded = text.encode('utf-8')
encoded = hashlib.sha256(encoded).digest()
encoded = base64.b64encode(encoded)
return str(encoded, encoding="utf-8")
with open('File1.csv') as csvfile:
with open('File2.csv', 'w') as newfile:
reader = csv.DictReader(csvfile)
for i, r in enumerate(reader):
# writing csv headers
if i == 0:
newfile.write(','.join(r) + '\n')
# hashing the 'CardNumber' column
r['consumer_id'] = encode_text(r['consumer_id'])
# writing the new row to the file with hashed 'CardNumber'
newfile.write(','.join(r.values()) + '\n')
In addition to JonSG's answer about getting the hashing/encoding correct, I'd like to comment on how you're reading and writing the CSV files.
It took me a minute to understand how you're dealing with the header vs the body of the CSV here:
with open("File1.csv") as csvfile:
with open("File2.csv", "w") as newfile:
reader = csv.DictReader(csvfile)
for i, r in enumerate(reader):
print(i, r)
if i == 0:
newfile.write(",".join(r) + "\n") # writing csv headers
newfile.write(",".join(r.values()) + "\n")
At first, I didn't realize that calling join() on a dict would just give back the keys; then you move on to join the values. That's clever!
I think it'd be clearer, and easier, to use the complementary DictWriter.
For clarity, I'm going to separate the reading, processing, and writing:
with open("File1.csv", newline="") as f_in:
reader = csv.DictReader(f_in, skipinitialspace=True)
rows = list(reader)
for row in rows:
row["ID"] = encode_text(row["ID"])
print(row)
with open("File2.csv", "w", newline="") as f_out:
writer = csv.DictWriter(f_out, fieldnames=rows[0])
writer.writeheader()
writer.writerows(rows)
In your case, you'll create your writer and need to give it the fieldnames. I just passed in the first row and the DictWriter() constructor used the keys from that dict to establish the header values. You need to explicitly call the writeheader() method, then you can write your (processed) rows.
I started with this File1.csv:
ID, Phone, Email
1234680000000000, 123-456-7890, johnsmith#test.com
and ended up with this File2.csv:
ID,Phone,Email
tO2Knao73NzQP/rnBR5t8Hsm/XIQVnsrPKQlsXmpkb8=,123-456-7890,johnsmith#test.com
That organization means all your rows are read into memory first. You mentioned having "thousands of entries", but for those 3 fields of data that'll only be a few hundred KB of RAM, maybe a MB of RAM.
If you do want to "stream" the data through, you'll want something like:
reader = csv.DictReader(f_in, skipinitialspace=True)
writer = csv.DictWriter(f_out, fieldnames=reader.fieldnames)
writer.writeheader()
for row in reader:
row["ID"] = encode_text(row["ID"])
writer.writerow(row)
In this example, I passed reader.fieldnames to the fieldnames= param of the DictWriter constructor.
For dealing with multiple files, I'll just open and close them myself, because the multiple with open(...) as x can look cluttered to me:
f_in = open("File1.csv", newline="")
f_out = open("File2.csv", "w", newline="")
...
f_in.close()
f_out.close()
I don't see any real benefit to the context managers for these simple utility scripts: if the program fails it will automatically close the files.
But the conventional wisdom is to use the with open(...) as x context managers, like you were. You could do nested, like you were, separate them with a comma, or if you have Python 3.10+ use grouping parenthesis for a cleaner look (also in that Q/A).

How to read csv data, strip spaces/tabs and write to new csv file?

I have a large (1.6million rows+) .csv file that has some data with leading spaces, tabs, and trailing spaces and maybe even trailing tabs. I need to read the data in, strip all of that whitespace, and then spit the rows back out into a new .csv file preferably with the most efficient code possible and using only built-in modules in python 3.7
Here is what I have that is currently working, except it only spits out the header over and over and over and doesn't seem to take care of trailing tabs (not a huge deal though on trailing tabs):
def new_stripper(self, input_filename: str, output_filename: str):
"""
new_stripper(self, filename: str):
:param self: no idea what this does
:param filename: name of file to be stripped, must have .csv at end of file
:return: for now, it doesn't return anything...
-still doesn't remove trailing tabs?? But it can remove trailing spaces
-removes leading tabs and spaces
-still needs to write to new .csv file
"""
import csv
csv.register_dialect('strip', skipinitialspace=True)
reader = csv.DictReader(open(input_filename), dialect='strip')
reader = (dict((k, v.strip()) for k, v in row.items() if v) for row in reader)
for row in reader:
with open(output_filename, 'w', newline='') as out_file:
writer = csv.writer(out_file, delimiter=',')
writer.writerow(row)
input_filename = 'testFile.csv'
output_filename = 'output_testFile.csv'
new_stripper(self='', input_filename=input_filename, output_filename=output_filename)
As written above, the code just prints the headers over and over in a single line. I've played around with the arrangement and indenting of the last four lines of the def with some different results, but the closest I've gotten is getting it to print the header row again and again on new lines each time:
...
# headers and headers for days
with open(output_filename, 'w', newline='') as out_file:
writer = csv.writer(out_file, delimiter=',')
for row in reader:
writer.writerow(row)
EDIT1: Here's the result from the non-stripping correctly thing. Some of them have leading spaces that weren't stripped, some have trailing spaces that weren't stripped. It seems like the left-most column was properly stripped of leading spaces, but not trailing spaces; same with header row.
enter image description here
Update: Here's the solution I was looking for:
def get_data(self, input_filename: str, output_filename: str):
import csv
with open(input_filename, 'r', newline='') as in_file, open(output_filename, 'w', newline='') as out_file:
r = csv.reader(in_file, delimiter=',')
w = csv.writer(out_file, delimiter=',')
for line in r:
trim = (field.strip() for field in line)
w.writerow(trim)
input_filename = 'testFile.csv'
output_filename = 'output_testFile.csv'
get_data(self='', input_filename=input_filename, output_filename=output_filename)
Don't make life complicated for yourself, "CSV" files are simple plain text files, and can be handled in a generic way:
with open('input.csv', 'r') as inf, open('output.csv', 'w') as of:
for line in inf:
trim = (field.strip() for field in line.split(','))
of.write(','.join(trim)+'\n')
Alternatively, using the csv module:
import csv
with open('input.csv', 'r') as inf, open('output.csv', 'w') as of:
r = csv.reader(inf, delimiter=',')
w = csv.writer(of, delimiter=',')
for line in r:
trim = (field.strip() for field in line)
w.writerow(trim)
Unfortunately I cannot comment, but I believe you might want to strip every entry in csv of the white space (not just the line). If that is the case, then, based on Jan's answer, this might do the trick:
with open('file.csv', 'r') as inf, open('output.csv', 'w') as of:
for line in inf:
of.write(','.join(list(map(str.strip, line.split(',')))) + '\n')
What it does is it splits each line by comma resulting in a list of values, then strips every element from whitespace to later join them back up and save to output file.
your final reader variable contains tuple of dicts but your writer expects list.
you can either user csv.DictWriter or store the processed data(v) in a list first and then write to csv and include headers using writer.writeheader()

Remove 1000's separator from column in CSV?

I have a Python script where I'm importing a csv that has commas in values over 1000. These values are strings in the csv. I need to remove the commas from the values, and convert the strings to rounded floats inside the csv before it's imported into Python.
I've tried appending all the new values to a list to use the csv.writer, but I haven't been able to figure out how to have the writer only replace the values in the column that have commas. Here's what I have so far. :
import csv
RoomReport = r'path_to_csv'
new_values_list = []
f = open(RoomReport, "r")
reader = csv.reader(f)
writer = csv.writer(f)
for row in reader:
useable_area = row[7]
if "," in useable_area:
useable_area_no_comma = useable_area.replace(",","")
useable_area_rounded = int(round(float(useable_area_no_comma)))
new_values_list.append(useable_area_rounded)
f.close()
As I mentioned in a comment, this can only be done if the input csv file is formatted in a way that will allow the commas in the numbers to be differentiated from the commas between each one of them.
Here's an example of one way it could be done (by quoting all the values):
"0","1","2","3","4","5","6","7,123.6","8","9"
"0","1","2","3","4","5","6","1,000","8","9"
"0","1","2","3","4","5","6","20,000","8","9"
Here's code that will do what you want. It uses the locale.atof function to simplify cleaning up the number:
import csv
import locale
# Set local to someplace that uses a comma for the thousands separator.
locale.setlocale(locale.LC_ALL, 'English_US.1252')
RoomReport = r'RoomReport.csv'
cleaned_report = r'RoomReport_cleaned.csv'
new_values_list = []
with open(RoomReport, "r", newline='') as inp:
for row in csv.reader(inp):
if "," in row[7]:
row[7] = int(round(locale.atof(row[7])))
new_values_list.append(row)
# Create cleaned-up output file.
with open(cleaned_report, "w", newline='') as outp:
csv.writer(outp, quoting=csv.QUOTE_ALL).writerows(new_values_list)
The RoomReport_cleaned.csv it creates from the example input will contain this:
"0","1","2","3","4","5","6","7124","8","9"
"0","1","2","3","4","5","6","1000","8","9"
"0","1","2","3","4","5","6","20000","8","9"
Note that since the values in the output no longer have commas embedded in them, the quoting all fields is not longer necessary—so could be left out by not specifying csv.QUOTE_ALL.
maybe something like this?
import re
from sys import stdout
isnum = re.compile('^[0-9, ]+$')
non = re.compile('[, ]')
fd = StringIO()
out = csv.writer(fd)
out.writerow(['foo','1,000,000',19])
out.writerow(['bar','1,234,567',20])
fd.seek(0)
inp = csv.reader(fd)
out = csv.writer(stdout)
for row in inp:
for i, x in enumerate(row):
if isnum.match(x):
row[i] = float(non.sub('', x))
out.writerow(row)

CSV Writing to File Difficulties

I am supposed to add a specific label to my CSV file based off conditions. The CSV file has 10 columns and the third, fourth, and fifth columns are the ones that affect the conditions the most and I add my label on the tenth column. I have code here which ended in an infinite loop:
import csv
import sys
w = open(sys.argv[1], 'w')
r = open(sys.argv[1], 'r')
reader = csv.reader(r)
writer = csv.writer(w)
for row in reader:
if row[2] or row[3] or row[4] == '0':
row[9] == 'Label'
writer.writerow(row)
w.close()
r.close()
I do not know why it would end in an infinite loop.
EDIT: I made a mistake and my original infinite loop program had this line:
w = open(sys.argv[1], 'a')
I changed 'a' to 'w' but this ended up erasing the entire CSV file itself. So now I have a different problem.
You have a problem here if row[2] or row[3] or row[4] == '0': and here row[9] == 'Label', you can use any to check several variables equal to the same value, and use = to assign a value, also i would recommend to use with open.
Additionally you can't read and write at the same time in csv file, so you need to save your changes to a new csv file, you can remove the original one after and rename the new one using os.remove and os.rename:
import csv
import sys
import os
with open('some_new_file.csv', 'w') as w, open(sys.argv[1], 'r') as r:
reader, writer = csv.reader(r), csv.writer(w)
for row in reader:
if any(x == '0' for x in (row[2], row[3], row[4])):
row[9] = 'Label'
writer.writerow(row)
os.remove('{}'.format(sys.argv[1]))
os.rename('some_new_file.csv', '{}'.format(sys.argv[1]))
You can write to a tempfile.NamedTemporaryFile and just use in to test for the "0" as you are matching a full string not a substring so you won't save anything by using any as you create a tuple of three elements so you may as well slice or just test for membership regardless, then you just replace the original file with shutil.move:
import csv
import sys
from shutil import move
from tempfile import NamedTemporaryFile
with NamedTemporaryFile("w", dir=".", delete=False) as w, open(sys.argv[1]) as r:
reader, writer = csv.reader(r), csv.writer(w)
writer.writerows(row[:-1] + ['Label'] if "0" in row[2:5] else row
for row in reader)
move(w.name, sys.argv[1])
sys.argv[1] is also you file name and a string so that is all you need to pass.
I think the Problem is in lines
w = open(sys.argv[1], 'w')
r = open(sys.argv[1], 'r')
You are opening the same file for reading and writing.So try using different file name.

Python loops through CSV, but writes header row twice

I have csv files with unwanted first characters in the header row except the first column.
The while loop strips the first character from the headers and writes the new header row to a new file (exit by counter). The else statement then writes the rest of the rows to the new file. The problem is the else statement begins with the header row and writes it a second time. Is there a way to have else begin an the next line with out breaking the for iterator? The actual files are 21 columns by 400,000+ rows. The unwanted character is a single space, but I used * in the example below to make it easier to see. Thanks for any help!
file.csv =
a,*b,*c,*d
1,2,3,4
import csv
reader = csv.reader(open('file.csv', 'rb'))
writer = csv.writer(open('file2.csv','wb'))
count = 0
for row in reader:
while (count <= 0):
row[1]=row[1][1:]
row[2]=row[2][1:]
row[3]=row[3][1:]
writer.writerow([row[0], row[1], row[2], row[3]])
count = count + 1
else:
writer.writerow([row[0], row[1], row[2], row[3]])
If you only want to change the header and copy the remaining lines without change:
with open('file.csv', 'r') as src, open('file2.csv', 'w') as dst:
dst.write(next(src).replace(" ", "")) # delete whitespaces from header
dst.writelines(line for line in src)
If you want to do additional transformations you can do something like this or this question.
If all you want to do is remove spaces, you can use:
string.replace(" ", "")
Hmm... It seems like your logic might be a bit backward. A bit cleaner, I think, to check if you're on the first row first. Also, a slightly more idiomatic way to remove spaces is to use string's lstrip method with no arguments to remove leading whitespace.
Why not use enumerate and check if your row is the header?
import csv
reader = csv.reader(open('file.csv', 'rb'))
writer = csv.writer(open('file2.csv','wb'))
for i, row in enumerate(reader):
if i == 0:
writer.writerow([row[0],
row[1].lstrip(),
row[2].lstrip(),
row[3].lstrip()])
else:
writer.writerow([row[0], row[1], row[2], row[3]])
If you have 21 columns, you don't want to write row[0], ... , row[21]. Plus, you want to close your files after opening them. .next() gets your header. And strip() lets you flexibly remove unwanted leading and trailing characters.
import csv
file = 'file1.csv'
newfile = open('file2.csv','wb')
writer = csv.writer(newfile)
with open(file, 'rb') as f:
reader = csv.reader(f)
header = reader.next()
newheader = []
for c in header:
newheader.append(c.strip(' '))
writer.writerow(newheader)
for r in reader:
writer.writerow(r)
newfile.close()

Categories

Resources