Writing output to csv file [in correct format] - python

I realize this question has been asked a million times and there is a lot of documentation on it. However, I am unable to output the results in the correct format.
The below code was adopted from: Replacing empty csv column values with a zero
# Save below script as RepEmptyCells.py
# Add #!/usr/bin/python to script
# Make executable by chmod +x prior to running the script on desired .csv file
# Below code will look through your .csv file and replace empty spaces with 0s
# This can be particularly useful for genetic distance matrices
import csv
import sys
reader = csv.reader(open(sys.argv[1], "rb"))
for row in reader:
for i, x in enumerate(row):
if len(x)< 1:
x = row[i] = 0
print(','.join(int(x) for x in row))
Currently, to get the correct output .csv file [i.e. in correct format] one can run the following command in bash:
#After making the script executable
./RepEmptyCells.py input.csv > output.csv # this produces the correct output
I've tried to use csv.writer function to produce the correctly formatted output.csv file (similar to ./RepEmptyCells.py input.csv > output.csv) without much luck.
I'd like to learn how to add this last part to the code to automate the process without having to do it in bash.
What I have tried:
f = open(output2.csv, 'w')
import csv
import sys
reader = csv.reader(open(sys.argv[1], "rb"))
for row in reader:
for i, x in enumerate(row):
if len(x)< 1:
x = row[i] = 0
f.write(','.join(int(x) for x in row))
f.close()
When looking at the raw files from this code and the one before, they look the same.
However, when I open them in either excel or iNumbers the latter (i.e. output2.csv) shows only a single row of the data.
Its important that both output.csv and output2.csv can be opened in excel.

2 options:
Just do a f.write('\n') after your current f.write statement.
Use csv.writer. You mention it but it isn't in your code.
writer = csv.writer(f)
...
writer.writerow([int(x) for x in row]) # Note difference in parameter format

An humble proposition
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import sys
# Use with statement to properly close files
# Use newline='' which is the right option for Python 3.x
with open(sys.argv[1], 'r', newline='') as fin, open(sys.argv[2], 'w', newline='') as fout:
reader = csv.reader(fin)
# You may need to redefine the dialect for some version of Excel that
# split cells on semicolons (for _Comma_ Separated Values, yes...)
writer = csv.writer(fout, dialect="excel")
for row in reader:
# Write as reading, let the OS do the caching alone
# Process the data as it comes in a generator, checking all cells
# in a row. If cell is empty, the or will return "0"
# Keep strings all the time: if it's not an int it would fail
# Converting to int will force the writer to convert it back to str
# anwway, and Excel doesn't make any difference when loading.
writer.writerow( cell or "0" for cell in row )
Sample in.csv
1,2,3,,4,5,6,
7,,8,,9,,10
Output out.csv
1,2,3,0,4,5,6,0
7,0,8,0,9,0,10

import csv
import sys
with open(sys.argv[1], 'rb') as f:
reader = csv.reader(f)
for row in reader:
print row.replace(' ', '0')
and I don't understand your need for using the shell and redirecting.
a csv writer is just:
with open('output.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(rows)

Related

Hash a column in CSV and output in Base64

Still getting my feet wet with Python, but my goal is to read a CSV file and hash a specific column using SHA256 then output in Base64.
Here is an example of the conversion that needs to take place
This calculator can be found at https://www.liavaag.org/English/SHA-Generator/
Here is the code I have currently
import hashlib
import csv
import base64
with open('File1.csv') as csvfile:
with open('File2.csv', 'w') as newfile:
reader = csv.DictReader(csvfile)
for i, r in enumerate(reader):
# writing csv headers
if i == 0:
newfile.write(','.join(r) + '\n')
# hashing the 'CardNumber' column
r['consumer_id'] = base64.b64encode(hashlib.sha256(r['consumer_id']).encode('utf-8')).digest()
# writing the new row to the file with hashed 'CardNumber'
newfile.write(','.join(r.values()) + '\n')
The error I receive is
r['consumer_id'] = base64.b64encode(hashlib.sha256(r['consumer_id']).encode('utf-8')).digest()
TypeError: Strings must be encoded before hashing
You are on the right track, just need to take it a step at a time before doing it all at once to see how it pieces together:
import hashlib
import base64
text = "1234567890"
encoded = text.encode('utf-8')
encoded = hashlib.sha256(encoded).digest()
encoded = base64.b64encode(encoded)
print(text, str(encoded, encoding="utf-8"))
That should give you:
1234567890 x3Xnt1ft5jDNCqERO9ECZhqziCnKUqZCKreChi8mhkY=
As a "one-liner":
r['consumer_id'] = str(base64.b64encode(hashlib.sha256(r['consumer_id'].encode('utf-8')).digest()), encoding="utf-8")
As you can see, your current use is close, but just has some parentheses opportunities to fix.
If you wanted to use this in a loop, say when iterating over a list of words or the rows of a csv you might do this:
import hashlib
import base64
def encode_text(text):
encoded = text.encode('utf-8')
encoded = hashlib.sha256(encoded).digest()
encoded = base64.b64encode(encoded)
return str(encoded, encoding="utf-8")
words = "1234567890 Hello World".split()
for word in words:
print(word, encode_text(word))
Giving you:
234567890 x3Xnt1ft5jDNCqERO9ECZhqziCnKUqZCKreChi8mhkY=
Hello GF+NsyJx/iX1Yab8k4suJkMG7DBO2lGAB9F2SCY4GWk=
World eK5kfcVUTSJxMKBoKlHjC8d3f7ttio8XAHRjo+zR1SQ=
Assuming the rest of your code works as you like, then:
import hashlib
import csv
import base64
def encode_text(text):
encoded = text.encode('utf-8')
encoded = hashlib.sha256(encoded).digest()
encoded = base64.b64encode(encoded)
return str(encoded, encoding="utf-8")
with open('File1.csv') as csvfile:
with open('File2.csv', 'w') as newfile:
reader = csv.DictReader(csvfile)
for i, r in enumerate(reader):
# writing csv headers
if i == 0:
newfile.write(','.join(r) + '\n')
# hashing the 'CardNumber' column
r['consumer_id'] = encode_text(r['consumer_id'])
# writing the new row to the file with hashed 'CardNumber'
newfile.write(','.join(r.values()) + '\n')
In addition to JonSG's answer about getting the hashing/encoding correct, I'd like to comment on how you're reading and writing the CSV files.
It took me a minute to understand how you're dealing with the header vs the body of the CSV here:
with open("File1.csv") as csvfile:
with open("File2.csv", "w") as newfile:
reader = csv.DictReader(csvfile)
for i, r in enumerate(reader):
print(i, r)
if i == 0:
newfile.write(",".join(r) + "\n") # writing csv headers
newfile.write(",".join(r.values()) + "\n")
At first, I didn't realize that calling join() on a dict would just give back the keys; then you move on to join the values. That's clever!
I think it'd be clearer, and easier, to use the complementary DictWriter.
For clarity, I'm going to separate the reading, processing, and writing:
with open("File1.csv", newline="") as f_in:
reader = csv.DictReader(f_in, skipinitialspace=True)
rows = list(reader)
for row in rows:
row["ID"] = encode_text(row["ID"])
print(row)
with open("File2.csv", "w", newline="") as f_out:
writer = csv.DictWriter(f_out, fieldnames=rows[0])
writer.writeheader()
writer.writerows(rows)
In your case, you'll create your writer and need to give it the fieldnames. I just passed in the first row and the DictWriter() constructor used the keys from that dict to establish the header values. You need to explicitly call the writeheader() method, then you can write your (processed) rows.
I started with this File1.csv:
ID, Phone, Email
1234680000000000, 123-456-7890, johnsmith#test.com
and ended up with this File2.csv:
ID,Phone,Email
tO2Knao73NzQP/rnBR5t8Hsm/XIQVnsrPKQlsXmpkb8=,123-456-7890,johnsmith#test.com
That organization means all your rows are read into memory first. You mentioned having "thousands of entries", but for those 3 fields of data that'll only be a few hundred KB of RAM, maybe a MB of RAM.
If you do want to "stream" the data through, you'll want something like:
reader = csv.DictReader(f_in, skipinitialspace=True)
writer = csv.DictWriter(f_out, fieldnames=reader.fieldnames)
writer.writeheader()
for row in reader:
row["ID"] = encode_text(row["ID"])
writer.writerow(row)
In this example, I passed reader.fieldnames to the fieldnames= param of the DictWriter constructor.
For dealing with multiple files, I'll just open and close them myself, because the multiple with open(...) as x can look cluttered to me:
f_in = open("File1.csv", newline="")
f_out = open("File2.csv", "w", newline="")
...
f_in.close()
f_out.close()
I don't see any real benefit to the context managers for these simple utility scripts: if the program fails it will automatically close the files.
But the conventional wisdom is to use the with open(...) as x context managers, like you were. You could do nested, like you were, separate them with a comma, or if you have Python 3.10+ use grouping parenthesis for a cleaner look (also in that Q/A).

how can I use csv tools for zip text file?

update-my file.txt.zp is tab delimited and looks kind of like this :
file.txt.zp
I want to split the first col by : _ /
original post:
I have a very large zipped tab delimited file.
I want to open it, scan it one row at a time, split some of the col, and write it to a new file.
I got various errors (every time I fix one another pops)
This is my code:
import csv
import re
import gzip
f = gzip.open('file.txt.gz')
original = f.readlines()
f.close()
original_l = csv.reader(original)
for row in original_l:
file_l = re.split('_|:|/',row)
with open ('newfile.gz', 'w', newline='') as final:
finalfile = csv.writer(final,delimiter = ' ')
finalfile.writerow(file_l)
Thanks!
for this code i got the error:
for row in original_l:
_csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
so based on what I found here I added this after f.close():
original = original.decode('utf8')
and then got the error:
original = original.decode('utf8')
AttributeError: 'list' object has no attribute 'decode'
Update 2
This code should produce the output that you're after.
import csv
import gzip
import re
with gzip.open('file.txt.gz', mode='rt') as f, \
open('newfile.gz', 'w') as final:
writer = csv.writer(final, delimiter=' ')
reader = csv.reader(f, delimiter='\t')
_ = next(reader) # skip header row
for row in reader:
writer.writerow(re.split(r'_|:|/', row[0]))
Update
Open the gzip file in text mode because str objects are required by the CSV module in Python 3.
f = gzip.open('file.txt.gz', 'rt')
Also specify the delimiter when creating the csv.reader.
original_l = csv.reader(original, delimiter='\t')
This will get you past the first hurdle.
Now you need to explain what the data is, which columns you wish to extract, and what the output should look like.
Original answer follows...
One obvious problem is that the output file is constantly being overwritten by the next row of input. This is because the output file is opened in (over)write mode (`'w`` ) once per row.
It would be better to open the output file once outside of the loop.
Also, the CSV file delimiter is not specified when creating the reader. You said that the file is tab delimited so specify that:
original_l = csv.reader(original, delimiter='\t')
On the other hand, your code attempts to split each row using other delimiters, however, the rows coming from the csv.reader are represented as a list, not a string as the re.split() code would require.
Another problem is that the output file is not zipped as the name suggests.

Read CSV with comma as linebreak

I have a file saved as .csv
"400":0.1,"401":0.2,"402":0.3
Ultimately I want to save the data in a proper format in a csv file for further processing. The problem is that there are no line breaks in the file.
pathname = r"C:\pathtofile\file.csv"
with open(pathname, newline='') as file:
reader = file.read().replace(',', '\n')
print(reader)
with open(r"C:\pathtofile\filenew.csv", 'w') as new_file:
csv_writer = csv.writer(new_file)
csv_writer.writerow(reader)
The print reader output looks exactly how I want (or at least it's a format I can further process).
"400":0.1
"401":0.2
"402":0.3
And now I want to save that to a new csv file. However the output looks like
"""",4,0,0,"""",:,0,.,1,"
","""",4,0,1,"""",:,0,.,2,"
","""",4,0,2,"""",:,0,.,3
I'm sure it would be intelligent to convert the format to
400,0.1
401,0.2
402,0.3
at this stage instead of doing later with another script.
The main problem is that my current code
with open(pathname, newline='') as file:
reader = file.read().replace(',', '\n')
reader = csv.reader(reader,delimiter=':')
x = []
y = []
print(reader)
for row in reader:
x.append( float(row[0]) )
y.append( float(row[1]) )
print(x)
print(y)
works fine for the type of csv files I currently have, but doesn't work for these mentioned above:
y.append( float(row[1]) )
IndexError: list index out of range
So I'm trying to find a way to work with them too. I think I'm missing something obvious as I imagine that it can't be too hard to properly define the linebreak character and delimiter of a file.
with open(pathname, newline=',') as file:
yields
ValueError: illegal newline value: ,
The right way with csv module, without replacing and casting to float:
import csv
with open('file.csv', 'r') as f, open('filenew.csv', 'w', newline='') as out:
reader = csv.reader(f)
writer = csv.writer(out, quotechar=None)
for r in reader:
for i in r:
writer.writerow(i.split(':'))
The resulting filenew.csv contents (according to your "intelligent" condition):
400,0.1
401,0.2
402,0.3
Nuances:
csv.reader and csv.writer objects treat comma , as default delimiter (no need to file.read().replace(',', '\n'))
quotechar=None is specified for csv.writer object to eliminate double quotes around the values being saved
You need to split the values to form a list to represent a row. Presently the code is splitting the string into individual characters to represent the row.
pathname = r"C:\pathtofile\file.csv"
with open(pathname) as old_file:
with open(r"C:\pathtofile\filenew.csv", 'w') as new_file:
csv_writer = csv.writer(new_file, delimiter=',')
text_rows = old_file.read().split(",")
for row in text_rows:
items = row.split(":")
csv_writer.writerow([int(items[0]), items[1])
If you look at the documentation, for write_row, it says:
Write the row parameter to the writer’s file
object, formatted according to the current dialect.
But, you are writing an entire string in your code
csv_writer.writerow(reader)
because reader is a string at this point.
Now, the format you want to use in your CSV file is not clearly mentioned in the question. But as you said, if you can do some preprocessing to create a list of lists and pass each sublist to writerow(), you should be able to produce the required file format.

Reading data from one CSV and displaying parsed data on to another CSV file

I am very new to Python. I am trying to read a csv file and displaying the result to another CSV file. What I want to do is I want to write selective rows in the input csv file on to the output file. Below is the code I wrote so far. This code read every single row from the input file i.e. 1.csv and write it to an output file out.csv. How can I tweak this code say for example I want my output file to contain only those rows which starts with READ in column 8 and rows which are not equal to 0000 in column 10. Both of these conditions need to be met. Like start with READ and not equal to 0000. I want to write all these rows. Also this block of code is for a single csv file. Can anyone please tell me how I can do it for say 10000 csv files ? Also when I execute the code, I can see spaces between lines on my out csv. How can I remove those spaces ?
import csv
f1 = open("1.csv", "r")
reader = csv.reader(f1)
header = reader.next()
f2 = open("out.csv", "w")
writer = csv.writer(f2)
writer.writerow(header)
for row in reader:
writer.writerow(row)
f1.close()
f2.close()
Something like:
import os
import csv
import glob
class CSVReadWriter(object):
def munge(self, filename, suffix):
name,ext = os.path.split(filename)
return '{0}{1}.{2}'.format(name, suffix, ext)
def is_valid(self, row):
return row[8] == 'READ' and row[10] == '0000'
def filter_csv(fin, fout):
reader = csv.reader(fin)
writer = csv.writer(fout)
writer.write(reader.next()) # header
for row in reader:
if self.is_valid(row):
writer.writerow(row)
def read_write(self, iname, suffix):
with open(iname, 'rb') as fin:
oname = self.munge(filename, suffix)
with open(oname, 'wb') as fout:
self.filter_csv(fin, fout)
work_directory = r"C:\Temp\Data"
for filename in glob.glob(work_directory):
csvrw = CSVReadWriter()
csvrw.read_write(filename, '_out')
I've made it a class so that you can over ride the munge and is_valid methods to suit different cases. Being a class also means that you can store state better, for example if you wanted to output lines between certain criteria.
The extra spaces between lines that you mention are to do with \r\n carriage return and line feed line endings. Using open with 'wb' might resolve it.

Python- Read from Multiple Files

I have 125 data files containing two columns and 21 rows of data. Please see the image below:
and I'd like to import them into a single .csv file (as 250 columns and 21 rows).
I am fairly new to python but this what I have been advised, code wise:
import glob
Results = [open(f) for f in glob.glob("*.data")]
fout = open("res.csv", 'w')
for row in range(21):
for f in Results:
fout.write( f.readline().strip() )
fout.write(',')
fout.write('\n')
fout.close()
However, there is slight problem with the code as I only get 125 columns, (i.e. the force and displacement columns are written in one column) Please refer to the image below:
I'd very much appreciate it if anyone could help me with this !
import glob
results = [open(f) for f in glob.glob("*.data")]
sep = ","
# Uncomment if your Excel formats decimal numbers like 3,14 instead of 3.14
# sep = ";"
with open("res.csv", 'w') as fout:
for row in range(21):
iterator = (f.readline().strip().replace("\t", sep) for f in results)
line = sep.join(iterator)
fout.write("{0}\n".format(line))
So to explain what went wrong with your code, your source files use tab as a field separator, but your code uses comma to separate the lines it reads from those files. If your excel uses period as a decimal separator, it uses comma as a default field separator. The whitespace is ignored unless enclosed in quotes, and you see the result.
If you use the text import feature of Excel (Data ribbon => From Text) you can ask it to consider both comma and tab as valid field separators, and then I'm pretty sure your original output would work too.
In contrast, the above code should produce a file that will open correctly when double clicked.
You don't need to write your own program to do this, in python or otherwise. You can use an existing unix command (if you are in that environment):
paste *.data > res.csv
Try this:
import glob, csv
from itertools import cycle, islice, count
def roundrobin(*iterables):
"roundrobin('ABC', 'D', 'EF') --> A D E B F C"
# Recipe credited to George Sakkis
pending = len(iterables)
nexts = cycle(iter(it).next for it in iterables)
while pending:
try:
for next in nexts:
yield next()
except StopIteration:
pending -= 1
nexts = cycle(islice(nexts, pending))
Results = [open(f).readlines() for f in glob.glob("*.data")]
fout = csv.writer(open("res.csv", 'wb'), dialect="excel")
row = []
for line, c in zip(roundrobin(Results), cycle(range(len(Results)))):
splitline = line.split()
for item,currItem in zip(splitline, count(1)):
row[c+currItem] = item
if count == len(Results):
fout.writerow(row)
row = []
del fout
It should loop over each line of your input file and stitch them together as one row, which the csv library will write in the listed dialect.
I suggest to get used to csv module. The reason is that if the data is not that simple (simple strings in headings, and then numbers only) it is difficult to implement everything again. Try the following:
import csv
import glob
import os
datapath = './data'
resultpath = './result'
if not os.path.isdir(resultpath):
os.makedirs(resultpath)
# Initialize the empty rows. It does not check how many rows are
# in the file.
rows = []
# Read data from the files to the above matrix.
for fname in glob.glob(os.path.join(datapath, '*.data')):
with open(fname, 'rb') as f:
reader = csv.reader(f)
for n, row in enumerate(reader):
if len(rows) < n+1:
rows.append([]) # add another row
rows[n].extend(row) # append the elements from the file
# Write the data from memory to the result file.
fname = os.path.join(resultpath, 'result.csv')
with open(fname, 'wb') as f:
writer = csv.writer(f)
for row in rows:
writer.writerow(row)
The with construct for opening a file can be replaced by the couple:
f = open(fname, 'wb')
...
f.close()
The csv.reader and csv.writer are simply wrappers that parse or compose the line of the file. The doc says that they require to open the file in the binary mode.

Categories

Resources