I have a question about Python 2.7 read/write csv file with 'utf-8-sig' code, my csv . header is
['\xef\xbb\xbfID;timestamp;CustomerID;Email']
there have some code("\xef\xbb\xbfID") I read from file A.csv and I want write the same code and header to file B.csv
My print log is shows:
['\xef\xbb\xbfID;timestamp;CustomerID;Email']
But the actual output file header it looks like
ÔªøID;timestamp
Here is the code:
def remove_gdpr_info_from_csv(file_path, file_name, temp_folder, original_header):
new_temp_folder = tempfile.mkdtemp()
new_temp_file = new_temp_folder + "/" + file_name
# Blanked new file
with open(new_temp_file, 'wb') as outfile:
writer = csv.writer(outfile, delimiter=";")
print original_header
writer.writerow(original_header)
# File from SFTP
with open(file_path, 'r') as infile:
reader = csv.reader(infile, delimiter=";")
first_row = next(reader)
email = first_row.index('Email')
contract_detractor1 = first_row.index('Contact Detractor (Q21)')
contract_detractor2 = first_row.index('Contact Detractor (Q20)')
contract_detractor3 = first_row.index('Contact Detractor (Q43)')
contract_detractor4 = first_row.index('Contact Detractor(Q26)')
contract_detractor5 = first_row.index('Contact Detractor(Q27)')
contract_detractor6 = first_row.index('Contact Detractor(Q44)')
indexes = []
for column_name in header_list:
ind = first_row.index(column_name)
indexes.append(ind)
for row in reader:
output_row = []
for ind in indexes:
data = row[ind]
if ind == email:
data = ''
elif ind == contract_detractor1:
data = ''
elif ind == contract_detractor2:
data = ''
elif ind == contract_detractor3:
data = ''
elif ind == contract_detractor4:
data = ''
elif ind == contract_detractor5:
data = ''
elif ind == contract_detractor6:
data = ''
output_row.append(data)
writer.writerow(output_row)
s3core.upload_files(SPARKY_S3, DESTINATION_PATH, new_temp_file)
shutil.rmtree(temp_folder)
shutil.rmtree(new_temp_folder)
'\xef\xbb\xbf' is the UTF8 encoded version of the unicode ZERO WIDTH NO-BREAK SPACE U+FEFF. It is often used as a Byte Order Mark at the beginning of unicode text files:
when you have 3 bytes: '\xef\xbb\xbf', then the file is utf8 encoded
when you have 2 bytes: '\xff\xfe', then the file is in utf16 little endian
when you have 2 bytes: '\xfe\xff', then the file is in utf16 big endian
The 'utf-8-sig' encoding explicitely asks for writing this BOM at the beginning of the file
To process it automatically at read time of a csv file in Python 2, you can use the codecs module:
with open(file_path, 'r') as infile:
reader = csv.reader(codecs.EncodedFile(infile, 'utf-8', 'utf-8-sig'), delimiter=";")
EncodedFile will wrap the original file object by decoding it in utf8-sig, actually skipping the BOM and re-encoding it in utf8 with no BOM.
You want to use the EncodedFile method from the codecs library as in Serge Ballesta's answer.
However using Python 2.7 the encoding utf-8-sig is not a supported alias for the UTF8-sig encoding, you need to use utf_8_sig. Additionally the order of the method properties needs to define the output data encoding first, and the file encoding second: codecs.EncodedFile(file,datacodec,filecodec=None,errors=’strict')
Here's the full result:
import codecs
with open(file_path, 'r') as infile:
reader = csv.reader(codecs.EncodedFile(infile, 'utf8', 'utf_8_sig'), delimiter=";")
Related
f.pk is basically a container of base64 + zip which I need to import in Python and extract. The zip file is p.plist, so f.pk = path + name + data of p.plist.
I can't find any working encoding for open() or codecs.open() to open it as a str and save the output. I always have a generated output.plist which is different than the original.
Encodings I have already used include ASCII; UTF-x; Latin_1; ISO-x;
import codecs, os
with open('f.pk', 'r', encoding='Latin_1') as f:
f_open = f.read()
with codecs.open('f.pk', 'r', encoding='zip') as f:
f_open = f.read()
f2=f_open[3:] #SKIP DUMMY PART
f3=f2.split('-DATA-')
f4=f3[1].split('-COMMENT-')
with open('output.plist', 'w') as f:
print(f_out, file=f)
original.plist = 5e03964972def5b83880397b7377e6d1aea33e2b
output.plist = 6473aea0ae8bc75a04859effe1ee366de4cdd2d2
I have deep analyzed both files:
with no success.
with open('file.pk', 'rb') as f:
f_open = f.read()
# do something with bytes here
with open('p.temp', 'wb') as f:
pickle.dump(f_bytes, f) # temp file
# reload temp file
with open("p.temp", 'rb') as f:
data = f.read()
# skip encodings and skip unwanted bytes
data = data[4:-3]
# save it
with open('p.plist', 'wb') as f:
f.write(data)
os.remove('p.temp')
I have this error =>cryptography.fernet.InvalidToken, when I try to decrypt the content of my file at this line exit1 = key.decrypt(listCipher[0]).
I sought everywhere but I didn't found anything about this problem. I tried to replace the list by using ConfigParser but it still doesn't work and I don't think that is the problem. Some help is welcome.
from cryptography.fernet import Fernet
entry1 = "First_sentence"
entry2 = "Second_sentence"
entry3 = "Third_sentence"
##--- Key creation
firstKey = Fernet.generate_key()
file = open('.\\TEST\\key.key', 'wb')
file.write(firstKey)
file.close()
##--- Cipher entries
key = Fernet(firstKey)
chiffrentry1 = key.encrypt(bytes(entry1, "utf-8"))
chiffrentry2 = key.encrypt(bytes(entry2, "utf-8"))
chiffrentry3 = key.encrypt(bytes(entry3, "utf-8"))
listAll = [chiffrentry1, chiffrentry2, chiffrentry3]
##-- Write cipher text in file
with open('.\\TEST\\text_encrypt.txt', 'w') as pt:
for ligne in listAll:
pt.write("%s\n" % ligne)
##--- Recover file to decrypt cipher text
listCipher = []
with open('.\\TEST\\text_encrypt.txt', 'rb') as pt:
for line in pt:
listCipher.append(line.strip())
exit1 = key.decrypt(listCipher[0])
exit2 = key.decrypt(listCipher[1])
exit3 = key.decrypt(listCipher[2])
print(exit1)
print(exit2)
print(exit3)
The '%s\n'%ligne is modifying your data. For instance if I do the following:
>>> with open('afile.txt', 'w') as fh:
for i in range(2):
fh.write('%s\n'%b'hi there')
12
12
>>> with open('afile.txt', 'rb') as fh:
for line in fh:
print(line)
b"b'hi there'\n"
b"b'hi there'\n"
The issue here is the type conversions you are doing. Fernet's operations expect bytes and you are storing the encrypted values as strings. When you convert a bytes object to a string, you don't get exactly what that byte-string was. To avoid this, don't convert the types
with open('.\\TEST\\text_encrypt.txt', 'wb') as pt:
# join supports byte-strings
to_write = b'\n'.join(listAll)
pt.write(to_write)
# Now I can read a bytes object directly
with open('.\\TEST\\text_encrypt.txt', 'rb') as fh:
# this is a single bytes-string with b'\n' chars inside it
contents = fh.read()
# byte-strings also support split
ciphers = contents.split(b'\n')
for cipher in ciphers:
print(key.decrypt(cipher))
I am trying to pickle a python object into a csv file. I want to write the pickle of an object as the third column in my file. I want to use pickle to avoid writing serialization for my complex objects.
Code to write to csv :
with open(self.file_path, 'a') as csv_file:
wr = csv.writer(csv_file, delimiter='|')
row = ['klines', symbol]
row.extend(pickle.dumps(object))
wr.writerow(row)
Code to read csv :
with open(self.simulation_file_name, 'r') as csv_file:
line = csv_file.readline()
while line != '':
line = line.strip('\n')
columns = line.split('|')
event_type = line.pop(0)
symbol = line.pop(0)
pickled = line.pop(0)
klines = pickle.loads(klines)
I get the following error :
TypeError: a bytes-like object is required, not 'str'
To write bytes/binary in text file like CSV, use base64 or other methods to avoid any escaping problem. Code simplified & python3 assumed.
import base64
with open('a.csv', 'a', encoding='utf8') as csv_file:
wr = csv.writer(csv_file, delimiter='|')
pickle_bytes = pickle.dumps(obj) # unsafe to write
b64_bytes = base64.b64encode(pickle_bytes) # safe to write but still bytes
b64_str = b64_bytes.decode('utf8') # safe and in utf8
wr.writerow(['col1', 'col2', b64_str])
# the file contains
# col1|col2|gANdcQAu
with open('a.csv', 'r') as csv_file:
for line in csv_file:
line = line.strip('\n')
b64_str = line.split('|')[2] # take the pickled obj
obj = pickle.loads(base64.b64decode(b64_str)) # retrieve
P.S. If you are not writing a utf8 file (e.g. ascii file), simply replace the encoding method.
P.S. Writing bytes in CSV is possible yet hardly elegant. One alternative is dumping a whole dict with dumped objects as values and storing keys in the CSV.
https://docs.google.com/file/d/0B1sEqo7wNB1-TlNEeXh6QldLT2c/edit
I am trying to have a program that will remove the special characters in the above txt.
I already have a remover like this.
chars = [u'\u001A', u'\u001C', u'\u001D', u'\u001E', u'\u0085'];
input_file = sys.argv[1]
output_file = sys.argv[2]
ifile = codecs.open(input_file, encoding = 'utf-8', mode="rb")
ofile = codecs.open(output_file, encoding = 'utf-8', mode="wb")
for line in ifile:
for ch in chars:
if ch in line:
line = line.replace(ch, '')
ofile.write(line)
ifile.close()
ofile.close()
But it can't remove those characters in that txt. Rather, it crashes. What should I do?
I would try with:
input_file = sys.argv[1]
output_file = sys.argv[2]
ifile = codecs.open(input_file, encoding = 'utf-8', mode="rb")
ofile = codecs.open(output_file, encoding = 'utf-8', mode="wb")
for line in ifile:
for ch in line:
try:
ofile.write(ch.decode('utf-8')
except UnicodeDecodeError:
pass
ifile.close()
ofile.close()
As a little hint, for the code to be more pythonic look at with statements.
here is my code for readinng individual cell of one csv file. but want to read multiple csv file one by one from .txt file where csv file paths are located.
import csv
ifile = open ("C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv", "rb")
data = list(csv.reader(ifile, delimiter = ';'))
REQ = []
RES = []
n = len(data)
for i in range(n):
x = data[i][1]
y = data[i][2]
REQ.append (x)
RES.append (y)
i += 1
for j in range(2,n):
try:
if REQ[j] != '' and RES[j]!= '': # ignore blank cell
print REQ[j], ' ', RES[j]
except:
pass
j += 1
And csv file paths are stored in a .txt file like
C:\Desktop\Test_Specification\RDBI.csv
C:\Desktop\Test_Specification\ECUreset.csv
C:\Desktop\Test_Specification\RDTC.csv
and so on..
You can read stuff stored in files into variables. And you can use variables with strings in them anywhere you can use a literal string. So...
with open('mytxtfile.txt', 'r') as txt_file:
for line in txt_file:
file_name = line.strip() # or was it trim()? I keep mixing them up
ifile = open(file_name, 'rb')
# ... the rest of your code goes here
Maybe we can fix this up a little...
import csv
with open('mytxtfile.txt', 'r') as txt_file:
for line in txt_file:
file_name = line.strip()
csv_file = csv.reader(open(file_name, 'rb', delimiter=';'))
for record in csv_file[1:]: # skip header row
req = record[1]
res = record[2]
if len(req + res):
print req, ' ', res
you just need to add a while which will read your file containing your list of files & paths upon your first open statement, for example
from __future__ import with_statement
with open("myfile_which_contains_file_path.txt") as f:
for line in f:
ifile = open(line, 'rb')
# here the rest of your code
You need to use a raw string string your path contains \
import csv
file_list = r"C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv"
with open(file_list) as f:
for line in f:
with open(line.strip(), 'rb') as the_file:
reader = csv.reader(the_file, delimiter=';')
for row in reader:
req,res = row[1:3]
if req and res:
print('{0} {1}'.format(req, res))