I'm getting attachment data from Gmail, while I'm writing data it is throwing an error
UnicodeDecodeError 'utf-8' codec can't decode byte 0xf0 in position
14: invalid continuation byte
service = build('gmail', 'v1', credentials=creds)
results = service.users().messages().list(userId='me',labelIds = ['INBOX'],maxResults=10).execute()
messages = results.get('messages', [])
if not messages:
print("No messages found.")
else:
print("Message snippets:")
count = 0
if os.path.exists('token.pickle'):
for message in messages:
msg_dict = {}
msgbody_dict = {}
msg = service.users().messages().get(userId='me', id=message['id']).execute()
ID=message['id']
payld = msg['payload']
headr = payld['headers']
payload_data = payld['body']
#print('type========================',payld)
if 'parts' in payld.keys():
payload_parts = msg['payload']['parts']
for i in payload_parts:
payload_headers = i['headers']
for NAME in payload_headers:
if NAME['name'] == 'Content-Type':
content_type = NAME['value']
if 'text/html' in content_type or 'text/plain' in content_type:
partstype_dat = i['body']['data']
partstype_dat=partstype_dat.replace('-', '+')
partstype_dat=partstype_dat.replace('_', '/')
payldp=base64.urlsafe_b64decode(partstype_dat)
payldp = str(payldp , 'utf-8')
msgbody_dict['data'] = payldp
payload_pldata.append(msgbody_dict)
elif i['filename']:
attach_parts = i['body']
att_id = attach_parts['attachmentId']
att_data = service.users().messages().attachments().get(userId='me', messageId=ID, id=att_id).execute()
encydoc_data = att_data['data']
doc_data = base64.urlsafe_b64decode(encydoc_data.encode('UTF-8'))
dcdata = doc_data.decode('ASCII')
final_docdata = str(doc_data , 'UTF-8')
with open(i['filename'],"w") as f:
f.write(final_docdata)
http://docs.python.org/howto/unicode.html#the-unicode-type
str = unicode(str, errors='replace')
or
str = unicode(str, errors='ignore')
Note: This will strip out (ignore) the characters in question returning the string without them.
Personally it my favorite since I'm using it as protection against non-ASCII input which is not allowed by my application.
Alternatively: Use the open method from the codecs module to read in the file:
import codecs
with codecs.open(file_name, "r",encoding='utf-8', errors='ignore') as fdata:
Related
The following script outputs files unreadable in .txt format. Please advise.
I inspired myself with: https://area.autodesk.com/m/drew.avis/tutorials/writing-and-reading-3ds-max-scene-sidecar-data-in-python
This is to replicate a macho shark into a mechanical robot.
import olefile
# set this to your file
f = r'C:\MRP\Shortfin_Mako_Shark_Rigged_scanline.max'
def cleanString(data,isArray=False):
# remove first 6 bytes + last byte
data = data[6:]
if isArray:
data = data[:-1]
return data
with olefile.OleFileIO(f) as ole:
ole.listdir()
print(ole.listdir())
i = 0
for entry in ole.listdir():
i = i + 1
print(entry)
if i > 2:
fin = ole.openstream(entry)
# myString = fin.read().decode("utf-16")
# myString = cleanString(myString, isArray=True)
fout = open(entry[0], "wb")
print(fout)
while True:
s = fin.read(8192)
if not s:
break
fout.write(s)
Please advise.
https://www.turbosquid.com/fr/3d-models/max-shortfin-mako-shark-rigged/991102#
I also tried this:
with olefile.OleFileIO(f) as ole:
ole.listdir()
print(ole.listdir())
i = 0
for entry in ole.listdir():
i = i + 1
print(entry)
if i > 2:
fin = ole.openstream(entry)
#myString = fin.read().decode("utf-16")
#myString = cleanString(myString, isArray=True)
fout = open(entry[0], "w")
print(fout)
while True:
s = fin.read(8192)
if not s:
break
fout.write(cleanString(s, isArray = True).decode("utf-8"))
# stream = ole.openstream('CustomFileStreamDataStorage/MyString')
# myString = stream.read().decode('utf-16')
# myString = cleanString(myString)
# stream = ole.openstream('CustomFileStreamDataStorage/MyGeometry')
# myGeometry = stream.read().decode('utf-16')
# myGeometry = cleanString(myGeometry, isArray=True)
# myGeometry = myGeometry.split('\x00')
# stream = ole.openstream('CustomFileStreamDataStorage/MyLayers')
# myLayers = stream.read().decode('utf-16')
# myLayers = cleanString(myLayers, isArray=True)
# myLayers = myLayers.split('\x00')
# print ("My String: {}\nMy Geometry: {}\nMy Layers: {}".format (myString, myGeometry, myLayers))
What is the right encoding to decode from?
Exception has occurred: UnicodeDecodeError
'utf-8' codec can't decode bytes in position 4-5: invalid continuation byte
File "C:\MRP\ALG_LIN.py", line 59, in
fout.write(cleanString(s, isArray = True).decode("utf-8"))
Exception has occurred: UnicodeEncodeError
'charmap' codec can't encode characters in position 2-5: character maps to
File "C:\MRP\ALG_LIN.py", line 59, in
fout.write(cleanString(s, isArray = True).decode("utf-16"))
KR,
Ludo
I am reading email from imap lib in python which is working but i am reading body part and storing body part in database but some times python code returning error in decoding body , i am identifying content type and charset of body but dont understand how to handle all content type and charset some time it is coming text/plain , with utf-8 in some mail it is ascii/ISO-8859/window-1252.
Please help me how to handle for all charset.
find the below code which i am currently using to read email body only if required i will provide all my code.
Expected Result : convert/handle all charset of email body in UTF-8 format then in HTML to show it on portal.
if email_message.is_multipart():
html = None
multipart = True
for part in email_message.walk():
print("%s, %s" % (part.get_content_type(), part.get_content_charset()))
charset = part.get_content_charset()
if part.get_content_charset() is None:
# We cannot know the character set, so return decoded "something"
text = part.get_payload(decode=True)
continue
if part.get_content_type() == 'text/plain' and part.get_content_charset() == 'utf-8':
# print('text--->1')
text = str(part.get_payload(decode=True))
# text = html.decode("utf-8")
# print(part.get_payload(decode=True))
if part.get_content_type() == 'text/plain' and part.get_content_charset() != 'utf-8':
# print('text--->2')
html = part.get_payload(decode=True)
# text1 = html.decode("utf-8")
text1 = html.decode(part.get_content_charset()).encode('utf8')
if part.get_content_type() == 'text/html' and part.get_content_charset() != 'windows-1252':
html = part.get_payload(decode=True)
# text1 = html.decode("utf-8")
text1 = html.decode(part.get_content_charset()).encode('utf8')
if part.get_content_type() == 'text/html' and part.get_content_charset() == 'windows-1252':
html = part.get_payload(decode=True)
text1 = html.decode("cp1252")
# if part.get_content_type() == 'text/html' and part.get_content_charset() == 'windows-1252':
# html = part.get_payload(decode=True)
# text1 = html.decode("latin-1")
# if text is not None:
# print(text.strip())
# prin('Rahul')
# else:
# print("text") # print( html.strip())
# print(text1.strip())
# print("text1")
# print(text1)
imageCount = 0
imageKey = ''
json_data = {}
filedata = {}
mydict1 = ''
value = ''
params = ''
filename = ''
newFileName = ''
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
if part.get_content_type() == 'message/rfc822':
part_string = (bytes(str(part), 'utf-8'))
# part_string = bytes(str(part.get_payload(0)),'utf-8')
print('EML Part')
print(part_string)
filename = part.get_filename()
# filename = filename.replace('\r', '').replace('\n', '')
# print(part_string)
# print(('attachment wala'))
else:
part_string = part.get_payload(decode=True)
# print(part_string)
# print(('attachment wala'))
filename = part.get_filename()
# filename = filename.replace('\r', '').replace('\n', '')
if filename is not None:
filepart = []
try:
decodefile = email.header.decode_header(filename)
print('decodefile')
print(decodefile)
except HeaderParseError:
return filename
#
for line1, encoding1 in decodefile:
enc = 'utf-8'
# print(encoding)
if encoding1 is not None: # else encoding
print(type(line1))
filepart.append((line1.decode(encoding1)))
print('line')
print(line1)
print(filepart)
filename = ''.join(filepart)[:1023]
else:
filename = filename
dot_position = filename.rfind('.')
file_prefix = filename[0: dot_position]
file_suffix = filename[dot_position: len(filename)]
print(filename)
print(file_prefix)
print(file_suffix)
# filename = filename.decode('utf-8')
# subject = ''
file_prefix = file_prefix.replace('/', '_')
now = datetime.datetime.now()
timestamp = str(now.strftime("%Y%m%d%H%M%S%f"))
print('timestamp--->')
print(timestamp)
newFileName = file_prefix + "_" + timestamp + file_suffix
newFileName = newFileName.replace('\r', '').replace('\n', '').replace(',', '')
filename = filename.replace('\r', '').replace('\n', '').replace(',', '')
sv_path = os.path.join(svdir, newFileName)
mydict = filename + '$$' + newFileName
mydict1 = mydict1 + ',' + mydict
# print(mydict1)
value, params = cgi.parse_header(part.get('Content-Disposition'))
print(value)
if value == 'inline':
imageCount = imageCount + 1
print("newFileName-->" + newFileName)
filedata[imageCount] = newFileName
print(filedata)
json_data = (filedata)
# inlineImages = inlineImages + ',' + newFileName + '{{' + str(imageCount) + '}}'
# print(json_data)
# print('TYPE-->')
# print(type(raw_email))
# print(type(part.get_payload(decode=1)))
# if type(part.get_payload(decode=1)) is None:
# print('message Type')
if not os.path.isfile(sv_path):
# print('rahul1')
try:
fp = open(sv_path, 'wb')
fp.write(part_string)
fp.close()
except TypeError:
pass
fp.close()
else:
print("%s, %s" % (email_message.get_content_type(), email_message.get_content_charset()))
if email_message.get_content_charset() is None:
# We cannot know the character set, so return decoded "something"
text = email_message.get_payload(decode=True)
continue
if email_message.get_content_type() == 'text/plain' and email_message.get_content_charset() == 'utf-8':
print('text--->1')
text = str(email_message.get_payload(decode=True))
# text = html.decode("utf-8")
# print(part.get_payload(decode=True))
if email_message.get_content_type() == 'text/plain' and email_message.get_content_charset() != 'utf-8':
print('text--->2')
html = email_message.get_payload(decode=True)
# text1 = html.decode("utf-8")
text1 = html.decode(email_message.get_content_charset()).encode('utf8')
if email_message.get_content_type() == 'text/html' and email_message.get_content_charset() != 'windows-1252':
html = email_message.get_payload(decode=True)
# text1 = html.decode("utf-8")
text1 = html.decode(email_message.get_content_charset()).encode('utf8')
if email_message.get_content_type() == 'text/html' and email_message.get_content_charset() == 'windows-1252':
html = email_message.get_payload(decode=True)
text1 = html.decode("cp1252")
How to handle all charset and content type when reading email from IMAP lib in Python
Simple answer:
Walk all message parts and apply the provided encoding setting. I see that you already do this (though I would rewrite your if-else cascades into something much simpler as the stdlib impl can deal with it just fine, your code is currently kinda messed up). That will work with standard conform mail content. But as always - there are many screwed up mail clients out there that dont care much about standard conformance (from good clients broken under certain circumstances to weakly scripted spam clients).
Long answer:
Its impossible to get this right for all messages. Decoding will fail for various reasons. Whenever decoding fails for a part the question is - what to do about it? Well you have basically these options:
do nothing special, just go with the raw content
You could just insert the raw byte content into your DB, and give users that content. Thats not very user friendly, and prolly not what you want if you have a big user base with business constraints coupled to it. Its still the much easier way to handle broken content. Its also the fallback if 2. still fails.
try to decode content with some heuristics
Here the nasty coding starts - whenever decoding of a part fails, there was something wrong with the annotated encoding and the actual content. So what can you do here? Well not much beside inspecting the content, try to find hints for the actual encoding (like pattern matching for UTF8 bit masks), or even brute force decoding. Clever heuristics might want to try out often seen encoding errors first (e.g. test for UTF8 or 8-bit encodings like latin-1 earlier). A good rule of thumb does not exists here, as messed up text encodings can go from just a wrongly announced encoding type up to several 8-bit encodings mixed up. While the first can most likely be spotted, the latter never can be resolved even by the most advanced heuristics and should always fall back to the solution in 1.
Skip content
Not recommended as it is likely to withhold important data from the user. Do this only if your sure, that the content is rubbish.
If you want to go the heuristics approach I suggest to do the following:
start with standard conform handling, any message that follows the standard should be handled correctly (in a perfect world you are done here)
implement 1. above as a general failover
collect data about typical failures, either from own users or search for typical faults in the internet (other mail clients have already identified those and handle them in a certain way)
implement the heuristics in 2., go with 80/20 rule (implement stuff first most users would benefit from), everything else gets handled by 1.
improve the heuristics over time
in any case - try to avoid 3.
This is a very general answer to your question, if you have a particular issue maybe you should address this more in detail.
I am working with a prototype for securing deserialization adding a signature to the serialized data, but when I try to concatenate the signature with the serialized data it throws an error.
with open(filename, 'w') as file_object:
#Adding the signature to the data
file_object.write(signature + serialized)
TypeError: can only concatenate str (not "bytes") to str
And if I try to convert the serialized data into a string it also throws an error
with open(filename, 'w') as file_object:
#Adding the signature to the data
serializedStr = serialized.decode('utf-8')
file_object.write(signature + serializedStr)
serializedStr = serialized.decode('utf-8') UnicodeDecodeError: 'utf-8'
codec can't decode byte 0x80 in position 0: invalid start byte
How can I add the signature to the serialized data?
Full Code
import pickle
import json
import hashlib
import hmac
class User(object):
def __init__(self, name):
self.name = name
filename = 'user.file'
KEY = b'secret'
user = User('david')
serialized = pickle.dumps(user)
#calculate the signature
signature = hmac.new(KEY, serialized, hashlib.sha256).hexdigest()
with open(filename, 'w') as file_object:
#Adding the signature to the data
print(type(serialized))
print(type(signature))
#serializedStr = serialized.decode('utf-8')
file_object.write(signature + serialized)
with open(filename, 'rb') as file_object:
raw_data = file_object.read()
if(len(raw_data) == len(signature)):
read_signature = raw_data[:len(signature)]
read_data = raw_data[len(signature):]
computed_signature = hmac.new(KEY, read_data, hashlib.sha256).hexdigest()
if hmac.compare_digest(computed_signature, read_signature):
userDeserialized = pickle.loads(read_data)
print (userDeserialized.name)
Use .digest() not .hexdigest() to get a byte string that can be prepended to the serialized data byte string. Open the file for binary read/write:
import pickle
import json
import hashlib
import hmac
class User(object):
def __init__(self, name):
self.name = name
filename = 'user.file'
KEY = b'secret'
user = User('david')
serialized = pickle.dumps(user)
#calculate the signature
signature = hmac.new(KEY, serialized, hashlib.sha256).digest() # not .hexdigest()
with open(filename, 'wb') as file_object: # binary write
file_object.write(signature + serialized)
with open(filename, 'rb') as file_object: # binary read
raw_data = file_object.read()
if len(raw_data) >= len(signature): # need >= here
read_signature = raw_data[:len(signature)]
read_data = raw_data[len(signature):]
computed_signature = hmac.new(KEY, read_data, hashlib.sha256).digest() # not .hexdigest()
if hmac.compare_digest(computed_signature, read_signature):
userDeserialized = pickle.loads(read_data)
print (userDeserialized.name)
Output:
david
try this
with open(filename, 'wb') as file_object:
#Adding the signature to the data
print(type(serialized))
print(type(signature))
#serializedStr = serialized.decode('utf-8')
s = bytearray(signature)
s.extend(bytes(serialized, 'utf-8'))
file_object.write(s)
In python2.7
AttributeError: 'module' object has no attribute 'detect_encoding'
for python2&3 compatibility,you can use:
from lib2to3.pgen2 import tokenize
tokenize.detect_encoding(f.readline)[0] # 'utf-8'
This function isn't available for python2.7, you can see it isn't listed on the https://docs.python.org/2.7/library/tokenize.html. That said, I don't see any reason why the python3.6 version wouldn't work on python2.7, ie:
import re
from codecs import lookup, BOM_UTF8
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
def _get_normal_name(orig_enc):
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc = orig_enc[:12].lower().replace("_", "-")
if enc == "utf-8" or enc.startswith("utf-8-"):
return "utf-8"
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
return "iso-8859-1"
return orig_enc
def detect_encoding(readline):
try:
filename = readline.__self__.name
except AttributeError:
filename = None
bom_found = False
encoding = None
default = 'utf-8'
def read_or_stop():
try:
return readline()
except StopIteration:
return b''
def find_cookie(line):
try:
# Decode as UTF-8. Either the line is an encoding declaration,
# in which case it should be pure ASCII, or it must be UTF-8
# per default encoding.
line_string = line.decode('utf-8')
except UnicodeDecodeError:
msg = "invalid or missing encoding declaration"
if filename is not None:
msg = '{} for {!r}'.format(msg, filename)
raise SyntaxError(msg)
match = cookie_re.match(line_string)
if not match:
return None
encoding = _get_normal_name(match.group(1))
try:
codec = lookup(encoding)
except LookupError:
# This behaviour mimics the Python interpreter
if filename is None:
msg = "unknown encoding: " + encoding
else:
msg = "unknown encoding for {!r}: {}".format(filename,
encoding)
raise SyntaxError(msg)
if bom_found:
if encoding != 'utf-8':
# This behaviour mimics the Python interpreter
if filename is None:
msg = 'encoding problem: utf-8'
else:
msg = 'encoding problem for {!r}: utf-8'.format(filename)
raise SyntaxError(msg)
encoding += '-sig'
return encoding
first = read_or_stop()
if first.startswith(BOM_UTF8):
bom_found = True
first = first[3:]
default = 'utf-8-sig'
if not first:
return default, []
encoding = find_cookie(first)
if encoding:
return encoding, [first]
if not blank_re.match(first):
return default, [first]
second = read_or_stop()
if not second:
return default, [first]
encoding = find_cookie(second)
if encoding:
return encoding, [first, second]
return default, [first, second]
coding, lines = detect_encoding(open("out.txt", 'rb').readline)
print(coding, lines)
I receive email attachment and save it directly to blobstore:
msg = email.message_from_string(self.request.body)
for part in msg.walk():
ctype = part.get_content_type()
if ctype in ['image/jpeg', 'image/png']:
image_file = part.get_payload(decode=True)
image_file_name = part.get_filename()
if image_file_name[:11] == '=?KOI8-R?B?':
image_file_name = base64.b64decode(image_file_name[11:]).decode('KOI8-R')
if image_file_name[:10] == '=?UTF-8?B?':
image_file_name = base64.b64decode(image_file_name[10:])
How to handle Q in the filename? In another question I asked, it has been said that Q indicates quoted-printable. But what should I do to get valid filename?
Upd. I've replaced the code with
filename_decoded = ''
for item in decode_header(filename):
if item[1]:
filename_decoded += item[0].decode(item[1])
else:
filename_decoded += item[0]
if filename_decoded != '':
filename = filename_decoded
It helped me with =?windows-1251?Q?, but fails with normal names - 'ascii' codec can't encode characters in position 11-14: ordinal not in range(128). How should I fix that?