How to handle =?windows-1251?Q? in the mail attachments? - python

I receive email attachment and save it directly to blobstore:
msg = email.message_from_string(self.request.body)
for part in msg.walk():
ctype = part.get_content_type()
if ctype in ['image/jpeg', 'image/png']:
image_file = part.get_payload(decode=True)
image_file_name = part.get_filename()
if image_file_name[:11] == '=?KOI8-R?B?':
image_file_name = base64.b64decode(image_file_name[11:]).decode('KOI8-R')
if image_file_name[:10] == '=?UTF-8?B?':
image_file_name = base64.b64decode(image_file_name[10:])
How to handle Q in the filename? In another question I asked, it has been said that Q indicates quoted-printable. But what should I do to get valid filename?
Upd. I've replaced the code with
filename_decoded = ''
for item in decode_header(filename):
if item[1]:
filename_decoded += item[0].decode(item[1])
else:
filename_decoded += item[0]
if filename_decoded != '':
filename = filename_decoded
It helped me with =?windows-1251?Q?, but fails with normal names - 'ascii' codec can't encode characters in position 11-14: ordinal not in range(128). How should I fix that?

Related

Python Trouble Parsing a .max translated to OLE File => output unreadable in text format

The following script outputs files unreadable in .txt format. Please advise.
I inspired myself with: https://area.autodesk.com/m/drew.avis/tutorials/writing-and-reading-3ds-max-scene-sidecar-data-in-python
This is to replicate a macho shark into a mechanical robot.
import olefile
# set this to your file
f = r'C:\MRP\Shortfin_Mako_Shark_Rigged_scanline.max'
def cleanString(data,isArray=False):
# remove first 6 bytes + last byte
data = data[6:]
if isArray:
data = data[:-1]
return data
with olefile.OleFileIO(f) as ole:
ole.listdir()
print(ole.listdir())
i = 0
for entry in ole.listdir():
i = i + 1
print(entry)
if i > 2:
fin = ole.openstream(entry)
# myString = fin.read().decode("utf-16")
# myString = cleanString(myString, isArray=True)
fout = open(entry[0], "wb")
print(fout)
while True:
s = fin.read(8192)
if not s:
break
fout.write(s)
Please advise.
https://www.turbosquid.com/fr/3d-models/max-shortfin-mako-shark-rigged/991102#
I also tried this:
with olefile.OleFileIO(f) as ole:
ole.listdir()
print(ole.listdir())
i = 0
for entry in ole.listdir():
i = i + 1
print(entry)
if i > 2:
fin = ole.openstream(entry)
#myString = fin.read().decode("utf-16")
#myString = cleanString(myString, isArray=True)
fout = open(entry[0], "w")
print(fout)
while True:
s = fin.read(8192)
if not s:
break
fout.write(cleanString(s, isArray = True).decode("utf-8"))
# stream = ole.openstream('CustomFileStreamDataStorage/MyString')
# myString = stream.read().decode('utf-16')
# myString = cleanString(myString)
# stream = ole.openstream('CustomFileStreamDataStorage/MyGeometry')
# myGeometry = stream.read().decode('utf-16')
# myGeometry = cleanString(myGeometry, isArray=True)
# myGeometry = myGeometry.split('\x00')
# stream = ole.openstream('CustomFileStreamDataStorage/MyLayers')
# myLayers = stream.read().decode('utf-16')
# myLayers = cleanString(myLayers, isArray=True)
# myLayers = myLayers.split('\x00')
# print ("My String: {}\nMy Geometry: {}\nMy Layers: {}".format (myString, myGeometry, myLayers))
What is the right encoding to decode from?
Exception has occurred: UnicodeDecodeError
'utf-8' codec can't decode bytes in position 4-5: invalid continuation byte
File "C:\MRP\ALG_LIN.py", line 59, in
fout.write(cleanString(s, isArray = True).decode("utf-8"))
Exception has occurred: UnicodeEncodeError
'charmap' codec can't encode characters in position 2-5: character maps to
File "C:\MRP\ALG_LIN.py", line 59, in
fout.write(cleanString(s, isArray = True).decode("utf-16"))
KR,
Ludo

How to handle all charset and content type when reading email from IMAP lib in Python

I am reading email from imap lib in python which is working but i am reading body part and storing body part in database but some times python code returning error in decoding body , i am identifying content type and charset of body but dont understand how to handle all content type and charset some time it is coming text/plain , with utf-8 in some mail it is ascii/ISO-8859/window-1252.
Please help me how to handle for all charset.
find the below code which i am currently using to read email body only if required i will provide all my code.
Expected Result : convert/handle all charset of email body in UTF-8 format then in HTML to show it on portal.
if email_message.is_multipart():
html = None
multipart = True
for part in email_message.walk():
print("%s, %s" % (part.get_content_type(), part.get_content_charset()))
charset = part.get_content_charset()
if part.get_content_charset() is None:
# We cannot know the character set, so return decoded "something"
text = part.get_payload(decode=True)
continue
if part.get_content_type() == 'text/plain' and part.get_content_charset() == 'utf-8':
# print('text--->1')
text = str(part.get_payload(decode=True))
# text = html.decode("utf-8")
# print(part.get_payload(decode=True))
if part.get_content_type() == 'text/plain' and part.get_content_charset() != 'utf-8':
# print('text--->2')
html = part.get_payload(decode=True)
# text1 = html.decode("utf-8")
text1 = html.decode(part.get_content_charset()).encode('utf8')
if part.get_content_type() == 'text/html' and part.get_content_charset() != 'windows-1252':
html = part.get_payload(decode=True)
# text1 = html.decode("utf-8")
text1 = html.decode(part.get_content_charset()).encode('utf8')
if part.get_content_type() == 'text/html' and part.get_content_charset() == 'windows-1252':
html = part.get_payload(decode=True)
text1 = html.decode("cp1252")
# if part.get_content_type() == 'text/html' and part.get_content_charset() == 'windows-1252':
# html = part.get_payload(decode=True)
# text1 = html.decode("latin-1")
# if text is not None:
# print(text.strip())
# prin('Rahul')
# else:
# print("text") # print( html.strip())
# print(text1.strip())
# print("text1")
# print(text1)
imageCount = 0
imageKey = ''
json_data = {}
filedata = {}
mydict1 = ''
value = ''
params = ''
filename = ''
newFileName = ''
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
if part.get_content_type() == 'message/rfc822':
part_string = (bytes(str(part), 'utf-8'))
# part_string = bytes(str(part.get_payload(0)),'utf-8')
print('EML Part')
print(part_string)
filename = part.get_filename()
# filename = filename.replace('\r', '').replace('\n', '')
# print(part_string)
# print(('attachment wala'))
else:
part_string = part.get_payload(decode=True)
# print(part_string)
# print(('attachment wala'))
filename = part.get_filename()
# filename = filename.replace('\r', '').replace('\n', '')
if filename is not None:
filepart = []
try:
decodefile = email.header.decode_header(filename)
print('decodefile')
print(decodefile)
except HeaderParseError:
return filename
#
for line1, encoding1 in decodefile:
enc = 'utf-8'
# print(encoding)
if encoding1 is not None: # else encoding
print(type(line1))
filepart.append((line1.decode(encoding1)))
print('line')
print(line1)
print(filepart)
filename = ''.join(filepart)[:1023]
else:
filename = filename
dot_position = filename.rfind('.')
file_prefix = filename[0: dot_position]
file_suffix = filename[dot_position: len(filename)]
print(filename)
print(file_prefix)
print(file_suffix)
# filename = filename.decode('utf-8')
# subject = ''
file_prefix = file_prefix.replace('/', '_')
now = datetime.datetime.now()
timestamp = str(now.strftime("%Y%m%d%H%M%S%f"))
print('timestamp--->')
print(timestamp)
newFileName = file_prefix + "_" + timestamp + file_suffix
newFileName = newFileName.replace('\r', '').replace('\n', '').replace(',', '')
filename = filename.replace('\r', '').replace('\n', '').replace(',', '')
sv_path = os.path.join(svdir, newFileName)
mydict = filename + '$$' + newFileName
mydict1 = mydict1 + ',' + mydict
# print(mydict1)
value, params = cgi.parse_header(part.get('Content-Disposition'))
print(value)
if value == 'inline':
imageCount = imageCount + 1
print("newFileName-->" + newFileName)
filedata[imageCount] = newFileName
print(filedata)
json_data = (filedata)
# inlineImages = inlineImages + ',' + newFileName + '{{' + str(imageCount) + '}}'
# print(json_data)
# print('TYPE-->')
# print(type(raw_email))
# print(type(part.get_payload(decode=1)))
# if type(part.get_payload(decode=1)) is None:
# print('message Type')
if not os.path.isfile(sv_path):
# print('rahul1')
try:
fp = open(sv_path, 'wb')
fp.write(part_string)
fp.close()
except TypeError:
pass
fp.close()
else:
print("%s, %s" % (email_message.get_content_type(), email_message.get_content_charset()))
if email_message.get_content_charset() is None:
# We cannot know the character set, so return decoded "something"
text = email_message.get_payload(decode=True)
continue
if email_message.get_content_type() == 'text/plain' and email_message.get_content_charset() == 'utf-8':
print('text--->1')
text = str(email_message.get_payload(decode=True))
# text = html.decode("utf-8")
# print(part.get_payload(decode=True))
if email_message.get_content_type() == 'text/plain' and email_message.get_content_charset() != 'utf-8':
print('text--->2')
html = email_message.get_payload(decode=True)
# text1 = html.decode("utf-8")
text1 = html.decode(email_message.get_content_charset()).encode('utf8')
if email_message.get_content_type() == 'text/html' and email_message.get_content_charset() != 'windows-1252':
html = email_message.get_payload(decode=True)
# text1 = html.decode("utf-8")
text1 = html.decode(email_message.get_content_charset()).encode('utf8')
if email_message.get_content_type() == 'text/html' and email_message.get_content_charset() == 'windows-1252':
html = email_message.get_payload(decode=True)
text1 = html.decode("cp1252")
How to handle all charset and content type when reading email from IMAP lib in Python
Simple answer:
Walk all message parts and apply the provided encoding setting. I see that you already do this (though I would rewrite your if-else cascades into something much simpler as the stdlib impl can deal with it just fine, your code is currently kinda messed up). That will work with standard conform mail content. But as always - there are many screwed up mail clients out there that dont care much about standard conformance (from good clients broken under certain circumstances to weakly scripted spam clients).
Long answer:
Its impossible to get this right for all messages. Decoding will fail for various reasons. Whenever decoding fails for a part the question is - what to do about it? Well you have basically these options:
do nothing special, just go with the raw content
You could just insert the raw byte content into your DB, and give users that content. Thats not very user friendly, and prolly not what you want if you have a big user base with business constraints coupled to it. Its still the much easier way to handle broken content. Its also the fallback if 2. still fails.
try to decode content with some heuristics
Here the nasty coding starts - whenever decoding of a part fails, there was something wrong with the annotated encoding and the actual content. So what can you do here? Well not much beside inspecting the content, try to find hints for the actual encoding (like pattern matching for UTF8 bit masks), or even brute force decoding. Clever heuristics might want to try out often seen encoding errors first (e.g. test for UTF8 or 8-bit encodings like latin-1 earlier). A good rule of thumb does not exists here, as messed up text encodings can go from just a wrongly announced encoding type up to several 8-bit encodings mixed up. While the first can most likely be spotted, the latter never can be resolved even by the most advanced heuristics and should always fall back to the solution in 1.
Skip content
Not recommended as it is likely to withhold important data from the user. Do this only if your sure, that the content is rubbish.
If you want to go the heuristics approach I suggest to do the following:
start with standard conform handling, any message that follows the standard should be handled correctly (in a perfect world you are done here)
implement 1. above as a general failover
collect data about typical failures, either from own users or search for typical faults in the internet (other mail clients have already identified those and handle them in a certain way)
implement the heuristics in 2., go with 80/20 rule (implement stuff first most users would benefit from), everything else gets handled by 1.
improve the heuristics over time
in any case - try to avoid 3.
This is a very general answer to your question, if you have a particular issue maybe you should address this more in detail.

How to write attachment data?

I'm getting attachment data from Gmail, while I'm writing data it is throwing an error
UnicodeDecodeError 'utf-8' codec can't decode byte 0xf0 in position
14: invalid continuation byte
service = build('gmail', 'v1', credentials=creds)
results = service.users().messages().list(userId='me',labelIds = ['INBOX'],maxResults=10).execute()
messages = results.get('messages', [])
if not messages:
print("No messages found.")
else:
print("Message snippets:")
count = 0
if os.path.exists('token.pickle'):
for message in messages:
msg_dict = {}
msgbody_dict = {}
msg = service.users().messages().get(userId='me', id=message['id']).execute()
ID=message['id']
payld = msg['payload']
headr = payld['headers']
payload_data = payld['body']
#print('type========================',payld)
if 'parts' in payld.keys():
payload_parts = msg['payload']['parts']
for i in payload_parts:
payload_headers = i['headers']
for NAME in payload_headers:
if NAME['name'] == 'Content-Type':
content_type = NAME['value']
if 'text/html' in content_type or 'text/plain' in content_type:
partstype_dat = i['body']['data']
partstype_dat=partstype_dat.replace('-', '+')
partstype_dat=partstype_dat.replace('_', '/')
payldp=base64.urlsafe_b64decode(partstype_dat)
payldp = str(payldp , 'utf-8')
msgbody_dict['data'] = payldp
payload_pldata.append(msgbody_dict)
elif i['filename']:
attach_parts = i['body']
att_id = attach_parts['attachmentId']
att_data = service.users().messages().attachments().get(userId='me', messageId=ID, id=att_id).execute()
encydoc_data = att_data['data']
doc_data = base64.urlsafe_b64decode(encydoc_data.encode('UTF-8'))
dcdata = doc_data.decode('ASCII')
final_docdata = str(doc_data , 'UTF-8')
with open(i['filename'],"w") as f:
f.write(final_docdata)
http://docs.python.org/howto/unicode.html#the-unicode-type
str = unicode(str, errors='replace')
or
str = unicode(str, errors='ignore')
Note: This will strip out (ignore) the characters in question returning the string without them.
Personally it my favorite since I'm using it as protection against non-ASCII input which is not allowed by my application.
Alternatively: Use the open method from the codecs module to read in the file:
import codecs
with codecs.open(file_name, "r",encoding='utf-8', errors='ignore') as fdata:

decoding a .txt - 'utf-8' codec can't decode byte 0xf3

I am taking data, domains, from an excel file to a text file and then check the availability of the domains. The problem pops up when I try to use that text file after taking the data from the excel file.
This is the data in the excel file
arete.cl
cbsanbernardo.cl
ludala.cl
puntotactico.cl
sunriseskateboard.cl
ellegrand.cl
turismosantodomingo.cl
delotroladof.cl
produccionesmandala.cl
So, basically if I type manually the domains in the text file the script works fine. But if I take the domains from an excel file to a text file and then run the script this errors pops up:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf3 in position 194: invalid continuation byte
The same happens if I try to check the domains directly from the excel file.
So should I decode the .txt or the .xlsx? How can I do it?
#!/usr/bin/python
import pythonwhois
import openpyxl
from openpyxl import load_workbook
import os
pathx = 'path'
filex = 'file.xlsx'
print('**Availability of domains**')
os.chdir(pathx)
workbook = openpyxl.load_workbook(filex, data_only = True)
sheet = workbook.get_sheet_by_name('Dic')
domainsz = io.open(pathx + '\\domains.txt', 'a')
for i in range(1, 10):
domainx = sheet["A" + str(i * 2)].value
if domainx is not None:
domainsz.write(domainx + '\n')
print(domainx)
domainsz.close()
with gzip.open('domains.txt' + ".gz", "wb") as outfile:
outfile.write(bytes(plaintext, 'UTF-8'))
domains = []
available = []
unavailable = []
def getDomains():
with io.open('domains.txt', 'r', encoding='latin-1') as f:
for domainName in f.read().splitlines():
domains.append(domainName)
def run():
for dom in domains:
if dom is not None and dom != '':
details = pythonwhois.get_whois(dom)
if details['contacts']['registrant'] is not None:
unavailable.append(dom)
else:
available.append(dom)
def printAvailability():
print ("-----------------------------")
print ("Unavailable Domains: ")
print ("-----------------------------")
for un in unavailable:
print (un)
print ("\n")
print ("-----------------------------")
print ("Available Domains: ")
print ("-----------------------------")
for av in available:
print (av)
if __name__ == "__main__":
getDomains()
run()
printAvailability()

Python: Write encrypted data to file

I've made a chat app for school, and some people just write into the database. So my new project on it is to encrypt the resources. So I've made an encrypt function.
It's working fine, but when I try to write a encrypted data at a file, I get an error Message:
File "C:\Python34\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\x94' in position 0:
character maps to <undefined>
How to fix that problem?
complete code:
def encrypts(data, step):
newdata = ""
i = 0
while (len(data) > len(step)):
step += step[i]
i += 1
if (len(data) < len(step)):
step = step[:len(data)]
for i in range(len(data)):
a = ord(data[i])
b = ord(step[i])
newdata += chr(a+b)
return newdata
file = open("C:/Users/David/Desktop/file.msg","wb")
file.write(encrypts("12345","code"))
Now, I finally solved my problem. The created ASCII Characters didn't exist. So I changed my functions:
def encrypts(data, step):
newdata = ""
i = 0
while (len(data) > len(step)):
step += step[i]
i += 1
if (len(data) < len(step)):
step = step[:len(data)]
for i in range(len(data)):
a = ord(data[i])
b = ord(step[i])
newdata += chr(a+b-100) #The "-100" fixed the problem.
return newdata
When opening a file for writing or saving, try adding the 'b' character to the open mode. So instead of :
open("encryptedFile.txt", 'w')
use
open("encryptedFile.txt", 'wb')
This will open files as binary, which is necessary when you modify the characters the way you are because you're sometime setting those characters to values outside of the ASCII range.
Your problem in the encoding of the file.
Try it:
inputFile = codecs.open('input.txt', 'rb', 'cp1251')
outFile = codecs.open('output.txt', 'wb', 'cp1251')

Categories

Resources