Extract content from a file with mime multipart - python

I have a file that contain a tiff image and a document xml in a multipart mime document.
I would extract the image from this file.
How I can get it?
I have this code, but it requires an infinite time to extract it, if I have a big file (for example 30Mb), so this is unuseful.
f=open("content_file.txt","rb")
msg = email.message_from_file(f)
j=0
image=False
for i in msg.walk():
if i.is_multipart():
#print "MULTIPART: "
continue
if i.get_content_maintype() == 'text':
j=j+1
continue
if i.get_content_maintype() == 'image':
image=True
j=j+1
pl = i.get_payload(decode=True)
localFile = open("map.out.tiff", 'wb')
localFile.write(pl)
continue
f.close()
if (image==False):
sys.exit(0);
Thank you so much.

Solved:
def extract_mime_part_matching(stream, mimetype):
"""Return the first element in a multipart MIME message on stream
matching mimetype."""
msg = mimetools.Message(stream)
msgtype = msg.gettype()
params = msg.getplist()
data = StringIO.StringIO()
if msgtype[:10] == "multipart/":
file = multifile.MultiFile(stream)
file.push(msg.getparam("boundary"))
while file.next():
submsg = mimetools.Message(file)
try:
data = StringIO.StringIO()
mimetools.decode(file, data, submsg.getencoding())
except ValueError:
continue
if submsg.gettype() == mimetype:
break
file.pop()
return data.getvalue()
From:
http://docs.python.org/release/2.6.6/library/multifile.html
Thank you for the support.

It is not quite clear to me, why your code hangs. The indentation looks a bit wrong and opened files are not properly closed. You may also be low on memory.
This version works fine for me:
import email
import mimetypes
with open('email.txt') as fp:
message = email.message_from_file(fp)
for i, part in enumerate(message.walk()):
if part.get_content_maintype() == 'image':
filename = part.get_filename()
if not filename:
ext = mimetypes.guess_extension(part.get_content_type())
filename = 'image-%02d%s' % (i, ext or '.tiff')
with open(filename, 'wb') as fp:
fp.write(part.get_payload(decode=True))
(Partly taken from http://docs.python.org/library/email-examples.html#email-examples)

Related

Extract email attachments and retain modification/creation date?

I'm trying to extract files from emails via IMAP using Python 3.7 (on Windows, fyi) and each of my attempts shows extracted files with Modification & Creation Date = time of extraction (which is incorrect).
As full email applications have the ability to preserve that information, it must me stored somewhere. I also gave working with structs a try, thinking the information may be stored in binary, but had no luck.
import email
from email.header import decode_header
import imaplib
import os
SERVER = None
OUT_DIR = '/var/out'
IMP_SRV = 'mail.domain.tld'
IMP_USR = 'user#domain.tld'
IMP_PWD = 'hunter2'
def login_mail():
global SERVER
SERVER = imaplib.IMAP4_SSL(IMP_SRV)
SERVER.login(IMP_USR, IMP_PWD)
def get_mail(folder='INBOX'):
mails = []
_, data = SERVER.uid('SEARCH', 'ALL')
uids = data[0].split()
for uid in uids:
_, s = SERVER.uid('FETCH', uid, '(RFC822)')
mail = email.message_from_bytes(s[0][1])
mails.append(mail)
return mails
def parse_attachments(mail):
for part in mail.walk():
if part.get_content_type() == 'application/octet-stream':
filename = get_filename(part)
output = os.path.join(OUT_DIR, filename)
with open(output, 'wb') as f:
f.write(part.get_payload(decode=True))
def get_filename(part):
filename = part.get_filename()
binary = part.get_payload(decode=True)
if decode_header(filename)[0][1] is not None:
filename = decode_header(filename)[0][0].decode(decode_header(filename)[0][1])
filename = os.path.basename(filename)
return filename
Can anyone tell me what I'm doing wrong and if it's somehow possible?
After getting said information it could be possible to modify the timestamps utilizing How do I change the file creation date of a Windows file?.
I was able to extract the creation-date and modification-date from the content-disposition header. Setting the file modified date is simple too.
attachment_creation_date = attachment.get_param('creation-date', None, 'content-disposition')
attachment_modification_date = attachment.get_param('modification-date', None, 'content-disposition')
Here's a more complete example that shows how to read these parameters if present:
def process_email_attachments(msg, output_directory):
for attachment in msg.iter_attachments():
try:
output_filename = attachment.get_filename()
except AttributeError:
print("Couldn't get attachment filename. Skipping.")
continue
# If no attachments are found, skip this file
if output_filename:
attachment_creation_date = attachment.get_param('creation-date', None, 'content-disposition')
attachment_modification_date = attachment.get_param('modification-date', None, 'content-disposition')
try:
output_file_full_path = os.path.join(output_directory, output_filename)
with open(output_file_full_path, "wb") as of:
payload = attachment.get_payload(decode=True)
of.write(payload)
if attachment_modification_date is not None:
attachment_modification_datetime = email.utils.parsedate_to_datetime(attachment_modification_date)
set_file_last_modified(output_file_full_path, attachment_modification_datetime)
except TypeError:
print("Couldn't get payload for %s" % output_filename)
def set_file_last_modified(file_path, dt):
dt_epoch = dt.timestamp()
os.utime(file_path, (dt_epoch, dt_epoch))
The second part of your question is how to set the file created date. This is platform dependent. There is already a separate question with answers demonstrating how to set the creation date on a Windows file: How do I change the file creation date of a Windows file?

How can I get an attached eml file from email message content using Python?

I am using python 3.7 and the email, imap library to read email and extract the content of email and attachments , all the attachment ( like excel, csv, pdf) is downloading as attachment but when i received any .eml file in email , it shows me error, please find the below code to read email content and attachment with error showing in case of eml file is received as attachment.
it is showing error at the time of writing eml file.
at the time of write part.get_payload(decode=True) is coming blank in eml file case.
filename = part.get_filename()
if filename is not None:
dot_position = filename.find('.')
file_prefix = filename[0:dot_position]
file_suffix = filename[dot_position:len(filename)]
# print(dot_position)
# print(file_prefix)
# print(file_suffix)
now = datetime.datetime.now()
timestamp = str(now.strftime("%Y%m%d%H%M%S%f"))
newFileName = file_prefix + "_" + timestamp + file_suffix
sv_path = os.path.join(svdir, newFileName)
# allfiles = allfiles.append([{"oldfilename": filename, "newfilename": newFileName}])
mydict = filename + '$$' + newFileName
mydict1 = mydict1 + ',' + mydict
print(mydict1)
if not os.path.isfile(sv_path):
print("oldpath:---->" + sv_path)
# filename = os.rename(filename, filename + '_Rahul')
# sv_path = os.path.join(svdir, filename)
# print("Newpath:---->" + sv_path)
fp = open(sv_path, 'wb')
# print("Rahul")
print(part.get_payload(decode=True))
# try:
# newFileByteArray = bytearray(fp)
# if part.get_payload(decode=True) is not None:
fp.write(part.get_payload(decode=True))
# except (TypeError, IOError):
# pass
fp.close()
Error is
<class 'TypeError'> ReadEmailUsingIMAP.py 129
a bytes-like object is required, not 'NoneType'
Just to explain why this is happening (it hit me too), quoting the v. 3.5 library doc. (v2 says the same):
If the message is a multipart and the decode flag is True, then None is returned.
If your attachment is an .EML, it's almost always going to be multi-part, thus the None.
Jin Thakur's workaround is appropriate if you're only expecting .EML multipart attachments (not sure if there is any other use cases); it should have been accepted as an answer.
Use eml_parser
https://pypi.org/project/eml-parser/
import datetime
import json
import eml_parser
def json_serial(obj):
if isinstance(obj, datetime.datetime):
serial = obj.isoformat()
return serial
with open('sample.eml', 'rb') as fhdl:
raw_email = fhdl.read()
parsed_eml = eml_parser.eml_parser.decode_email_b(raw_email)
print(json.dumps(parsed_eml, default=json_serial))

To copy the attached file in an email.

I have been able to figure out how to get the name of the attached file in an email. i am just stuck after that. I don't know what to do after that, I have tried using os.path.join which just gives the path i want to download the folder to and joins it with the filename. Please suggest something. Thanks.
m = imaplib.IMAP4_SSL('outlook.office365.com',993)
m.login("UN", "PW")
m.select("Inbox")
typ, msgs = mail.search(None, '(SUBJECT "qwerty")')
msgs = msgs[0].split()
for emailid in msgs:
resp, data = mail.fetch(emailid, "(RFC822)")
email_body = data[0][1]
m = email.message_from_bytes(email_body)
if m.get_content_maintype() != 'multipart':
continue
for part in m.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
print(filename)
Following the sample from this link you can set the path when using the open function. (raw string by prefixing the string with r)
fp = open(r'c:\tmp\folder\' + filename, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
print '%s saved!' % filename

Trouble isolating emails when downloading via Python script

I have a script that fetches emails from my account, downloads the attachments, creates some html for an email blast program, and then zips them into a nice little archive. This works well when only one email is present in the inbox, however, the script hangs when multiple emails exist. I feel like this is because the section of the script that zips the files is not looping correctly. What I am trying to accomplish is one zip file for each email. 3 emails in the inbox = 3 seperate zip files. I've done my best to reduce my code for maximum readability while still maintaining the core structure. Could anyone point me in the right direction here? Thanks!
Code:
for emailid in items:
resp, data = m.fetch(emailid, "(RFC822)")
email_body = data[0][1]
mail = email.message_from_string(email_body)
for part in mail.walk():
if part.get_content_type() == 'text/plain':
content = part.get_payload()
#do something/define variables from email contents
if mail.get_content_maintype() != 'multipart':
continue
for part in mail.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
counter = 1
if not filename:
filename = 'part-%03d%s' % (counter, 'bin')
counter += 1
att_path = os.path.join(detach_dir, filename)
if not os.path.isfile(att_path) :
fp = open(att_path, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
path = 'C:\directory'
os.chdir(path)
for file in os.listdir('.'):
#download attachments
htmlFile = str(token)+'.html'
htmlCode = ('<html>HTML goes here</html>')
htmlData = open(os.path.join('C:\directory', htmlFile), 'w+')
htmlData.write(htmlCode)
print htmlFile+' Complete'
htmlData.close()
allFiles = [f for f in os.listdir('.')]
for file in allFiles:
archive = zipfile.ZipFile(token+'.zip', mode='a')
archive.write(file)
archive.close()
os.unlink(file)
UPDATE
Here is alink to the complete code. http://ideone.com/WEXv9P
There seems to be a mistake here:
counter = 1
if not filename:
filename = 'part-%03d%s' % (counter, 'bin')
counter += 1
Counter will always be 1 in this loop, you probably want to define it before the second
for part in mail.walk():
EDIT:
Okay, so I think the problem is at the last part of the code
allFiles = [f for f in os.listdir('.')]
for file in allFiles:
archive = zipfile.ZipFile(token+'.zip', mode='a')
archive.write(file)
archive.close()
os.unlink(file)
this will create a zip file for each part of the email
I think what you want to do is indent this out a level and change it to something more like this:
allFiles = [f for f in os.listdir(detach_dir) if not f.endswith(".zip")]
for file in allFiles:
archive = zipfile.ZipFile(token+'.zip', mode='a')
archive.write(file)
archive.close()
os.unlink(file)
That way it won't recursively zip other zip files or remove them

PySide QFTP put only uploading 35-40 bytes

When I use QFTP's put command to upload a file it only uploads around 40 bytes of the specified file. I'm catching the dataProgress signal and I'm getting the progress but the total size of the file is only read to be around 40 bytes. Is there anything wrong with my code, or is it a problem on the FTP server's side?
Here is my upload function:
def upload(self):
filename = QFileDialog.getOpenFileName(self, 'Upload File', '.')
fname = QIODevice(filename[0])
dataname = filename[0]
data = os.path.basename(dataname)
#data = data[data.find("/") + 1:]
print data
print fname
if not self.fileTree.currentItem():
self.qftp.put(fname, data)
elif "." in self.fileTree.currentItem().text(0):
self.qftp.put(fname, self.fileTree.currentItem().parent().text(0) + data)
elif self.fileTree.currentItem().text(0) == "/":
self.qftp.put(fname, data)
else:
return
Alright, figured out what I needed to do. I needed to create a QFile and read all of the bytes from that file and then pass that to the put command.
def upload(self):
filename = QFileDialog.getOpenFileName(self, 'Upload File', '.')
data = QFile(filename[0])
data.open(1)
qdata = QByteArray(data.readAll())
file = os.path.basename(filename[0])
print data
if not self.fileTree.currentItem():
self.qftp.put(qdata, file, self.qftp.TransferType())
elif "." in self.fileTree.currentItem().text(0):
self.qftp.put(qdata, self.fileTree.currentItem().parent().text(0) + file)
elif self.fileTree.currentItem().text(0) == "/":
self.qftp.put(qdata, file)
else:
return
I'm guessing that data = os.path.basename(dataname) means data is always a string containing the name of the file. Try changing this to be an open fileobj by using data = open(os.path.basename(dataname), 'rb')
edit
Looking at PySide.QtNetwork.QFtp.put(data, file[, type=Binary]) and PySide.QtNetwork.QFtp.put(dev, file[, type=Binary]) - the order of arguments is data/dev then file - so it's the wrong way around in your code...

Categories

Resources