Python (3.5) - Constructing String to Save File - String Contains Escape Characters - python

I am using Python (3.5) to loop through some .msg files, extract data from them, which contains a url to download a file and a folder that the file should go into. I have successfully extracted the data from the .msg file but now when I try to piece together the absolute file path for the downloaded file, the format ends up weird, with backslashes and \t\r.
Here's a shortened view of the code:
for file in files:
file_abs_path = script_dir + '/' + file
print(file_abs_path)
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
msg = outlook.OpenSharedItem(file_abs_path)
pattern = re.compile(r'(?:^|(?<=\n))[^:<\n]*[:<]\s*([^>\n]*)', flags=re.DOTALL)
results = pattern.findall(msg.Body)
# results[0] -> eventID
regexID = re.compile(r'^[^\/\s]*', flags=re.DOTALL)
filtered = regexID.findall(results[0])
eventID = filtered[0]
# print(eventID)
# results[1] -> title
title = results[1].translate(str.maketrans('','',string.punctuation)).replace(' ', '_') #results[1]
title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore')
title = title.decode('UTF-8')
#results[1]
print(title)
# results[2] -> account
regexAcc = re.compile(r'^[^\(\s]*', flags=re.DOTALL)
filtered = regexAcc.findall(results[2])
account = filtered[0]
account = unicodedata.normalize('NFKD', account).encode('ascii', 'ignore')
account = account.decode('UTF-8')
# print(account)
# results[3] -> downloadURL
downloadURL = results[3]
# print(downloadURL)
rel_path = account + '/' + eventID + '_' + title + '.mp4'
rel_path = unicodedata.normalize('NFKD', rel_path).encode('ascii', 'ignore')
rel_path = rel_path.decode('UTF-8')
filename_abs_path = os.path.join(script_dir, rel_path)
# Download .mp4 from a url and save it locally under `file_name`:
with urllib.request.urlopen(downloadURL) as response, open(filename_abs_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
# print item [ID - Title] when done
print('[Complete] ' + eventID + ' - ' + title)
del outlook, msg
So as you can see I have some regex that extracts 4 pieces of data from the .msg. Then I have to go through each one and do some further fine tuning, but then have what I need:
eventID
# 123456
title
# Name_of_item_with_underscord_no_punctuation
account
# nameofaccount
downloadURL
# http://download.com/basicurlandfile.mp4
So this is the data I get, and I've print() it off and it doesn't have any weird characters. But when I try to construct the path for the .mp4 (filename and directory):
downloadURL = results[3]
# print(downloadURL)
rel_path = account + '/' + eventID + '_' + title + '.mp4'
rel_path = unicodedata.normalize('NFKD', rel_path).encode('ascii', 'ignore')
rel_path = rel_path.decode('UTF-8')
filename_abs_path = os.path.join(script_dir, rel_path)
# Download .mp4 from a url and save it locally under `file_name`:
with urllib.request.urlopen(downloadURL) as response, open(filename_abs_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
After doing this, the output I get from running the code is:
Traceback (most recent call last): File "sfaScript.py", line 65, in <module> with urllib.request.urlopen(downloadURL) as response, open(filename_abs_path, 'wb') as out_file:
OSError: [Errno 22] Invalid argument: 'C:/Users/Kenny/Desktop/sfa_kenny_batch_1\\accountnamehere/123456_Name_of_item_with_underscord_no_punctuation\t\r.mp4'
TL;DR - QUESTION
So the filename_abs_path somehow got changed to
C:/Users/Kenny/Desktop/sfa_kenny_batch_1\\accountnamehere/123456_Name_of_item_with_underscord_no_punctuation\t\r.mp4
I need it to be
C:/Users/Kenny/Desktop/sfa_kenny_batch_1/accountnamehere/123456_Name_of_item_with_underscord_no_punctuation.mp4
Thanks for any help provided!

Looks like your regex captured a tabulation char (\t) and a linefeed char (\r) in title
A quickfix for this would be:
title = title.strip()
(before composing the filename)
which removes all "blank" chars, including tabulations and carriage return chars.

Related

Problem opening up text file of the downloadPath with gedit

After downloading the dependencies from nexus, I have a download path for the data to be in, but I wasn't able to open the textile its not responding, why is this so?
for item in data["items"]:
for asset in item["assets"]:
fileurl = asset["downloadUrl"]
print(fileurl)
downloadPath = '/home/centos/'
filename = downloadPath + fileurl.split('/')[-1]# '\' for Windows
outfile = open(filename, "w")
outfile.write(str(urllib.request.urlopen(fileurl).read()))
outfile.close()
if data["continuationToken"] is None:
sys.exit()
else:
#construct pagination url and loop
url = baseurl + 'components?continuationToken=' + data["continuationToken"] + '&repository=' + downloadRepository
return

How to save file for every content_type rather than every uid with imaplib and email

I am successfully saving the content for each email with the following code, as a .txt, .html or .PDF file. However, I would like to save a version of every content_type, for each email (for each uid). Currently it is only saving one file type for every uid.
For example, an email with a PDF attachment is only currently saving the PDF. I would like it to save the PDF attachment along with the plain text content of the email, in 2 separate files.
Thanks for any help.
import imaplib
import email
import os
import mimetypes
mail = imaplib.IMAP4_SSL('imap.secureserver.net',993)
mail.login('[user]', '[pw]')
mail.select('Inbox')
result, data = mail.uid('search', None, 'ALL')
item_list = data[0].split()
for item in item_list:
result2, email_data = mail.uid('fetch',item,'(RFC822)')
raw_email = email_data[0][1].decode("utf-8")
email_message = email.message_from_string(raw_email)
print_dir = False
if print_dir: print(dir(email_message)) #options, e.g. list of from, to etc.
from_ = email_message['From']
date_ = email_message['Date']
for part in email_message.walk():
option = str(item)[2:-1] + ' ' + date_[:-15] + ' ' + from_ + ' '
content_type = part.get_content_type()
print(str(item),' ',content_type)
if content_type == 'text/html':
filename = option + '.html'
elif content_type == 'text/plain':
filename = option + '.txt'
elif content_type == 'application/pdf':
attachment = part.get_filename() #attachment filename
filename = option + str(attachment)
else:
# Guesses the file type
ext = mimetypes.guess_extension(content_type)
if not ext:
ext = '.bin'
filename = option + ext
save_path = os.getcwd() + '/' + filename
with open(save_path, 'wb') as fp:
fp.write(part.get_payload(decode=True))
^ For multitypes I would like to save a file with all the type extensions. Such as for 22382, a PDF and txt
^ Current Output files
I'm not fully sure, but I think your problem is in the for item in item_list: loop.
email_message would only end up being whatever the last item in that loop creates.
Would you need to push nearly everything in that loop 1 tab's worth out?
Also I'd assume you'd want to use part instead of item in this line: option = str(item)[2:-1] + ' ' + date_[:-15] + ' ' + from_ + ' '
Again, not fully sure, but hope this helps!

How can I get an attached eml file from email message content using Python?

I am using python 3.7 and the email, imap library to read email and extract the content of email and attachments , all the attachment ( like excel, csv, pdf) is downloading as attachment but when i received any .eml file in email , it shows me error, please find the below code to read email content and attachment with error showing in case of eml file is received as attachment.
it is showing error at the time of writing eml file.
at the time of write part.get_payload(decode=True) is coming blank in eml file case.
filename = part.get_filename()
if filename is not None:
dot_position = filename.find('.')
file_prefix = filename[0:dot_position]
file_suffix = filename[dot_position:len(filename)]
# print(dot_position)
# print(file_prefix)
# print(file_suffix)
now = datetime.datetime.now()
timestamp = str(now.strftime("%Y%m%d%H%M%S%f"))
newFileName = file_prefix + "_" + timestamp + file_suffix
sv_path = os.path.join(svdir, newFileName)
# allfiles = allfiles.append([{"oldfilename": filename, "newfilename": newFileName}])
mydict = filename + '$$' + newFileName
mydict1 = mydict1 + ',' + mydict
print(mydict1)
if not os.path.isfile(sv_path):
print("oldpath:---->" + sv_path)
# filename = os.rename(filename, filename + '_Rahul')
# sv_path = os.path.join(svdir, filename)
# print("Newpath:---->" + sv_path)
fp = open(sv_path, 'wb')
# print("Rahul")
print(part.get_payload(decode=True))
# try:
# newFileByteArray = bytearray(fp)
# if part.get_payload(decode=True) is not None:
fp.write(part.get_payload(decode=True))
# except (TypeError, IOError):
# pass
fp.close()
Error is
<class 'TypeError'> ReadEmailUsingIMAP.py 129
a bytes-like object is required, not 'NoneType'
Just to explain why this is happening (it hit me too), quoting the v. 3.5 library doc. (v2 says the same):
If the message is a multipart and the decode flag is True, then None is returned.
If your attachment is an .EML, it's almost always going to be multi-part, thus the None.
Jin Thakur's workaround is appropriate if you're only expecting .EML multipart attachments (not sure if there is any other use cases); it should have been accepted as an answer.
Use eml_parser
https://pypi.org/project/eml-parser/
import datetime
import json
import eml_parser
def json_serial(obj):
if isinstance(obj, datetime.datetime):
serial = obj.isoformat()
return serial
with open('sample.eml', 'rb') as fhdl:
raw_email = fhdl.read()
parsed_eml = eml_parser.eml_parser.decode_email_b(raw_email)
print(json.dumps(parsed_eml, default=json_serial))

Python Email PDF from file Directory

I need to email a pdf and a generic cover letter from a file directory to a an email address that matches the 5 digit code. the code can be found in the first 5 of the pdf name and then the corresonding dataframe that contains the 5 digit code and email address. Is there an easy way to accomplish this? Thanks
# loop through the email list
for i in email_list.itertuples():
PDF_name = i.AGCODE + '_2017 Net_Initial.pdf'
cover_letter_name = 'CoverLetter.pdf'
print(PDF_name)
#attach an excel file and PDF file:
with open(dir_path + PDF_name, 'rb') as f, open(dir_path + cover_letter_name, 'rb') as g:
# Read the binary file contents
PDF_content = f.read()
cl_content = g.read()
PDF_att = FileAttachment(name=PDF_name, content=PDF_content)
cl_att = FileAttachment(name=cover_letter_name, content=cl_content)
# if you want a copy in the 'Sent' folder
m = Message(
account=a
,folder=a.sent
,subject=('Award Letter for ' + i.FACILITY_NAME + ' -- Agency Code: ' + i.AGCODE)
,body = body_of_email
,to_recipients=[Mailbox(email_address=i.FAC_EMAIL_ADDR)])
#attach files
m.attach(cl_att)
m.attach(PDF_att)
# send email each time
m.send_and_save()
#========================

epub3 : how to add the mimetype at first in archive

I'm working on a script to create epub from html files, but when I check my epub I have the following error : Mimetype entry missing or not the first in archive
The Mimetype is present, but it's not the first file in the epub. Any idea how to put it in first place in any case using Python ?
Sorry, I don't have the time right now to give a detailed explanation, but here's a (relatively) simple epub processing program I wrote a while ago that shows how to do that.
epubpad.py
#! /usr/bin/env python
''' Pad the the ends of paragraph lines in an epub file with a single space char
Written by PM 2Ring 2013.05.12
'''
import sys, re, zipfile
def bold(s): return "\x1b[1m%s\x1b[0m" % s
def report(attr, val):
print "%s '%s'" % (bold(attr + ':'), val)
def fixepub(oldname, newname):
oldz = zipfile.ZipFile(oldname, 'r')
nlist = oldz.namelist()
#print '\n'.join(nlist) + '\n'
if nlist[0] != 'mimetype':
print bold('Warning!!!'), "First file is '%s', not 'mimetype" % nlist[0]
#get the name of the contents file from the container
container = 'META-INF/container.xml'
# container should be in nlist
s = oldz.read(container)
p = re.compile(r'full-path="(.*?)"')
a = p.search(s)
contents = a.group(1)
#report("Contents file", contents)
i = contents.find('/')
if i>=0:
dirname = contents[:i+1]
else:
#No directory separator in contents name!
dirname = ''
report("dirname", dirname)
s = oldz.read(contents)
#print s
p = re.compile(r'<dc:creator.*>(.*)</dc:creator>')
a = p.search(s)
creator = a.group(1)
report("Creator", creator)
p = re.compile(r'<dc:title>(.*)</dc:title>')
a = p.search(s)
title = a.group(1)
report("Title", title)
#Find the names of all xhtml & html text files
p = re.compile(r'\.[x]?htm[l]?')
htmnames = [i for i in nlist if p.search(i) and i.find('wrap')==-1]
#Pattern for end of lines that don't need padding
eolp = re.compile(r'[>}]$')
newz = zipfile.ZipFile(newname, 'w', zipfile.ZIP_DEFLATED)
for fname in nlist:
print fname,
s = oldz.read(fname)
if fname == 'mimetype':
f = open(fname, 'w')
f.write(s)
f.close()
newz.write(fname, fname, zipfile.ZIP_STORED)
print ' * stored'
continue
if fname in htmnames:
print ' * text',
#Pad lines that are (hopefully) inside paragraphs...
newlines = []
for line in s.splitlines():
if len(line)==0 or eolp.search(line):
newlines.append(line)
else:
newlines.append(line + ' ')
s = '\n'.join(newlines)
newz.writestr(fname, s)
print
newz.close()
oldz.close()
def main():
oldname = len(sys.argv) > 1 and sys.argv[1]
if not oldname:
print 'No filename given!'
raise SystemExit
newname = len(sys.argv) > 2 and sys.argv[2]
if not newname:
if oldname.rfind('.') == -1:
newname = oldname + '_P'
else:
newname = oldname.replace('.epub', '_P.epub')
newname = newname.replace(' ', '_')
print "Processing '%s' to '%s' ..." % (oldname, newname)
fixepub(oldname, newname)
if __name__ == '__main__':
main()
FWIW, I wrote this program to process files for my simple e-reader that annoyingly joins paragraphs together if they don't end with white space.
The solution I've found:
delete the previous mimetype file
when creating the new archive create an new mimetype file before adding anything else : zipFile.writestr("mimetype", "application/epub+zip")
Why does it work : the mimetype is the same for all epub : "application/epub+zip", no need to use the original file.

Categories

Resources