Create a Generator to read email and process messages - python

I am trying to write some code to read my inbox and process some attachments if present. I decided this would be a good time to learn how generators work as I want to process all messages that have a particular subject. I have gotten to the point where I can get all the attachments and relevant subjects but I sort of had to fake it as the iterator in the for i in range . . . was not advancing so I am advancing the latest_email_id in the loop
def read_email_from_gmail():
try:
print 'got here'
mail = imaplib.IMAP4_SSL(SMTP_SERVER)
mail.login(FROM_EMAIL,FROM_PWD)
mail.select('inbox')
type, data = mail.search(None, 'ALL')
mail_ids = data[0]
id_list = mail_ids.split()
first_email_id = int(id_list[0])
latest_email_id = int(id_list[-1])
print latest_email_id
while True:
for i in range(latest_email_id,first_email_id - 1, -1):
latest_email_id -= 1
#do stuff to get attachment and subject
yield attachment_data, subject
except Exception, e:
print str(e)
for attachment, subject in read_email_from_gmail():
x = process_attachment(attachment)
y = process_subject(subject)
Is there a more pythonic way to advance through my in-box using a generator to hold state in the in-box?

I have learned a bit more about generators and played around with the code I started with so I have a function that uses a generator to send each relevant email message subject to the main function. This is what I have so far, and it works great for my needs
import imaplib
import email
FROM_EMAIL = 'myemail#gmail.com'
FROM_PWD = "mygmail_password"
SMTP_SERVER = "imap.gmail.com"
SMTP_PORT = 993
STOP_MESSAGES = set(['Could not connect to mailbox',
'No Messages or None Retrieved Successfully',
'Could not retrieve some message',
'Finished processing'])
def read_emails():
mail = imaplib.IMAP4_SSL(SMTP_SERVER)
mail.login(FROM_EMAIL,FROM_PWD)
mail.select('inbox')
con_status, data = mail.uid('search', None, "ALL")
if con_status != 'OK':
yield 'Could not connect to mailbox'
try:
mail_ids = data[0].split()
except Exception:
yield 'No Messages or None Retrieved Successfully'
print mail_ids
processed = []
while True:
for mail_id in mail_ids:
status, mdata = mail.uid('fetch', mail_id, '(RFC822)')
if status != 'OK':
yield 'Could not retrieve some message'
if mail_id in processed:
yield 'Finished processing'
raw_msg = mdata[0][1]
structured_msg = email.message_from_string(raw_msg)
msg_subject = structured_msg['subject']
processed.append(mail_id)
yield msg_subject
To access my messages one by one, I then use the following block to get my messages
for msg_subj in read_emails():
if msg_subj not in STOP_MESSAGES:
do some stuff here with msg_subj
else:
print msg_subj
break
I am accessing these messages by their uid as I will be deleting them later and would like to use the uid as the key to manage deletion. For me the trick was to collect the uid in the list named processed and then check to see if I was going to circle through them again because I was working with a uid that had already been processed.

Related

How would I use imaplib to check if an email is received from a certain email address and then trigger a selenium action?

My code so far
import base64
email_user = input('Email: ')
email_pass = input('Password: ')
M = imaplib.IMAP4_SSL('imap.gmail.com', 993)
M.login(email_user, email_pass)
M.select()
typ, message_numbers = M.search(None, 'ALL') # change variable name, and use new name in for loop
for num in message_numbers[0].split():
typ, data = M.fetch(num, '(RFC822)')
# num1 = base64.b64decode(num) # unnecessary, I think
print(data) # check what you've actually got. That will help with the next line
data1 = base64.b64decode(data[0][1])
print('Message %s\n%s\n' % (num, data1))
M.close()
M.logout()
My code currently prints out all of my emails
It prints out a load of text, but how would I refine it to see if an email from an email address is received and then trigger opening a website.

Finding hyperlinks in gmail email body with IMAP

The point of this script is to find hyperlinks in emails and automatically open them. I'm currently stuck on the search part.
The script can't seem to pick up the link from the body of the email. The hyperlink should look like
https://something.com/verify/c4b7668ad547922226426896f
is something wrong with my regex?
def process_mailbox(M):
rv, data = M.search(None, specific_email_addy)
if rv != 'OK':
print "No messages found!"
return
for num in data[0].split():
rv, data = M.fetch(num, '(RFC822)')
if rv != 'OK':
print "ERROR getting message", num
return
msg = email.message_from_string(data[0][1])
raw_email = data[0][1] # here's the body, which is raw headers and html and body of the whole email including headers and alternate payloads
msg = email.message_from_string(raw_email)
for part in msg.walk():
# each part is a either non-multipart, or another multipart message
# that contains further parts... Message is organized like a tree
if part.get_content_type() == 'text/html':
plain_text = part.get_payload()
link_pattern = re.compile('<a[^>]+href=\'(.*?)\'[^>]*>(.*)?</a>')
search = link_pattern.search(plain_text)
if search is not None:
print("Link found! -> " + search)
break
else:
print("No links were found.")

Python IMAP4 Don't Mark messages as read

I have a Python script to move messages with a certain subject. The messages get marked as read and I don't want them to be marked as read. What part of the script marks them as read and how do I make it not mark as read?
Also, I'm not too sure what I am doing yet, so if there is any redundant code or errors please let me know.
import getpass
from Crypto.Hash import MD5
import sys
import imaplib
import email
import re
password = getpass.getpass()
match = "redacted"
username = "redacted"
dest = "000"
pattern_uid = re.compile('\d+ \(UID (?P<uid>\d+)\)')
def md5(message):
hash = MD5.new()
hash.update(message)
return hash.hexdigest()
md5 = md5(password)
if md5 == match:
pass
else:
print "Mismatch"
sys.exit()
M = imaplib.IMAP4_SSL("mail.redacted.com", 993)
M.login(username, password)
M.select()
typ, data = M.search(None, 'ALL')
M.select('Inbox')
msgs = M.search(None, 'ALL')[1]
num_messages = len(msgs[0].split())
num_messages += 1
def parse_uid(data):
match = pattern_uid.match(data)
return match.group('uid')
for i in range(1, num_messages):
try:
typ, msg_data = M.fetch(str(i), '(RFC822)')
except:
pass
for response_part in msg_data:
if isinstance(response_part, tuple):
UID = M.fetch(str(i),'UID')
UID = UID[1]
try:
UID = parse_uid(UID[0])
except:
pass
msg = email.message_from_string(response_part[1])
for header in [ 'subject' ]:
if msg[header] == "Redacted":
result = M.uid('COPY', UID, dest)
if result[0] == 'OK':
mov, data = M.uid('STORE', UID, '+FLAGS', '(\Deleted)')
M.expunge()
M.close()
M.logout()
typ, msg_data = M.fetch(str(i), '(RFC822)')
Fetching a message body marks it as read. You'll want to use BODY.PEEK[].
Although, I don't know why you're fetching the whole message just to copy it. Why don't you just fetch the headers? Use BODY.PEEK[HEADERS].

How do I download only unread attachments from a specific gmail label?

I have a Python script adapted from Downloading MMS emails sent to Gmail using Python
import email, getpass, imaplib, os
detach_dir = '.' # directory where to save attachments (default: current)
user = raw_input("Enter your GMail username:")
pwd = getpass.getpass("Enter your password: ")
# connecting to the gmail imap server
m = imaplib.IMAP4_SSL("imap.gmail.com")
m.login(user,pwd)
m.select("[Gmail]/All Mail") # here you a can choose a mail box like INBOX instead
# use m.list() to get all the mailboxes
resp, items = m.search(None, 'FROM', '"Impact Stats Script"') # you could filter using the IMAP rules here (check http://www.example-code.com/csharp/imap-search-critera.asp)
items = items[0].split() # getting the mails id
for emailid in items:
resp, data = m.fetch(emailid, "(RFC822)") # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc
email_body = data[0][1] # getting the mail content
mail = email.message_from_string(email_body) # parsing the mail content to get a mail object
#Check if any attachments at all
if mail.get_content_maintype() != 'multipart':
continue
print "["+mail["From"]+"] :" + mail["Subject"]
# we use walk to create a generator so we can iterate on the parts and forget about the recursive headach
for part in mail.walk():
# multipart are just containers, so we skip them
if part.get_content_maintype() == 'multipart':
continue
# is this part an attachment ?
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
counter = 1
# if there is no filename, we create one with a counter to avoid duplicates
if not filename:
filename = 'part-%03d%s' % (counter, 'bin')
counter += 1
att_path = os.path.join(detach_dir, filename)
#Check if its already there
if not os.path.isfile(att_path) :
# finally write the stuff
fp = open(att_path, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
I am filtering messages by subject and getting the attachments, but now I need to only get attachments from new emails. Can I modify the m.search() somehow to return only unread emails?
Try modifying this line:
resp, items = m.search(None, 'FROM', '"Impact Stats Script"')
to:
resp, items = m.search(None, 'UNSEEN', 'FROM', '"Impact Stats Script"')
The Python imaplib documentation shows just adding more search criteria, and the IMAP specification defines the UNSEEN search criteria:
UNSEEN
Messages that do not have the \Seen flag set.

python imaplib to get gmail inbox subjects titles and sender name

I'm using pythons imaplib to connect to my gmail account. I want to retrieve the top 15 messages (unread or read, it doesn't matter) and display just the subjects and sender name (or address) but don't know how to display the contents of the inbox.
Here is my code so far (successful connection)
import imaplib
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login('mygmail#gmail.com', 'somecrazypassword')
mail.list()
mail.select('inbox')
#need to add some stuff in here
mail.logout()
I believe this should be simple enough, I'm just not familiar enough with the commands for the imaplib library. Any help would be must appreciated...
UPDATE
thanks to Julian I can iterate through each message and retrieve the entire contents with:
typ, data = mail.search(None, 'ALL')
for num in data[0].split():
typ, data = mail.fetch(num, '(RFC822)')
print 'Message %s\n%s\n' % (num, data[0][1])
mail.close()
but I'm wanting just the subject and the sender. Is there a imaplib command for these items or will I have to parse the entire contents of data[0][1] for the text: Subject, and Sender?
UPDATE
OK, got the subject and sender part working but the iteration (1, 15) is done by desc order apparently showing me the oldest messages first. How can I change this? I tried doing this:
for i in range( len(data[0])-15, len(data[0]) ):
print data
but that just gives me None for all 15 iterations... any ideas? I've also tried mail.sort('REVERSE DATE', 'UTF-8', 'ALL') but gmail doesnt support the .sort() function
UPDATE
Figured out a way to do it:
#....^other code is the same as above except need to import email module
mail.select('inbox')
typ, data = mail.search(None, 'ALL')
ids = data[0]
id_list = ids.split()
#get the most recent email id
latest_email_id = int( id_list[-1] )
#iterate through 15 messages in decending order starting with latest_email_id
#the '-1' dictates reverse looping order
for i in range( latest_email_id, latest_email_id-15, -1 ):
typ, data = mail.fetch( i, '(RFC822)' )
for response_part in data:
if isinstance(response_part, tuple):
msg = email.message_from_string(response_part[1])
varSubject = msg['subject']
varFrom = msg['from']
#remove the brackets around the sender email address
varFrom = varFrom.replace('<', '')
varFrom = varFrom.replace('>', '')
#add ellipsis (...) if subject length is greater than 35 characters
if len( varSubject ) > 35:
varSubject = varSubject[0:32] + '...'
print '[' + varFrom.split()[-1] + '] ' + varSubject
this gives me the most recent 15 message subject and sender address in decending order as requested! Thanks to all who helped!
c.select('INBOX', readonly=True)
for i in range(1, 30):
typ, msg_data = c.fetch(str(i), '(RFC822)')
for response_part in msg_data:
if isinstance(response_part, tuple):
msg = email.message_from_string(response_part[1])
for header in [ 'subject', 'to', 'from' ]:
print '%-8s: %s' % (header.upper(), msg[header])
This should give you an idea on how to retrieve the subject and from?
This was my solution to get the useful bits of information from emails:
import datetime
import email
import imaplib
import mailbox
EMAIL_ACCOUNT = "your#gmail.com"
PASSWORD = "your password"
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login(EMAIL_ACCOUNT, PASSWORD)
mail.list()
mail.select('inbox')
result, data = mail.uid('search', None, "UNSEEN") # (ALL/UNSEEN)
i = len(data[0].split())
for x in range(i):
latest_email_uid = data[0].split()[x]
result, email_data = mail.uid('fetch', latest_email_uid, '(RFC822)')
# result, email_data = conn.store(num,'-FLAGS','\\Seen')
# this might work to set flag to seen, if it doesn't already
raw_email = email_data[0][1]
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
# Header Details
date_tuple = email.utils.parsedate_tz(email_message['Date'])
if date_tuple:
local_date = datetime.datetime.fromtimestamp(email.utils.mktime_tz(date_tuple))
local_message_date = "%s" %(str(local_date.strftime("%a, %d %b %Y %H:%M:%S")))
email_from = str(email.header.make_header(email.header.decode_header(email_message['From'])))
email_to = str(email.header.make_header(email.header.decode_header(email_message['To'])))
subject = str(email.header.make_header(email.header.decode_header(email_message['Subject'])))
# Body details
for part in email_message.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True)
file_name = "email_" + str(x) + ".txt"
output_file = open(file_name, 'w')
output_file.write("From: %s\nTo: %s\nDate: %s\nSubject: %s\n\nBody: \n\n%s" %(email_from, email_to,local_message_date, subject, body.decode('utf-8')))
output_file.close()
else:
continue
For those looking for how to check mail and parse the headers, this is what I used:
def parse_header(str_after, checkli_name, mailbox) :
#typ, data = m.search(None,'SENTON', str_after)
print mailbox
m.SELECT(mailbox)
date = (datetime.date.today() - datetime.timedelta(1)).strftime("%d-%b-%Y")
#date = (datetime.date.today().strftime("%d-%b-%Y"))
#date = "23-Jul-2012"
print date
result, data = m.uid('search', None, '(SENTON %s)' % date)
print data
doneli = []
for latest_email_uid in data[0].split():
print latest_email_uid
result, data = m.uid('fetch', latest_email_uid, '(RFC822)')
raw_email = data[0][1]
import email
email_message = email.message_from_string(raw_email)
print email_message['To']
print email_message['Subject']
print email.utils.parseaddr(email_message['From'])
print email_message.items() # print all headers
I was looking for a ready made simple script to list last inbox via IMAP without sorting through all messages. The information here is useful, though DIY and misses some aspects. First, IMAP4.select returns message count. Second, subject header decoding isn't straightforward.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import imaplib
import email
from email.header import decode_header
import HTMLParser
# to unescape xml entities
_parser = HTMLParser.HTMLParser()
def decodeHeader(value):
if value.startswith('"=?'):
value = value.replace('"', '')
value, encoding = decode_header(value)[0]
if encoding:
value = value.decode(encoding)
return _parser.unescape(value)
def listLastInbox(top = 4):
mailbox = imaplib.IMAP4_SSL('imap.gmail.com')
mailbox.login('mygmail#gmail.com', 'somecrazypassword')
selected = mailbox.select('INBOX')
assert selected[0] == 'OK'
messageCount = int(selected[1][0])
for i in range(messageCount, messageCount - top, -1):
reponse = mailbox.fetch(str(i), '(RFC822)')[1]
for part in reponse:
if isinstance(part, tuple):
message = email.message_from_string(part[1])
yield {h: decodeHeader(message[h]) for h in ('subject', 'from', 'date')}
mailbox.logout()
if __name__ == '__main__':
for message in listLastInbox():
print '-' * 40
for h, v in message.items():
print u'{0:8s}: {1}'.format(h.upper(), v)
BODY gets almost everything and marks the message as read.
BODY[<parts>] gets just those parts.
BODY.PEEK[<parts>] gets the same parts, but doesn't mark the message read.
<parts> can be HEADER or TEXT or HEADER.FIELDS (<list of fields>) or
HEADER.FIELDS.NOT (<list of fields>)
This is what I use: typ, data = connection.fetch(message_num_s, b'(BODY.PEEK[HEADER.FIELDS (SUBJECT FROM)])')
`
def safe_encode(seq):
if seq not in (list,tuple):
seq = [seq]
for i in seq:
if isinstance(i, (int,float)):
yield str(i).encode()
elif isinstance(i, str):
yield i.encode()
elif isinstance(i, bytes):
yield i
else:
raise ValueError
def fetch_fields(connection, message_num, field_s):
"""Fetch just the fields we care about. Parse them into a dict"""
if isinstance(field_s, (list,tuple)):
field_s = b' '.join(safe_encode(field_s))
else:
field_s = tuple(safe_encode(field_s))[0]
message_num = tuple(safe_encode(message_num))[0]
typ, data = connection.fetch(message_num, b'(BODY.PEEK[HEADER.FIELDS (%s)])'%(field_s.upper()))
if typ != 'OK':
return typ, data #change this to an exception if you'd rather
items={}
lastkey = None
for line in data[0][1].splitlines():
if b':' in line:
lastkey, value = line.strip().split(b':', 1)
lastkey = lastkey.capitalize()
#not all servers capitalize the same, and some just leave it
#as however it arrived from some other mail server.
items[lastkey]=value
else:
#subject was so long it ran onto the next line, luckily it didn't have a ':' in it so its easy to recognize.
items[lastkey]+=line
#print(items[lastkey])
return typ, items
`
You drop it into your code example: by replacing the call to 'mail.fetch()' with fetch_fields(mail, i, 'SUBJECT FROM') or fetch_fields(mail, i, ('SUBJECT' 'FROM'))
Adding to all the above answers.
import imaplib
import base64
import os
import email
if __name__ == '__main__':
email_user = "email#domain.com"
email_pass = "********"
mail = imaplib.IMAP4_SSL("hostname", 993)
mail.login(email_user, email_pass)
mail.select()
type, data = mail.search(None, 'ALL')
mail_ids = data[0].decode('utf-8')
id_list = mail_ids.split()
mail.select('INBOX', readonly=True)
for i in id_list:
typ, msg_data = mail.fetch(str(i), '(RFC822)')
for response_part in msg_data:
if isinstance(response_part, tuple):
msg = email.message_from_bytes(response_part[1])
print(msg['from']+"\t"+msg['subject'])
This will give you the email's from and subject name.

Categories

Resources