Python Email PDF from file Directory - python

I need to email a pdf and a generic cover letter from a file directory to a an email address that matches the 5 digit code. the code can be found in the first 5 of the pdf name and then the corresonding dataframe that contains the 5 digit code and email address. Is there an easy way to accomplish this? Thanks
# loop through the email list
for i in email_list.itertuples():
PDF_name = i.AGCODE + '_2017 Net_Initial.pdf'
cover_letter_name = 'CoverLetter.pdf'
print(PDF_name)
#attach an excel file and PDF file:
with open(dir_path + PDF_name, 'rb') as f, open(dir_path + cover_letter_name, 'rb') as g:
# Read the binary file contents
PDF_content = f.read()
cl_content = g.read()
PDF_att = FileAttachment(name=PDF_name, content=PDF_content)
cl_att = FileAttachment(name=cover_letter_name, content=cl_content)
# if you want a copy in the 'Sent' folder
m = Message(
account=a
,folder=a.sent
,subject=('Award Letter for ' + i.FACILITY_NAME + ' -- Agency Code: ' + i.AGCODE)
,body = body_of_email
,to_recipients=[Mailbox(email_address=i.FAC_EMAIL_ADDR)])
#attach files
m.attach(cl_att)
m.attach(PDF_att)
# send email each time
m.send_and_save()
#========================

Related

Folder Compare using Python

I have to compare 100s of files in two folders(directories). There is a way in which we can derive the second file based on the first file and vice versa. I was asked to develop a script so that we can do this task quickly. Following were the requirements
a) HTML Report showing the differences
b) Txt file showing the basic information i.e., count,header,trailer info.
I have written the following script using python but after processing 14 files, there is no movement.
{#Take two folders as input and compare the same files in them using pandas and sqlite
#!/usr/bin/env python3
# Path: folder_compare.py
import os
import pandas as pd
import sqlite3
import logging
import difflib
import sys
#function to write the message sent to the txt file passed as an argument
def write_to_txt(file_name, message):
#path to the file
d_path = 'C:/Upgrade/File-Compare/Differences/' + os.path.basename(file_name)
os.makedirs(d_path, exist_ok=True)
file_path = d_path + '/' + file_name + '.txt'
#Create the file if it does not exist
if not os.path.exists(file_path):
open(file_path, 'w').close()
f = open(file_path, 'a')
f.write(message)
f.close()
def convert_windows_path_to_python(path):
path = path.replace("\\","/")
return path
#get the folders as input from the user
fol1 = input("Enter the first folder path: ")
fol2 = input("Enter the second folder path: ")
folder1 = convert_windows_path_to_python(fol1)
folder2 = convert_windows_path_to_python(fol2)
#function to derive the second file name from the first file name
def get_file_name(file_name):
#file_name = file_name.split('.')
#file_name = file_name[0].replace('BZ1CV','BZ1DV') + '.' + file_name[1]
file_name = file_name.replace('BZ1CV','BZ1DV')
return file_name
#function to compare the two files and write the difference to a html using html.table
def compare_files(file1, file2):
#read the two files
f1 = pd.read_table(file1, encoding='unicode_escape', header=None)
f2 = pd.read_table(file2, encoding='unicode_escape', header=None)
#Get the filesize of the two files
f1_size = os.path.getsize(file1)
f2_size = os.path.getsize(file2)
d_path = 'C:/Upgrade/File-Compare/Differences/' + os.path.basename(file1)
os.makedirs(d_path, exist_ok=True)
#if file size of any of the two files is greater than 10MB, then compare the files using pandas concat and drop_duplicates
if f1_size > 10485760 or f2_size > 10485760:
#compare the two files using pandas concat and drop_duplicates, where both the files can be viewed side by side
difference = pd.concat([f1, f2]).drop_duplicates(keep=False)
difference.to_html(d_path + '_diff.html')
#if the file size of any of the two files is less than 10MB, then compare the files using difflib.html_diff
else:
#compare the two files using difflib.html_diff
first_file_lines = open(file1).readlines()
second_file_lines = open(file2).readlines()
diff = difflib.HtmlDiff().make_file(first_file_lines, second_file_lines, file1, file2, context=True, numlines=0)
diff_report = open(d_path + '_diff.html', 'w')
diff_report.writelines(diff)
diff_report.close()
logging.info('The files are compared successfully')
#Now start logging findings of the files
#Count the number of rows in the two data frames and log the rowcount of both the data frames in a log file with the name as the first file name and extension as .txt
#Loop through the files in the folder1 and compare them with the files in the folder2
for file in os.listdir(folder1):
file1 = folder1 + '/' + file
file2 = folder2 + '/' + get_file_name(file)
#if the second file does not exist in folder 2, then log the error and continue
if not os.path.isfile(file2):
logging.error('File not found: ' + os.path.basename(file2))
continue
f1 = pd.read_table(file1, encoding='unicode_escape', header=None)
f2 = pd.read_table(file2, encoding='unicode_escape', header=None)
#Get the first row(header) of the first data frame and the first row(header) of the second data frame and write both the headers to a text file using the first file name and extension as .txt
f1_header = f1.iloc[0]
f2_header = f2.iloc[0]
#write the headers to a text file using the first file name and extension as .txt and writing a sentence to the text file
write_to_txt(os.path.basename(file1) , 'The headers of the first file are: ' + str(f1_header) + '\n')
write_to_txt(os.path.basename(file1) , 'The headers of the second file are: ' + str(f2_header) + '\n')
#Get the rowcount of the first data frame and the rowcount of the second data frame and write both the rowcounts to a text file using the first file name and extension as .txt
f1_rowcount = f1.shape[0]
f2_rowcount = f2.shape[0]
write_to_txt(os.path.basename(file1) , 'The rowcount of the first file(including header and trailer rows) is: ' + str(f1_rowcount) + '\n')
write_to_txt(os.path.basename(file1) , 'The rowcount of the second file(including header and trailer rows) is: ' + str(f2_rowcount) + '\n')
#Get the last row (footer) of the first data frame and the last row (footer) of the second data frame and write both the footers to a text file using the first file name and extension as .txt
f1_footer = f1.iloc[-1]
f2_footer = f2.iloc[-1]
write_to_txt(os.path.basename(file1) , 'The trailer of the first file are: ' + str(f1_footer) + '\n')
write_to_txt(os.path.basename(file1) , 'The trailer of the second file are: ' + str(f2_footer) + '\n')
compare_files(file1, file2)
}

Use Tesseract OCR to extract text from a scanned pdf folders

I have the code to extract/convert text from scanned pdf files/normal pdf files by using Tesseract OCR. But I want to make my code to convert a pdf folder rather than a single pdf file, then the extract text files will be store in a folder that I want.
See my code below:
filePath = '/Users/CodingStark/scanned/scanned-file.pdf'
pages = convert_from_path(filePath, 500)
image_counter = 1
# Iterate through all the pages stored above
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
page.save(filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter-1
# Creating a text file to write the output
outfile = "scanned-file.txt"
f = open(outfile, "a")
# Iterate from 1 to total number of pages
for i in range(1, filelimit + 1):
filename = "page_"+str(i)+".jpg"
# Recognize the text as string in image using pytesserct
text = str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
f.write(text)
#Close the file after writing all the text.
f.close()
I want to automate my code so it will convert all my pdf files in the scanned folder and those extract text files will be in a folder that I want. Also, are there any ways to delete all the jpg files after the code? Since it takes a lot of memory spaces. Thank you so much!!
Updated with Answer
def tesseractOCR_pdf(pdf):
filePath = pdf
pages = convert_from_path(filePath, 500)
# Counter to store images of each page of PDF to image
image_counter = 1
# Iterate through all the pages stored above
for page in pages:
# Declaring filename for each page of PDF as JPG
# For each page, filename will be:
# PDF page 1 -> page_1.jpg
# PDF page 2 -> page_2.jpg
# PDF page 3 -> page_3.jpg
# ....
# PDF page n -> page_n.jpg
filename = "page_"+str(image_counter)+".jpg"
# Save the image of the page in system
page.save(filename, 'JPEG')
# Increment the counter to update filename
image_counter = image_counter + 1
# Variable to get count of total number of pages
filelimit = image_counter-1
# Create an empty string for stroing purposes
text = ""
# Iterate from 1 to total number of pages
for i in range(1, filelimit + 1):
# Set filename to recognize text from
# Again, these files will be:
# page_1.jpg
# page_2.jpg
# ....
# page_n.jpg
filename = "page_"+str(i)+".jpg"
# Recognize the text as string in image using pytesserct
text += str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
#Delete all the jpg files that created from above
for i in glob.glob("*.jpg"):
os.remove(i)
return text
def tesseractOCR_img(img):
filePath = img
text = str(pytesseract.image_to_string(filePath,lang='eng',config='--psm 6'))
text = text.replace('-\n', '')
return text
def Tesseract_ALL(docDir, txtDir):
if docDir == "": docDir = os.getcwd() + "\\" #if no docDir passed in
for doc in os.listdir(docDir): #iterate through docs in doc directory
try:
fileExtension = doc.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = docDir + doc
text = tesseractOCR_pdf(pdfFilename) #get string of text content of pdf
textFilename = txtDir + doc + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
else:
# elif (fileExtension == "tif") | (fileExtension == "tiff") | (fileExtension == "jpg"):
imgFilename = docDir + doc
text = tesseractOCR_img(imgFilename) #get string of text content of img
textFilename = txtDir + doc + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
except:
print("Error in file: "+ str(doc))
for filename in os.listdir(txtDir):
fileExtension = filename.split(".")[-2]
if fileExtension == "pdf":
os.rename(txtDir + filename, txtDir + filename.replace('.pdf', ''))
elif fileExtension == "tif":
os.rename(txtDir + filename, txtDir + filename.replace('.tif', ''))
elif fileExtension == "tiff":
os.rename(txtDir + filename, txtDir + filename.replace('.tiff', ''))
elif fileExtension == "jpg":
os.rename(txtDir + filename, txtDir + filename.replace('.jpg', ''))
#Below are the code to run the functions
#Specific telling the function where the documents located and where you want the txt files to be at
docDir = "pdf_folder"
txtDir = "text_folder"
Tesseract_ALL(docDir, txtDir)
here is the loop to read from a path,
import glob,os
import os, subprocess
pdf_dir = "dir"
os.chdir(pdf_dir)
for pdf_file in glob.glob(os.path.join(pdf_dir, "*.PDF")):
//// put here what you want to do for each pdf file

How to save file for every content_type rather than every uid with imaplib and email

I am successfully saving the content for each email with the following code, as a .txt, .html or .PDF file. However, I would like to save a version of every content_type, for each email (for each uid). Currently it is only saving one file type for every uid.
For example, an email with a PDF attachment is only currently saving the PDF. I would like it to save the PDF attachment along with the plain text content of the email, in 2 separate files.
Thanks for any help.
import imaplib
import email
import os
import mimetypes
mail = imaplib.IMAP4_SSL('imap.secureserver.net',993)
mail.login('[user]', '[pw]')
mail.select('Inbox')
result, data = mail.uid('search', None, 'ALL')
item_list = data[0].split()
for item in item_list:
result2, email_data = mail.uid('fetch',item,'(RFC822)')
raw_email = email_data[0][1].decode("utf-8")
email_message = email.message_from_string(raw_email)
print_dir = False
if print_dir: print(dir(email_message)) #options, e.g. list of from, to etc.
from_ = email_message['From']
date_ = email_message['Date']
for part in email_message.walk():
option = str(item)[2:-1] + ' ' + date_[:-15] + ' ' + from_ + ' '
content_type = part.get_content_type()
print(str(item),' ',content_type)
if content_type == 'text/html':
filename = option + '.html'
elif content_type == 'text/plain':
filename = option + '.txt'
elif content_type == 'application/pdf':
attachment = part.get_filename() #attachment filename
filename = option + str(attachment)
else:
# Guesses the file type
ext = mimetypes.guess_extension(content_type)
if not ext:
ext = '.bin'
filename = option + ext
save_path = os.getcwd() + '/' + filename
with open(save_path, 'wb') as fp:
fp.write(part.get_payload(decode=True))
^ For multitypes I would like to save a file with all the type extensions. Such as for 22382, a PDF and txt
^ Current Output files
I'm not fully sure, but I think your problem is in the for item in item_list: loop.
email_message would only end up being whatever the last item in that loop creates.
Would you need to push nearly everything in that loop 1 tab's worth out?
Also I'd assume you'd want to use part instead of item in this line: option = str(item)[2:-1] + ' ' + date_[:-15] + ' ' + from_ + ' '
Again, not fully sure, but hope this helps!

Python: Multiple Text Files to Dataframe

I'm a little stuck on how exactly to proceed, so a little nudge would be very helpful.
I have ~1800 text files, emails actually, that are in a repeated format.
The structure of each file is as follows:
From: Person-1 [email#person-1.com]
Sent: Tuesday, April 18, 2017 11:24 AM
To: email#person-2.com
Subject: Important Subject
User,
Below is your search alert.
Target: text
Attribute: text
Label: abcdef
Time: Apr 18, 2017 11:24 EDT
Full Text: Text of various length exists here. Some files even have links. I'm not sure how I would capture a varied length field.
Recording: abcde & fghijk lmnop
That's the gist of it.
I would like to write that into a DF I can store as a CSV.
I would like to end up with maybe something like this?
| Target | Attribute | Label | Time | Full Text | Recording | Filename |
|--------|-----------|---------|--------|-------------|-----------|----------|
| text| text| abcdef| (date) |(Full text..)|abcde & f..| 1111.txt |
| text2| text2| abcdef2| (date) |(Full text..)|abcde & f..| 1112.txt |
Where the 2nd row is another text file.
I have code to go through all of the text files and print them. Here's that code:
# -*- coding: utf-8 -*-
import os
import sys
# Take all text files in workingDirectory and put them into a DF.
def convertText(workingDirectory, outputDirectory):
if workingDirectory == "": workingDirectory = os.getcwd() + "\\" # Returns current working directory, if workingDirectory is empty.
i = 0
for txt in os.listdir(workingDirectory): # Iterate through text filess in workingDirectory
print("Processing File: " + str(txt))
fileExtension = txt.split(".")[-1]
if fileExtension == "txt":
textFilename = workingDirectory + txt # Becomes: \PATH\example.text
f = open(textFilename,"r")
data = f.read() # read what is inside
print data # print to show it is readable
#RegEx goes here?
i += 1 # counter
print("Successfully read " + str(i) + " files.")
def main(argv):
workingDirectory = "../Documents/folder//" # Put your source directory of text files here
outputDirectory = "../Documents//" # Where you want your converted files to go.
convertText(workingDirectory, outputDirectory)
if __name__ == "__main__":
main(sys.argv[1:])
I guess I would need RegEx, maybe, to parse the files? What would you recommend?
I am not opposed to using R or something else, if it makes more sense.
Thank You.
Regex should be sufficient for your use case. Using the regex expression r"\sTarget:(.*) you can match everything on the line that matches with Target:, then by creating a list of all the fields you wish to match and iterating over them, you build up a dictionary object that stores the values of each field.
Using the Python CSV library you can create a CSV file and for each .txt file in your directory push a row of the matched dictionary fields with writer.writerow({'Target':'','Attribute':'','Time':'','Filename':'','Label':''})
Example:
import os
import sys
import re
import csv
# Take all text files in workingDirectory and put them into a DF.
def convertText(workingDirectory, outputDirectory):
with open(outputDirectory+'emails.csv', 'w') as csvfile: # opens the file \PATH\emails.csv
fields = ['Target','Attribute','Label','Time','Full Text'] # fields you're searching for with regex
csvfield = ['Target','Attribute','Label','Time','Full Text','Filename'] # You want to include the file name in the csv header but not find it with regex
writer = csv.DictWriter(csvfile, delimiter=',', lineterminator='\n', fieldnames=fields)
writer.writeheader() # writes the csvfields list to the header of the csv
if workingDirectory == "": workingDirectory = os.getcwd() + "\\" # Returns current working directory, if workingDirectory is empty.
i = 0
for txt in os.listdir(workingDirectory): # Iterate through text filess in workingDirectory
print("Processing File: " + str(txt))
fileExtension = txt.split(".")[-1]
if fileExtension == "txt":
textFilename = workingDirectory + txt # Becomes: \PATH\example.text
f = open(textFilename,"r")
data = f.read() # read what is inside
#print(data) # print to show it is readable
fieldmatches = {}
for field in fields:
regex = "\\s" + field + ":(.*)" # iterates through each of the fields and matches using r"\sTarget:(.*) that selects everything on the line that matches with Target:
match = re.search(regex, data)
if match:
fieldmatches[field] = match.group(1)
writer.writerow(fieldmatches) # for each file creates a dict of fields and their values and then adds that row to the csv
i += 1 # counter
print("Successfully read " + str(i) + " files.")
def main(argv):
workingDirectory = "../Documents/folder//" # Put your source directory of text files here
outputDirectory = "../Documents//" # Where you want your converted files to go.
convertText(workingDirectory, outputDirectory)
if __name__ == "__main__":
main(sys.argv[1:])
For processing files this should be fast enough on my machine it took less than a second
Successfully read 1866 files.
Time: 0.6991933065852838
Hope this helps!

Python (3.5) - Constructing String to Save File - String Contains Escape Characters

I am using Python (3.5) to loop through some .msg files, extract data from them, which contains a url to download a file and a folder that the file should go into. I have successfully extracted the data from the .msg file but now when I try to piece together the absolute file path for the downloaded file, the format ends up weird, with backslashes and \t\r.
Here's a shortened view of the code:
for file in files:
file_abs_path = script_dir + '/' + file
print(file_abs_path)
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
msg = outlook.OpenSharedItem(file_abs_path)
pattern = re.compile(r'(?:^|(?<=\n))[^:<\n]*[:<]\s*([^>\n]*)', flags=re.DOTALL)
results = pattern.findall(msg.Body)
# results[0] -> eventID
regexID = re.compile(r'^[^\/\s]*', flags=re.DOTALL)
filtered = regexID.findall(results[0])
eventID = filtered[0]
# print(eventID)
# results[1] -> title
title = results[1].translate(str.maketrans('','',string.punctuation)).replace(' ', '_') #results[1]
title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore')
title = title.decode('UTF-8')
#results[1]
print(title)
# results[2] -> account
regexAcc = re.compile(r'^[^\(\s]*', flags=re.DOTALL)
filtered = regexAcc.findall(results[2])
account = filtered[0]
account = unicodedata.normalize('NFKD', account).encode('ascii', 'ignore')
account = account.decode('UTF-8')
# print(account)
# results[3] -> downloadURL
downloadURL = results[3]
# print(downloadURL)
rel_path = account + '/' + eventID + '_' + title + '.mp4'
rel_path = unicodedata.normalize('NFKD', rel_path).encode('ascii', 'ignore')
rel_path = rel_path.decode('UTF-8')
filename_abs_path = os.path.join(script_dir, rel_path)
# Download .mp4 from a url and save it locally under `file_name`:
with urllib.request.urlopen(downloadURL) as response, open(filename_abs_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
# print item [ID - Title] when done
print('[Complete] ' + eventID + ' - ' + title)
del outlook, msg
So as you can see I have some regex that extracts 4 pieces of data from the .msg. Then I have to go through each one and do some further fine tuning, but then have what I need:
eventID
# 123456
title
# Name_of_item_with_underscord_no_punctuation
account
# nameofaccount
downloadURL
# http://download.com/basicurlandfile.mp4
So this is the data I get, and I've print() it off and it doesn't have any weird characters. But when I try to construct the path for the .mp4 (filename and directory):
downloadURL = results[3]
# print(downloadURL)
rel_path = account + '/' + eventID + '_' + title + '.mp4'
rel_path = unicodedata.normalize('NFKD', rel_path).encode('ascii', 'ignore')
rel_path = rel_path.decode('UTF-8')
filename_abs_path = os.path.join(script_dir, rel_path)
# Download .mp4 from a url and save it locally under `file_name`:
with urllib.request.urlopen(downloadURL) as response, open(filename_abs_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
After doing this, the output I get from running the code is:
Traceback (most recent call last): File "sfaScript.py", line 65, in <module> with urllib.request.urlopen(downloadURL) as response, open(filename_abs_path, 'wb') as out_file:
OSError: [Errno 22] Invalid argument: 'C:/Users/Kenny/Desktop/sfa_kenny_batch_1\\accountnamehere/123456_Name_of_item_with_underscord_no_punctuation\t\r.mp4'
TL;DR - QUESTION
So the filename_abs_path somehow got changed to
C:/Users/Kenny/Desktop/sfa_kenny_batch_1\\accountnamehere/123456_Name_of_item_with_underscord_no_punctuation\t\r.mp4
I need it to be
C:/Users/Kenny/Desktop/sfa_kenny_batch_1/accountnamehere/123456_Name_of_item_with_underscord_no_punctuation.mp4
Thanks for any help provided!
Looks like your regex captured a tabulation char (\t) and a linefeed char (\r) in title
A quickfix for this would be:
title = title.strip()
(before composing the filename)
which removes all "blank" chars, including tabulations and carriage return chars.

Categories

Resources