Unable to download all documents from eml file - python

I have a .eml file with 3 attachments in it. I was able to download one of the attachment but unable to download all the attachments.
import os
import email
import base64
# Get list of all files
files = [f for f in os.listdir('.') if os.path.isfile(f)]
# Create output directory
if os.path.exists("output"):
pass
else:
os.makedirs("output")
for eml_file in files:
if eml_file.endswith(".eml"):
with open(eml_file) as f:
email = f.read()
ext=".docx"
if ext is not "":
# Extract the base64 encoding part of the eml file
encoding = email.split(ext+'"')[-1]
if encoding:
# Remove all whitespaces
encoding = "".join(encoding.strip().split())
encoding = encoding.split("=", 1)[0]
# Convert base64 to string
if len(encoding) % 4 != 0: #check if multiple of 4
while len(encoding) % 4 != 0:
encoding = encoding + "="
try:
decoded = base64.b64decode(encoding)
except:
print(encoding)
for i in range(100):
print('\n')
# Save it as docx
path = os.path.splitext(eml_file)[0]
if path:
path = os.path.join("output", path + ext)
try:
os.remove(path)
except OSError:
pass
with open(path, "wb") as f:
f.write(decoded)
else:
print("File not done: " + eml_file)
How can I download all the attachments?
edit: I have initialized the eml_file still not downloading all files.

You import the email module. So why do you ignore it and try to write an email parser yourself? In addition:
You can use glob to list all files with a given extension.
Use should have used not operator in the condition: (if not os.path.exists("output"): os.makedirs("output")), but even this is not necessary, because makedirs has exist_ok parameter.
import os
import glob
import email
from email import policy
indir = '.'
outdir = os.path.join(indir, 'output')
os.makedirs(outdir, exist_ok=True)
files = glob.glob(os.path.join(indir, '*.eml'))
for eml_file in files:
# This will not work in Python 2
msg = email.message_from_file(open(eml_file), policy=policy.default)
for att in msg.iter_attachments():
# Tabs may be added for indentation and not stripped automatically
filename = att.get_filename().replace('\t', '')
# Here we suppose for simplicity sake that each attachment has a valid unique filename,
# which, generally speaking, is not true.
with open(os.path.join(outdir, filename), 'wb') as f:
f.write(att.get_content())

Related

Compare dictionary values

As part of a wider project (to learn) I am building a script to discover discovering the files recursively in a folder. Then adding the filename (including the path) and the size in bytes to a CSV file.
I've then loaded that CSV file as a python dictionary.
What I would like to do now, is have python parse over each value in the dictionary (which is the size) and compare it to all others in the dictionary. If it finds a match, I want it to show me which keys (file name) have the matching values. I'll then do an MD5 hash on those that appear to have the same size.
The code below is as far as I've got - can anyone assist please?
#!/usr/bin/env python3
import argparse
import os
import sys
import csv
import fnmatch
def verify_args():
parser = argparse.ArgumentParser(description='Compare files recursively.')
parser.add_argument('path', help='Location to begin file comparison from.')
check = parser.parse_args()
if os.path.isdir(check.path):
print(check.path,'is a valid path - continuing' + '\n')
else:
print(check.path,'is an invalid path - exiting' + '\n')
sys.exit()
return parser.parse_args()
def listfiles(file_path):
print ('Starting comparison')
pattern = '*'
with open('/tmp/foo','w') as fo:
fo.write('file,size' + '\n')
for root, dirs, files in os.walk(file_path):
for filename in fnmatch.filter(files, pattern):
fo.write(os.path.join(root, filename) + ',' + str(os.path.getsize(os.path.join(root, filename))) + '\n')
files = {}
with open('/tmp/foo') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
files[row['file']] = row['size']
x = files.keys()
print(x)
# Not sure now what to do
def main():
args = verify_args()
file_path = args.path
listfiles(file_path)
if __name__ == '__main__':
main()

python How do I import multiple .txt files in a folder to add characters to each .txt file?

There are text files of various names in the folder 'a'. I want to read all of these text files and add the letter 'b' to each text file. What should I do?
cwd = os.getcwd()
input_dir = os.path.join(cwd, "my .txt files dir")
sorts = sorted(glob(input_dir), key = lambda x:(len(x) , x))
for f in sorts :
f = open(input_dir, 'a')
data = "add text"
f.write(data)
f.close()
Append data to file:
- first: get all file in folder a.
- second: find extension with .txt.
- third: open it and do something('append', or 'rewrite').
Demo:
import os
# your .txt files dir
path = 'a'
# append data what you want
appendData = 'b'
fileNames = list(os.walk(path))[0][2]
fileNames.sort(key=len)
fileNums = len(fileNames)
# your dst file extension
fileExt = '.txt'
# # Extract extension from filename
# fileExt = os.path.splitext(fileNames[0])[1]
for fileName in fileNames:
if fileName.endswith(fileExt):
fileFullPath = os.path.join(path, fileName)
with open(fileFullPath, 'a') as f:
f.write(appendData)
Like the others said, this is an easy question that could easily be find on google. Anyway here's how to do it:
from os import listdir
from os.path import isfile, isdir, join
files = [file for file in listdir("files") if isfile(join("files", file))]
directories = [directory for directory in listdir("files") if isdir(join("files", directory))]
print(files)
for file_name in files:
try:
file = open("files/" + file_name, "a")
file.write("b")
file.close()
except IOError as err:
print("Could not open file because : ", err)
Replace "file" with the directory where your files are or the path to that directory like "directory0/directory1/directory_with_files"
Avoid to open files with
f = open(input_dir, 'a')
f.close()
Instead
with open(input_dir, 'a') as inputFile:
Do something
Also what you want is
import os
import glob # We will use this module to open only .txt files
path = 'your/path'
for filename in glob.glob(os.path.join(path, '*.txt'))
with open(filename, 'a') as inputFile:
inputFile.write('b')

How do I apply my python code to all of the files in a folder at once, and how do I create a new name for each subsequent output file?

The code I am working with takes in a .pdf file, and outputs a .txt file. My question is, how do I create a loop (probably a for loop) which runs the code over and over again on all files in a folder which end in ".pdf"? Furthermore, how do I change the output each time the loop runs so that I can write a new file each time, that has the same name as the input file (ie. 1_pet.pdf > 1_pet.txt, 2_pet.pdf > 2_pet.txt, etc.)
Here is the code so far:
path="2_pet.pdf"
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
The following script solve your problem:
import os
sourcedir = 'pdfdir'
dl = os.listdir('pdfdir')
for f in dl:
fs = f.split(".")
if fs[1] == "pdf":
path_in = os.path.join(dl,f)
content = getPDFContent(path_in)
encoded = content.encode("utf-8")
path_out = os.path.join(dl,fs[0] + ".txt")
text_file = open(path_out, 'w')
text_file.write(encoded)
text_file.close()
Create a function that encapsulates what you want to do to each file.
import os.path
def parse_pdf(filename):
"Parse a pdf into text"
content = getPDFContent(filename)
encoded = content.encode("utf-8")
## split of the pdf extension to add .txt instead.
(root, _) = os.path.splitext(filename)
text_file = open(root + ".txt", "w")
text_file.write(encoded)
text_file.close()
Then apply this function to a list of filenames, like so:
for f in files:
parse_pdf(f)
One way to operate on all PDF files in a directory is to invoke glob.glob() and iterate over the results:
import glob
for path in glob.glob('*.pdf')
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
Another way is to allow the user to specify the files:
import sys
for path in sys.argv[1:]:
...
Then the user runs your script like python foo.py *.pdf.
You could use a recursive function to search the folders and all subfolders for files that end with pdf. Than take those files and then create a text file for it.
It could be something like:
import os
def convert_PDF(path, func):
d = os.path.basename(path)
if os.path.isdir(path):
[convert_PDF(os.path.join(path,x), func) for x in os.listdir(path)]
elif d[-4:] == '.pdf':
funct(path)
# based entirely on your example code
def convert_to_txt(path):
content = getPDFContent(path)
encoded = content.encode("utf-8")
file_path = os.path.dirname(path)
# replace pdf with txt extension
file_name = os.path.basename(path)[:-4]+'.txt'
text_file = open(file_path +'/'+file_name, "w")
text_file.write(encoded)
text_file.close()
convert_PDF('path/to/files', convert_to_txt)
Because the actual operation is changeable, you can replace the function with whatever operation you need to perform (like using a different library, converting to a different type, etc.)

Move file to new directory only if it contains specified string

I have 1 folder with thousands of files and I need to loop through every single file and see if that file contains a specific string, once it has concluded that it has a specific string, it must then be moved to the correct folder. So far I have:
for filename in glob.iglob('*.txt'):
f = open(filename)
s = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
if s.find('* Test Outcome : FAIL') != -1:
src_file = os.path.join(dirSTART, filename)
dst_file = os.path.join(dirFAIL, filename)
shutil.move(src_file, dst_file + filename)
At the moment, it only moves the first file but I know for a fact there's more.
Thanks
You can use the os module alone to do this.
import os
import shutil
source_dir = "this/is/source/folder"
dest_dir = "this/is/destination/folder"
for top, dirs, files in os.walk(source_dir):
for filename in files:
if not filename.endswith('.txt'):
continue
file_path = os.path.join(top, filename)
with open(file_path, 'r') as f:
if '* Test Outcome : FAIL' in f.read():
shutil.move(file_path, os.path.join(dest_dir, filename))
CAUTION: Since I don't know much about your code, I am assuming all of the files are txt, that they are small and the string you are matching will always be the same.
from re import compile
pattern = compile("\* Test Outcome : FAIL")
for filename in glob.iglob('*.txt'):
fl = open(filename, 'r')
for i in fl.readlines():
if pattern.search(i):
fl.close()
src_file = os.path.join(dirSTART, filename)
dst_file = os.path.join(dirFAIL, filename)
shutil.move(src_file, dst_file + filename)
break #To stop checking other lines
Use a contextmanager and with to open your files so they will be closed each time:
from mmap import mmap, ACCESS_READ
import contextlib
from os import path
from shutil import move
for filename in glob.iglob('*.txt'):
with open(filename) as f:
with contextlib.closing(mmap(f.fileno(), 0, access=ACCESS_READ)) as s:
if s.find('* Test Outcome : FAIL') != -1:
src_file = path.join(dirSTART, filename)
dst_file = path.join(dirFAIL, filename)
move(src_file, dst_file)
Try to do f.close() after s = mmap.mmap(...)
Are you on Linux? If so, might be quicker to this in a shell command with grep and mv.

I have a ".txt "file which consists of various filenames and I want to search each filename in a folder where these files are actually kept

Suppose I have a text file aiq_hits.txt.
Each line in this file corresponds a filename
ant1.aiq
ant2.aiq
ant3.aiq
ant4.aiq
I want to match each line of my textfile (ant1.aiq,ant2.aiq and so on) with filenames which are present at some specific place(R:\Sample) and extract matching files into some other place (R:\sample\wsa).
I have an idea that I need to use functions like os.walk() and fnmatch.fnmatch(), shutil.copy() but I am not able to implement them
My code:
import os
import shutil
import fnmatch
with open("aiq_hits.txt","r") as in_file:
for line in in_file:
I am stuck here
import os
import shutil
sourceDir = "R:\\Sample"
targetDir = "R:\\Sample\\wsa"
existingFiles = set(f for f in os.listdir(sourceDir) if os.path.isfile(os.path.join(sourceDir, f)))
infilepath = "aiq_hits.txt"
with open(infilepath) as infile:
for line in infile:
fname = line.strip()
if fname not in existingFiles: continue
shutil.move(os.path.join(sourceDir, fname), os.path.join(targetDir, fname))
I hope this will suffice:
import os
def match_files(url,file_read, dest):
f = open(file_read, 'rb')
file_list = os.listdir(url)
print(file_list)
saved_path = os.getcwd()
print("Current working directory is " + saved_path)
os.chdir(url)
match = []
for file_name in f:
file_name = file_name.strip()
if file_name in file_list:
match.append(file_name)
os.rename(os.path.join(url, file_name), os.path.join(dest, file_name))
os.chdir(saved_path)
print match
here, url is source directory or folder from which u want to match files, file_read is the name of file (with path) in which list of file names is given, dest is the destination folder.
this code moves the matching files from url to dest, i.e. these files won't remin in url after running the code.
Alternatively you could use the glob module which allows you to enter in a expression for the file name\extension which will then return a list that you can loop over.
I'd use this module if the source directory can have files with the same extension that you want to exclude from being looped over
Also I'm assuming that the file name list is not large and so storing it in a list wont be an issue
eg (I haven't tested the below )
from glob import glob
import os
import shutil
src = 'R:\\Sample'
dst = "R:\\Sample\\wsa"
in_file_list = "aiq_hits.txt"
list_Of_files = glob(os.path.join(src, 'ant*.aiq'))
data = []
with open(in_file_list) as reader:
data += reader.readlines()
for row in list_Of_files:
file_path, file_name = os.path.split(row)
if file_name in data:
shutil.copy2(row, os.path.join(dst, file_name))
# or if you want to move the file
# shutil.move(row, os.path.join(dst, file_name))

Categories

Resources