Copying files from directory via multiprocessing and shutil python - python

shutil provides one of the simplest ways to copy files/folders
from a directory to another.
A simple way of doing this is by:
# Source path
src = r'D:\source_path'
# Destination path
dest = r'C:\destination_path\new_folder'
# Copy the content of source to destination
destination = shutil.copytree(src, dest)
The problem with above is it copies each individual files one after another. And for directory containing a thousand files, and let along one in an distant server, this becomes difficult and time consuming.
Implementing multiprocessing to this task will solve a lot of pain and time.
I am aware of basic use of multiprocessing features but not sure how to proceed. I would think to start like this:
import multiprocessing
import os
def copy_instance():
# printing process id
destination = shutil.copytree(src, dest)
if __name__ == "__main__":
# printing main program process id
print("ID of main process: {}".format(os.getpid()))
# creating processes
p1 = multiprocessing.Process(target=copy_instance)
# starting processes
p1.start()
But this doesn't solve anything in the sense of applying each file as a separate run. Any help, suggestion or links will be helpful.
Edit: Also tried this, but couldn't make it work. Any suggestion.
import multiprocessing
import os
import shutil
def copy_instance(list):
dest = r'C:\destination_path\new_folder'
destination = shutil.copytree(list, dest)
return destination
if __name__ == "__main__":
# input list
sec = r"D:\source_path"
source = os.listdir(sec)
list=[]
for ith in range(len(source)):
list.append(str(sec) + "\\" + str(source[ith]))
# creating a pool object
p = multiprocessing.Pool()
# map list to target function
result = p.map(copy_instance, list)
print(result)

What you coded doesn't solve the problem because you're not using multiprocessing properly. The last two lines is just creating a single process to copy the files and it's working like you didn't do anything, I mean as if you didn't use multiprocessing, so what you have to do is creating multiple process to copy the files and one solution could be create one process per file and to do that you'll have to add some steps and stop using copytree as follow:
import shutil
import multiprocessing
import os
def copy_instance(file):
# printing process id to SHOW that we're actually using MULTIPROCESSING
print("ID of main process: {}".format(os.getpid()))
shutil.copy(f'{src}\{file}', f'{dest}/')
if __name__ == "__main__":
files = os.listdir(src) # Getting the files to copy
for file in files:
# creating a process per file
p1 = multiprocessing.Process(target=copy_instance, args=(file, ))
# starting processes
p1.start()
Make sure to have the permission to copy in dest directory and try to use absolute path for both source and destination directory

Related

How to copy list of files in parallel to list of destinations? Python

I have src_list and dst_list, two lists of the same length.
src_list contains paths to existing files.
dst_list contains paths to maybe existing files to maybe overwrite (not folders!).
src_list[i] should correspond to dst_list[i].
I want to copy every src_list[i] to dst_list[i], in (multiprocessing, not threading) parallel.
I want the call to be blocking, meaning all processes should be joined before moving on.
Examples of lists:
src_list = [
r"/data/one/f1.txt",
r"/data/one/f2.txt",
r"/data/two/f3.txt",
r"/data/three/f4.txt",
r"/data/four/five/f5.txt",
]
dst_list = [
r"/out1/f1.txt",
r"/out2/two/three/pook.txt",
r"/data/foo/bar/baz/f333.txt",
r"/kiko/f4",
r"/data/four/five/f5.txt",
]
really, just just paths to files. Please assume the lists are ~1000-5000 elements long.
What I seem to lack is the correct API call to parallelize over two iterables.
from concurrent.futures import ProcessPoolExecutor
from shutil import copyfile
with ProcessPoolExecutor() as executor:
executor.map(copyfile, src_list, dst_list)
If you want to limit the number of processes that are working on this you can pass a max_workers argument inside the ProcessPoolExecutor() method.
you can try something like that:
from multiprocessing import Process
import shutil
def parallel_copy(src_lst, dst_list):
if not src_lst or not dst_list or len(src_lst) != len(dst_list):
raise ValueError('Cannot process inputs.')
processes = [Process(target=shutil.copyfile, args=(src, dst)) for src, dst in zip(src_lst, dst_list)]
[p.start() for p in processes]
[p.join() for p in processes]
Full answer, based on #tibipin's. This one also creates the dst folder structure if needed.
from concurrent.futures import ProcessPoolExecutor
import shutil
from typing import Iterable
def copy_single_file_create_dirs(src, dst):
os.makedirs(os.path.dirname(dst), exist_ok=True)
shutil.copyfile(src, dst)
def copy_src_to_dst_overwrite_parallel(
existing_src_files: Iterable[str],
dst_files: Iterable[str],
max_workers=None,
):
with ProcessPoolExecutor(max_workers=max_workers) as executor:
executor.map(copy_single_file_create_dirs, existing_src_files, dst_files)
The copy_single_file_create_dirs function can't be defined as an inner function because then it needs to be pickled which doesn't work. This is the cleanest I could get.

Progress bar with multiprocessing

I use the multiprocessing package to run the function: run_performance, on which it loads zip files in which they contains several csv files.
I search to display a progress bar properly with the number of csv in each zipfile.
With my code, the display is incoherent/wrong:
My code:
from alive_progress import alive_bar
from zipfile import ZipFile
import os
def get_filepaths(directory):
file_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for root, directories, files in os.walk(directory):
for filename in files:
# Join the two strings in order to form the full filepath.
filepath = os.path.join(root, filename)
file_paths.append(filepath) # Add it to the list.
return file_paths # Self-explanatory.
def count_files_7z(myarchive):
cnt_files = []
with closing(ZipFile(myarchive)) as archive:
for csv in archive.namelist():
cnt_files.append(csv)
return cnt_files
def run_performance(zipobj):
zf = zipfile.ZipFile(zipobj)
cnt = count_files_7z(zipobj)
with alive_bar(len(cnt)) as bar:
for f in zf.namelist():
bar()
with zf.open(f) as myfile:
print(myfile) # and done other things
list_dir = "path_of_zipfiles" #
for idx1, folder in enumerate(list_dir):
get_all_zips = get_filepaths(folder)
for idx2, zip_file in enumerate(get_all_zips):
with zipfile.ZipFile(zip_file) as zipobj:
p = Process(target=run_performance,args=(zipobj.filename,))
p.start()
p.join()
My display:
|████▌ | ▄▆█ 1/9 [11%] in 0s (3.3/s, eta: 0s)|████▌ | ▄▆█ 1/9 [11%] in 0s (3.3/s, eta: 0s)|████▌ | ▄▆█ 1/9 [11%] in 0s (3.3/s, eta: 0s
...
If I place the line p.join() as the same indentation as p.start(), the display is correct, but the multiprocessing does not work anymore.
So the script takes too much time:
1m18s vs 0m14s
Desired output:
|████████████████████████████████████████| 1/1 [100%] in 2.4s (0.41/s)
|████████████████████████████████████████| 2/2 [100%] in 4.7s (0.43/s)
|████████████████████ | ▄▂▂ 1/2 [50%] in 2s (0.6/s, eta: 0s)
First a few general comments concerning your code. In your main process you use a path to a file to open zip archive just to retrieve back the original file name. That really does not make too much sense. Then in count_files_7z you iterate the return value from zf.namelist() to build a list of the files within the archive when zf.namelist() is already a list of those files. That does not make too much sense either. You also use the context manager function closing to ensure that the archive is closed at the end of the block, but the with block itself is a context manager that serves the same purpose.
I tried installing alive-progress and the progress bars were a mess. This is a task better suited to multithreading rather than multiprocessing. Actually, it is probably better suited to serial processing since doing concurrent I/O operations to your disk, unless it is a solid state drive, is probably going to hurt performance. You will gain performance if there is heavy CPU-intensive processing involved of the files you read. If that is the case, I have passed to each thread a multiprocessing pool to which you can execute a calls to apply specifying functions in which you have placed CPU-intensive code. But the progress bars will should work better when done under multithreading rather than multiprocessing. Even then I could not get any sort of decent display with alive-progress, which admittedly I did not spend too much time on. So I have switched to using the more common tqdm module available from the PyPi repository.
Even with tqdm there is a problem in that when a progress bar reaches 100%, tqdm must be writing something (a newline?) that relocates the other progress bars. Therefore, what I have done is specified leave=False, which causes the bar to disappear when it reaches 100%. But at least you can see all the progress bars without distortion as they are progressing.
from multiprocessing.pool import Pool, ThreadPool
from threading import Lock
import tqdm
from zipfile import ZipFile
import os
import heapq
def get_filepaths(directory):
file_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for root, directories, files in os.walk(directory):
for filename in files:
# Join the two strings in order to form the full filepath.
filepath = os.path.join(root, filename)
file_paths.append(filepath) # Add it to the list.
return file_paths # Self-explanatory.
def get_free_position():
""" Return the minimum possible position """
with lock:
free_position = heapq.heappop(free_positions)
return free_position
def return_free_position(position):
with lock:
heapq.heappush(free_positions, position)
def run_performance(zip_file):
position = get_free_position()
with ZipFile(zip_file) as zf:
file_list = zf.namelist()
with tqdm.tqdm(total=len(file_list), position=position, leave=False) as bar:
for f in file_list:
with zf.open(f) as myfile:
... # do things with myfile (perhaps myfile.read())
# for CPU-intensive tasks: result = pool.apply(some_function, args=(arg1, arg2, ... argn))
import time
time.sleep(.005) # simulate doing something
bar.update()
return_free_position(position)
def generate_zip_files():
list_dir = ['path1', 'path2']
for folder in list_dir:
get_all_zips = get_filepaths(folder)
for zip_file in get_all_zips:
yield zip_file
# Required for Windows:
if __name__ == '__main__':
N_THREADS = 5
free_positions = list(range(N_THREADS)) # already a heap
lock = Lock()
pool = Pool()
thread_pool = ThreadPool(N_THREADS)
for result in thread_pool.imap_unordered(run_performance, generate_zip_files()):
pass
pool.close()
pool.join()
thread_pool.close()
thread_pool.join()
The code above uses a multiprocessing thread pool arbitrarily limited in size to 5 just as a demo. You can increase or decrease N_THREADS to whatever value you want, but as I said, it may or may not help performance. If you want one thread per zip file then:
if __name__ == '__main__':
zip_files = list(generate_zip_files())
N_THREADS = len(zip_files)
free_positions = list(range(N_THREADS)) # already a heap
lock = Lock()
pool = Pool()
thread_pool = ThreadPool(N_THREADS)
for result in thread_pool.imap_unordered(run_performance, zip_files):
pass
pool.close()
pool.join()
thread_pool.close()
thread_pool.join()
In the Enlighten codebase there is an example of something similar. You would just substitute the process_files() function with your own.
It's a bit large to recreate here, but the idea is you should really only be doing console output in the main process and use some form of IPC to relay the information from subprocesses. The Enlighten example uses queues for IPC, which is pretty reasonable given it's only sending it's current count.
It seems that alive_bar remembers the position of the cursor when it was called, and starts drawing the bar from that point. When you start many processes, each one is not aware of the other and the output gets scrambled.
Indeed, there is an open issue in github about this (see here). There are some hacky solutions for using multithreading, but I don't think that it will be easy to solve it using multiprocessing, unless you implement some kind on interprocess communication that will slow down things.

line 105, in spawn_main exitcode = _main(fd) while multiprocessing in a for loop

I had a look into many published issues without finding some insights to my current issue.
I am dealing with multiprocessing runs of an external code. This external code eats inputs files. The files names are joined in a list that enable me to launch pool for each file. A path is also needed.
for i in range(len(file2run)):
pool.apply_async(runcase, args=(file2run[i], filepath))
The runcase function launches one process for a given input file and analyses and saves the results in some folder.
it works fine whatever the length of the file2run is. The external code runs on several processes (as many as maxCPU : defined in the pool with:
pool = multiprocessing.Pool(processes = maxCPU).
My issue is that I'd like to make a step further and integrate this in a for loop. In each loop, several input files are created and once all of the runs are finished a new set of inputs files are created and a pool is created again.
It works fine for two loops but I encountered the issue of the xxx line 105, in spawn_main exitcode = _main(fd) and a bunch of messages up the error of a missing needed module. Same messages for 2 or 1000 input files in each loop...
So I guess it's about the pool creation, but is there a way of clearing the variables between each runs ?? I have tried to created the pool initialization (with the number of CPU) at the very beginning of the main function but same issues raises...I have tried to make a sort of equivalent of clear all matlab function but always same issue... and why does it work for two loops and not for the third one ? why is the 2nd one working??
Thanks in advance for any help (or to point out to the good already published issue).
Xavfa
here is a try of an example that actually......works !
I copy\paste my original script and made it way more easier to share for sake of understanding the paradigm of my original try (the original one deals with object of several kinds to build the input file and uses an embedded function of one of the objects to launches the external code with subprocess.check_all).
but the example keeps the over all paradigm of making input files in a folder, simulation results in an other one with multiprocessing package.
the original still doesn't work, still at the third round of the loop (if name == 'main' : of multiproc_test.py
here is one script (multiproc_test.py):
import os
import Simlauncher
def RunProcess(MainPath):
file2run = Simlauncher.initiateprocess(MainPath)
Simlauncher.RunMultiProc(file2run, MainPath, multi=True, maxcpu=0.7)
def LaunchProcess(nbcase):
#exemple that build the file
MainPath = os.getcwd()
SimDir = os.path.join(os.getcwd(), 'SimFiles\\')
if not os.path.exists(SimDir):
os.mkdir(SimDir)
for i in range(100):
with open(SimDir+'inputfile'+str(i)+'.mptest', 'w') as file:
file.write('Hello World')
RunProcess(MainPath)
if __name__ == '__main__' :
for i in range(1,10):
LaunchProcess(i)
os.rename(os.path.join(os.getcwd(), 'SimFiles'), os.path.join(os.getcwd(), 'SimFiles'+str(i)))
here is the other one (Simlauncher.py) :
import multiprocessing as mp
import os
def initiateprocess(MainPath):
filepath = MainPath + '\\SimFiles\\'
listOfFiles = os.listdir(filepath)
file2run = []
for file in listOfFiles:
if '.mptest' in file:
file2run.append(file)
return file2run
def runtestcase(file,filepath):
filepath = filepath+'\\SimFiles'
ResSimpath = filepath + '\\SimRes\\'
if not os.path.exists(ResSimpath):
os.mkdir(ResSimpath)
with open(ResSimpath+'Res_' + file, 'w') as res:
res.write('I am done')
print(file +'is finished')
def RunMultiProc(file2run, filepath, multi, maxcpu):
print('Launching cases :')
nbcpu = mp.cpu_count()
pool = mp.Pool(processes=int(nbcpu * maxcpu))
for i in range(len(file2run)):
pool.apply_async(runtestcase, args=(file2run[i], filepath))
pool.close()
pool.join()
print('Done with this one !')
any help is still needed....
btw, the external code is energyplus (for building energy simulation)
Xavier

Python multiprocess/multithreading to speed up file copying

I have a program which copies large numbers of files from one location to another - I'm talking 100,000+ files (I'm copying 314g in image sequences at this moment). They're both on huge, VERY fast network storage RAID'd in the extreme. I'm using shutil to copy the files over sequentially and it is taking some time, so I'm trying to find the best way to opimize this. I've noticed some software I use effectively multi-threads reading files off of the network with huge gains in load times so I'd like to try doing this in python.
I have no experience with programming multithreading/multiprocessesing - does this seem like the right area to proceed? If so what's the best way to do this? I've looked around a few other SO posts regarding threading file copying in python and they all seemed to say that you get no speed gain, but I do not think this will be the case considering my hardware. I'm nowhere near my IO cap at the moment and resources are sitting around 1% (I have 40 cores and 64g of RAM locally).
EDIT
Been getting some up-votes on this question (now a few years old) so I thought I'd point out one more thing to speed up file copies. In addition to the fact that you can easily 8x-10x copy speeds using some of the answers below (seriously!) I have also since found that shutil.copy2 is excruciatingly slow for no good reason. Yes, even in python 3+. It is beyond the scope of this question so I won't dive into it here (it's also highly OS and hardware/network dependent), beyond just mentioning that by tweaking the copy buffer size in the copy2 function you can increase copy speeds by yet another factor of 10! (however note that you will start running into bandwidth limits and the gains are not linear when multi-threading AND tweaking buffer sizes. At some point it does flat line).
UPDATE:
I never did get Gevent working (first answer) because I couldn't install the module without an internet connection, which I don't have on my workstation. However I was able to decrease file copy times by 8 just using the built in threading with python (which I have since learned how to use) and I wanted to post it up as an additional answer for anyone interested! Here's my code below, and it is probably important to note that my 8x copy time will most likely differ from environment to environment due to your hardware/network set-up.
import Queue, threading, os, time
import shutil
fileQueue = Queue.Queue()
destPath = 'path/to/cop'
class ThreadedCopy:
totalFiles = 0
copyCount = 0
lock = threading.Lock()
def __init__(self):
with open("filelist.txt", "r") as txt: #txt with a file per line
fileList = txt.read().splitlines()
if not os.path.exists(destPath):
os.mkdir(destPath)
self.totalFiles = len(fileList)
print str(self.totalFiles) + " files to copy."
self.threadWorkerCopy(fileList)
def CopyWorker(self):
while True:
fileName = fileQueue.get()
shutil.copy(fileName, destPath)
fileQueue.task_done()
with self.lock:
self.copyCount += 1
percent = (self.copyCount * 100) / self.totalFiles
print str(percent) + " percent copied."
def threadWorkerCopy(self, fileNameList):
for i in range(16):
t = threading.Thread(target=self.CopyWorker)
t.daemon = True
t.start()
for fileName in fileNameList:
fileQueue.put(fileName)
fileQueue.join()
ThreadedCopy()
How about using a ThreadPool?
import os
import glob
import shutil
from functools import partial
from multiprocessing.pool import ThreadPool
DST_DIR = '../path/to/new/dir'
SRC_DIR = '../path/to/files/to/copy'
# copy_to_mydir will copy any file you give it to DST_DIR
copy_to_mydir = partial(shutil.copy, dst=DST_DIR)
# list of files we want to copy
to_copy = glob.glob(os.path.join(SRC_DIR, '*'))
with ThreadPool(4) as p:
p.map(copy_to_mydir, to_copy)
This can be parallelized by using gevent in Python.
I would recommend the following logic to achieve speeding up 100k+ file copying:
Put names of all the 100K+ files, which need to be copied in a csv file, for eg: 'input.csv'.
Then create chunks from that csv file. The number of chunks should be decided based on no.of processors/cores in your machine.
Pass each of those chunks to separate threads.
Each thread sequentially reads filename in that chunk and copies it from one location to another.
Here goes the python code snippet:
import sys
import os
import multiprocessing
from gevent import monkey
monkey.patch_all()
from gevent.pool import Pool
def _copyFile(file):
# over here, you can put your own logic of copying a file from source to destination
def _worker(csv_file, chunk):
f = open(csv_file)
f.seek(chunk[0])
for file in f.read(chunk[1]).splitlines():
_copyFile(file)
def _getChunks(file, size):
f = open(file)
while 1:
start = f.tell()
f.seek(size, 1)
s = f.readline()
yield start, f.tell() - start
if not s:
f.close()
break
if __name__ == "__main__":
if(len(sys.argv) > 1):
csv_file_name = sys.argv[1]
else:
print "Please provide a csv file as an argument."
sys.exit()
no_of_procs = multiprocessing.cpu_count() * 4
file_size = os.stat(csv_file_name).st_size
file_size_per_chunk = file_size/no_of_procs
pool = Pool(no_of_procs)
for chunk in _getChunks(csv_file_name, file_size_per_chunk):
pool.apply_async(_worker, (csv_file_name, chunk))
pool.join()
Save the file as file_copier.py.
Open terminal and run:
$ ./file_copier.py input.csv
While re-implementing the code posted by #Spencer, I ran into the same error as mentioned in the comments below the post (to be more specific: OSError: [Errno 24] Too many open files).
I solved this issue by moving away from the daemonic threads and using concurrent.futures.ThreadPoolExecutor instead. This seems to handle in a better way the opening and closing of the files to copy. By doing so all the code stayed the same besides the threadWorkerCopy(self, filename_list: List[str]) method which looks like this now:
def threadWorkerCopy(self, filename_list: List[str]):
"""
This function initializes the workers to enable the multi-threaded process. The workers are handles automatically with
ThreadPoolExecutor. More infos about multi-threading can be found here: https://realpython.com/intro-to-python-threading/.
A recurrent problem with the threading here was "OSError: [Errno 24] Too many open files". This was coming from the fact
that deamon threads were not killed before the end of the script. Therefore, everything opened by them was never closed.
Args:
filename_list (List[str]): List containing the name of the files to copy.
"""
with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:
executor.submit(self.CopyWorker)
for filename in filename_list:
self.file_queue.put(filename)
self.file_queue.join() # program waits for this process to be done.
If you just want to copy a directory tree from one path to another, here's my solution that's a litte more simple than the previous solutions. It leverages multiprocessing.pool.ThreadPool and uses a custom copy function for shutil.copytree:
import shutil
from multiprocessing.pool import ThreadPool
class MultithreadedCopier:
def __init__(self, max_threads):
self.pool = ThreadPool(max_threads)
def copy(self, source, dest):
self.pool.apply_async(shutil.copy2, args=(source, dest))
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.pool.close()
self.pool.join()
src_dir = "/path/to/src/dir"
dest_dir = "/path/to/dest/dir"
with MultithreadedCopier(max_threads=16) as copier:
shutil.copytree(src_dir, dest_dir, copy_function=copier.copy)

Run script with loop with different combinations of arguments using python multiprocessing

I'm sorry if this is a duplicate of another question but I've read other threats that attempt to use multiprocessing and I have to say it only made me more confuse (I'm a biologist attempting to deal with lots of data and files in a server and I'm not very familiarized with the proper language. My bad!).
What I basically want is to run a loop inside a script simultaneously 5 times so I can take advantage of the fact that I have several CPUs in a server. This would be simple if I didn't have different combinations of arguments as input for this script. The script loops through files (different samples in my experiment) in my folder, creating output names based on the names of these files, and modifying a string that I submit to os.system to run a program. In my program call, I also need to specify a different reference file for each one of my samples and I was doing that by building a dictionary inside my script.
I call my script like this:
run_ProgramXPTO.py list.txt
Where in list.txt I have something like this, which specifies the path to a reference file for each sample file. Let's say I have 5 samples, so I would have:
sampleA /path/to/reference/lion.reference
sampleB /path/to/reference/cat.reference
sampleC /path/to/reference/tiger.reference
sampleD /path/to/reference/cow.reference
sampleE /path/to/reference/dog.reference
Then, inside this script, I add necessary extensions to sample names, create an output name and set an argument with path to reference. My call of this program would be:
do_this_for_me -input sampleA_call.vcf.gz -reference /path/to/reference/lion.reference -output sampleA_call.stats
I was trying to use multiprocessing to make this loop run 5 times in simultaneous, but what is happening is that the same input file is running 5 times, instead of the program running 5 times with different input files. So, I'm doing something wrong and did not understand how to use multiprocessing from searching the web...
So, this is what I have so far inside my run_ProgramXPTO.py:
import sys
import os
import glob
import multiprocessing
#this reads a file with paths to references
list=sys.argv[1]
#this makes a dictionary from the input file where for each sample
#I now have a path to another file (reference) in my system
def make_PathDir(list):
list=open(list,"r")
mydir={}
for line in list:
row=line.strip().split('\t')
key=row[0]
value=row[1]
mydir.setdefault(key,value)
return mydir
#call the program specifying, for each input, an output name
#and the path to reference file
def worker(x):
for i in x:
name1=i.strip("./")
name2=name1.strip("_call.vcf.gz")
output=str(name2+"_call.stats")
path=PathDir.get(name2)
command="bcftools stats -F %s -s - %s > %s" % (path, name1, output)
os.system(command)
return
PathDir=make_PathDir(list)
#and here, run my program 5 times for each input file
if __name__ == '__main__':
jobs = []
for i in range(5):
f=glob.glob("./*_call.vcf.gz")
p = multiprocessing.Process(target=worker,args=[f])
jobs.append(p)
p.start()
Many thanks in advance.
A Python 3.2+ solution (I missed the Python 2.7 tag). If it has to be Python 2, we can modify this. This should give you the idea in the meantime. It replaces some of your code with the easier, more Pythonic ways of doing them.
#!/usr/bin/env python3
import sys
import os
import glob
import argparse
import functools
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor as PoolExecutor
NUM_CONCURRENT_WORKERS = 5
def process_sample(sample_to_reference_map, input_filename):
"""Run bcftools stats on input_filename using the correct reference file"""
sample_basename = input_filename.rstrip('_call.vcf.gz')
output_filename = '{}_call.stats'.format(sample_basename)
reference_filename = sample_to_reference_map[sample_basename]
command = 'bcftools stats -F {} -s - {} > {}'.format(
reference_filename,
input_filename,
output_filename)
os.system(command)
def process_args():
parser = argparse.ArgumentParser(prog=sys.argv[0])
parser.add_argument('sample_map')
return parser.parse_args()
def main():
args = process_args()
# Read sample to reference mapping
with open(args.sample_map) as f:
sample_to_reference_map = dict(line.strip().split() for line in f)
# Create a worker function that has the map passed to it
worker = functools.partial(process_sample, sample_to_reference_map)
# Use a pool of workers to process samples
with PoolExecutor(max_workers=NUM_CONCURRENT_WORKERS) as executor:
# Get a list of sample files to process
input_files = glob.glob('*_call.vcf.gz')
# Queue a background job for each file, and keep a job-to-sample
# map for status
future_to_sample = {executor.submit(worker, f): f for f in input_files}
# Print messages for each as they finish
for future in concurrent.futures.as_completed(future_to_sample):
print('{} completed'.format(future_to_sample[future]))
if __name__ == '__main__':
main()

Categories

Resources