Why subprocess with waitpid is crashing? - python

I am trying to parallel download urls with the following:
def parallel_download_files(self, urls, filenames):
pids = []
for (url, filename) in zip(urls, filenames):
pid = os.fork()
if pid == 0:
open(filename, 'wb').write(requests.get(url).content)
else:
pids.append(pid)
for pid in pids:
os.waitpid(pid, os.WNOHANG)
But when executing with a list of urls and filenames, the computer system is building up in memory and crashing. From the documentation, I thought that the options in waitpid should be correctly handled if setting it to os.WNOHANG. This is the first time I am trying parallel with forks, I have been doing such tasks with concurrent.futures.ThreadPoolExecutor before.

Using os.fork() is far from ideal especially as you're not handling the two processes that are being created (parent/child). multithreading is far superior for this use-case.
For example:
from concurrent.futures import ThreadPoolExecutor as TPE
from requests import get as GET
def parallel_download_files(urls, filenames):
def _process(t):
url, filename = t
try:
(r := GET(url)).raise_for_status()
with open(filename, 'wb') as output:
output.write(r.content)
except Exception as e:
print('Failed: ', url, filename, e)
with TPE() as executor:
executor.map(_process, zip(urls, filenames))
urls = ['https://www.bbc.co.uk', 'https://news.bbc.co.uk']
filenames = ['www.txt', 'news.txt']
parallel_download_files(urls, filenames)
Note:
If any filenames are duplicated in the filenames list then you'll need a more complex strategy that ensures that you never have more than one thread writing to the same file

Related

How to optimize the below code to read very large multiple file?

I have folder containing about 5 million files and i have to read the content of each file so that i can form dataframe.It take very long time to do that. Is there any way i can optimize the below code to speed up the process below.
new_list = []
file_name=[]
for root, dirs, files in os.walk('Folder_5M'):
for file in files:
count+=1
file_name.append(file)
with open(os.path.join(root, file), 'rb') as f:
text = f.read()
new_list.append(text)
This is an IO bound task so multi-threading is the tool for the job. In python there are two ways to implement multi-threads. One using the thread pool and the second is using the asyncio that works with event loop. The event loop usually has better performance the challenge is to limit the number of threads executing at the same time. Fortunately, Andrei wrote a very good solution for this.
This code creates an event loop that reads the files in several threads. The parameter MAX_NUMBER_OF_THREADS defines the amount of thread can execute at the same time. Try to play with this number for better performance as it is affected by the machine that runs it.
import os
import asyncio
async def read_file(file_path: str) -> str:
with open(file_path, "r") as f:
return f.read()
async def gather_with_concurrency(n, *tasks):
semaphore = asyncio.Semaphore(n)
async def sem_task(task):
async with semaphore:
return await task
return await asyncio.gather(*(sem_task(task) for task in tasks))
MAX_NUMBER_OF_THREADS = 100
file_name = []
file_path = []
for path, subdirs, files in os.walk("Folder_5M"):
for name in files:
file_path.append(os.path.join(path, name))
file_name.append(name)
count = len(file_name)
tasks = [read_file(file) for file in file_path]
asyncio.run(gather_with_concurrency(MAX_NUMBER_OF_THREADS, *tasks))
Here's an idea for how you could use multiprocessing for this.
Constructing a list of files resulting from os.walk is likely to be very fast. It's the processing of those files that's going to take time. With multiprocessing you can do a lot of that work in parallel.
Each process opens the given file, processes it and creates a dataframe. When all of the parallel processing has been carried out you then concatenate the returned dataframes. This last part will be CPU intensive and there's no way (that I can think of) that would allow you to share that load.
from pandas import DataFrame, concat
from os import walk
from os.path import join, expanduser
from multiprocessing import Pool
HOME = expanduser('~')
def process(filename):
try:
with open(filename) as data:
df = DataFrame()
# analyse your data and populate the dataframe here
return df
except Exception:
return DataFrame()
def main():
with Pool() as pool:
filenames = []
for root, _, files in walk(join(HOME, 'Desktop')):
for file in files:
filenames.append(join(root, file))
ar = pool.map_async(process, filenames)
master = concat(ar.get())
print(master)
if __name__ == '__main__':
main()

How to read and write the same file at a time in python

There are three python programs, writer program (writer.py) writes in to the file output.txt and two reader programs (reader_1.py, reader_2.py) read from the same output.txt file at a same time.
What is the best way to achieve the synchronization between these three programs?
How to avoid reading by the reader program, if the other program is writing in to the output file?
How to handle single writer and multiple readers problem efficiently in python?
I tried to implement the fnctl locking mechanism, but this module is not found in my python.
writer.py
#!/usr/bin/python
import subprocess
import time
cycle = 10
cmd="ls -lrt"
def poll():
with open("/home/output.txt", 'a') as fobj:
fobj.seek(0)
fobj.truncate()
try:
subprocess.Popen(cmd, shell=True, stdout=fobj)
except Exception:
print "Exception Occured"
# Poll the Data
def do_poll():
count = int(time.time())
while True:
looptime = int(time.time())
if (looptime - count) >= cycle:
count = int(time.time())
print('Begin polling cycle')
poll()
print('End polling cycle')
def main():
do_poll()
if __name__ == "__main__":
main()
reader_1.py
#!/usr/bin/python
with open("/home/output10.txt", 'r') as fobj:
f=fobj.read()
print f
reader_2.py
#!/usr/bin/python
with open("/home/output10.txt", 'r') as fobj:
f=fobj.read()
print f
Note: reader_1.py and reader_2.py runs continuously in while loop.
Due to this reason same file being accessed by three programs at same time.
Looking for ideas.
Solution #1: Added fnctl locking mechanism to writer.py program. But not sure this is efficiently locking the file.
#!/usr/bin/python
import subprocess
import time
import os
import fcntl, os
report_cycle = 2
cmd='ls -lrt'
def poll(devnull):
with open("/home/output10.txt", 'a') as fobj:
try:
fcntl.flock(fobj, fcntl.LOCK_EX | fcntl.LOCK_NB)
except IOError:
print "flock() failed to hold an exclusive lock."
fobj.seek(0)
fobj.truncate()
try:
subprocess.call(cmd, shell=True, stdout=fobj, stderr=devnull)
except Exception:
print "Exception Occured"
# Unlock file
try:
fcntl.flock(fobj, fcntl.LOCK_UN)
except:
print "flock() failed to unlock file."

How to run some piece of code in background, after a function returns in python

I am trying to make a background process which will compress a combination of files. I have a python project which have flask API. I want that when a flask API receives request, compression should start and API return should not wait for end of compression thread.
I have this bunch of code but it waits for the time for completion
#server.route("/api/dummy/zip", methods=['GET'])
def compress_file():
print("zipping starts at", time.time())
file_path = ""
file_name = ""
comObj = compression_handler.CompressionThread()
comObj.__int__(file_name, file_path)
logging.info("compression starts")
print("zipping end at", time.time())
return "zipping has took "
and compression_handler python file :
import zipfile
import os
from os.path import basename
import threading
import threading
from Main import logging
class CompressionThread:
def __int__(self, file_name, file_path):
thread = threading.Thread(target=self.compress, args=(['/Users/nirmalsarswat/Desktop/jdk-8u171-macosx-x64.dmg', '/Users/nirmalsarswat/Desktop/resume-nirmal-sarswat.pdf'],))
thread.daemon = True
thread.start()
logging.info('thread has started, in progress')
def compress(self, file_list):
print("running thread in list")
try:
zip_path = '/Users/nirmalsarswat/Documents/newone.zip'
print(zip_path)
zip_file = zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED)
for files in file_list:
zip_file.write(files, basename(files))
zip_file.close()
logging.info("compression of files is done !!!")
except Exception as e:
print("exception occured during compression %s" % e)
Following code output is like this (in console debugging)
> Zipping starts at xxxxx
> running thread in list \n
> compression of files is done !!! \n
> thread has started, in progress
> compression starts
> zipping end at xxxxxx

Downloading Many Images with Python Requests and Multiprocessing

I'm attempting to download a few thousand images using Python and the multiprocessing and requests libs. Things start off fine but about 100 images in, everything locks up and I have to kill the processes. I'm using python 2.7.6. Here's the code:
import requests
import shutil
from multiprocessing import Pool
from urlparse import urlparse
def get_domain_name(s):
domain_name = urlparse(s).netloc
new_s = re.sub('\:', '_', domain_name) #replace colons
return new_s
def grab_image(url):
response = requests.get(url, stream=True, timeout=2)
if response.status_code == 200:
img_name = get_domain_name(url)
with open(IMG_DST + img_name + ".jpg", 'wb') as outf:
shutil.copyfileobj(response.raw, outf)
del response
def main():
with open(list_of_image_urls, 'r') as f:
urls = f.read().splitlines()
urls.sort()
pool = Pool(processes=4, maxtasksperchild=2)
pool.map(grab_image, urls)
pool.close()
pool.join()
if __name__ == "__main__":
main()
Edit: After changing the multiprocessing import to multiprocessing.dummy to use threads instead of processes I am still experiencing the same problem. It seems I'm sometimes hitting a motion jpeg stream instead of a single image, which is causing the associated problems. In order to deal with this issue I'm using a context manager and I created a FileTooBigException. While I haven't implement checking to make sure I've actually downloaded an image file and some other house cleaning, I thought the below code might be useful for someone:
class FileTooBigException(requests.exceptions.RequestException):
"""File over LIMIT_SIZE"""
def grab_image(url):
try:
img = ''
with closing(requests.get(url, stream=True, timeout=4)) as response:
if response.status_code == 200:
content_length = 0
img_name = get_domain_name(url)
img = IMG_DST + img_name + ".jpg"
with open(img, 'wb') as outf:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
outf.write(chunk)
content_length = content_length + CHUNK_SIZE
if(content_length > LIMIT_SIZE):
raise FileTooBigException(response)
except requests.exceptions.Timeout:
pass
except requests.exceptions.ConnectionError:
pass
except socket.timeout:
pass
except FileTooBigException:
os.remove(img)
pass
And, any suggested improvements welcome!
There is no point in using multiprocessing for I/O concurrency. In network I/O the thread involved just waits most of the time doing nothing. And Python threads are excellent for doing nothing. So use a threadpool, instead of a processpool. Each process consumes a lot of resouces and are unnecessary for I/O bound activities. While threads share the process state and are exactly what you are looking for.

How to parallelize file downloads?

I can download a file at a time with:
import urllib.request
urls = ['foo.com/bar.gz', 'foobar.com/barfoo.gz', 'bar.com/foo.gz']
for u in urls:
urllib.request.urlretrieve(u)
I could try to subprocess it as such:
import subprocess
import os
def parallelized_commandline(command, files, max_processes=2):
processes = set()
for name in files:
processes.add(subprocess.Popen([command, name]))
if len(processes) >= max_processes:
os.wait()
processes.difference_update(
[p for p in processes if p.poll() is not None])
#Check if all the child processes were closed
for p in processes:
if p.poll() is None:
p.wait()
urls = ['http://www.statmt.org/wmt15/training-monolingual-nc-v10/news-commentary-v10.en.gz',
'http://www.statmt.org/wmt15/training-monolingual-nc-v10/news-commentary-v10.cs.gz',
'http://www.statmt.org/wmt15/training-monolingual-nc-v10/news-commentary-v10.de.gz']
parallelized_commandline('wget', urls)
Is there any way to parallelize urlretrieve without using os.system or subprocess to cheat?
Given that I must resort to the "cheat" for now, is subprocess.Popen the right way to download the data?
When using the parallelized_commandline() above, it's using multi-thread but not multi-core for the wget, is that normal? Is there a way to make it multi-core instead of multi-thread?
You could use a thread pool to download files in parallel:
#!/usr/bin/env python3
from multiprocessing.dummy import Pool # use threads for I/O bound tasks
from urllib.request import urlretrieve
urls = [...]
result = Pool(4).map(urlretrieve, urls) # download 4 files at a time
You could also download several files at once in a single thread using asyncio:
#!/usr/bin/env python3
import asyncio
import logging
from contextlib import closing
import aiohttp # $ pip install aiohttp
#asyncio.coroutine
def download(url, session, semaphore, chunk_size=1<<15):
with (yield from semaphore): # limit number of concurrent downloads
filename = url2filename(url)
logging.info('downloading %s', filename)
response = yield from session.get(url)
with closing(response), open(filename, 'wb') as file:
while True: # save file
chunk = yield from response.content.read(chunk_size)
if not chunk:
break
file.write(chunk)
logging.info('done %s', filename)
return filename, (response.status, tuple(response.headers.items()))
urls = [...]
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
with closing(asyncio.get_event_loop()) as loop, \
closing(aiohttp.ClientSession()) as session:
semaphore = asyncio.Semaphore(4)
download_tasks = (download(url, session, semaphore) for url in urls)
result = loop.run_until_complete(asyncio.gather(*download_tasks))
where url2filename() is defined here.

Categories

Resources