I've requirement to calculate the total number of post requests sent to a server. My script uses a thread per JSON file which contains post data. Below is the rough code snippet.
statistics = 0
def load_from_file(some_arguments, filename):
data_list = json.loads(open(filename).read())
url = address + getUrl(filename, config)
for data in data_list.get("results"):
statistics += 1
r = requests.post(url, data=json.dumps(data), headers=headers,
auth=HTTPBasicAuth(username, password))
def load_from_directory(some_arguments, directory):
pool = mp.Pool(mp.cpu_count() * 2)
func = partial(load_from_file, some_arguments)
file_list = [f for f in listdir(directory) if isfile(join(directory, f))]
pool.map(func, [join(directory, f) for f in file_list ])
pool.close()
pool.join()
print "total post requests", statistics
I want to print the total number of post requests processed using this script. Is it the right way?
Sharing memory is not so simple when using multiprocesses. I'm not seeing the need to use the multiprocessing module instead of threading. Multiprocessing is mostly used as a workaround for the Global interpreter lock.
In your example you are using IO bound operations which probably won't ever reach full CPU time. If you insist on using multiprocess instead of threading I suggest to take a look at exchanging-objects-between-processes.
Otherwise using threading you can share the global statistics variable between threads.
import threading
statistics = 0
def load_from_file(some_arguments, filename):
global statistics
data_list = json.loads(open(filename).read())
url = address + getUrl(filename, config)
for data in data_list.get("results"):
statistics += 1
r = requests.post(url, data=json.dumps(data), headers=headers,
auth=HTTPBasicAuth(username, password))
def load_from_directory(some_arguments, directory):
threads = []
func = partial(load_from_file, some_arguments)
file_list = [f for f in listdir(directory) if isfile(join(directory, f))]
for f in file_list:
t = threading.Thread(target=func, args=(join(directory, f)))
t.start()
threads.append(t)
#Wait for threads to finish
for thread in threads:
thread.join()
print "total post requests", statistics
Note: This currently simultaneously spawns threads based on the number of files in the directory. You might want to implement some kind of throttling for optimal performance.
Related
I have folder containing about 5 million files and i have to read the content of each file so that i can form dataframe.It take very long time to do that. Is there any way i can optimize the below code to speed up the process below.
new_list = []
file_name=[]
for root, dirs, files in os.walk('Folder_5M'):
for file in files:
count+=1
file_name.append(file)
with open(os.path.join(root, file), 'rb') as f:
text = f.read()
new_list.append(text)
This is an IO bound task so multi-threading is the tool for the job. In python there are two ways to implement multi-threads. One using the thread pool and the second is using the asyncio that works with event loop. The event loop usually has better performance the challenge is to limit the number of threads executing at the same time. Fortunately, Andrei wrote a very good solution for this.
This code creates an event loop that reads the files in several threads. The parameter MAX_NUMBER_OF_THREADS defines the amount of thread can execute at the same time. Try to play with this number for better performance as it is affected by the machine that runs it.
import os
import asyncio
async def read_file(file_path: str) -> str:
with open(file_path, "r") as f:
return f.read()
async def gather_with_concurrency(n, *tasks):
semaphore = asyncio.Semaphore(n)
async def sem_task(task):
async with semaphore:
return await task
return await asyncio.gather(*(sem_task(task) for task in tasks))
MAX_NUMBER_OF_THREADS = 100
file_name = []
file_path = []
for path, subdirs, files in os.walk("Folder_5M"):
for name in files:
file_path.append(os.path.join(path, name))
file_name.append(name)
count = len(file_name)
tasks = [read_file(file) for file in file_path]
asyncio.run(gather_with_concurrency(MAX_NUMBER_OF_THREADS, *tasks))
Here's an idea for how you could use multiprocessing for this.
Constructing a list of files resulting from os.walk is likely to be very fast. It's the processing of those files that's going to take time. With multiprocessing you can do a lot of that work in parallel.
Each process opens the given file, processes it and creates a dataframe. When all of the parallel processing has been carried out you then concatenate the returned dataframes. This last part will be CPU intensive and there's no way (that I can think of) that would allow you to share that load.
from pandas import DataFrame, concat
from os import walk
from os.path import join, expanduser
from multiprocessing import Pool
HOME = expanduser('~')
def process(filename):
try:
with open(filename) as data:
df = DataFrame()
# analyse your data and populate the dataframe here
return df
except Exception:
return DataFrame()
def main():
with Pool() as pool:
filenames = []
for root, _, files in walk(join(HOME, 'Desktop')):
for file in files:
filenames.append(join(root, file))
ar = pool.map_async(process, filenames)
master = concat(ar.get())
print(master)
if __name__ == '__main__':
main()
I am trying to create a ThreadPoolExecutor to pull data from a server, however when the executor runs it will make many requests at the same time to the server, but the server can only handle 4 requests at a time. So my task is to make the ThreadPoolExecutor only make 4 requests at any time. Below I have a minimum working example, but how can I make it to limit the total number of requests at a given time?
import requests
import concurrent.futures
# Files
files = [r'https://data.pacificclimate.org/data/downscaled_gcms/tasmin_day_BCCAQv2+ANUSPLIN300_CanESM2_historical+rcp45_r1i1p1_19500101-21001231.nc.nc?tasmin[0:55114][152:152][290:290]',
r'https://data.pacificclimate.org/data/downscaled_gcms/tasmin_day_BCCAQv2+ANUSPLIN300_CNRM-CM5_historical+rcp45_r1i1p1_19500101-21001231.nc.nc?tasmin[0:55114][152:152][290:290]',
r'https://data.pacificclimate.org/data/downscaled_gcms/tasmin_day_BCCAQv2+ANUSPLIN300_CSIRO-Mk3-6-0_historical+rcp45_r1i1p1_19500101-21001231.nc.nc?tasmin[0:55114][152:152][290:290]',
r'https://data.pacificclimate.org/data/downscaled_gcms/tasmin_day_BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r2i1p1_19500101-21001231.nc.nc?tasmin[0:55114][152:152][290:290]',
r'https://data.pacificclimate.org/data/downscaled_gcms/tasmin_day_BCCAQv2+ANUSPLIN300_MIROC5_historical+rcp45_r3i1p1_19500101-21001231.nc.nc?tasmin[0:55114][152:152][290:290]',
r'https://data.pacificclimate.org/data/downscaled_gcms/tasmin_day_BCCAQv2+ANUSPLIN300_MPI-ESM-LR_historical+rcp45_r3i1p1_19500101-21001231.nc.nc?tasmin[0:55114][152:152][290:290]',
r'https://data.pacificclimate.org/data/downscaled_gcms/tasmin_day_BCCAQv2+ANUSPLIN300_MRI-CGCM3_historical+rcp45_r1i1p1_19500101-21001231.nc.nc?tasmin[0:55114][152:152][290:290]',
r'https://data.pacificclimate.org/data/downscaled_gcms/tasmin_day_BCCAQv2+ANUSPLIN300_GFDL-ESM2G_historical+rcp45_r1i1p1_19500101-21001231.nc.nc?tasmin[0:55114][152:152][290:290]',
r'https://data.pacificclimate.org/data/downscaled_gcms/tasmin_day_BCCAQv2+ANUSPLIN300_HadGEM2-ES_historical+rcp45_r1i1p1_19500101-21001231.nc.nc?tasmin[0:55114][152:152][290:290]]',]
# List of Climate Model Names Corresponding To Files
climate_model = ['CanESM2', 'CNRM-CM5','CSIRO-Mk3-6-0','CCSM4','MIROC5','MPI-ESM-LR','MRI-CGCM3','GFDL-ESM2G','HadGEM2-ES']
def min_function(url,climate_model):
r = requests.get(url)
filename = 'tasmin85_' + f'{climate_model}' + '.nc'
with open(filename,'wb') as f:
f.write(r.content)
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
executor.map(min_function, files, climate_model)
I'm attempting to download a few thousand images using Python and the multiprocessing and requests libs. Things start off fine but about 100 images in, everything locks up and I have to kill the processes. I'm using python 2.7.6. Here's the code:
import requests
import shutil
from multiprocessing import Pool
from urlparse import urlparse
def get_domain_name(s):
domain_name = urlparse(s).netloc
new_s = re.sub('\:', '_', domain_name) #replace colons
return new_s
def grab_image(url):
response = requests.get(url, stream=True, timeout=2)
if response.status_code == 200:
img_name = get_domain_name(url)
with open(IMG_DST + img_name + ".jpg", 'wb') as outf:
shutil.copyfileobj(response.raw, outf)
del response
def main():
with open(list_of_image_urls, 'r') as f:
urls = f.read().splitlines()
urls.sort()
pool = Pool(processes=4, maxtasksperchild=2)
pool.map(grab_image, urls)
pool.close()
pool.join()
if __name__ == "__main__":
main()
Edit: After changing the multiprocessing import to multiprocessing.dummy to use threads instead of processes I am still experiencing the same problem. It seems I'm sometimes hitting a motion jpeg stream instead of a single image, which is causing the associated problems. In order to deal with this issue I'm using a context manager and I created a FileTooBigException. While I haven't implement checking to make sure I've actually downloaded an image file and some other house cleaning, I thought the below code might be useful for someone:
class FileTooBigException(requests.exceptions.RequestException):
"""File over LIMIT_SIZE"""
def grab_image(url):
try:
img = ''
with closing(requests.get(url, stream=True, timeout=4)) as response:
if response.status_code == 200:
content_length = 0
img_name = get_domain_name(url)
img = IMG_DST + img_name + ".jpg"
with open(img, 'wb') as outf:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
outf.write(chunk)
content_length = content_length + CHUNK_SIZE
if(content_length > LIMIT_SIZE):
raise FileTooBigException(response)
except requests.exceptions.Timeout:
pass
except requests.exceptions.ConnectionError:
pass
except socket.timeout:
pass
except FileTooBigException:
os.remove(img)
pass
And, any suggested improvements welcome!
There is no point in using multiprocessing for I/O concurrency. In network I/O the thread involved just waits most of the time doing nothing. And Python threads are excellent for doing nothing. So use a threadpool, instead of a processpool. Each process consumes a lot of resouces and are unnecessary for I/O bound activities. While threads share the process state and are exactly what you are looking for.
I have a file to download (download path extracted from json. eg: http://testsite/abc.zip).
I need a help to perform, all the 5 threads should download the abc.zip file to the output directory and the download has to be Asynchronous or concurrent.
Currently with the below code it does download the file 5 times but it downloads one by one (Synchronous).
What I want is, the download to be simultaneous.
def dldr(file=file_url, outputdir=out1):
local_fn = str(uuid.uuid4())
if not os.path.exists(outputdir):
os.makedirs(outputdir)
s = datetime.now()
urllib.urlretrieve(file, outputdir + os.sep + local_fn)
e = datetime.now()
time_diff = e - s
logger(out1, local_fn, time_diff)
for i in range(1, 6):
t = threading.Thread(target=dldr())
t.start()
I have read Requests with multiple connections post and it's helpful, but doesn't address the requirement of the question asked.
I use threading module for download threads:
Also requests, but you can change that to urllib by yourself.
import threading
import requests
def download(link, filelocation):
r = requests.get(link, stream=True)
with open(filelocation, 'wb') as f:
for chunk in r.iter_content(1024):
if chunk:
f.write(chunk)
def createNewDownloadThread(link, filelocation):
download_thread = threading.Thread(target=download, args=(link,filelocation))
download_thread.start()
for i in range(0,5):
file = "C:\\test" + str(i) + ".png"
print file
createNewDownloadThread("http://stackoverflow.com/users/flair/2374517.png", file)
I can download a file at a time with:
import urllib.request
urls = ['foo.com/bar.gz', 'foobar.com/barfoo.gz', 'bar.com/foo.gz']
for u in urls:
urllib.request.urlretrieve(u)
I could try to subprocess it as such:
import subprocess
import os
def parallelized_commandline(command, files, max_processes=2):
processes = set()
for name in files:
processes.add(subprocess.Popen([command, name]))
if len(processes) >= max_processes:
os.wait()
processes.difference_update(
[p for p in processes if p.poll() is not None])
#Check if all the child processes were closed
for p in processes:
if p.poll() is None:
p.wait()
urls = ['http://www.statmt.org/wmt15/training-monolingual-nc-v10/news-commentary-v10.en.gz',
'http://www.statmt.org/wmt15/training-monolingual-nc-v10/news-commentary-v10.cs.gz',
'http://www.statmt.org/wmt15/training-monolingual-nc-v10/news-commentary-v10.de.gz']
parallelized_commandline('wget', urls)
Is there any way to parallelize urlretrieve without using os.system or subprocess to cheat?
Given that I must resort to the "cheat" for now, is subprocess.Popen the right way to download the data?
When using the parallelized_commandline() above, it's using multi-thread but not multi-core for the wget, is that normal? Is there a way to make it multi-core instead of multi-thread?
You could use a thread pool to download files in parallel:
#!/usr/bin/env python3
from multiprocessing.dummy import Pool # use threads for I/O bound tasks
from urllib.request import urlretrieve
urls = [...]
result = Pool(4).map(urlretrieve, urls) # download 4 files at a time
You could also download several files at once in a single thread using asyncio:
#!/usr/bin/env python3
import asyncio
import logging
from contextlib import closing
import aiohttp # $ pip install aiohttp
#asyncio.coroutine
def download(url, session, semaphore, chunk_size=1<<15):
with (yield from semaphore): # limit number of concurrent downloads
filename = url2filename(url)
logging.info('downloading %s', filename)
response = yield from session.get(url)
with closing(response), open(filename, 'wb') as file:
while True: # save file
chunk = yield from response.content.read(chunk_size)
if not chunk:
break
file.write(chunk)
logging.info('done %s', filename)
return filename, (response.status, tuple(response.headers.items()))
urls = [...]
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
with closing(asyncio.get_event_loop()) as loop, \
closing(aiohttp.ClientSession()) as session:
semaphore = asyncio.Semaphore(4)
download_tasks = (download(url, session, semaphore) for url in urls)
result = loop.run_until_complete(asyncio.gather(*download_tasks))
where url2filename() is defined here.