I am trying to modify the solution shown here: What is the fastest way to send 100,000 HTTP requests in Python? except that instead of checking header status I am making an API request which returns a dictionary and I would like the end result of all of these API requests to be a list of all of the dictionaries.
Here is my code -- consider that api_calls is a list that has each url to open for the json request...
from threading import Thread
from Queue import Queue
concurrent = 200
def doWork():
while True:
url = q.get()
result = makeRequest(url[0])
doSomethingWithResult(result, url)
q.task_done()
def makeRequest(ourl):
try:
api_call = urlopen(ourl).read()
result = json.loads(api_call)
return result, ourl
except:
return "error", ourl
def doSomethingWithResult(result, url):
print(url,result)
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in api_calls:
q.put(url)
q.join()
except KeyboardInterrupt:
sys.exit(1)
Like the example linked, this currently will succesfully print the url, result on each line. What I would instead like to do is add the (url, result) to a list in each thread and then at the end join them into one master list. I cannot figure out how to have this master list and join the results at the end. Can anybody help with what I should modify in the doSomethingWithResult? If I was doing one large loop, I would just have an empty list and I would append the result to the list after each API request, but I do not know how to mimick this now that I am using threads.
I expect that a common response will be to use https://en.wikipedia.org/wiki/Asynchronous_I/O and if this is the suggestion, then I would appreciate somebody actually providing an example that accomplishes as much as the code I have linked above.
Use a ThreadPool instead. It does the heavy lifting for you. Here is a working example that fetches a few urls.
import multiprocessing.pool
concurrent = 200
def makeRequest(ourl):
try:
api_call = urlopen(ourl).read()
result = json.loads(api_call)
return "success", ourl
except:
return "error", ourl
def main():
api_calls = [
'http:http://jsonplaceholder.typicode.com/posts/{}'.format(i)
for i in range(1,5)]
# a thread pool that implements the process pool API.
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
return_list = pool.map(makeRequest, api_calls, chunksize=1)
pool.close()
for status, data in return_list:
print(data)
main()
Related
I am trying to run the loops simultaneously the second loop is dependent on the first one output and needs it to fetch the input from the ids list so no need to wait for the first one until the finish. I tried to do it with multiple libraries and methods but failed to find the optimal structure for that.
import time
import pandas as pd
import requests
import json
from matplotlib import pyplot
import seaborn as sns
import numpy as np
API_KEY = ''
df = pd.read_csv('lat_long file')
# get name and information of each place
id = df['id']
lat = df['latitude']
lon = df['longitude']
ids=[]
loc=[]
unit=[]
print('First API now running')
def get_details(lat, lon):
try:
url = "https://maps.googleapis.com/maps/api/geocode/json?latlng="+ str(lat) + ',' + str(lon)+'&key='+ API_KEY
response = requests.get(url)
data = json.loads(response.text)
ids.append(data['results'][0]['place_id'])
except Exception as e:
print('This code NOT be running because of', e)
return data
def get_deta(ids):
url1 = "https://maps.googleapis.com/maps/api/place/details/json?language=en-US&placeid="+str(ids)+"&key=" + API_KEY
responsedata = requests.get(url1)
data2 = json.loads(responsedata.text)
if 'business_status' in data2['result'].keys():
loc.append((data2['result']['business_status']))
else:
loc.append('0')
flag = False
if data2['result']:
for level in data2['result']['address_components']:
#if len(level['types']) > 1:
if level['types'][0] == 'premise':
flag = True
unit.append(level['long_name'][4:])
else:
print(data2)
if not flag:
unit.append('0')
return data2
def loop1():
for i in range(len(id)):
get_details(lat[i], lon[i])
return
print('Seconed API now running')
def loop2(len(id)):
#printing and appending addresses to use them with the next API
for i in range(50):
get_deta(ids[i])
return
loop1()
loop2()
It is not very clear what you are trying to achieve here. How exactly does the second API depends on the first?
To achieve concurrency you could use the AsyncIO library which is designed to perform concurrent network requests efficiently. However, the requests library you are using is a synchronous one, you must change to an asynchronous one such as aiohttp.
Given that, you can communicate between two concurrent tasks using asyncio.Queue. Here is a draft of what your program could look like:
import asyncio
import aiohttp
import json
async def get_details(lat, lon, session: aiohttp.ClientSession, id_queue: asyncio.Queue):
url: str = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{lon}&key={API_KEY}"
async with session.get(url) as response:
data = await json.loads(response.text())
await id_queue.put(data['results'][0]['place_id'])
async def get_data(id, session: aiohttp.ClientSSLError, loc_queue: asyncio.Queue):
# Network requests and JSON decoding
...
await loc_queue.put((data['result']['business_status']))
async def loop_1(coords, id_queue: asyncio.Queue):
await asyncio.gather(
*[get_details(lat, lon) for lat, lon in coords]
)
async def loop_2(id_queue: asyncio.Queue, loc_queue: asyncio.Queue):
while True:
id = await id_queue.get()
await get_data(id)
async def main():
id_queue = asyncio.Queue(maxsize=100)
loc_queue = asyncio.Queue(maxsize=100)
await asyncio.gather(
loop_1(),
loop_2()
)
if __name__ == "__main__":
asyncio.run(main())
I simplified your example for the purpose of the example. If you take a look at the main() function, the two loops are executed concurrently with asyncio.gather(). The first loop gets the details of all places concurrently (again with asyncio.gather) and feed a shared queue id_queue. The second loop waits for new ids to come up in the queue and process them with the second API as soon as they are available. It then enqueue the results in a last queue loc_queue.
You could extend this program by adding a third API linked plugged to this last queue and continue to process.
I am trying to make my code run faster for finding roblox account names. I tried using larger and larger event loops (they basically took the previous event manager and used that to make a larger event manager), but that resulted in the same, if not worse performance when compared to using just a single small event loop.
This code was supplied in another question of mine (with modifications from me here). It works great, but it still can take a good few minutes to handle larger quantities of accounts. Usually I wouldn't care, but I am trying to get to 100,000 accounts, so I need performance. Is this just how fast it can go? Or can we drive this even further? Is the answer just more CPU/memory? Better internet? Do I need network programming at all, or is there a faster, no-request way?
Code:
import asyncio
import aiohttp
async def find_account(url, session, id):
try:
async with session.get(url) as response:
if response.status == 200:
r = await response.read()
from bs4 import BeautifulSoup
soup = BeautifulSoup(r, 'html.parser')
h2 = []
for i in soup.find_all('h2'):
h2.append(i)
print('Done')
return str(list(list(h2)[0])[0]) + ' ' + str(url)
else:
return 'This account does not exist ID: {}'.format(id)
except aiohttp.ServerDisconnectedError:
print('Done')
return find_account(url, session, id)
async def main(min_id, max_id):
tasks = []
async with aiohttp.ClientSession() as session:
for id in range(min_id, max_id):
url = f'https://web.roblox.com/users/{str(id)}/profile'
tasks.append(asyncio.create_task(find_account(url=url, session=session, id=id)))
return await asyncio.gather(*tasks)
from time import time
loop = asyncio.get_event_loop()
starting = int(input("Type Your Starting Id Number>> "))
ending = int(input("Type Your Ending Id Number>> "))
timer = time()
users = loop.run_until_complete(main(starting, ending))
users = [i for i in users if i != '1']
print(users)
print(time()-timer)
You could run BeautifulSoup in multiple processes to speed it up. For example, you can extract the part of find_account that does the parsing and pass that to a process pool executor:
import concurrent.futures
_pool = concurrent.futures.ProcessPoolExecutor()
def parse(html):
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
h2 = []
for i in soup.find_all('h2'):
h2.append(i)
return str(list(list(h2)[0])[0])
async def find_account(url, session, id):
while True:
async with session.get(url) as response:
if response.status == 200:
r = await response.read()
loop = asyncio.get_event_loop()
extracted = await loop.run_in_executor(_pool, parse, r)
print('Done')
return extracted + ' ' + str(url)
else:
return 'This account does not exist ID: {}'.format(id)
except aiohttp.ServerDisconnectedError:
print('Done')
# keep looping
On an unrelated note, your recursive call to find_account() was incorrect because it was missing an await. The above code fixes that and switches to a loop instead, which makes it a bit more explicit that the code is in fact looping.
I have a strange situation and cannot figure it out after lots of hit-trials. I am using multi-threading (10) for reading urls (100) and it works fine in most cases but in some situation, it gets stuck at the last thread. I waited for it to see if it returns and it took a lot of time (1050 seconds) whereas the rest of the nine threads returned within 25 seconds. It shows something is wrong with my code but can't figure it out. Any ideas?
Note1: It happens for both daemon and non-daemon threads.
Note2: The number of URLs and thread changes. I tried a different number of URLs from 10-100 and various threads from 5-50.
Note3: The URLs are most of the time completely different.
import urllib2
import Queue
import threading
from goose import Goose
input_queue = Queue.Queue()
result_queue = Queue.Queue()
Thread Worker:
def worker(input_queue, result_queue):
queue_full = true
while queue_full:
try:
url = input_queue.get(False)
read a url using urllib2 and goose
process it
result_queue.put(updated value)
except Queue.Empty:
queue_full = False
Main process:
for url in urls:
input_queue.put(url)
thread_count = 5
for t in range(thread_count):
t = threading.Thread(target=worker, args= (input_queue, result_queue))
t.start()
for url in urls:
url = result_queue.get() # updates url
The process gets blocked at the last result_queue.get() call.
NOTE: I am more interested in what I am doing wrong here, in case someone can point that out? Because I tend to think that I wrote correct code but apparently that's not the case.
You can use ThreadPoolExecutor from concurrent.futures.
from concurrent.futures import ThreadPoolExecutor
MAX_WORKERS = 50
def worker(url):
response = requests.get(url)
return response.content
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
results = executor.map(worker, urls)
for result in results:
print(result)
For example, i take URL as a list of numbers
import urllib2
import Queue
import threading
#from goose import Goose
input_queue = Queue.Queue()
result_queue = Queue.Queue()
def worker(input_queue, result_queue):
while not input_queue.empty():
try:
url = input_queue.get(False)
updated_value = int(url) * 9
result_queue.put(updated_value)
except Queue.Empty:
pass
urls = [1,2,3,4,5,6,7,8,9]
for url in urls:
input_queue.put(url)
thread_count = 5
for i in range(thread_count):
t = threading.Thread(target=worker, args= (input_queue, result_queue))
t.start()
t.join()
for url in urls:
try:
url = result_queue.get()
print url
except Queue.Empty:
pass
Output
9
18
27
36
45
54
63
72
81
I am working on creating a HTTP client which can generate hundreds of connections each second and send up to 10 requests on each of those connections. I am using threading so concurrency can be achieved.
Here is my code:
def generate_req(reqSession):
requestCounter = 0
while requestCounter < requestRate:
try:
response1 = reqSession.get('http://20.20.1.2/tempurl.html')
if response1.status_code == 200:
client_notify('r')
except(exceptions.ConnectionError, exceptions.HTTPError, exceptions.Timeout) as Err:
client_notify('F')
break
requestCounter += 1
def main():
for q in range(connectionPerSec):
s1 = requests.session()
t1 = threading.Thread(target=generate_req, args=(s1,))
t1.start()
Issues:
It is not scaling above 200 connections/sec with requestRate = 1. I ran other available HTTP clients on the same client machine and against the server, test runs fine and it is able to scale.
When requestRate = 10, connections/sec drops to 30.
Reason: Not able to create targeted number of threads every second.
For issue #2, client machine is not able to create enough request sessions and start new threads. As soon as requestRate is set to more than 1, things start to fall apart.
I am suspecting it has something to do with HTTP connection pooling which requests uses.
Please suggest what am I doing wrong here.
I wasn't able to get things to fall apart, however the following code has some new features:
1) extended logging, including specific per-thread information
2) all threads join()ed at the end to make sure the parent process doesntt leave them hanging
3) multithreaded print tends to interleave the messages, which can be unwieldy. This version uses yield so a future version can accept the messages and print them clearly.
source
import exceptions, requests, threading, time
requestRate = 1
connectionPerSec = 2
def client_notify(msg):
return time.time(), threading.current_thread().name, msg
def generate_req(reqSession):
requestCounter = 0
while requestCounter < requestRate:
try:
response1 = reqSession.get('http://127.0.0.1/')
if response1.status_code == 200:
print client_notify('r')
except (exceptions.ConnectionError, exceptions.HTTPError, exceptions.Timeout):
print client_notify('F')
break
requestCounter += 1
def main():
for cnum in range(connectionPerSec):
s1 = requests.session()
th = threading.Thread(
target=generate_req, args=(s1,),
name='thread-{:03d}'.format(cnum),
)
th.start()
for th in threading.enumerate():
if th != threading.current_thread():
th.join()
if __name__=='__main__':
main()
output
(1407275951.954147, 'thread-000', 'r')
(1407275951.95479, 'thread-001', 'r')
The purpose of my program is to download files with threads. I define the unit, and using len/unit threads, the len is the length of the file which is going to be downloaded.
Using my program, the file can be downloaded, but the threads are not stopping. I can't find the reason why.
This is my code...
#! /usr/bin/python
import urllib2
import threading
import os
from time import ctime
class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self);
self.func = func;
self.args = args;
self.name = name;
def run(self):
apply(self.func,self.args);
url = 'http://ubuntuone.com/1SHQeCAQWgIjUP2945hkZF';
request = urllib2.Request(url);
response = urllib2.urlopen(request);
meta = response.info();
response.close();
unit = 1000000;
flen = int(meta.getheaders('Content-Length')[0]);
print flen;
if flen%unit == 0:
bs = flen/unit;
else :
bs = flen/unit+1;
blocks = range(bs);
cnt = {};
for i in blocks:
cnt[i]=i;
def getStr(i):
try:
print 'Thread %d start.'%(i,);
fout = open('a.zip','wb');
fout.seek(i*unit,0);
if (i+1)*unit > flen:
request.add_header('Range','bytes=%d-%d'%(i*unit,flen-1));
else :
request.add_header('Range','bytes=%d-%d'%(i*unit,(i+1)*unit-1));
#opener = urllib2.build_opener();
#buf = opener.open(request).read();
resp = urllib2.urlopen(request);
buf = resp.read();
fout.write(buf);
except BaseException:
print 'Error';
finally :
#opener.close();
fout.flush();
fout.close();
del cnt[i];
# filelen = os.path.getsize('a.zip');
print 'Thread %d ended.'%(i),
print cnt;
# print 'progress : %4.2f'%(filelen*100.0/flen,),'%';
def main():
print 'download at:',ctime();
threads = [];
for i in blocks:
t = MyThread(getStr,(blocks[i],),getStr.__name__);
threads.append(t);
for i in blocks:
threads[i].start();
for i in blocks:
# print 'this is the %d thread;'%(i,);
threads[i].join();
#print 'size:',os.path.getsize('a.zip');
print 'download done at:',ctime();
if __name__=='__main__':
main();
Could someone please help me understand why the threads aren't stopping.
I can't really address your code example because it is quite messy and hard to follow, but a potential reason you are seeing the threads not end is that a request will stall out and never finish. urllib2 allows you to specify timeouts for how long you will allow the request to take.
What I would recommend for your own code is that you split your work up into a queue, start a fixed number of thread (instead of a variable number), and let the worker threads pick up work until it is done. Make the http requests have a timeout. If the timeout expires, try again or put the work back into the queue.
Here is a generic example of how to use a queue, a fixed number of workers and a sync primitive between them:
import threading
import time
from Queue import Queue
def worker(queue, results, lock):
local_results = []
while True:
val = queue.get()
if val is None:
break
# pretend to do work
time.sleep(.1)
local_results.append(val)
with lock:
results.extend(local_results)
print threading.current_thread().name, "Done!"
num_workers = 4
threads = []
queue = Queue()
lock = threading.Lock()
results = []
for i in xrange(100):
queue.put(i)
for _ in xrange(num_workers):
# Use None as a sentinel to signal the threads to end
queue.put(None)
t = threading.Thread(target=worker, args=(queue,results,lock))
t.start()
threads.append(t)
for t in threads:
t.join()
print sorted(results)
print "All done"