I have 5,00,000 urls. and want to get response of each asynchronously.
import aiohttp
import asyncio
#asyncio.coroutine
def worker(url):
response = yield from aiohttp.request('GET', url, connector=aiohttp.TCPConnector(share_cookies=True, verify_ssl=False))
body = yield from response.read_and_close()
print(url)
def main():
url_list = [] # lacs of urls, extracting from a file
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait([worker(u) for u in url_list]))
main()
I want 200 connections at a time(concurrent 200), not more than this because
when I run this program for 50 urls it works fine, i.e url_list[:50]
but if I pass whole list, i get this error
aiohttp.errors.ClientOSError: Cannot connect to host www.example.com:443 ssl:True Future/Task exception was never retrieved future: Task()
may be frequency is too much and server is denying to respond after a limit?
Yes, one can expect a server to stop responding after causing too much traffic (whatever the definition of "too much traffic") to it.
One way to limit number of concurrent requests (throttle them) in such cases is to use asyncio.Semaphore, similar in use to these used in multithreading: just like there, you create a semaphore and make sure the operation you want to throttle is aquiring that semaphore prior to doing actual work and releasing it afterwards.
For your convenience, asyncio.Semaphore implements context manager to make it even easier.
Most basic approach:
CONCURRENT_REQUESTS = 200
#asyncio.coroutine
def worker(url, semaphore):
# Aquiring/releasing semaphore using context manager.
with (yield from semaphore):
response = yield from aiohttp.request(
'GET',
url,
connector=aiohttp.TCPConnector(share_cookies=True,
verify_ssl=False))
body = yield from response.read_and_close()
print(url)
def main():
url_list = [] # lacs of urls, extracting from a file
semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait([worker(u, semaphore) for u in url_list]))
Related
I am trying to open a multiple web session and save the data into CSV, Have written my code using for loop & requests.get options, But it's taking so long to access 90 number of Web location. Can anyone let me know how the whole process run in parallel for loc_var:
The code is working fine, only the issue is running one by one for loc_var, and took so long time.
Want to access all the for loop loc_var URL in parallel and write operation of CSV
Below is the Code:
import pandas as pd
import numpy as np
import os
import requests
import datetime
import zipfile
t=datetime.date.today()-datetime.timedelta(2)
server = [("A","web1",":5000","username=usr&password=p7Tdfr")]
'''List of all web_ips'''
web_1 = ["Web1","Web2","Web3","Web4","Web5","Web6","Web7","Web8","Web9","Web10","Web11","Web12","Web13","Web14","Web15"]
'''List of All location'''
loc_var =["post1","post2","post3","post4","post5","post6","post7","post8","post9","post10","post11","post12","post13","post14","post15","post16","post17","post18"]
for s,web,port,usr in server:
login_url='http://'+web+port+'/api/v1/system/login/?'+usr
print (login_url)
s= requests.session()
login_response = s.post(login_url)
print("login Responce",login_response)
#Start access the Web for Loc_variable
for mkt in loc_var:
#output is CSV File
com_actions_url='http://'+web+port+'/api/v1/3E+date(%5C%22'+str(t)+'%5C%22)and+location+%3D%3D+%27'+mkt+'%27%22&page_size=-1&format=%22csv%22'
print("com_action_url",com_actions_url)
r = s.get(com_actions_url)
print("action",r)
if r.ok == True:
with open(os.path.join("/home/Reports_DC/", "relation_%s.csv"%mkt),'wb') as f:
f.write(r.content)
# If loc is not aceesble try with another Web_1 List
if r.ok == False:
while r.ok == False:
for web_2 in web_1:
login_url='http://'+web_2+port+'/api/v1/system/login/?'+usr
com_actions_url='http://'+web_2+port+'/api/v1/3E+date(%5C%22'+str(t)+'%5C%22)and+location+%3D%3D+%27'+mkt+'%27%22&page_size=-1&format=%22csv%22'
login_response = s.post(login_url)
print("login Responce",login_response)
print("com_action_url",com_actions_url)
r = s.get(com_actions_url)
if r.ok == True:
with open(os.path.join("/home/Reports_DC/", "relation_%s.csv"%mkt),'wb') as f:
f.write(r.content)
break
There are multiple approaches that you can take to make concurrent HTTP requests. Two that I've used are (1) multiple threads with concurrent.futures.ThreadPoolExecutor or (2) send the requests asynchronously using asyncio/aiohttp.
To use a thread pool to send your requests in parallel, you would first generate a list of URLs that you want to fetch in parallel (in your case generate a list of login_urls and com_action_urls), and then you would request all of the URLs concurrently as follows:
from concurrent.futures import ThreadPoolExecutor
import requests
def fetch(url):
page = requests.get(url)
return page.text
# Catch HTTP errors/exceptions here
pool = ThreadPoolExecutor(max_workers=5)
urls = ['http://www.google.com', 'http://www.yahoo.com', 'http://www.bing.com'] # Create a list of urls
for page in pool.map(fetch, urls):
# Do whatever you want with the results ...
print(page[0:100])
Using asyncio/aiohttp is generally faster than the threaded approach above, but the learning curve is more complicated. Here is a simple example (Python 3.7+):
import asyncio
import aiohttp
urls = ['http://www.google.com', 'http://www.yahoo.com', 'http://www.bing.com']
async def fetch(session, url):
async with session.get(url) as resp:
return await resp.text()
# Catch HTTP errors/exceptions here
async def fetch_concurrent(urls):
loop = asyncio.get_event_loop()
async with aiohttp.ClientSession() as session:
tasks = []
for u in urls:
tasks.append(loop.create_task(fetch(session, u)))
for result in asyncio.as_completed(tasks):
page = await result
#Do whatever you want with results
print(page[0:100])
asyncio.run(fetch_concurrent(urls))
But unless you are going to be making a huge number of requests, the threaded approach will likely be sufficient (and way easier to implement).
I have the following code:
with ThreadPoolExecutor(max_workers=num_of_pages) as executor:
futh = [(executor.submit(self.getdata2, page, hed, data, apifolder,additional)) for page in pages]
for data in as_completed(futh):
datarALL = datarALL + data.result()
return datarALL
The num_of_pages isn't fixed but usualy it's around 250.
getdata2 func creates GET requests and return each page results:
The problem is that all 250 pages (threads) are created together. which means 250 GET requests which are called at the same time. This cause overload in the server so I get alot of retries due to delayed server response which shuts down the GET call and retry it. I want to avoid it.
I thought of creating some sort of lock which will prevent the thread/page from creating the GET request if there are more than 10 active requests. In such case it will wait till a slot is available.
Some thing like:
executing_now = []
def getdata2(...)
...
while len(executing_now)>10:
sleep(10)
executing_now.append(page)
response = requests.get(url, data=data, headers=hed, verify=False)
....
executing_now.remove(page)
return ...
Is there an existed mechanism for this in Python? This requires the threads to check for a shared memory... I want to avoid the multi threading problems such as deadlock etc..
Basically warp the GET call with a limit of how many threads can execute it on the same time.
We can use queue to "prepare" all your pages, and then you can limit to your thread pool to any number of threads since each thread will fetch needed page from queue:
# preparing here all you page objects
pages_queue = queue.Queue()
[pages_queue.put(page) for page in pages]
# ThreadPool - Each thread will take one page from queue, and when done, will fetch next one
with ThreadPoolExecutor(max_workers=10) as executor:
futh = [(executor.submit(self.getdata2, pages_queue, hed, data, apifolder,additional))]
for data in as_completed(futh):
datarALL = datarALL + data.result()
return datarALL
def getdata2(...)
...
try:
while True: # non blocking wait will raise Empty when queue is empty
page = pages_queue.get_nowait()
response = requests.get(page.url, data=data, headers=hed, verify=False)
....
return ...
except queue.Empty:
pass
Situation:
I am trying to send a HTTP request to all listed domains in a specific file I already downloaded and get the destination URL, I was forwarded to.
Problem: Well I have followed a tutorial and I get many less responses than expected. It's around 100 responses per second, but in the tutorial there are 100,000 responses per minute listed.
The script gets also slower and slower after a couple of seconds, so that I just get 1 response every 5 seconds.
Already tried: Firstly I thought that this problem is because I ran that on a Windows server. Well after I tried the script on my computer, I recognized that it was just a little bit faster, but not much more. On an other Linux server it was the same like on my computer (Unix, macOS).
Code: https://pastebin.com/WjLegw7K
work_dir = os.path.dirname(__file__)
async def fetch(url, session):
try:
async with session.get(url, ssl=False) as response:
if response.status == 200:
delay = response.headers.get("DELAY")
date = response.headers.get("DATE")
print("{}:{} with delay {}".format(date, response.url, delay))
return await response.read()
except Exception:
pass
async def bound_fetch(sem, url, session):
# Getter function with semaphore.
async with sem:
await fetch(url, session)
async def run():
os.chdir(work_dir)
for file in glob.glob("cdx-*"):
print("Opening: " + file)
opened_file = file
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(40000)
with open(work_dir + '/' + file) as infile:
seen = set()
async with ClientSession() as session:
for line in infile:
regex = re.compile(r'://(.*?)/')
domain = regex.search(line).group(1)
domain = domain.lower()
if domain not in seen:
seen.add(domain)
task = asyncio.ensure_future(bound_fetch(sem, 'http://' + domain, session))
tasks.append(task)
del line
responses = asyncio.gather(*tasks)
await responses
infile.close()
del seen
del file
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
I really don't know how to fix that issue. Especially because I'm very new to Python... but I have to get it to work somehow :(
It's hard to tell what is going wrong without actually debugging the code, but one potential problem is that file processing is serialized. In other words, the code never processes the next file until all the requests from the current file have finished. If there are many files and one of them is slow, this could be a problem.
To change this, define run along these lines:
async def run():
os.chdir(work_dir)
async with ClientSession() as session:
sem = asyncio.Semaphore(40000)
seen = set()
pending_tasks = set()
for f in glob.glob("cdx-*"):
print("Opening: " + f)
with open(f) as infile:
lines = list(infile)
for line in lines:
domain = re.search(r'://(.*?)/', line).group(1)
domain = domain.lower()
if domain in seen:
continue
seen.add(domain)
task = asyncio.ensure_future(bound_fetch(sem, 'http://' + domain, session))
pending_tasks.add(task)
# ensure that each task removes itself from the pending set
# when done, so that the set doesn't grow without bounds
task.add_done_callback(pending_tasks.remove)
# await the remaining tasks
await asyncio.wait(pending_tasks)
Another important thing: silencing all exceptions in fetch() is bad practice because there is no indication that something has started going wrong (due to either a bug or a simple typo). This might well be the reason your script becomes "slow" after a while - fetch is raising exceptions and you're never seeing them. Instead of pass, use something like print(f'failed to get {url}: {e}') where e is the object you get from except Exception as e.
Several additional remarks:
There is almost never a need to del local variables in Python; the garbage collector does that automatically.
You needn't close() a file opened using a with statement. with is designed specifically to do such closing automatically for you.
The code added domains to a seen set, but also processed an already seen domain. This version skips the domain for which it had already spawned a task.
You can create a single ClientSession and use it for the entire run.
I am using python 2.7 on Windows machine. I have an array of urls accompanied by data and headers, so POST method is required.
In simple execution it works well:
rescodeinvalid =[]
success = []
for i in range(0,len(HostArray)):
data = urllib.urlencode(post_data)
req = urllib2.Request(HostArray[i], data)
response = urllib2.urlopen(req)
rescode=response.getcode()
if responsecode == 400:
rescodeinvalid.append(HostArray[i])
if responsecode == 200:
success.append(HostArray[i])
My question is if HostArray length is very large, then it is taking much time in loop.
So, how to check each url of HostArray in a multithread. If response code of each url is 200, then I am doing different operation. I have arrays to store 200 and 400 responses.
So, how to do this in multithread in python
If you want to do each one in a separate thread you could do something like:
rescodeinvalid =[]
success = []
def post_and_handle(url,post_data)
data = urllib.urlencode(post_data)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
rescode=response.getcode()
if responsecode == 400:
rescodeinvalid.append(url) # Append is thread safe
elif responsecode == 200:
success.append(url) # Append is thread safe
workers = []
for i in range(0,len(HostArray)):
t = threading.Thread(target=post_and_handle,args=(HostArray[i],post_data))
t.start()
workers.append(t)
# Wait for all of the requests to complete
for t in workers:
t.join()
I'd also suggest using requests: http://docs.python-requests.org/en/latest/
as well as a thread pool:
Threading pool similar to the multiprocessing Pool?
Thread pool usage:
from multiprocessing.pool import ThreadPool
# Done here because this must be done in the main thread
pool = ThreadPool(processes=50) # use a max of 50 threads
# do this instead of Thread(target=func,args=args,kwargs=kwargs))
pool.apply_async(func,args,kwargs)
pool.close() # I think
pool.join()
scrapy uses twisted library to call multiple urls in parallel without the overhead of opening a new thread per request, it also manage internal queue to accumulate and even prioritize them as a bonus you can also restrict number of parallel requests by settings maximum concurrent requests, you can either launch a scrapy spider as an external process or from your code, just set spider start_urls = HostArray
Your case (basically processing a list into another list) looks like an ideal candidate for concurrent.futures (see for example this answer) or you may go all the way to Executor.map. And of course use ThreadPoolExecutor to limit the number of concurrently running threads to something reasonable.
I want to make a lot of url requets to a REST webserivce. Typically between 75-90k. However, I need to throttle the number of concurrent connections to the webservice.
I started playing around with grequests in the following manner, but quickly started chewing up opened sockets.
concurrent_limit = 30
urllist = buildUrls()
hdrs = {'Host' : 'hostserver'}
g_requests = (grequests.get(url, headers=hdrs) for url in urls)
g_responses = grequests.map(g_requests, size=concurrent_limit)
As this runs for a minute or so, I get hit with 'maximum number of sockets reached' errors.
As far as I can tell, each one of the requests.get calls in grequests uses it's own session which means a new socket is opened for each request.
I found a note on github referring how to make grequests use a single session. But this seems to effectively bottleneck all requests into a single shared pool. That seems to defeat the purpose of asynchronous http requests.
s = requests.session()
rs = [grequests.get(url, session=s) for url in urls]
grequests.map(rs)
Is is possible to use grequests or gevent.Pool in a way that creates a number of sessions?
Put another way: How can I make many concurrent http requests using either through queuing or connection pooling?
I ended up not using grequests to solve my problem. I'm still hopeful it might be possible.
I used threading:
class MyAwesomeThread(Thread):
"""
Threading wrapper to handle counting and processing of tasks
"""
def __init__(self, session, q):
self.q = q
self.count = 0
self.session = session
self.response = None
Thread.__init__(self)
def run(self):
"""TASK RUN BY THREADING"""
while True:
url, host = self.q.get()
httpHeaders = {'Host' : host}
self.response = session.get(url, headers=httpHeaders)
# handle response here
self.count+= 1
self.q.task_done()
return
q=Queue()
threads = []
for i in range(CONCURRENT):
session = requests.session()
t=MyAwesomeThread(session,q)
t.daemon=True # allows us to send an interrupt
threads.append(t)
## build urls and add them to the Queue
for url in buildurls():
q.put_nowait((url,host))
## start the threads
for t in threads:
t.start()
rs is a AsyncRequest list。each AsyncRequest have it's own session.
rs = [grequests.get(url) for url in urls]
grequests.map(rs)
for ar in rs:
print(ar.session.cookies)
Something like this:
NUM_SESSIONS = 50
sessions = [requests.Session() for i in range(NUM_SESSIONS)]
reqs = []
i = 0
for url in urls:
reqs.append(grequests.get(url, session=sessions[i % NUM_SESSIONS]
i+=1
responses = grequests.map(reqs, size=NUM_SESSIONS*5)
That should spread the requests over 50 different sessions.