using python requests to speed up for loop - python

I have a python code and I want to speed it up using threads but when I try to I get the same lines getting duplicated, is there is any way I could speed it up without getting duplicate lines
code
import requests
import json
f = open("urls.json")
data = json.load(f)
def urls():
for i in data['urls']:
r = requests.get("https://" + i)
print(r.headers)

You can use ThreadPoolExecutor class from concurrent.futures. It is efficient way according to Thread class.
You can change the max_workers value according to your task
Here is the piece of code:
import requests
from concurrent.futures import ThreadPoolExecutor
import json
with open("urls.json") as f:
data = json.load(f)
def urls():
urls = ["https://" + url for url in data['urls']]
print(urls)
with ThreadPoolExecutor(max_workers=5) as pool:
iterator = pool.map(requests.get,urls)
for response in iterator:
print(response.headers)
print("\n")

Make async or threaded calls.
So, you would do something like this:
import aiohttp
import asyncio
import time
start_time = time.time()
async def main():
async with aiohttp.ClientSession() as session:
for number in range(1, 151):
pokemon_url = f'https://pokeapi.co/api/v2/pokemon/{number}'
async with session.get(pokemon_url) as resp:
pokemon = await resp.json()
print(pokemon['name'])
asyncio.run(main())
Could also do multiprocessing as per the comment, but async is better for i/o type tasks.

Related

Parallelize checking of dead URLs

The question is quite easy: Is it possible to test a list of URLs and store in a list only dead URLs (response code > 400) using asynchronous function?
I previously use requests library to do it and it works great but I have a big list of URLs to test and if I do it sequentially it takes more than 1 hour.
I saw a lot of article on how to make parallels requests using asyncio and aiohttp but I didn't see many things about how to test URLs with these libraries.
Is it possible to do it?
Using multithreading you could do it like this:
import requests
from concurrent.futures import ThreadPoolExecutor
results = dict()
# test the given url
# add url and status code to the results dictionary if GET succeeds but status code >= 400
# also add url to results dictionary if an exception arises with full exception details
def test_url(url):
try:
r = requests.get(url)
if r.status_code >= 400:
results[url] = f'{r.status_code=}'
except requests.exceptions.RequestException as e:
results[url] = str(e)
# return a list of URLs to be checked. probably get these from a file in reality
def get_list_of_urls():
return ['https://facebook.com', 'https://google.com', 'http://google.com/nonsense', 'http://goooglyeyes.org']
def main():
with ThreadPoolExecutor() as executor:
executor.map(test_url, get_list_of_urls())
print(results)
if __name__ == '__main__':
main()
You could do something like this using aiohttp and asyncio.
Could be done more pythonic I guess but this should work.
import aiohttp
import asyncio
urls = ['url1', 'url2']
async def test_url(session, url):
async with session.get(url) as resp:
if resp.status > 400:
return url
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for url in urls:
tasks.append(asyncio.ensure_future(test_url(session, url)))
dead_urls = await asyncio.gather(*tasks)
print(dead_urls)
asyncio.run(main())
Very basic example, but this is how I would solve it:
from aiohttp import ClientSession
from asyncio import create_task, gather, run
async def TestUrl(url, session):
async with session.get(url) as response:
if response.status >= 400:
r = await response.text()
print(f"Site: {url} is dead, response code: {str(response.status)} response text: {r}")
async def TestUrls(urls):
resultsList: list = []
async with ClientSession() as session:
# Maybe some rate limiting?
partitionTasks: list = [
create_task(TestUrl(url, session))
for url in urls]
resultsList.append(await gather(*partitionTasks, return_exceptions=False))
# do stuff with the results or return?
return(resultsList)
async def main():
urls = []
test = await TestUrls(urls)
if __name__ == "__main__":
run(main())
Try using a ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor
import requests
url_list=[
"https://www.google.com",
"https://www.adsadasdad.com",
"https://www.14fsdfsff.com",
"https://www.ggr723tg.com",
"https://www.yyyyyyyyyyyyyyy.com",
"https://www.78sdf8sf5sf45sf.com",
"https://www.wikipedia.com",
"https://www.464dfgdfg235345.com",
"https://www.tttllldjfh.com",
"https://www.qqqqqqqqqq456.com"
]
def check(url):
r=requests.get(url)
if r.status_code < 400:
print(f"{url} is ALIVE")
with ThreadPoolExecutor(max_workers=5) as e:
for url in url_list:
e.submit(check, url)
Multiprocessing could be the better option for your problem.
from multiprocessing import Process
from multiprocessing import Manager
import requests
def checkURLStatus(url, url_status):
res = requests.get(url)
if res.status_code >= 400:
url_status[url] = "Inactive"
else:
url_status[url] = "Active"
if __name__ == "__main__":
urls = [
"https://www.google.com"
]
manager = Manager()
# to store the results for later usage
url_status = manager.dict()
procs = []
for url in urls:
proc = Process(target=checkURLStatus, args=(url, url_status))
procs.append(proc)
proc.start()
for proc in procs:
proc.join()
print(url_status.values())
url_status is a shared variable to store data for separate threads. Refer this page for more info.

Using multithreading for api requests in python

For my project I need to request a api and to store the result in a list. But the no. of requests I need to give more than 5000 with different body values. So, it take huge amount of time to complete. Is there is any way to parallely send the requests to complete the process quickly. I tried some threading code in this but I can't be able to figure out the ay to solve this.
import requests
res_list=[]
l=[19821, 29674 , 41983, 40234 ,.....] # Nearly 5000 items for now and the count may increase in future
for i in l:
URL ="https://api.something.com/?key=xxx-xxx-xxx&job_id={0}".format(i)
res = requests.get(url=URL)
res_list.append(res.text)
Probably, you just need to make your queries asynchronously. Something like that:
import asyncio
import aiohttp
NUMBERS = [1, 2, 3]
async def call():
async with aiohttp.ClientSession() as session:
for num in NUMBERS:
async with session.get(f'http://httpbin.org/get?{num}') as resp:
print(resp.status)
print(await resp.text())
if __name__ == '__main__':
loop = asyncio.new_event_loop()
loop.run_until_complete(call())

Multithreading Python Requests Through Tor

The following code is my attempt at doing python requests through tor, this works fine, however I am interested in adding multithreading to this.
So I would like to simultaneously do about 10 different requests and process their outputs. What is the simplest and most efficient way to do this?
def onionrequest(url, onionid):
onionid = onionid
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#r = session.get('http://google.com')
onionurlforrequest = "http://" + url
try:
r = session.get(onionurlforrequest, timeout=15)
except:
return None
if r.status_code = 200:
listofallonions.append(url)
I would recommend using the the following packages to achieve this: asyncio, aiohttp, aiohttp_socks
example code:
import asyncio
import aiohttp
from aiohttp_socks import ProxyConnector
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def main(urls):
tasks = []
connector = ProxyConnector.from_url('socks5://localhost:9150', rdns=True)
async with aiohttp.ClientSession(connector=connector, rdns=True) as session:
for url in urls:
tasks.append(fetch(session, url))
htmls = await asyncio.gather(*tasks)
for html in htmls:
print(html)
if __name__ == '__main__':
urls = [
'http://python.org',
'https://google.com',
...
]
loop = asyncio.get_event_loop()
loop.run_until_complete(main(urls))
Using asyncio can get a bit daunting at first, so you might need to practice for a while before you get the hang of it.
If you want a more in-depth explanation of the difference between synchronous and asynchronous, check out this question.

Run Parallel Request session in python

I am trying to open a multiple web session and save the data into CSV, Have written my code using for loop & requests.get options, But it's taking so long to access 90 number of Web location. Can anyone let me know how the whole process run in parallel for loc_var:
The code is working fine, only the issue is running one by one for loc_var, and took so long time.
Want to access all the for loop loc_var URL in parallel and write operation of CSV
Below is the Code:
import pandas as pd
import numpy as np
import os
import requests
import datetime
import zipfile
t=datetime.date.today()-datetime.timedelta(2)
server = [("A","web1",":5000","username=usr&password=p7Tdfr")]
'''List of all web_ips'''
web_1 = ["Web1","Web2","Web3","Web4","Web5","Web6","Web7","Web8","Web9","Web10","Web11","Web12","Web13","Web14","Web15"]
'''List of All location'''
loc_var =["post1","post2","post3","post4","post5","post6","post7","post8","post9","post10","post11","post12","post13","post14","post15","post16","post17","post18"]
for s,web,port,usr in server:
login_url='http://'+web+port+'/api/v1/system/login/?'+usr
print (login_url)
s= requests.session()
login_response = s.post(login_url)
print("login Responce",login_response)
#Start access the Web for Loc_variable
for mkt in loc_var:
#output is CSV File
com_actions_url='http://'+web+port+'/api/v1/3E+date(%5C%22'+str(t)+'%5C%22)and+location+%3D%3D+%27'+mkt+'%27%22&page_size=-1&format=%22csv%22'
print("com_action_url",com_actions_url)
r = s.get(com_actions_url)
print("action",r)
if r.ok == True:
with open(os.path.join("/home/Reports_DC/", "relation_%s.csv"%mkt),'wb') as f:
f.write(r.content)
# If loc is not aceesble try with another Web_1 List
if r.ok == False:
while r.ok == False:
for web_2 in web_1:
login_url='http://'+web_2+port+'/api/v1/system/login/?'+usr
com_actions_url='http://'+web_2+port+'/api/v1/3E+date(%5C%22'+str(t)+'%5C%22)and+location+%3D%3D+%27'+mkt+'%27%22&page_size=-1&format=%22csv%22'
login_response = s.post(login_url)
print("login Responce",login_response)
print("com_action_url",com_actions_url)
r = s.get(com_actions_url)
if r.ok == True:
with open(os.path.join("/home/Reports_DC/", "relation_%s.csv"%mkt),'wb') as f:
f.write(r.content)
break
There are multiple approaches that you can take to make concurrent HTTP requests. Two that I've used are (1) multiple threads with concurrent.futures.ThreadPoolExecutor or (2) send the requests asynchronously using asyncio/aiohttp.
To use a thread pool to send your requests in parallel, you would first generate a list of URLs that you want to fetch in parallel (in your case generate a list of login_urls and com_action_urls), and then you would request all of the URLs concurrently as follows:
from concurrent.futures import ThreadPoolExecutor
import requests
def fetch(url):
page = requests.get(url)
return page.text
# Catch HTTP errors/exceptions here
pool = ThreadPoolExecutor(max_workers=5)
urls = ['http://www.google.com', 'http://www.yahoo.com', 'http://www.bing.com'] # Create a list of urls
for page in pool.map(fetch, urls):
# Do whatever you want with the results ...
print(page[0:100])
Using asyncio/aiohttp is generally faster than the threaded approach above, but the learning curve is more complicated. Here is a simple example (Python 3.7+):
import asyncio
import aiohttp
urls = ['http://www.google.com', 'http://www.yahoo.com', 'http://www.bing.com']
async def fetch(session, url):
async with session.get(url) as resp:
return await resp.text()
# Catch HTTP errors/exceptions here
async def fetch_concurrent(urls):
loop = asyncio.get_event_loop()
async with aiohttp.ClientSession() as session:
tasks = []
for u in urls:
tasks.append(loop.create_task(fetch(session, u)))
for result in asyncio.as_completed(tasks):
page = await result
#Do whatever you want with results
print(page[0:100])
asyncio.run(fetch_concurrent(urls))
But unless you are going to be making a huge number of requests, the threaded approach will likely be sufficient (and way easier to implement).

Send Simultaneous Requests python (all at once)

I'm trying to create a script that send's over 1000 requests to one page at the same time. But requests library with threading (1000) threads. Seems to be doing to first 50 or so requests all within 1 second, whereas the other 9950 are taking considerably longer. I measured it like this.
def print_to_cmd(strinng):
queueLock.acquire()
print strinng
queueLock.release()
start = time.time()
resp = requests.get('http://test.net/', headers=header)
end = time.time()
print_to_cmd(str(end-start))
I'm thinking requests library is limiting how fast they are getting sent.
Doe's anybody know a way in python to send requests all at the same time? I have a VPS with 200mb upload so that is not the issue its something to do with python or requests library limiting it. They all need to hit the website within 1 second of each other.
Thanks for reading and I hope somebody can help.
I have generally found that the best solution is to use an asynchronous library like tornado. The easiest solution that I found however is to use ThreadPoolExecutor.
import requests
from concurrent.futures import ThreadPoolExecutor
def get_url(url):
return requests.get(url)
with ThreadPoolExecutor(max_workers=50) as pool:
print(list(pool.map(get_url,list_of_urls)))
I know this is an old question, but you can now do this using asyncio and aiohttp.
import asyncio
import aiohttp
from aiohttp import ClientSession
async def fetch_html(url: str, session: ClientSession, **kwargs) -> str:
resp = await session.request(method="GET", url=url, **kwargs)
resp.raise_for_status()
return await resp.text()
async def make_requests(url: str, **kwargs) -> None:
async with ClientSession() as session:
tasks = []
for i in range(1,1000):
tasks.append(
fetch_html(url=url, session=session, **kwargs)
)
results = await asyncio.gather(*tasks)
# do something with results
if __name__ == "__main__":
asyncio.run(make_requests(url='http://test.net/'))
You can read more about it and see an example here.
Assumed that you know what you are doing, I first suggest you to implement a backoff policy with a jitter to prevent "predictable thundering hoardes" to your server. That said, you should consider to do some threading
import threading
class FuncThread(threading.Thread):
def __init__(self, target, *args):
self._target = target
self._args = args
threading.Thread.__init__(self)
def run(self):
self._target(*self._args)
so that you would do something like
t = FuncThread(doApiCall, url)
t.start()
where your method doApiCall is defined like this
def doApiCall(self, url):

Categories

Resources