Guaranteed timout when using Python requests module - python

I have Python application which uses threading and requests modules for processing many pages. Basic function for page downloading looks like this:
def get_page(url):
error = None
data = None
max_page_size = 10 * 1024 * 1024
try:
s = requests.Session()
s.max_redirects = 10
s.keep_alive = False
r = s.get('http://%s' % url if not url.startswith('http://') else url,
headers=headers, timeout=10.0, stream=True)
raw_data = io.BytesIO()
size = 0
for chunk in r.iter_content(4096):
size += len(chunk)
raw_data.write(chunk)
if size > max_page_size:
r.close()
raise SpyderError('too_large')
fetch_result = 'ok'
finally:
del s
It works well in most cases but sometimes application freezes because of very slow connection with some servers or some other network problems. How can I setup a global guaranteed timeout for whole function? Should I use asyncio or coroutines?

Related

Python Multi Processing printing statement multiple times

I've written this code that downloads a file from the internet and saves it to my computer.
To make it more efficient, I added MultiProcessing to my code to be able to download multiple files at the same time and it works, However, It keeps printing the progressbar I added again and again.
What I want is for the progress bars to display once and keep updating, like they would before the Multi Processing functionality is added. I've added my code below to reproduce.
from multiprocessing import Process
from alive_progress import alive_bar
import requests
import time
import os
def download(url):
curr_dir = os.getcwd()
x = requests.head(url)
y = requests.head(x.headers['Location'])
file_size = int(int(y.headers['content-length']) / 1024)
chunk_size = 1024
def compute():
response = requests.get(url, stream=True)
with open(curr_dir + '\\' + str(time.time()) + '.mp4', 'wb') as f:
for chunk in response.iter_content(chunk_size=chunk_size):
f.write(chunk)
yield 1024
with alive_bar(file_size, bar='classic2', spinner='classic') as bar:
for i in compute():
bar()
print("Downloaded!")
if __name__ == '__main__':
processess = []
num_processess = 2
for i in num_processess:
process = Process(target=download, args=(links[i],))
processess.append(process)
for process in processess:
process.start()
for process in processess:
process.join()
Alive-progress doesn't support showing and updating multiple progress bars. You have to use another library, such as the tqdm.
The following is an example of using the tqdm for your scenario. The key point is to call the tqdm.set_lock() to specify a synchronization mechanism for inter-process interaction and control positions of progress bars via the position argument of tqdm().
import multiprocessing
import tqdm
def download(url, id, tqdm_lock):
...
tqdm.tqdm.set_lock(tqdm_lock)
with tqdm.tqdm(total=file_size, position=id) as bar:
for i in compute():
bar.update(1)
bar.clear()
...
if __name__ == '__main__':
tqdm_lock = multiprocessing.RLock()
processess = []
num_processess = 2
links = [...]
for i in num_processess:
process = Process(target=download, args=(links[i], i, tqdm_lock))
processess.append(process)
for process in processess:
process.start()
for process in processess:
process.join()
Update 2
If you want multiple progress bars, then I would use package tqdm.
This is how I would approach it:
First find out for each URL how many CHUNK_SIZE chunks there are. CHUNK_SIZE is set at 1024, but consider increasing this for large files. A potential issue is that the 'content-length' header key is not always present. In this case, the URL is considered to consist of a single chunk and the progress bar created will be updated only once when the entire file has been downloaded..
Then each submitted task creates a progress bar whose size is the number of chunks it retrieved in step 1 and designated for a specific position based on its task number. Then the chunks are retrieved and the progress bar is updated. The logic is predicated on the file being retrieved never varying in size when the content-length key is present in the fetched header. That is, the size of the file does not change between the head and get requests being issued so that the progress bar size set from the head command will match the actual number of chunks read when the download is done.
In the code below I have commented out specific code pertaining to the writing of downloaded files to disk and have gotten rid of the compute generator function, which now seems to be unnecessary. I have also added a delay between successive fetching of chunks so that the progress bar does not progress too fast:
import requests
from tqdm import tqdm
CHUNK_SIZE = 1024
def get_number_of_chunks(url):
r = requests.head(url, allow_redirects=True)
headers = r.headers
if 'content-length' in headers:
n_chunks, remainder = divmod(int(headers['content-length']), CHUNK_SIZE)
if remainder:
n_chunks += 1
else:
n_chunks = 1
return n_chunks
def download(task_number, url):
n_chunks = get_number_of_chunks(url)
response = requests.get(url, stream=True)
#with open(str(time.time()) + '.mp4', 'wb') as f:
if True:
with tqdm(total=n_chunks, position=task_number) as bar:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
#f.write(chunk)
if n_chunks != 1:
bar.update(1)
# For demo purposes:
import time
time.sleep(.1)
if n_chunks == 1:
bar.update(1)
if __name__ == '__main__':
from multiprocessing.pool import ThreadPool
links = [
'http://localhost/friends/images/nav.png',
'http://localhost/friends/images/race.jpg',
]
n_writers = len(links)
pool = ThreadPool(n_writers)
pool.starmap(download, enumerate(links))
pool.close()
pool.join()
Multiprocessing Version
If you must use multiprocessing, then thanks to relent95, who showed the way:
import requests
from tqdm import tqdm
CHUNK_SIZE = 1024
def init_pool_processes(lock):
"""
Note: The lock only needs to be set once for each pool process.
"""
tqdm.set_lock(lock)
def get_number_of_chunks(url):
r = requests.head(url, allow_redirects=True)
headers = r.headers
if 'content-length' in headers:
n_chunks, remainder = divmod(int(headers['content-length']), CHUNK_SIZE)
if remainder:
n_chunks += 1
else:
n_chunks = 1
return n_chunks
def download(task_number, url):
n_chunks = get_number_of_chunks(url)
response = requests.get(url, stream=True)
#with open(str(time.time()) + '.mp4', 'wb') as f:
if True:
with tqdm(total=n_chunks, position=task_number) as bar:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
#f.write(chunk)
if n_chunks != 1:
bar.update(1)
# For demo purposes:
import time
time.sleep(.1)
if n_chunks == 1:
bar.update(1)
if __name__ == '__main__':
from multiprocessing import Pool, Lock
links = [
'http://localhost/friends/images/nav.png',
'http://localhost/friends/images/race.jpg',
]
n_writers = len(links)
pool = Pool(n_writers, initializer=init_pool_processes, initargs=(Lock(),))
pool.starmap(download, enumerate(links))
pool.close()
pool.join()

Python Multithreaded HTTP crawler - Closing connection and hanging of the program

Wrote this crawler in Python, it dumps several parameters to JSON output file based on the input list of domains.
Have this question:
Do I need to close the HTTP connection in each thread? Input data is ca. 5 Million items. It process at the beginning at a rate ca. 50 iterations per second, but later after some time it drops to 1-2 per second and/or hangs (no kernel messages and no errors on stdout)? Can this be code or is network limiting related? I suspect software since when I restart it, it starts again with high rate (ca. 50 iteration per second)
Any tips how to improve the code below are also welcome, especially improve on speed and crawling throughput.
Code in questions:
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\""+item+"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
#print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
Update 1:
Closing the Socket and FileDescriptor makes it work better, does not seem to hang anymore after some time. Performance is 50 reqs/sec on home laptop and ca 100 req/sec on a VPS
from threading import Thread
import httplib, sys
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
import json
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
realsock = response.fp._sock.fp._sock
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
realsock.close()
response.close()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\",\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":"+json.dumps(item)+",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\"," + "\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":\"\",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
The handles will be automatically garbage collected, but, you will be better off closing the handles yourself, especially as you are doing this in a tight loop.
You also asked for suggestions for improvement. A big one would be to stop using urllib2 and start using requests instead.
There are many possible options, why your crawling rate drops.
1.) Take care not to crawl to much data from the same domain. Some web servers are configured just to allow one connection per IP address in parallel.
2.) Try to send randomized browser-like http headers (user-agent, referrer, ...) to prevent web server scraping protection, if set.
3.) Use a mature http (parallel) library, like pycurl (has MultiCurl) or requests (grequests). They perform faster for sure.

using python stream downloading file with server limit

I tried to download file from a server using python, sometimes the file is very large, I would like to have some progress bar, one way to do this I can come up with is to download in a stream, so that I can print the progress. Currently I have tried the standard urlopen, urlretrieve, and requests module (with stream on).
Obviously, urlopen cannot download file in stream, requests module support this, however, the server has limit on the file I can download at one time (its limit is 1). So everytime, I tried to use requests, it only get the webpage told me to wait, is there any other way to do this?
I have very recently downloaded many types of media with this function:
import sys
import requests
import time
def download_resource(domain, url, file_name = None, download = True):
cookies = {}
s = requests.Session()
s.config['keep_alive'] = True
#add your own cookies here, I have a specific function I call
#for my application but yours is different
r = s.get(url, cookies = cookies, stream = True)
if not r.ok:
print "error in downloading"
return -1
file_size = int(r.headers['content-length'])
if not file_name:
try:
temp = r.headers['content-disposition']
except Exception as e:
pass
#failing download
return -1
else:
if not temp:
return -1
else:
file_name = temp.split("filename=")[-1]
return_obj["filename"] = file_name
#print "File size:", file_size
#print "\n", str(self.entire_size / float(1024*1024*1024)), "\n"
print "Downloading:", file_name
if download:
with open(file_name, "wb") as fh:
count = 1
chunk_size = 1048576
start_time = time.time()
try:
for block in r.iter_content(chunk_size):
total_time = time.time() - start_time
percent = count*chunk_size/float(file_size) * 100.0
fraction = int(percent/5)
download_speed = 1.0 / total_time
sys.stdout.write('\r')
sys.stdout.write("[%-20s] %d%% %3.2f MB/s " % ('='* fraction , percent, download_speed))
sys.stdout.flush()
if not block:
break
fh.write(block)
count += 1
start_time = time.time()
except Exception as e:
print e
finally:
#close up the stream
r.close()

python-requests with multithreading

I am working on creating a HTTP client which can generate hundreds of connections each second and send up to 10 requests on each of those connections. I am using threading so concurrency can be achieved.
Here is my code:
def generate_req(reqSession):
requestCounter = 0
while requestCounter < requestRate:
try:
response1 = reqSession.get('http://20.20.1.2/tempurl.html')
if response1.status_code == 200:
client_notify('r')
except(exceptions.ConnectionError, exceptions.HTTPError, exceptions.Timeout) as Err:
client_notify('F')
break
requestCounter += 1
def main():
for q in range(connectionPerSec):
s1 = requests.session()
t1 = threading.Thread(target=generate_req, args=(s1,))
t1.start()
Issues:
It is not scaling above 200 connections/sec with requestRate = 1. I ran other available HTTP clients on the same client machine and against the server, test runs fine and it is able to scale.
When requestRate = 10, connections/sec drops to 30.
Reason: Not able to create targeted number of threads every second.
For issue #2, client machine is not able to create enough request sessions and start new threads. As soon as requestRate is set to more than 1, things start to fall apart.
I am suspecting it has something to do with HTTP connection pooling which requests uses.
Please suggest what am I doing wrong here.
I wasn't able to get things to fall apart, however the following code has some new features:
1) extended logging, including specific per-thread information
2) all threads join()ed at the end to make sure the parent process doesntt leave them hanging
3) multithreaded print tends to interleave the messages, which can be unwieldy. This version uses yield so a future version can accept the messages and print them clearly.
source
import exceptions, requests, threading, time
requestRate = 1
connectionPerSec = 2
def client_notify(msg):
return time.time(), threading.current_thread().name, msg
def generate_req(reqSession):
requestCounter = 0
while requestCounter < requestRate:
try:
response1 = reqSession.get('http://127.0.0.1/')
if response1.status_code == 200:
print client_notify('r')
except (exceptions.ConnectionError, exceptions.HTTPError, exceptions.Timeout):
print client_notify('F')
break
requestCounter += 1
def main():
for cnum in range(connectionPerSec):
s1 = requests.session()
th = threading.Thread(
target=generate_req, args=(s1,),
name='thread-{:03d}'.format(cnum),
)
th.start()
for th in threading.enumerate():
if th != threading.current_thread():
th.join()
if __name__=='__main__':
main()
output
(1407275951.954147, 'thread-000', 'r')
(1407275951.95479, 'thread-001', 'r')

How to perform time limited response download with python requests?

When downloading a large file with python, I want to put a time limit not only for the connection process, but also for the download.
I am trying with the following python code:
import requests
r = requests.get('http://ipv4.download.thinkbroadband.com/1GB.zip', timeout = 0.5, prefetch = False)
print r.headers['content-length']
print len(r.raw.read())
This does not work (the download is not time limited), as correctly noted in the docs: https://requests.readthedocs.org/en/latest/user/quickstart/#timeouts
This would be great if it was possible:
r.raw.read(timeout = 10)
The question is, how to put a time limit to the download?
And the answer is: do not use requests, as it is blocking. Use non-blocking network I/O, for example eventlet:
import eventlet
from eventlet.green import urllib2
from eventlet.timeout import Timeout
url5 = 'http://ipv4.download.thinkbroadband.com/5MB.zip'
url10 = 'http://ipv4.download.thinkbroadband.com/10MB.zip'
urls = [url5, url5, url10, url10, url10, url5, url5]
def fetch(url):
response = bytearray()
with Timeout(60, False):
response = urllib2.urlopen(url).read()
return url, len(response)
pool = eventlet.GreenPool()
for url, length in pool.imap(fetch, urls):
if (not length):
print "%s: timeout!" % (url)
else:
print "%s: %s" % (url, length)
Produces expected results:
http://ipv4.download.thinkbroadband.com/5MB.zip: 5242880
http://ipv4.download.thinkbroadband.com/5MB.zip: 5242880
http://ipv4.download.thinkbroadband.com/10MB.zip: timeout!
http://ipv4.download.thinkbroadband.com/10MB.zip: timeout!
http://ipv4.download.thinkbroadband.com/10MB.zip: timeout!
http://ipv4.download.thinkbroadband.com/5MB.zip: 5242880
http://ipv4.download.thinkbroadband.com/5MB.zip: 5242880
When using Requests' prefetch=False parameter, you get to pull in arbitrary-sized chunks of the respone at a time (rather than all at once).
What you'll need to do is tell Requests not to preload the entire request and keep your own time of how much you've spent reading so far, while fetching small chunks at a time. You can fetch a chunk using r.raw.read(CHUNK_SIZE). Overall, the code will look something like this:
import requests
import time
CHUNK_SIZE = 2**12 # Bytes
TIME_EXPIRE = time.time() + 5 # Seconds
r = requests.get('http://ipv4.download.thinkbroadband.com/1GB.zip', prefetch=False)
data = ''
buffer = r.raw.read(CHUNK_SIZE)
while buffer:
data += buffer
buffer = r.raw.read(CHUNK_SIZE)
if TIME_EXPIRE < time.time():
# Quit after 5 seconds.
data += buffer
break
r.raw.release_conn()
print "Read %s bytes out of %s expected." % (len(data), r.headers['content-length'])
Note that this might sometimes use a bit more than the 5 seconds allotted as the final r.raw.read(...) could lag an arbitrary amount of time. But at least it doesn't depend on multithreading or socket timeouts.
Run download in a thread which you can then abort if not finished on time.
import requests
import threading
URL='http://ipv4.download.thinkbroadband.com/1GB.zip'
TIMEOUT=0.5
def download(return_value):
return_value.append(requests.get(URL))
return_value = []
download_thread = threading.Thread(target=download, args=(return_value,))
download_thread.start()
download_thread.join(TIMEOUT)
if download_thread.is_alive():
print 'The download was not finished on time...'
else:
print return_value[0].headers['content-length']

Categories

Resources