Cannot use urllib when monkey patch of gevent was applied

Cannot use urllib when monkey patch of gevent was applied - python

I am new to gevent package that allows us to run things asynchronously. I found this example code on https://sdiehl.github.io/gevent-tutorial/ that allows us to make async api calls. Original code is,
import gevent.monkey
gevent.monkey.patch_socket()
import gevent
import urllib2
import simplejson as json
def fetch(pid):
response = urllib2.urlopen('http://json-time.appspot.com/time.json')
result = response.read()
json_result = json.loads(result)
datetime = json_result['datetime']
print('Process %s: %s' % (pid, datetime))
return json_result['datetime']
def synchronous():
for i in range(1,10):
fetch(i)
def asynchronous():
threads = []
for i in range(1,10):
threads.append(gevent.spawn(fetch, i))
gevent.joinall(threads)
print('Synchronous:')
synchronous()
print('Asynchronous:')
asynchronous()
Because the api that use to take time doesn't work, I had to do some changes. this is the code that I edited.
import gevent
from urllib.request import urlopen #this is the alternative for urllib2
import requests
#import simplejson as json
def fetch(pid):
response = urlopen('https://just-the-time.appspot.com/')
result = response.read()
datetime = result.decode('utf-8')
print('Process %s: %s' % (pid, datetime))
return datetime
def synchronous():
for i in range(1,10):
fetch(i)
def asynchronous():
threads = []
for i in range(1,10):
threads.append(gevent.spawn(fetch, i))
gevent.joinall(threads)
print('Synchronous:')
synchronous()
print('Asynchronous:')
asynchronous()
I tried make requests to the time api directly and it works fine. But when I apply the 'monkey patch' from gevent it gives this error. TypeError: _wrap_socket() argument 'sock' must be _socket.socket, not SSLSocket
You can try removing monkey patch and it works fine. But I am not sure without the monkey patch, async part runs asynchronously or not, because in synchronous and asynchronous the results are kind of identical. Can someone explain what is going on here?

Related

Run python script like a service with Twisted

I would like to run this script like a automatic service who will run every minute, everyday with Twisted (I first tried to 'DAEMON' but it seems to difficult and i didn't find good tutos to do it, I already tried crontab but that's not what I'm looking for).
Do anyone ever do that with Twisted because I'm not finding the tutorial made for my kind of script(getting datas from a db table and putting them in another table of same db) ? I have to keep the logs in a file but it will not be the most difficult part.
from twisted.enterprise import adbapi
from twisted.internet import task
import logging
from datetime import datetime
from twisted.internet import reactor
from twisted.internet.defer import inlineCallbacks
"""
Test DB : This File do database connection and basic operation.
"""
log = logging.getLogger("Test DB")
dbpool = adbapi.ConnectionPool("MySQLdb",db="xxxx",user="guza",passwd="vQsx7gbblal8aiICbTKP",host="192.168.15.01")
class MetersCount():
def getTime(self):
log.info("Get Current Time from System.")
time = str(datetime.now()).split('.')[0]
return time
def getTotalMeters(self):
log.info("Select operation in Database.")
getMetersQuery = """ SELECT count(met_id) as totalMeters FROM meters WHERE DATE(met_last_heard) = DATE(NOW()) """
return dbpool.runQuery(getMetersQuery).addCallback(self.getResult)
def getResult(self, result):
print ("Receive Result : ")
print (result)
# general purpose method to receive result from defer.
return result
def insertMetersCount(self, meters_count):
log.info("Insert operation in Database.")
insertMetersQuery = """ INSERT INTO meter_count (mec_datetime, mec_count) VALUES (NOW(), %s)"""
return dbpool.runQuery(insertMetersQuery, [meters_count])
def checkDB(self):
d = self.getTotalMeters()
d.addCallback(self.insertMetersCount)
return d
a= MetersCount()
a.checkDB()
reactor.run()

If you want to run a function once a minute, have a look at LoopingCall. It takes a function, and runs it at intervals unless told to stop.
You would use it something like this (which I haven't tested):
from twisted.internet.task import LoopingCall
looper = LoopingCall(a.checkDB)
looper.start(60)
The documentation is at the link.

No requests module error

I'm trying to write a web parser script using requests module. Here is my current code:
import requests
import subprocess
import json
import sys
import threading
import time
from Queue import Queue
numberOfViewers = int(sys.argv[1])
builderThreads = int(sys.argv[2])
startTime = time.time()
numberOfSockets = 0
concurrent = 25
urls = []
urlsUsed = []
def getURL(): # Get tokens
output = subprocess.Popen(["livestreamer", "twitch.tv/CHANNEL_NAME", "-j"],
stdout=subprocess.PIPE).communicate()[0]
return json.loads(output)['streams']['worst']['url'] # Parse json and return the URL parameter
def build(): # Builds a set of tokens, aka viewers
global numberOfSockets
global numberOfViewers
while True:
if numberOfSockets < numberOfViewers:
numberOfSockets += 1
print ("Building viewers " + str(numberOfSockets) + "/" + str(numberOfViewers))
urls.append(getURL())
def view(): # Opens connections to send views
global numberOfSockets
while True:
url=q.get()
requests.head(url)
if (url in urlsUsed):
urls.remove(url)
urlsUsed.remove(url)
numberOfSockets -= 1
else:
urlsUsed.append(url)
q.task_done()
if __name__ == '__main__':
for i in range(0, builderThreads):
threading.Thread(target = build).start()
while True:
while (numberOfViewers != numberOfSockets): # Wait until sockets are built
time.sleep(1)
q=Queue(concurrent*2)
for i in range(concurrent):
try:
t=threading.Thread(target=view)
t.daemon=True
t.start()
except:
print ('thread error')
try:
for url in urls:
print (url)
q.put(url.strip())
q.join()
except KeyboardInterrupt:
sys.exit(1)
But when I run the code, it says:
Traceback (most recent call last):
File "C:\Users\flamelier\Desktop\Twitch.py", line 1, in <module>
import requests
ImportError: No module named 'requests'
Why am I getting this error? How do I install this module?
Will this error keep repeating for all the scripts henceforth?
How can I prevent such similar errors in the future?

Requests is a 3rd party module. You should first install it to Python using PIP or easy_install.

You have to run pip3 install requests as requests doesn't come with Python by default, as it is a third party library.

Even after you have pip3-installed requests, the code shown won't do anything. The
if __name__ == "__main__"
test and everything after it is part of an else block in the view function. Back this line and the block that follows out to the left margin.

Lambda Python Pool.map and urllib2.urlopen : Retry only failing processes, log only errors

I have an AWS Lambda function which calls a set of URLs using pool.map. The problem is that if one of the URLs returns anything other than a 200 the Lambda function fails and immediately retries. The problem is it immediately retries the ENTIRE lambda function. I'd like it to retry only the failed URLs, and if (after a second try) it still fails them, call a fixed URL to log an error.
This is the code as it currently sits (with some details removed), working only when all URLs are:
from __future__ import print_function
import urllib2
from multiprocessing.dummy import Pool as ThreadPool
import hashlib
import datetime
import json
print('Loading function')
def lambda_handler(event, context):
f = urllib2.urlopen("https://example.com/geturls/?action=something");
data = json.loads(f.read());
urls = [];
for d in data:
urls.append("https://"+d+".example.com/path/to/action");
# Make the Pool of workers
pool = ThreadPool(4);
# Open the urls in their own threads
# and return the results
results = pool.map(urllib2.urlopen, urls);
#close the pool and wait for the work to finish
pool.close();
return pool.join();
I tried reading the official documentation but it seems to be lacking a bit in explaining the map function, specifically explaining return values.
Using the urlopen documentation I've tried modifying my code to the following:
from __future__ import print_function
import urllib2
from multiprocessing.dummy import Pool as ThreadPool
import hashlib
import datetime
import json
print('Loading function')
def lambda_handler(event, context):
f = urllib2.urlopen("https://example.com/geturls/?action=something");
data = json.loads(f.read());
urls = [];
for d in data:
urls.append("https://"+d+".example.com/path/to/action");
# Make the Pool of workers
pool = ThreadPool(4);
# Open the urls in their own threads
# and return the results
try:
results = pool.map(urllib2.urlopen, urls);
except URLError:
try: # try once more before logging error
urllib2.urlopen(URLError.url); # TODO: figure out which URL errored
except URLError: # log error
urllib2.urlopen("https://example.com/error/?url="+URLError.url);
#close the pool and wait for the work to finish
pool.close();
return true; # always return true so we never duplicate successful calls
I'm not sure if I'm correct to be doing exceptions that way, or if I'm even making python exception notation correctly. Again, my goal is I'd like it to retry only the failed URLs, and if (after a second try) it still fails them, call a fixed URL to log an error.

I figured out the answer thanks to a "lower-level" look at this question I posted here.
The answer was to create my own custom wrapper to the urllib2.urlopen function, since each thread itself needed to be try{}catch'd instead of the whole thing. That function looked like so:
def my_urlopen(url):
try:
return urllib2.urlopen(url)
except URLError:
urllib2.urlopen("https://example.com/log_error/?url="+url)
return None
I put that above the def lambda_handler function declaration, then I can replace the whole try/catch within it from this:
try:
results = pool.map(urllib2.urlopen, urls);
except URLError:
try: # try once more before logging error
urllib2.urlopen(URLError.url);
except URLError: # log error
urllib2.urlopen("https://example.com/error/?url="+URLError.url);
To this:
results = pool.map(my_urlopen, urls);
Q.E.D.

Download files from url parallely in python

I have some links in a database which I want to download parallely. I tried doing it serially but it took too much time. I have around 1877 links.
I tried this code for running the downloads parallely but it throws an error: failed: 'tuple' object has no attribute 'read'
#!/usr/bin/env python
import urllib
from stream import ThreadPool
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
def retrieve(urls):
for url in urls:
print url,' '
res = urllib.urlretrieve(url).read()
yield url, res
if __name__ == '__main__':
retrieved = URLs >> ThreadPool(retrieve, poolsize=7)
for url, content in retrieved:
print '%r is %d bytes' % (url, len(content))
for url, exception in retrieved.failure:
print '%r failed: %s' % (url, exception)
I tried this as well:
import urllib
import tldextract
from multiprocessing.pool import ThreadPool
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
def dwld(url):
print url
res = urllib.urlopen(url).read()
filename = tldextract.extract(url)
with open(filename.domain, 'wb') as fh:
fh.write(res)
return url
pool = ThreadPool(processes = 4)
pool.map(dwld, URLs)
Gives me
Traceback (most recent call last):
File "dwld_thread.py", line 26, in
pool.map(dwld, URLs)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/multiprocessing/pool.py", line 148, in map
return self.map_async(func, iterable, chunksize).get()
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/multiprocessing/pool.py", line 422, in get
raise self._value
IOError: [Errno socket error] [Errno 8] nodename nor servname provided, or not known

I have no idea what that stream.ThreadPool is that you're using, or what its API is… but the problem is obvious:
res = urllib.urlretrieve(url).read()
If you look at the doc for urlretrieve:
Return a tuple (filename, headers) where filename is the local file name under which the object can be found…
You obviously can't call read on that. If you want to download to a local file, using this legacy API, and then read that file, you can:
filename, headers = urllib.urlretrieve(url)
with open(filename) as f:
res = f.read()
But why? Just use urllib2.urlopen, which "returns a file-like object with two additional methods", so you can just call read on it, and you won't be creating a temporary file, and you're not using an old function that wasn't quite designed right that nobody has maintained in years.
But Python has a nice ThreadPoolExecutor built into the standard library. And if you look at the very first example they show you, it's exactly what you're trying to do.
Unfortunately, you're using Python 2.x, which doesn't have the concurrent.futures module. Fortunately, there is a backport on PyPI that works with 2.5+.
Python also has multiprocessing.dummy.Pool (also available under the undocumented, but probably more readable, name multiprocessing.ThreadPool). But if you're willing to go outside the stdlib for some module that you apparently aren't sure how to use and that I've never heard of, I'm guessing you won't have any problem using futures. So:
import futures
import urllib2
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
def load_url(url):
return urllib2.urlopen(url).read()
if __name__ == '__main__':
with futures.ThreadPoolExecutor(max_workers=7) as executor:
fmap = dict((executor.submit(load_url, url), url) for url in URLs)
for f in futures.as_completed(fmap):
url = fmap[f]
try:
content = f.result()
except Exception as exception:
print '%r failed: %s' % (url, exception)
else:
print '%r is %d bytes' % (url, len(content))

urllib.urlretrieve(url).read() should be urllib.urlopen(url).read()

from threading import *
from time import sleep
# if Python2:
import urllib
# if Python3:
# import urllib.request
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
class worker(Thread):
def __init__(self, link):
Thread.__init__(self)
self.link = link
self.start()
def run(self):
# if Python2:
res = urllib.urlopen(url).read() # as mentioned by #DhruvPathak
# if Python3:
# res = urllib.request.urlopen(url).read()
with open(url, 'rb') as fh:
fh.write(res) # store fetched data in a file called <link>
for url in urls:
while len(enumerate()) > 500:
sleep(0.25)
worker(url)
while len(enumerate()) > 1:
sleep(0.25) # wait for all threads to finish

What about using multiprocessing ?
Sample code:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import urllib
from multiprocessing import Pool
import os
POOL = 8
PDFS_DOWNLOAD_DIR = 'pdfs'
PDF_LINKS = sys.argv[1]
class DownloadFiles(object):
def __init__(self):
self.pdf_links = self.read_links_from_file()
self.create_download_dir()
def create_download_dir(self):
try:
if not os.path.exists(PDFS_DOWNLOAD_DIR):
os.makedirs(PDFS_DOWNLOAD_DIR)
except IOError as e:
exit()
def read_links_from_file(self):
try:
with open(PDF_LINKS, 'r') as f:
return list(set([x.strip() for x in f]))
except (IndexError, IOError) as e:
exit()
def get_file(self, link):
filename = link.split('/')[-2]
print('Downloading file --> "{filename}"'.format(
filename=filename
))
urllib.urlretrieve(link, filename='{pdfs_data}/{filename}'.format(
pdfs_data=PDFS_DOWNLOAD_DIR,
filename=filename
))
def download(self):
pool = Pool(POOL)
pool.map(self.get_file, self.pdf_links)
pool.close()
pool.join()
print('\nSuccessfully downloaded files from given source!\n')
d = DownloadFiles()
d.download()

Resolving and saving hostnames in parallel with Python

I'm trying to resolve a list of hostnames. The problem is when I hit a non existent domain, it slows down the whole process. The code is a trivial for loop:
for domain in domains:
try:
if socket.gethostbyname(domain.split('#')[1]):
file1.write(domain)
else:
file2.write(domain)
except socket.gaierror:
pass
I was wondering if there is a simple way to parallelize what is inside the for loop.

You could use one of example from Gevent - dns_mass_resolve.py. There's also usefull possibility of setting timeout for all queries.
from __future__ import with_statement
import sys
import gevent
from gevent import socket
from gevent.pool import Pool
N = 1000
# limit ourselves to max 10 simultaneous outstanding requests
pool = Pool(10)
finished = 0
def job(url):
global finished
try:
try:
ip = socket.gethostbyname(url)
print ('%s = %s' % (url, ip))
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (url, ex))
finally:
finished += 1
with gevent.Timeout(2, False):
for x in xrange(10, 10 + N):
pool.spawn(job, '%s.com' % x)
pool.join()
print ('finished within 2 seconds: %s/%s' % (finished, N))

I don't know a simple solution. Using multiple threads/process would be complicated and would probably don't help that much, because your execution speed is bound to IO. Therefore I would have a look at some async lib like Twisted. There is a method resolve in IReactorCore: http://twistedmatrix.com/documents/12.2.0/api/twisted.internet.interfaces.IReactorCore.html

import thread
def resolve_one_domain(domain):
...
for domain in domains:
thread.start_new_thread(resolve_one_domain, [domain])

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Cannot use urllib when monkey patch of gevent was applied - python

Related

Run python script like a service with Twisted

No requests module error

Lambda Python Pool.map and urllib2.urlopen : Retry only failing processes, log only errors

Download files from url parallely in python

Resolving and saving hostnames in parallel with Python

Categories

Resources