Download files from url parallely in python

Download files from url parallely in python - python

I have some links in a database which I want to download parallely. I tried doing it serially but it took too much time. I have around 1877 links.
I tried this code for running the downloads parallely but it throws an error: failed: 'tuple' object has no attribute 'read'
#!/usr/bin/env python
import urllib
from stream import ThreadPool
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
def retrieve(urls):
for url in urls:
print url,' '
res = urllib.urlretrieve(url).read()
yield url, res
if __name__ == '__main__':
retrieved = URLs >> ThreadPool(retrieve, poolsize=7)
for url, content in retrieved:
print '%r is %d bytes' % (url, len(content))
for url, exception in retrieved.failure:
print '%r failed: %s' % (url, exception)
I tried this as well:
import urllib
import tldextract
from multiprocessing.pool import ThreadPool
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
def dwld(url):
print url
res = urllib.urlopen(url).read()
filename = tldextract.extract(url)
with open(filename.domain, 'wb') as fh:
fh.write(res)
return url
pool = ThreadPool(processes = 4)
pool.map(dwld, URLs)
Gives me
Traceback (most recent call last):
File "dwld_thread.py", line 26, in
pool.map(dwld, URLs)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/multiprocessing/pool.py", line 148, in map
return self.map_async(func, iterable, chunksize).get()
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/multiprocessing/pool.py", line 422, in get
raise self._value
IOError: [Errno socket error] [Errno 8] nodename nor servname provided, or not known

I have no idea what that stream.ThreadPool is that you're using, or what its API is… but the problem is obvious:
res = urllib.urlretrieve(url).read()
If you look at the doc for urlretrieve:
Return a tuple (filename, headers) where filename is the local file name under which the object can be found…
You obviously can't call read on that. If you want to download to a local file, using this legacy API, and then read that file, you can:
filename, headers = urllib.urlretrieve(url)
with open(filename) as f:
res = f.read()
But why? Just use urllib2.urlopen, which "returns a file-like object with two additional methods", so you can just call read on it, and you won't be creating a temporary file, and you're not using an old function that wasn't quite designed right that nobody has maintained in years.
But Python has a nice ThreadPoolExecutor built into the standard library. And if you look at the very first example they show you, it's exactly what you're trying to do.
Unfortunately, you're using Python 2.x, which doesn't have the concurrent.futures module. Fortunately, there is a backport on PyPI that works with 2.5+.
Python also has multiprocessing.dummy.Pool (also available under the undocumented, but probably more readable, name multiprocessing.ThreadPool). But if you're willing to go outside the stdlib for some module that you apparently aren't sure how to use and that I've never heard of, I'm guessing you won't have any problem using futures. So:
import futures
import urllib2
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
def load_url(url):
return urllib2.urlopen(url).read()
if __name__ == '__main__':
with futures.ThreadPoolExecutor(max_workers=7) as executor:
fmap = dict((executor.submit(load_url, url), url) for url in URLs)
for f in futures.as_completed(fmap):
url = fmap[f]
try:
content = f.result()
except Exception as exception:
print '%r failed: %s' % (url, exception)
else:
print '%r is %d bytes' % (url, len(content))

urllib.urlretrieve(url).read() should be urllib.urlopen(url).read()

from threading import *
from time import sleep
# if Python2:
import urllib
# if Python3:
# import urllib.request
URLs = [
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
class worker(Thread):
def __init__(self, link):
Thread.__init__(self)
self.link = link
self.start()
def run(self):
# if Python2:
res = urllib.urlopen(url).read() # as mentioned by #DhruvPathak
# if Python3:
# res = urllib.request.urlopen(url).read()
with open(url, 'rb') as fh:
fh.write(res) # store fetched data in a file called <link>
for url in urls:
while len(enumerate()) > 500:
sleep(0.25)
worker(url)
while len(enumerate()) > 1:
sleep(0.25) # wait for all threads to finish

What about using multiprocessing ?
Sample code:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import urllib
from multiprocessing import Pool
import os
POOL = 8
PDFS_DOWNLOAD_DIR = 'pdfs'
PDF_LINKS = sys.argv[1]
class DownloadFiles(object):
def __init__(self):
self.pdf_links = self.read_links_from_file()
self.create_download_dir()
def create_download_dir(self):
try:
if not os.path.exists(PDFS_DOWNLOAD_DIR):
os.makedirs(PDFS_DOWNLOAD_DIR)
except IOError as e:
exit()
def read_links_from_file(self):
try:
with open(PDF_LINKS, 'r') as f:
return list(set([x.strip() for x in f]))
except (IndexError, IOError) as e:
exit()
def get_file(self, link):
filename = link.split('/')[-2]
print('Downloading file --> "{filename}"'.format(
filename=filename
))
urllib.urlretrieve(link, filename='{pdfs_data}/{filename}'.format(
pdfs_data=PDFS_DOWNLOAD_DIR,
filename=filename
))
def download(self):
pool = Pool(POOL)
pool.map(self.get_file, self.pdf_links)
pool.close()
pool.join()
print('\nSuccessfully downloaded files from given source!\n')
d = DownloadFiles()
d.download()

Related

Cannot use urllib when monkey patch of gevent was applied

I am new to gevent package that allows us to run things asynchronously. I found this example code on https://sdiehl.github.io/gevent-tutorial/ that allows us to make async api calls. Original code is,
import gevent.monkey
gevent.monkey.patch_socket()
import gevent
import urllib2
import simplejson as json
def fetch(pid):
response = urllib2.urlopen('http://json-time.appspot.com/time.json')
result = response.read()
json_result = json.loads(result)
datetime = json_result['datetime']
print('Process %s: %s' % (pid, datetime))
return json_result['datetime']
def synchronous():
for i in range(1,10):
fetch(i)
def asynchronous():
threads = []
for i in range(1,10):
threads.append(gevent.spawn(fetch, i))
gevent.joinall(threads)
print('Synchronous:')
synchronous()
print('Asynchronous:')
asynchronous()
Because the api that use to take time doesn't work, I had to do some changes. this is the code that I edited.
import gevent
from urllib.request import urlopen #this is the alternative for urllib2
import requests
#import simplejson as json
def fetch(pid):
response = urlopen('https://just-the-time.appspot.com/')
result = response.read()
datetime = result.decode('utf-8')
print('Process %s: %s' % (pid, datetime))
return datetime
def synchronous():
for i in range(1,10):
fetch(i)
def asynchronous():
threads = []
for i in range(1,10):
threads.append(gevent.spawn(fetch, i))
gevent.joinall(threads)
print('Synchronous:')
synchronous()
print('Asynchronous:')
asynchronous()
I tried make requests to the time api directly and it works fine. But when I apply the 'monkey patch' from gevent it gives this error. TypeError: _wrap_socket() argument 'sock' must be _socket.socket, not SSLSocket
You can try removing monkey patch and it works fine. But I am not sure without the monkey patch, async part runs asynchronously or not, because in synchronous and asynchronous the results are kind of identical. Can someone explain what is going on here?

No requests module error

I'm trying to write a web parser script using requests module. Here is my current code:
import requests
import subprocess
import json
import sys
import threading
import time
from Queue import Queue
numberOfViewers = int(sys.argv[1])
builderThreads = int(sys.argv[2])
startTime = time.time()
numberOfSockets = 0
concurrent = 25
urls = []
urlsUsed = []
def getURL(): # Get tokens
output = subprocess.Popen(["livestreamer", "twitch.tv/CHANNEL_NAME", "-j"],
stdout=subprocess.PIPE).communicate()[0]
return json.loads(output)['streams']['worst']['url'] # Parse json and return the URL parameter
def build(): # Builds a set of tokens, aka viewers
global numberOfSockets
global numberOfViewers
while True:
if numberOfSockets < numberOfViewers:
numberOfSockets += 1
print ("Building viewers " + str(numberOfSockets) + "/" + str(numberOfViewers))
urls.append(getURL())
def view(): # Opens connections to send views
global numberOfSockets
while True:
url=q.get()
requests.head(url)
if (url in urlsUsed):
urls.remove(url)
urlsUsed.remove(url)
numberOfSockets -= 1
else:
urlsUsed.append(url)
q.task_done()
if __name__ == '__main__':
for i in range(0, builderThreads):
threading.Thread(target = build).start()
while True:
while (numberOfViewers != numberOfSockets): # Wait until sockets are built
time.sleep(1)
q=Queue(concurrent*2)
for i in range(concurrent):
try:
t=threading.Thread(target=view)
t.daemon=True
t.start()
except:
print ('thread error')
try:
for url in urls:
print (url)
q.put(url.strip())
q.join()
except KeyboardInterrupt:
sys.exit(1)
But when I run the code, it says:
Traceback (most recent call last):
File "C:\Users\flamelier\Desktop\Twitch.py", line 1, in <module>
import requests
ImportError: No module named 'requests'
Why am I getting this error? How do I install this module?
Will this error keep repeating for all the scripts henceforth?
How can I prevent such similar errors in the future?

Requests is a 3rd party module. You should first install it to Python using PIP or easy_install.

You have to run pip3 install requests as requests doesn't come with Python by default, as it is a third party library.

Even after you have pip3-installed requests, the code shown won't do anything. The
if __name__ == "__main__"
test and everything after it is part of an else block in the view function. Back this line and the block that follows out to the left margin.

Read file dynamically with Python and threads

I want a Python script that reads in an infinite loop a file (till stopped from keyboard or process killed).
That input file is appended dynamically from top and bottom.
The Python script should have 5 threads doing the reading of the file and removing the lines read.
I am having problems doing that: either a line from the input file is read more than once (which should not happen) or the threads don't change the file properly.
#!/usr/bin/env python
from multiprocessing.pool import ThreadPool
from time import time as timer
from urllib2 import urlopen
import requests
import os
session = requests.Session()
rawBody = "\r\n"
i=0
k=0
lines = [line.rstrip() for line in open('input.txt')]
urls = lines
def fetch_url(url):
global k
try:
print url
return url, None, None
except Exception as e:
return url, None, e
start = timer()
results = ThreadPool(5).imap_unordered(fetch_url, urls)
for url, html, error in results:
if error is None:
#print ""
i=i+1
#print("%r fetched in %ss" % (url, timer() - start))
else:
i=i+1
print error
#print("error fetching %r: %s" % (url, error))
#print("Elapsed Time: %s" % (timer() - start,))
Here is a real example of how I wanna use the script: the input file contains a large number of urls of image files. I want to check every url for gps data. The python script needs to run continuously, even when the input file is empty and wait for updates of the urls. The threads I need to do more urls at once so it's not time consuming one by one.
And I need a nice quality script, not a "quick and dirty" sample.

In this case, how can I write the queue.put

I'm writing a program to get the domain in same server and it also can scan the web directory.
#!/usr/bin/env python
#encoding = utf-8
import threading
import urllib,urllib2,httplib
from urllib2 import Request, urlopen, URLError
import Queue,sys
import re
concurrent = 5
url = sys.argv[1]
class Scanner(threading.Thread):
def __init__(self, work_q):
threading.Thread.__init__(self)
self.work_q = work_q
def getdomains(self):
doreq = Request('http://www.logontube.com/website/'+ url)
response = urlopen(doreq)
html = response.read()
response.close()
domains = re.findall('<br><a href=\"(.*?)\" target=\"_blank\"',html)
return domains
def run(self):
alldomains = self.getdomains()
pathline = [line.rstrip() for line in open("path.txt")]
while True:
for aim in alldomains:
for path in pathline:
path = self.work_q.get()
req = Request(aim+path)
try:
response = urlopen(req)
except URLError, e:
if hasattr(e, 'reason'):
print aim+path,'Not Found'
elif hasattr(e,'code'):
print aim+path,'Not Found'
else:
try:
logs = open('log.txt',"a+")
except(IOError):
print "[x] Failed to create log file"
print aim+path,"Found"
logs.writelines(aim+path+"\n")
logs.close()
def main():
work_q = Queue.Queue()
paths = [line.rstrip() for line in open("path.txt")]
for i in range(concurrent):
t = Scanner(work_q)
t.setDaemon(True)
t.start()
for path in paths:
work_q.put(path)
work_q.join()
main()
The problem is this program only do the loop of the path, so i only can get the scan result of one website.
I've found the problem,
for path in paths:
work_q.put(path) # The program finishes when it puts all the path
If you want to help me to test this program, you may need some directory of website(save it as path.txt)
/default.asp
/index.asp
/index.htm
/index.html
/index.jsp
/index.php
/admin.asp
/admin.php
/admin.shtml
/admin.txt
/admin_admin.asp
/config.asp
/inc/
/login.asp
/login.jsp
/login.php
/login/
/phpinfo.php
/readme.txt
/robots.txt
/test.asp
/test.html
/test.txt
/test.php
/news/readme.txt
/addmember/

You need a:
while 1:
pass
or something that waits until your threads are completed then it exits.
What is happening is that you are starting the threads but you are terminating the main thread so you never get to see the results of your threads.

Argument is URL or path

What is the standard practice in Python when I have a command-line application taking one argument which is
URL to a web page
or
path to a HTML file somewhere on disk
(only one)
is sufficient the code?
if "http://" in sys.argv[1]:
print "URL"
else:
print "path to file"

import urlparse
def is_url(url):
return urlparse.urlparse(url).scheme != ""
is_url(sys.argv[1])

Depends on what the program must do. If it just prints whether it got a URL, sys.argv[1].startswith('http://') might do. If you must actually use the URL for something useful, do
from urllib2 import urlopen
try:
f = urlopen(sys.argv[1])
except ValueError: # invalid URL
f = open(sys.argv[1])

Larsmans might work, but it doesn't check whether the user actually specified an argument or not.
import urllib
import sys
try:
arg = sys.argv[1]
except IndexError:
print "Usage: "+sys.argv[0]+" file/URL"
sys.exit(1)
try:
site = urllib.urlopen(arg)
except ValueError:
file = open(arg)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Download files from url parallely in python - python

urllib.urlretrieve(url).read() should be urllib.urlopen(url).read()

Related

Cannot use urllib when monkey patch of gevent was applied

No requests module error

Read file dynamically with Python and threads

In this case, how can I write the queue.put

Argument is URL or path

Categories

Resources