How can I improve downloading speed with python urllib.request - python

How can I improve downloading speed with urllib.request? I want to download image from web and It works well. But it takes too long downloading it. It took 42 seconds to excute donwload_album_art() func. What Can I do for that? Can I use mutiprocess or etc? h
import os
import shutil
import requests
from bs4 import BeautifulSoup
from urllib import request
URL = 'https://music.bugs.co.kr/chart/track/day/total'
PATH = os.getcwd() + '/static/images/'
# Scrapping html code
def get_html(target_url):
_html = ""
response = requests.get(target_url)
if response.status_code == 200:
_html = response.text
return _html
# parse image url and save in list
def get_image_url():
html = get_html(URL)
soup = BeautifulSoup(html, 'html.parser')
img_url = []
for image in soup.select('a.thumbnail > img'):
if image.has_attr('src'):
img_url.append(image.get('src'))
else:
continue
return img_url
# download album art in static/images directory
def download_album_arts():
images = get_image_url()
for i in range(0, 100):
url = images[i]
file_name = PATH + str(i + 1) + '.png'
request.urlretrieve(url, file_name)
# delete all album art
def delete_album_art():
path = os.getcwd() + '/static/images'
if os.path.exists(path):
shutil.rmtree(path)
os.mkdir(path)
else:
os.mkdir(path)

Related

Download all files with extension from a page

I am trying to download all netcdf (.nc) files here:
https://www.ncei.noaa.gov/data/avhrr-land-normalized-difference-vegetation-index/access/2000/
import urllib3
from bs4 import BeautifulSoup
site = urllib3.PoolManager()
base_url = 'https://www.ncei.noaa.gov//data//avhrr-land-normalized-difference-vegetation-index//access//'
html = site.request('GET', base_url + '//' + '2000')
soup = BeautifulSoup(html.data, "lxml")
list_urls = soup.find_all('.nc')
However, list_urls is empty after running this code. How can I fix it?
Here is what I did soup.find_all(text=lambda t: ".nc" in t) and working fine with a progress bar as well :)
import sys
import requests
import urllib3
import humanize
from bs4 import BeautifulSoup
site = urllib3.PoolManager()
base_url = 'https://www.ncei.noaa.gov//data//avhrr-land-normalized-difference-vegetation-index//access//'
html = site.request('GET', base_url + '//' + '2000')
soup = BeautifulSoup(html.data, "lxml")
link_urls = soup.find_all(text=lambda t: ".nc" in t)
for link in link_urls:
download_link = "{}2000/{}".format(base_url, link)
r = requests.get(download_link, stream=True)
total_length = r.headers.get('content-length')
print("\nDownloading: {}\nTotalSize: {}".format(download_link, humanize.naturalsize(total_length)))
with open(link, "wb") as f:
print("Downloading %s" % link)
if total_length is None: # no content length header
f.write(r.content)
else:
dl = 0
total_length = int(total_length)
for data in r.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done)))
sys.stdout.flush()

Python WebScraper - object has no attribute 'urlretrieve'

I am trying to create a python webscraper that downloads a certain amount of images from a url, to my current directory. However for the following code:
urllib.request.urlretrieve(each, filename)
It is saying that: AttributeError: 'function' object has no attribute 'urlretrieve' when running the program
Here is the full code:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
url = 'https://unsplash.com/s/photos/download'
def download_imgs(url, amountOfImgs):
html = urlopen(url).read()
#parsing the html from the url
page_soup = soup(html, "html.parser")
images = [img for img in page_soup.findAll('img')]
counter = 0
#compiling the unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
if(counter <= amountOfImgs):
filename = each.split('/')[-1]
urllib.request.urlretrieve(each, filename)
counter += 1
else:
return image_links
print(download_imgs(url, 5))
It looks like when you imported just URLOpen, you missed everything else.
I did it a bit differently, I got the html using the requests.get method, and removed the need for url open, you could just do
import urlopen, urlretrieve
if you want to use mine, I know it worked,
import urllib.request
from bs4 import BeautifulSoup as soup
import requests
url = 'https://unsplash.com/s/photos/download'
def download_imgs(url, amountOfImgs):
req=requests.get(url)
html=req.text
#parsing the html from the url
page_soup = soup(html, "html.parser")
images = [img for img in page_soup.findAll('img')]
counter = 0
#compiling the unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
if(counter <= amountOfImgs):
filename = each.split('/')[-1]
urllib.request.urlretrieve(each, filename)
counter += 1
else:
return image_links
print(download_imgs(url, 5))

Unknown URL Type: Image Scraping

My error:
File "C:/Users/hp dv4/PycharmProjects/project/imagescrap.py", line
22, in
imagefile.write(urllib.request.urlopen(img_src).read())
ValueError: unknown url type: '/img/logo_with_text.png'
I am getting this error while crawling through the specified website whereas, this same code works fine with some other website.
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
i = 1
soup = make_soup("http://ioe.edu.np/")
unique_srcs = []
for img in soup.findAll('img'):
if img.get('src') not in unique_srcs:
unique_srcs.append(img.get('src'))
for img_src in unique_srcs:
filename = str(i)
i = i + 1
imagefile = open(filename + '.png', 'wb')
imagefile.write(urllib.request.urlopen(img_src).read())
imagefile.close()
the above code will encounter one more error.
you are trying to save every file with .png extension, which may make the files unreadable.
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
base_url = "http://ioe.edu.np/"
soup = make_soup(base_url)
unique_srcs = []
for img in soup.findAll('img'):
if img.get('src') not in unique_srcs:
unique_srcs.append(img.get('src'))
for i, img_src in enumerate(unique_srcs):
print(img_src)
filename = str(i)
extension = img_src.split('.')[-1]
with open(filename+'.'+extension, 'wb') as f:
f.write(urllib.request.urlopen(base_url+img_src).read())
few idiomatic python suggestions:
use enumerate instead of trying to manage a counter.
use the with-open construct which takes care of closing your file.
one other thing you could do to further improve:
use a set instead of a list, so that you don't download the same file twice.
As the error messages says:
unknown url type: '/img/logo_with_text.png'
add http://ioe.edu.np/ in front of img_src and it should work

When trying to extract meta data out of images on webpages, keeps returning {}, why?

I've looked at the exifread documentation and it says that it's returned as a dictionary, but the problem is that it returns nothing except {}, I don't know if that means there is no meta data in the image, or I made a nooby mistake, well anyway I've spend a good chunk of time looking at my code and documentation, but still can't find the solution, any help would be appreciated :)
Code:
import exifread
import colorama
import urllib2
import urllib
import random
import time
import bs4
import sys
def get_images(target):
colorama.init()
print(colorama.Fore.LIGHTGREEN_EX + "[*] Retrieving Meta Data from Target's Page...")
req = urllib2.Request(target)
resp = urllib2.urlopen(req)
page = resp.read()
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
if "www" in src or "http" in src or "https" in src:
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(src, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
else:
s = target + src
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(s, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
return
def main():
target = raw_input("Enter the target: ")
print ("\n")
get_images(target)
time.sleep(5)
sys.exit()
if __name__ == "__main__":
main()
The problem is you were not passing a base url, you need to pass the host and then join that to the src unless you get an absolute url from the src attribute.
The following code demonstrates a working example, I used requests in place of urllib but the logic is the same:
import bs4
import sys
import os
import requests
from urlparse import urljoin
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img", src=True):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
tags = exifread.process_file(f,"rb")
print (tags)
def main():
target ="http://www.exiv2.org/sample.html"
# need base to join to relative src
base = "http://www.exiv2.org/"
get_images(target, base)
if __name__ == "__main__":
main()
You will get the exif data for the one image on the page that has some:
A PIL example:
import bs4
import os
import requests
from urlparse import urljoin
import PIL.Image
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
try:
img = PIL.Image.open(f)
exif_data = img._getexif()
print(exif_data)
except AttributeError as e:
print("No exif data for {}".format(name))
os.remove(name)
os.remove(name) will remove files that have no exif data, if you don't want that to happen then remove it.

Save image from url to special folder

I want to save images from url to special folder, for example 'my_images', but not to default(where my *.py file is). Is it possible to make it?
Because my code saves all images to folder with *.py file.
Here is my code:
import urllib.request
from bs4 import BeautifulSoup
import re
import os
BASE_URL = 'https://fachowiec.com/sklep/pl/products/index?Products_page=1&pageSize=15'
def get_domain(url):
domain = re.findall(r'https:\W\W\w+\.\w+', url)
return domain[0]
def get_html(url):
request = urllib.request.urlopen(url)
return request.read()
def get_img(html):
soup = BeautifulSoup(html)
img_box = []
imgs = soup.find_all('div', class_= 'pthumb')
for img in imgs:
img_box.append(get_domain(BASE_URL) + img.img['src'])
for img in img_box:
urllib.request.urlretrieve(img, os.path.basename(img))
def main():
get_img(get_html('https://fachowiec.com/sklep/pl/products/index?Products_page=1&pageSize=15'))
if __name__ == '__main__':
main()
def get_img(html):
soup = BeautifulSoup(html)
img_box = []
imgs = soup.find_all('div', class_= 'pthumb')
for img in imgs:
img_box.append(get_domain(BASE_URL) + img.img['src'])
my_path = '/home/<username>/Desktop' # use whatever path you like
for img in img_box:
urllib.request.urlretrieve(img, os.path.join(my_path, os.path.basename(img)))
You should add the pathname in second parameter of urllib.request.urlretrieve. Something like below:
urllib.request.urlretrieve(img, "PATH"+os.path.basename(img))
The second argument, if present, specifies the file location to copy to (if absent, the location will be a tempfile with a generated name).

Categories

Resources