Beautifulsoup download all .zip files from Google Patent Search - python

What I am trying to do is use Beautifulsoup to download every zip file from the Google Patent archive. Below is the code that i've written thus far. But it seems that I am having troubles getting the files to download into a directory on my desktop. Any help would be greatly appreciated
from bs4 import BeautifulSoup
import urllib2
import re
import pandas as pd
url = 'http://www.google.com/googlebooks/uspto-patents-grants.html'
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
soup.prettify()
path = open('/Users/username/Desktop/', "wb")
for name in soup.findAll('a', href=True):
print name['href']
linkpath = name['href']
rq = urllib2.request(linkpath)
res = urllib2.urlope
The result that I am supposed to get, is that all of the zip files are supposed to download into a specific dir. Instead, I am getting the following error:
> #2015 --------------------------------------------------------------------------- AttributeError Traceback (most recent call last)
> <ipython-input-13-874f34e07473> in <module>() 17 print name['href'] 18
> linkpath = name['href'] ---> 19 rq = urllib2.request(namep) 20 res =
> urllib2.urlopen(rq) 21 path.write(res.read()) AttributeError: 'module'
> object has no attribute 'request' –

In addition to using a non-existent request entity from urllib2, you don't output to a file correctly - you can't just open the directory, you have to open each file for output separately.
Also, the 'Requests' package has a much nicer interface than urllib2. I recommend installing it.
Note, that, today anyway, the first .zip is 5.7Gb so streaming to a file is essential.
Really, you want something more like this:
from BeautifulSoup import BeautifulSoup
import requests
# point to output directory
outpath = 'D:/patent_zips/'
url = 'http://www.google.com/googlebooks/uspto-patents-grants.html'
mbyte=1024*1024
print 'Reading: ', url
html = requests.get(url).text
soup = BeautifulSoup(html)
print 'Processing: ', url
for name in soup.findAll('a', href=True):
zipurl = name['href']
if( zipurl.endswith('.zip') ):
outfname = outpath + zipurl.split('/')[-1]
r = requests.get(zipurl, stream=True)
if( r.status_code == requests.codes.ok ) :
fsize = int(r.headers['content-length'])
print 'Downloading %s (%sMb)' % ( outfname, fsize/mbyte )
with open(outfname, 'wb') as fd:
for chunk in r.iter_content(chunk_size=1024): # chuck size can be larger
if chunk: # ignore keep-alive requests
fd.write(chunk)
fd.close()

This is your problem:
rq = urllib2.request(linkpath)
urllib2 is a module and it has no request entity/attribute in it.
I see a Request class in urllib2, but I'm unsure if that's what you intended to actually use...

Related

Web scraping images are in a unsupported format

I have been trying to scrape some images using Beautifulsoup in Python and I am facing some problems, so the thing is that I am successfully able to scrape the link as well as store it in the folder but the images are in an unsupported format.
res = requests.get('https://books.toscrape.com/')
res.raise_for_status()
file = open('op.html', 'wb')
for i in res.iter_content(10000):
file.write(i)
os.makedirs('images', exist_ok=True)
newfile=open("op.html",'rb')
data=newfile.read()
soup=BeautifulSoup(data,'html.parser')
for link in soup.find_all('img'):
ll=link.get('src')
ima = open(os.path.join('images', os.path.basename(ll)), 'wb')
for down in res.iter_content(1000):
ima.write(down)
It says file format not supported even though it's in a jpeg format
output image in a folder
This line for down in res.iter_content(1000): is not iterating the image from ll - it is reiterating the html result. Your OS may recognize the file from the extension (.jpeg), but this is only because of the filename - not the content (which is not JPEG, but HTML, and hence the error).
You should make another request for the image itself, so it can be fetched and stored:
for link in soup.find_all('img'):
ll = link.get('src')
img_rs = requests.get(os.path.join('https://books.toscrape.com/', ll)) # <-- this line
ima = open(os.path.join('images', os.path.basename(ll)), 'wb')
for down in img_rs.iter_content(1000): # <-- and iterate on the result
ima.write(down)
The reason for saving the HTML is obscure. So, ignoring that part of the code in question, it comes down to this:
import requests
from os.path import join, basename
from bs4 import BeautifulSoup as BS
from urllib.parse import urljoin
URL = 'https://books.toscrape.com'
TARGET_DIR = '/tmp'
with requests.Session() as session:
(r := session.get(URL)).raise_for_status()
for image in BS(r.text, 'lxml').find_all('img'):
src = image['src']
(r := session.get(urljoin(URL, src), stream=True)).raise_for_status()
with open(join(TARGET_DIR, basename(src)), 'wb') as t:
for chunk in r.iter_content(chunk_size=8192):
t.write(chunk)
In terms of performance, this can be significantly enhanced by multithreading
Your problem is that after you find the URL of the image you don't do anything with it and instead you try to save the whole inital request which is just the html file of the whole website. Try something like this instead:
base_url = 'https://books.toscrape.com/'
res = requests.get('https://books.toscrape.com/')
res.raise_for_status()
file = open('op.html', 'wb')
for i in res.iter_content(10000):
file.write(i)
os.makedirs('images', exist_ok=True)
newfile=open("op.html",'rb')
data=newfile.read()
soup=BeautifulSoup(data,'html.parser')
for link in soup.find_all('img'):
ll=link.get('src')
ima = os.path.join('images', os.path.basename(ll))
current_img = os.path.join(base_url, ll)
img_res = requests.get(current_img, stream = True)
with open(ima, 'wb') as f:
shutil.copyfileobj(img_res.raw, f)
del img_res

Python - Downloading images using Wget. How to add a string to each file?

I'm using the following Python code to download images from a certain website. It's part of a code that I'm using to make a web scraper.
for url in links:
# Invoke wget download method to download specified url image.
local_image_filename = wget.download(url)
# Print out local image file name.
local_image_filename
continue
It's working well, but I want to know if it's possible to add a string as a prefix to each file...
My ideia is get the page title via Xpath and add as a prefix for each file.
I don't know where to add a string in this code. Can someone help me?
For example, I'm downloading these files:
logo.jpg, plans.jpg, circle.jpg
And I need to add a prefix, like these:
Beautiful_Plan_logo.jpg, Beautiful_Plan_plans.jpg, Beautiful_Plan_circle.jpg
Following I'll put the entire code:
import requests
import bs4 as bs
import urllib.request
import wget
##################################################
# getting url images #
##################################################
url = "https://tyreehouseplans.com/shop/house-plans/blackberry-blossom/"
opener = urllib.request.build_opener()
opener.add_headers = [{'User-Agent' : 'Mozilla'}]
urllib.request.install_opener(opener)
raw = requests.get(url).text
soup = bs.BeautifulSoup(raw, 'html.parser')
imgs = soup.find_all('img')
links = []
for img in imgs:
link = img.get('src')
links.append(link)
print(links)
################################################
# downloading images #
################################################
for url in links:
# Invoke wget download method to download specified url image.
local_image_filename = wget.download(url)
# Print out local image file name.
local_image_filename
continue
Thank you for any help!
python module wget has an option out, which determines the name of the output file. For example, the following script downloads 3 images, adding a prefix Beautiful_Plan_.
import wget
base_url = 'https://homepages.cae.wisc.edu/~ece533/images/'
image_names = ['airplane.png', 'arctichare.png', 'baboon.png']
prefix = 'Beautiful_Plan_'
for image_name in image_names:
wget.download(base_url + image_name, out = prefix + image_name)
you can use shutil for this
import shutil
prefix = "prefix_"
#your piece of code
for url in links:
# Invoke wget download method to download specified url image.
local_image_filename = wget.download(url)
# Print out local image file name.
local_image_filename
shutil.copy(local_image_filename, prefix+local_image_filename)
use os.rename as per this documentation
I wrote code for making a seperate file with the extra information up front with a seperator.
import requests
import bs4 as bs
import urllib.request
import wget
##################################################
# getting url images #
##################################################
url = "https://tyreehouseplans.com/shop/house-plans/blackberry-blossom/"
opener = urllib.request.build_opener()
opener.add_headers = [{'User-Agent': 'Mozilla'}]
urllib.request.install_opener(opener)
raw = requests.get(url).text
soup = bs.BeautifulSoup(raw, 'html.parser')
imgs = soup.find_all('img')
links = []
for img in imgs:
link = img.get('src')
links.append(link)
# print(links)
################################################
# downloading images #
################################################
for url in links:
# Invoke wget download method to download specified url image.
try:
local_image_filename = wget.download(url)
except ValueError:
break
# Print out local image file name.
print(local_image_filename)
with open(local_image_filename, 'r') as myFile:
try:
data = myFile.read()
except UnicodeDecodeError:
data = "UNICODE DECODE ERROR"
except ValueError:
data = "VALUE ERROR"
print(data)
print(type(data))
myFile.close()
newSaveString = str(local_image_filename) + "SeperatorOfSomeKind" + str(data)
newFileName = "NEW_" + local_image_filename
with open(newFileName, 'w') as myFile:
myFile.write(newSaveString)
myFile.close()
continue

Python: downloading xml files in batch returns a damaged zip file

Drawing inspiration from this post, I am trying to download a bunch of xml files in batch from a website:
import urllib2
url='http://ratings.food.gov.uk/open-data/'
f = urllib2.urlopen(url)
data = f.read()
with open("C:\Users\MyName\Desktop\data.zip", "wb") as code:
code.write(data)
The zip file is created within seconds, but as I attempt to access it, an error window comes up:
Windows cannot open the folder.
The Compressed (zipped) Folder "C:\Users\MyName\Desktop\data.zip" is invalid.
What am I doing wrong here?
you are not opening file handles inside the zip file:
import urllib2
from bs4 import BeautifulSoup
import zipfile
url='http://ratings.food.gov.uk/open-data/'
fileurls = []
f = urllib2.urlopen(url)
mainpage = f.read()
soup = BeautifulSoup(mainpage, 'html.parser')
tablewrapper = soup.find(id='openDataStatic')
for table in tablewrapper.find_all('table'):
for link in table.find_all('a'):
fileurls.append(link['href'])
with zipfile.ZipFile("data.zip", "w") as code:
for url in fileurls:
print('Downloading: %s' % url)
f = urllib2.urlopen(url)
data = f.read()
xmlfilename = url.rsplit('/', 1)[-1]
code.writestr(xmlfilename, data)
You are doing nothing to encode this as zip file. If instead you choose to open it in a plain text editor such as notepad it should show you the raw xml.

"File Does Not Exist" when dynamically creating files for PDF download with Requests in Python 2.7

I'm trying to dynamically download pdf's from a web site. I am sure I'm listing them correctly but I am not sure I'm doing the actual file I/O correctly. I get the following error
File "download.py", line 22, in <module>
with open("'"+url+"'", "wb") as pdf:
IOError: [Errno 2] No such file or directory: "'http://www.lcs.mit.edu/publications/pubs/pdf/MIT-LCS-TR-179.pdf'"
Here is my code:
import requests
import re
from bs4 import BeautifulSoup
origin = requests.get("http://freehaven.net/anonbib")
soup=BeautifulSoup(origin.text)
results = soup.find_all(href=re.compile("(http).*(pdf)"))
for link in results:
url = (link.get('href'))
r = requests.get(url)
with open("'"+url+"'", "wb") as pdf:
try:
pdf.write(r.content)
finally:
pdf.close
If url is set to 'http://www.lcs.mit.edu/publications/pubs/pdf/MIT-LCS-TR-179.pdf', your code fails because it is trying to open a file with that name on your filesystem.
Instead, try something like this:
fileForUrl = '/tmp/' + url.split('/')[-1]
with open(fileForUrl, 'wb') as pdf:
# Rest of the code as before

How to save an image locally using Python whose URL address I already know?

I know the URL of an image on Internet.
e.g. http://www.digimouth.com/news/media/2011/09/google-logo.jpg, which contains the logo of Google.
Now, how can I download this image using Python without actually opening the URL in a browser and saving the file manually.
Python 2
Here is a more straightforward way if all you want to do is save it as a file:
import urllib
urllib.urlretrieve("http://www.digimouth.com/news/media/2011/09/google-logo.jpg", "local-filename.jpg")
The second argument is the local path where the file should be saved.
Python 3
As SergO suggested the code below should work with Python 3.
import urllib.request
urllib.request.urlretrieve("http://www.digimouth.com/news/media/2011/09/google-logo.jpg", "local-filename.jpg")
import urllib
resource = urllib.urlopen("http://www.digimouth.com/news/media/2011/09/google-logo.jpg")
output = open("file01.jpg","wb")
output.write(resource.read())
output.close()
file01.jpg will contain your image.
I wrote a script that does just this, and it is available on my github for your use.
I utilized BeautifulSoup to allow me to parse any website for images. If you will be doing much web scraping (or intend to use my tool) I suggest you sudo pip install BeautifulSoup. Information on BeautifulSoup is available here.
For convenience here is my code:
from bs4 import BeautifulSoup
from urllib2 import urlopen
import urllib
# use this image scraper from the location that
#you want to save scraped images to
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
images = [img for img in soup.findAll('img')]
print (str(len(images)) + "images found.")
print 'Downloading images to current working directory.'
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
filename=each.split('/')[-1]
urllib.urlretrieve(each, filename)
return image_links
#a standard call looks like this
#get_images('http://www.wookmark.com')
This can be done with requests. Load the page and dump the binary content to a file.
import os
import requests
url = 'https://apod.nasa.gov/apod/image/1701/potw1636aN159_HST_2048.jpg'
page = requests.get(url)
f_ext = os.path.splitext(url)[-1]
f_name = 'img{}'.format(f_ext)
with open(f_name, 'wb') as f:
f.write(page.content)
Python 3
urllib.request — Extensible library for opening URLs
from urllib.error import HTTPError
from urllib.request import urlretrieve
try:
urlretrieve(image_url, image_local_path)
except FileNotFoundError as err:
print(err) # something wrong with local path
except HTTPError as err:
print(err) # something wrong with url
I made a script expanding on Yup.'s script. I fixed some things. It will now bypass 403:Forbidden problems. It wont crash when an image fails to be retrieved. It tries to avoid corrupted previews. It gets the right absolute urls. It gives out more information. It can be run with an argument from the command line.
# getem.py
# python2 script to download all images in a given url
# use: python getem.py http://url.where.images.are
from bs4 import BeautifulSoup
import urllib2
import shutil
import requests
from urlparse import urljoin
import sys
import time
def make_soup(url):
req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
html = urllib2.urlopen(req)
return BeautifulSoup(html, 'html.parser')
def get_images(url):
soup = make_soup(url)
images = [img for img in soup.findAll('img')]
print (str(len(images)) + " images found.")
print 'Downloading images to current working directory.'
image_links = [each.get('src') for each in images]
for each in image_links:
try:
filename = each.strip().split('/')[-1].strip()
src = urljoin(url, each)
print 'Getting: ' + filename
response = requests.get(src, stream=True)
# delay to avoid corrupted previews
time.sleep(1)
with open(filename, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
except:
print ' An error occured. Continuing.'
print 'Done.'
if __name__ == '__main__':
url = sys.argv[1]
get_images(url)
A solution which works with Python 2 and Python 3:
try:
from urllib.request import urlretrieve # Python 3
except ImportError:
from urllib import urlretrieve # Python 2
url = "http://www.digimouth.com/news/media/2011/09/google-logo.jpg"
urlretrieve(url, "local-filename.jpg")
or, if the additional requirement of requests is acceptable and if it is a http(s) URL:
def load_requests(source_url, sink_path):
"""
Load a file from an URL (e.g. http).
Parameters
----------
source_url : str
Where to load the file from.
sink_path : str
Where the loaded file is stored.
"""
import requests
r = requests.get(source_url, stream=True)
if r.status_code == 200:
with open(sink_path, 'wb') as f:
for chunk in r:
f.write(chunk)
Using requests library
import requests
import shutil,os
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
currentDir = os.getcwd()
path = os.path.join(currentDir,'Images')#saving images to Images folder
def ImageDl(url):
attempts = 0
while attempts < 5:#retry 5 times
try:
filename = url.split('/')[-1]
r = requests.get(url,headers=headers,stream=True,timeout=5)
if r.status_code == 200:
with open(os.path.join(path,filename),'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw,f)
print(filename)
break
except Exception as e:
attempts+=1
print(e)
ImageDl(url)
Use a simple python wget module to download the link. Usage below:
import wget
wget.download('http://www.digimouth.com/news/media/2011/09/google-logo.jpg')
This is very short answer.
import urllib
urllib.urlretrieve("http://photogallery.sandesh.com/Picture.aspx?AlubumId=422040", "Abc.jpg")
Version for Python 3
I adjusted the code of #madprops for Python 3
# getem.py
# python2 script to download all images in a given url
# use: python getem.py http://url.where.images.are
from bs4 import BeautifulSoup
import urllib.request
import shutil
import requests
from urllib.parse import urljoin
import sys
import time
def make_soup(url):
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
html = urllib.request.urlopen(req)
return BeautifulSoup(html, 'html.parser')
def get_images(url):
soup = make_soup(url)
images = [img for img in soup.findAll('img')]
print (str(len(images)) + " images found.")
print('Downloading images to current working directory.')
image_links = [each.get('src') for each in images]
for each in image_links:
try:
filename = each.strip().split('/')[-1].strip()
src = urljoin(url, each)
print('Getting: ' + filename)
response = requests.get(src, stream=True)
# delay to avoid corrupted previews
time.sleep(1)
with open(filename, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
except:
print(' An error occured. Continuing.')
print('Done.')
if __name__ == '__main__':
get_images('http://www.wookmark.com')
Late answer, but for python>=3.6 you can use dload, i.e.:
import dload
dload.save("http://www.digimouth.com/news/media/2011/09/google-logo.jpg")
if you need the image as bytes, use:
img_bytes = dload.bytes("http://www.digimouth.com/news/media/2011/09/google-logo.jpg")
install using pip3 install dload
Something fresh for Python 3 using Requests:
Comments in the code. Ready to use function.
import requests
from os import path
def get_image(image_url):
"""
Get image based on url.
:return: Image name if everything OK, False otherwise
"""
image_name = path.split(image_url)[1]
try:
image = requests.get(image_url)
except OSError: # Little too wide, but work OK, no additional imports needed. Catch all conection problems
return False
if image.status_code == 200: # we could have retrieved error page
base_dir = path.join(path.dirname(path.realpath(__file__)), "images") # Use your own path or "" to use current working directory. Folder must exist.
with open(path.join(base_dir, image_name), "wb") as f:
f.write(image.content)
return image_name
get_image("https://apod.nasddfda.gov/apod/image/2003/S106_Mishra_1947.jpg")
this is the easiest method to download images.
import requests
from slugify import slugify
img_url = 'https://apod.nasa.gov/apod/image/1701/potw1636aN159_HST_2048.jpg'
img = requests.get(img_url).content
img_file = open(slugify(img_url) + '.' + str(img_url).split('.')[-1], 'wb')
img_file.write(img)
img_file.close()
If you don't already have the url for the image, you could scrape it with gazpacho:
from gazpacho import Soup
base_url = "http://books.toscrape.com"
soup = Soup.get(base_url)
links = [img.attrs["src"] for img in soup.find("img")]
And then download the asset with urllib as mentioned:
from pathlib import Path
from urllib.request import urlretrieve as download
directory = "images"
Path(directory).mkdir(exist_ok=True)
link = links[0]
name = link.split("/")[-1]
download(f"{base_url}/{link}", f"{directory}/{name}")
# import the required libraries from Python
import pathlib,urllib.request
# Using pathlib, specify where the image is to be saved
downloads_path = str(pathlib.Path.home() / "Downloads")
# Form a full image path by joining the path to the
# images' new name
picture_path = os.path.join(downloads_path, "new-image.png")
# "/home/User/Downloads/new-image.png"
# Using "urlretrieve()" from urllib.request save the image
urllib.request.urlretrieve("//example.com/image.png", picture_path)
# urlretrieve() takes in 2 arguments
# 1. The URL of the image to be downloaded
# 2. The image new name after download. By default, the image is saved
# inside your current working directory
Ok, so, this is my rudimentary attempt, and probably total overkill.
Update if needed, as this doesn't handle any timeouts, but, I got this working for fun.
Code listed here: https://github.com/JayRizzo/JayRizzoTools/blob/master/pyImageDownloader.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# =============================================================================
# Created Syst: MAC OSX High Sierra 21.5.0 (17G65)
# Created Plat: Python 3.9.5 ('v3.9.5:0a7dcbdb13', 'May 3 2021 13:17:02')
# Created By : Jeromie Kirchoff
# Created Date: Thu Jun 15 23:31:01 2022 CDT
# Last ModDate: Thu Jun 16 01:41:01 2022 CDT
# =============================================================================
# NOTE: Doesn't work on SVG images at this time.
# I will look into this further: https://stackoverflow.com/a/6599172/1896134
# =============================================================================
import requests # to get image from the web
import shutil # to save it locally
import os # needed
from os.path import exists as filepathexist # check if file paths exist
from os.path import join # joins path for different os
from os.path import expanduser # expands current home
from pyuser_agent import UA # generates random UserAgent
class ImageDownloader(object):
"""URL ImageDownloader.
Input : Full Image URL
Output: Image saved to your ~/Pictures/JayRizzoDL folder.
"""
def __init__(self, URL: str):
self.url = URL
self.headers = {"User-Agent" : UA().random}
self.currentHome = expanduser('~')
self.desktop = join(self.currentHome + "/Desktop/")
self.download = join(self.currentHome + "/Downloads/")
self.pictures = join(self.currentHome + "/Pictures/JayRizzoDL/")
self.outfile = ""
self.filename = ""
self.response = ""
self.rawstream = ""
self.createdfilepath = ""
self.imgFileName = ""
# Check if the JayRizzoDL exists in the pictures folder.
# if it doesn't exist create it.
if not filepathexist(self.pictures):
os.mkdir(self.pictures)
self.main()
def getFileNameFromURL(self, URL: str):
"""Parse the URL for the name after the last forward slash."""
NewFileName = self.url.strip().split('/')[-1].strip()
return NewFileName
def getResponse(self, URL: str):
"""Try streaming the URL for the raw data."""
self.response = requests.get(self.url, headers=self.headers, stream=True)
return self.response
def gocreateFile(self, name: str, response):
"""Try creating the file with the raw data in a custom folder."""
self.outfile = join(self.pictures, name)
with open(self.outfile, 'wb') as outFilePath:
shutil.copyfileobj(response.raw, outFilePath)
return self.outfile
def main(self):
"""Combine Everything and use in for loops."""
self.filename = self.getFileNameFromURL(self.url)
self.rawstream = self.getResponse(self.url)
self.createdfilepath = self.gocreateFile(self.filename, self.rawstream)
print(f"File was created: {self.createdfilepath}")
return
if __name__ == '__main__':
# Example when calling the file directly.
ImageDownloader("https://stackoverflow.design/assets/img/logos/so/logo-stackoverflow.png")
Download Image file, with avoiding all possible error:
import requests
import validators
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
def is_downloadable(url):
valid=validators. url(url)
if valid==False:
return False
req = Request(url)
try:
response = urlopen(req)
except HTTPError as e:
return False
except URLError as e:
return False
else:
return True
for i in range(len(File_data)): #File data Contain list of address for image
#file
url = File_data[i][1]
try:
if (is_downloadable(url)):
try:
r = requests.get(url, allow_redirects=True)
if url.find('/'):
fname = url.rsplit('/', 1)[1]
fname = pth+File_data[i][0]+"$"+fname #Destination to save
#image file
open(fname, 'wb').write(r.content)
except Exception as e:
print(e)
except Exception as e:
print(e)

Categories

Resources