similar to Try to scrape image from image url (using python urllib ) but get html instead , but the solution does not work for me.
from BeautifulSoup import BeautifulSoup
import urllib2
import requests
img_url='http://7-themes.com/data_images/out/79/7041933-beautiful-backgrounds-wallpaper.jpg'
r = requests.get(img_url, allow_redirects=False)
headers = {}
headers['Referer'] = r.headers['location']
r = requests.get(img_url, headers=headers)
with open('7041933-beautiful-backgrounds-wallpaper.jpg', 'wb') as fh:
fh.write(r.content)
the downloaded file is still a html page, not an image.
Your referrer was not being set correctly. I have hard coded the referrer and it works fine
from BeautifulSoup import BeautifulSoup
import urllib2
import requests
img_url='http://7-themes.com/data_images/out/79/7041933-beautiful-backgrounds-wallpaper.jpg'
r = requests.get(img_url, allow_redirects=False)
headers = {}
headers['Referer'] = 'http://7-themes.com/7041933-beautiful-backgrounds-wallpaper.html'
r = requests.get(img_url, headers=headers, allow_redirects=False)
with open('7041933-beautiful-backgrounds-wallpaper.jpg', 'wb') as fh:
fh.write(r.content)
I found a root cause in my code is that refer field in the header is still a html, not image.
So I change the refer field to the img_url, and this works.
from BeautifulSoup import BeautifulSoup
import urllib2
import urllib
import requests
img_url='http://7-themes.com/data_images/out/79/7041933-beautiful-backgrounds-wallpaper.jpg'
headers = {}
headers['Referer'] = img_url
r = requests.get(img_url, headers=headers)
with open('7041933-beautiful-backgrounds-wallpaper.jpg', 'wb') as fh:
fh.write(r.content)
Related
Why does the result output the last url?
Is something wrong with my code?
import requests as uReq
from bs4 import BeautifulSoup as soup
import numpy as np
#can i use while loop instead for?
for page in np.arange(1,15):
url = uReq.get('https://www.myanmarbusiness-directory.com/en/categories-index/car-wheels-tyres-tubes-dealers/page{}.html?city=%E1%80%99%E1%80%9B%E1%80%99%E1%80%B9%E1%80%B8%E1%80%80%E1%80%AF%E1%80%94%E1%80%B9%E1%80%B8%E1%81%BF%E1%80%99%E1%80%AD%E1%80%B3%E1%82%95%E1%80%94%E1%80%9A%E1%80%B9'.format(page)).text
#have used for loop,but result is the last url
page_soup = soup(url,"html.parser")
info = page_soup.findAll("div",{"class: ","row detail_row"})
#Do all the url return output in one file?
filename = "wheel.csv"
file = open(filename,"w",encoding="utf-8")
you should check the indentation of what is happening after the for loop, otherwise, the variable url is replaced for every iteration of the loop, hence retaining only the last one.
import requests as uReq
from bs4 import BeautifulSoup as soup
import numpy as np
for page in np.arange(1,15):
url = uReq.get('https://www.myanmarbusiness-directory.com/en/categories-index/car-wheels-tyres-tubes-dealers/page{}.html?city=%E1%80%99%E1%80%9B%E1%80%99%E1%80%B9%E1%80%B8%E1%80%80%E1%80%AF%E1%80%94%E1%80%B9%E1%80%B8%E1%81%BF%E1%80%99%E1%80%AD%E1%80%B3%E1%82%95%E1%80%94%E1%80%9A%E1%80%B9'.format(page)).text
# this should be done N times (where N is the range param)
page_soup = soup(url,"html.parser")
info = page_soup.findAll("div",{"class: ","row detail_row"})
# append the results to the csv file
filename = "wheel.csv"
file = open(filename,"a",encoding="utf-8")
... # code for writing in the csv file
file.close()
Then, you will find everything in your file. Be aware that you should also close the file for saving it.
Try this!!
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import requests
urls=['https://www.myanmarbusiness-directory.com/en/categories-index/car-wheels-tyres-tubes-dealers/page{}.html?city=%E1%80%99%E1%80%9B%E1%80%99%E1%80%B9%E1%80%B8%E1%80%80%E1%80%AF%E1%80%94%E1%80%B9%E1%80%B8%E1%81%BF%E1%80%99%E1%80%AD%E1%80%B3%E1%82%95%E1%80%94%E1%80%9A%E1%80%B9']
links = []
for url in urls:
response = requests.get(url)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html_page = urlopen(req).read()
soup = BeautifulSoup(html_page, features="html.parser")
for link in soup.select_one('ol.list_products').findAll('a', attrs={'href': re.compile("^([a-zA-Z0-9\-])+$")}):
links.append(link.get('href'))
filename = 'output.csv'
with open(filename, mode="w") as outfile:
for s in links:
outfile.write("%s\n" %s)
Hello Community I have a problem and I dont know how to solve it my problem is I write a script to crawl webpages for Images with BeautifuleSoup4 but I got the error (AttributeError: 'NoneType' object has no attribute 'group')
import re
import requests
from bs4 import BeautifulSoup
site = 'https://www.fotocommunity.de/natur/wolken/3144?sort=new'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img', {"src": True})
urls = [img["src"] for img in img_tags]
for url in urls:
filename = re.search(r'([\w_-]+[.](jpg|png))$', url)
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
Your regex is wrong. Use Python's internal urllib to do the heavyweight lifting instead of writing regexes if you're not familiar with them.
Use something like this (untested):
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlsplit # import this additional library
from os.path import basename # import this additional library
site = 'https://www.fotocommunity.de/natur/wolken/3144?sort=new'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
images_div = soup.find(id=re.compile(r"fcx-gallery-\w+")) # focus on the div containing the images
if img_tags: # test if img_tags has any data
img_tags = images_div.find_all('img', {"data-src": True}) # get all the images in that div
urls = [img["data-src"] for img in img_tags] # grab sources from data-source
for url in urls:
filename = basename(urlsplit(url).path) # use this instead of a regex
with open(filename, 'wb') as f: # filename is now a string
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
I'm unable to save/download the images at the location. I can't figure out the problem although the code seems right.
I'm using requests library for scraping the images.
import os
import urllib
import urllib.request
from bs4 import BeautifulSoup
import requests
import re
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
title = fromstring(r.content).findtext('.//title')
#print(title)
newPath = r'C:\Users\Vicky\Desktop\ScrappedImages\ ' + title
for link in soup.find_all('img'):
image = link.get('src')
if 'http' in image:
print(image)
imageName = os.path.split(image)[1]
print(imageName)
r2 = requests.get(image)
if not os.path.exists(newPath):
os.makedirs(newPath)
with open(imageName, "wb") as f:
f.write(r2.content)
Try wrapping your r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci") in a try: or while: statement to make sure that the website you are scraping is returning a 200 response, it could be that the website is timing out or not serving your request.
import os
from bs4 import BeautifulSoup
import urllib
import requests
import urlparse
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
for link in soup.find_all('img'):
image = link.get('src')
if bool(urlparse.urlparse(image).netloc):
print(image)
imageName = image[image.rfind("/")+1:]
print(imageName)
urllib.urlretrieve(image,imageName)
I'm trying to write a Python script to download images from any website. It is working, but inconsistently. Specifically, find_all("img") is not doing so for the second url. The script is:
# works for http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/
# but not http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
html = requests.get("http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/")
soup = BeautifulSoup(html.text, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
The images are rendered with JavaScript on the page that is failing.
First render the page with dryscrape
(If you don't want to use dryscrape see Web-scraping JavaScript page with Python )
e.g.
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
session = dryscrape.Session()
session.visit("http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/")
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
But I would also check that you have an absolute URL not a relative one:
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
from urllib.parse import urljoin
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
base = "http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/"
session = dryscrape.Session()
session.visit(base)
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
if url.startswith( 'http' ):
absoute = url
else:
absoute = urljoin(base, url)
print (absoute)
url_to_image(absoute, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
The following works when I paste it on the browser:
http://www.somesite.com/details.pl?urn=2344
But when I try reading the URL with Python nothing happens:
link = 'http://www.somesite.com/details.pl?urn=2344'
f = urllib.urlopen(link)
myfile = f.readline()
print myfile
Do I need to encode the URL, or is there something I'm not seeing?
To answer your question:
import urllib
link = "http://www.somesite.com/details.pl?urn=2344"
f = urllib.urlopen(link)
myfile = f.read()
print(myfile)
You need to read(), not readline()
EDIT (2018-06-25): Since Python 3, the legacy urllib.urlopen() was replaced by urllib.request.urlopen() (see notes from https://docs.python.org/3/library/urllib.request.html#urllib.request.urlopen for details).
If you're using Python 3, see answers by Martin Thoma or i.n.n.m within this question:
https://stackoverflow.com/a/28040508/158111 (Python 2/3 compat)
https://stackoverflow.com/a/45886824/158111 (Python 3)
Or, just get this library here: http://docs.python-requests.org/en/latest/ and seriously use it :)
import requests
link = "http://www.somesite.com/details.pl?urn=2344"
f = requests.get(link)
print(f.text)
For python3 users, to save time, use the following code,
from urllib.request import urlopen
link = "https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html"
f = urlopen(link)
myfile = f.read()
print(myfile)
I know there are different threads for error: Name Error: urlopen is not defined, but thought this might save time.
None of these answers are very good for Python 3 (tested on latest version at the time of this post).
This is how you do it...
import urllib.request
try:
with urllib.request.urlopen('http://www.python.org/') as f:
print(f.read().decode('utf-8'))
except urllib.error.URLError as e:
print(e.reason)
The above is for contents that return 'utf-8'. Remove .decode('utf-8') if you want python to "guess the appropriate encoding."
Documentation:
https://docs.python.org/3/library/urllib.request.html#module-urllib.request
A solution with works with Python 2.X and Python 3.X makes use of the Python 2 and 3 compatibility library six:
from six.moves.urllib.request import urlopen
link = "http://www.somesite.com/details.pl?urn=2344"
response = urlopen(link)
content = response.read()
print(content)
We can read website html content as below :
from urllib.request import urlopen
response = urlopen('http://google.com/')
html = response.read()
print(html)
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Works on python 3 and python 2.
# when server knows where the request is coming from.
import sys
if sys.version_info[0] == 3:
from urllib.request import urlopen
else:
from urllib import urlopen
with urlopen('https://www.facebook.com/') as \
url:
data = url.read()
print data
# When the server does not know where the request is coming from.
# Works on python 3.
import urllib.request
user_agent = \
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = 'https://www.facebook.com/'
headers = {'User-Agent': user_agent}
request = urllib.request.Request(url, None, headers)
response = urllib.request.urlopen(request)
data = response.read()
print data
from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen("https://blog.csdn.net/qq_39591494/article/details/83934260").read().decode('utf-8')
print(html)
from urllib.request import urlopen
from bs4 import BeautifulSoup
link = "https://www.timeshighereducation.com/hub/sinorbis"
f = urlopen(link)
soup = BeautifulSoup(f, 'html.parser')
# get the text content of the webpage
text = soup.get_text()
print(text)
using BeautifulSoup's HTML parser we can extract the content of the webpage.
I used the following code:
import urllib
def read_text():
quotes = urllib.urlopen("https://s3.amazonaws.com/udacity-hosted-downloads/ud036/movie_quotes.txt")
contents_file = quotes.read()
print contents_file
read_text()
# retrieving data from url
# only for python 3
import urllib.request
def main():
url = "http://docs.python.org"
# retrieving data from URL
webUrl = urllib.request.urlopen(url)
print("Result code: " + str(webUrl.getcode()))
# print data from URL
print("Returned data: -----------------")
data = webUrl.read().decode("utf-8")
print(data)
if __name__ == "__main__":
main()
The URL should be a string:
import urllib
link = "http://www.somesite.com/details.pl?urn=2344"
f = urllib.urlopen(link)
myfile = f.readline()
print myfile