Python- Scraping images using requests - python

I'm unable to save/download the images at the location. I can't figure out the problem although the code seems right.
I'm using requests library for scraping the images.
import os
import urllib
import urllib.request
from bs4 import BeautifulSoup
import requests
import re
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
title = fromstring(r.content).findtext('.//title')
#print(title)
newPath = r'C:\Users\Vicky\Desktop\ScrappedImages\ ' + title
for link in soup.find_all('img'):
image = link.get('src')
if 'http' in image:
print(image)
imageName = os.path.split(image)[1]
print(imageName)
r2 = requests.get(image)
if not os.path.exists(newPath):
os.makedirs(newPath)
with open(imageName, "wb") as f:
f.write(r2.content)

Try wrapping your r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci") in a try: or while: statement to make sure that the website you are scraping is returning a 200 response, it could be that the website is timing out or not serving your request.

import os
from bs4 import BeautifulSoup
import urllib
import requests
import urlparse
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
for link in soup.find_all('img'):
image = link.get('src')
if bool(urlparse.urlparse(image).netloc):
print(image)
imageName = image[image.rfind("/")+1:]
print(imageName)
urllib.urlretrieve(image,imageName)

Related

How to breakup webpage text

I assume I have to use the /br to break up this text and extract the fixtures only, but cannot figure it out at all!
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pandas as pd
import pprint
import json
from urllib.request import urlopen
# sample web page
sample_web_page = 'https://bleacherreport.com/articles/10005879-epl-schedule-2021-22-official-list-of-fixtures-for-new-premier-league-season'
# call get method to request that page
page = requests.get(sample_web_page)
# with the help of beautifulSoup and html parser create soup
soup = BeautifulSoup(page.content, "html.parser")
z = soup.findAll('p', {'class':''})
print(z)

I would like to scrape multiple pages , but I got the result of the last url.Why?

Why does the result output the last url?
Is something wrong with my code?
import requests as uReq
from bs4 import BeautifulSoup as soup
import numpy as np
#can i use while loop instead for?
for page in np.arange(1,15):
url = uReq.get('https://www.myanmarbusiness-directory.com/en/categories-index/car-wheels-tyres-tubes-dealers/page{}.html?city=%E1%80%99%E1%80%9B%E1%80%99%E1%80%B9%E1%80%B8%E1%80%80%E1%80%AF%E1%80%94%E1%80%B9%E1%80%B8%E1%81%BF%E1%80%99%E1%80%AD%E1%80%B3%E1%82%95%E1%80%94%E1%80%9A%E1%80%B9'.format(page)).text
#have used for loop,but result is the last url
page_soup = soup(url,"html.parser")
info = page_soup.findAll("div",{"class: ","row detail_row"})
#Do all the url return output in one file?
filename = "wheel.csv"
file = open(filename,"w",encoding="utf-8")
you should check the indentation of what is happening after the for loop, otherwise, the variable url is replaced for every iteration of the loop, hence retaining only the last one.
import requests as uReq
from bs4 import BeautifulSoup as soup
import numpy as np
for page in np.arange(1,15):
url = uReq.get('https://www.myanmarbusiness-directory.com/en/categories-index/car-wheels-tyres-tubes-dealers/page{}.html?city=%E1%80%99%E1%80%9B%E1%80%99%E1%80%B9%E1%80%B8%E1%80%80%E1%80%AF%E1%80%94%E1%80%B9%E1%80%B8%E1%81%BF%E1%80%99%E1%80%AD%E1%80%B3%E1%82%95%E1%80%94%E1%80%9A%E1%80%B9'.format(page)).text
# this should be done N times (where N is the range param)
page_soup = soup(url,"html.parser")
info = page_soup.findAll("div",{"class: ","row detail_row"})
# append the results to the csv file
filename = "wheel.csv"
file = open(filename,"a",encoding="utf-8")
... # code for writing in the csv file
file.close()
Then, you will find everything in your file. Be aware that you should also close the file for saving it.
Try this!!
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import requests
urls=['https://www.myanmarbusiness-directory.com/en/categories-index/car-wheels-tyres-tubes-dealers/page{}.html?city=%E1%80%99%E1%80%9B%E1%80%99%E1%80%B9%E1%80%B8%E1%80%80%E1%80%AF%E1%80%94%E1%80%B9%E1%80%B8%E1%81%BF%E1%80%99%E1%80%AD%E1%80%B3%E1%82%95%E1%80%94%E1%80%9A%E1%80%B9']
links = []
for url in urls:
response = requests.get(url)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html_page = urlopen(req).read()
soup = BeautifulSoup(html_page, features="html.parser")
for link in soup.select_one('ol.list_products').findAll('a', attrs={'href': re.compile("^([a-zA-Z0-9\-])+$")}):
links.append(link.get('href'))
filename = 'output.csv'
with open(filename, mode="w") as outfile:
for s in links:
outfile.write("%s\n" %s)

AttributeError: 'NoneType' object has no attribute 'group' with BeautifulSoup4

Hello Community I have a problem and I dont know how to solve it my problem is I write a script to crawl webpages for Images with BeautifuleSoup4 but I got the error (AttributeError: 'NoneType' object has no attribute 'group')
import re
import requests
from bs4 import BeautifulSoup
site = 'https://www.fotocommunity.de/natur/wolken/3144?sort=new'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img', {"src": True})
urls = [img["src"] for img in img_tags]
for url in urls:
filename = re.search(r'([\w_-]+[.](jpg|png))$', url)
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
Your regex is wrong. Use Python's internal urllib to do the heavyweight lifting instead of writing regexes if you're not familiar with them.
Use something like this (untested):
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlsplit # import this additional library
from os.path import basename # import this additional library
site = 'https://www.fotocommunity.de/natur/wolken/3144?sort=new'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
images_div = soup.find(id=re.compile(r"fcx-gallery-\w+")) # focus on the div containing the images
if img_tags: # test if img_tags has any data
img_tags = images_div.find_all('img', {"data-src": True}) # get all the images in that div
urls = [img["data-src"] for img in img_tags] # grab sources from data-source
for url in urls:
filename = basename(urlsplit(url).path) # use this instead of a regex
with open(filename, 'wb') as f: # filename is now a string
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)

BeautifulSoup find_all("img") not working for all sites

I'm trying to write a Python script to download images from any website. It is working, but inconsistently. Specifically, find_all("img") is not doing so for the second url. The script is:
# works for http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/
# but not http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
html = requests.get("http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/")
soup = BeautifulSoup(html.text, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
The images are rendered with JavaScript on the page that is failing.
First render the page with dryscrape
(If you don't want to use dryscrape see Web-scraping JavaScript page with Python )
e.g.
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
session = dryscrape.Session()
session.visit("http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/")
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
But I would also check that you have an absolute URL not a relative one:
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
from urllib.parse import urljoin
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
base = "http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/"
session = dryscrape.Session()
session.visit(base)
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
if url.startswith( 'http' ):
absoute = url
else:
absoute = urljoin(base, url)
print (absoute)
url_to_image(absoute, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1

Downloaded videos with urllib and bs4 does not play

I have a folder online where I have video files so I need to download videos from there to my local system so in order to do that I am using Bs4 , urllib modules , The downloaded files are not playable
Please have a look at my code .
from bs4 import BeautifulSoup
import urllib2
import random
import urllib
from urllib2 import urlopen
def download(url):
response = urllib.urlopen("http://ssdolutions/addadsfasdfulsdfaatadfae")
doc = response.read()
soup = BeautifulSoup(doc)
for link in soup.find_all('a'):
x = (link.get('href'))
name = random.randrange(1,10)
full_name = str(name) + ".mp4"
urllib.urlretrieve(url, full_name)
download("http://ssdolutions/addadsfasdfulsdfaatadfae")
Pass every link URL (stored in x variable) to urlretrieve call:
from bs4 import BeautifulSoup
import urllib2
import random
import urllib
import os
from urllib2 import urlopen
def download(url):
response = urllib.urlopen("http://ssdolutions/addadsfasdfulsdfaatadfae")
doc = response.read()
soup = BeautifulSoup(doc)
for link in soup.find_all('a'):
url = (link.get('href'))
# Extract filename from link URL
filename = os.path.basename(url)
file_data = os.path.splitext(filename)
if len(file_data) > 1:
file_ext = file_data[1]
# this will allow you to download links with *.mp4 extension only
if file_ext == ".mp4":
urllib.urlretrieve(url, filename)
download("http://ssdolutions/addadsfasdfulsdfaatadfae")

Categories

Resources