Images download with BeautifulSoup - python

I am using BeautifulSoup for extracting pictures which works well for normal pages.
Now I want to extract the picture of the Chromebook from a web page like this
https://twitter.com/banprada/statuses/829102430017187841
The page apparently contains a link to another page with the image. Here is my code for downloading an image from mentioned link but I am only getting the image of the person who posted the link.
import urllib.request
import os
from bs4 import BeautifulSoup
URL = "http://twitter.com/banprada/statuses/829102430017187841"
list_dir="D:\\"
default_dir = os.path.join(list_dir,"Pictures_neu")
opener = urllib.request.build_opener()
urllib.request.install_opener(opener)
soup = BeautifulSoup(urllib.request.urlopen(URL).read())
imgs = soup.findAll("img",{"alt":True, "src":True})
for img in imgs:
img_url = img["src"]
filename = os.path.join(default_dir, img_url.split("/")[-1])
img_data = opener.open(img_url)
f = open(filename,"wb")
f.write(img_data.read())
f.close()
Is there an opportunity to download the image somehow?
Many thanks and regards,
Andi

This is how you can get only mentioned image using Selenium + requests
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
link = 'https://twitter.com/banprada/statuses/829102430017187841'
driver = webdriver.PhantomJS()
driver.get(link)
wait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe[starts-with(#id, 'xdm_default')]")))
image_src = driver.find_element_by_tag_name('img').get_attribute('src')
response = requests.get(image_src).content
with open('C:\\Users\\You\\Desktop\\Image.jpeg', 'wb') as f:
f.write(response)
If you want to get all the images from all iframes on page (excluding images on initial page source that you can get with your code):
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import requests
import time
link = 'https://twitter.com/banprada/statuses/829102430017187841'
driver = webdriver.Chrome()
driver.get(link)
time.sleep(5) # To wait until all iframes completely rendered. Might be increased
iframe_counter = 0
while True:
try:
driver.switch_to_frame(iframe_counter)
pictures = driver.find_elements_by_xpath('//img[#src and #alt]')
if len(pictures) > 0:
for pic in pictures:
response = requests.get(pic.get_attribute('src')).content
with open('C:\\Users\\You\\Desktop\\Images\\%s.jpeg' % (str(iframe_counter) + str(pictures.index(pic))), 'wb') as f:
f.write(response)
driver.switch_to_default_content()
iframe_counter += 1
except WebDriverException:
break
Note, that you can use any webdriver

Related

Python Selenium Problem with downloading images from the site

I have a question, I'm making myself a portfolio app (scrapping + Python) and I can't figure out two things,
How to download thumbnails of photos?
https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944
I had a code that downloaded after XPath:
//*[#id="root"]/div[1]/div[2]/form/div[5]/div/div[2]/div[{counter}]/a/div/div/div[1]/div[1]/div
and it was working fine and since 2 days I get the information
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element...
and I'm pulling my hair out of my head... as for the code, I'm just learning Scrapping so maybe it's not the best solution (maybe JS changes something dynamically?), How to get these thumbnails?
Second question: Which DIV to click a Scrapping to enter a specific car listing? also I'm lost in this because you can click both the price and the photo.... and one enters (example code: driver.find_element(By.CLASS_NAME, 'css-8wsg1m').click())
Thank you very much for your help.
import request
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
option = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
driver.get('https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944')
driver.find_element(By.ID, 'onetrust-accept-btn-handler').click() # Cookies
img=driver.find_element(By.CLASS_NAME, 'css-gl6djm')
imgURL=img.get_attribute('src')
savedImageName='Image.jpg'
downloadImage(imgURL,savedImageName)
name = r'/home/.../Pulpit/GitHub_Public/Selenium_Porsche/work_dir/Image.jpg'
url = r'https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944'
downloadImage(url, name)
The Easiest way to achieve this is by creating an imageDownload() function which takes the image URL as an input and it downloads the image to your machine. You can easily get the image URL with the help of selenium.
import request
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
img=driver.driver.find_element(By.CLASS_NAME, 'CLASS_NAME_HERE')
imgURL=img.get_attribute('src')
savedImageName='Image.jpg'
downloadImage(imgURL,savedImageName)
Your Xpath is wrong, Here's the final code:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
option = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
driver.get('https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944')
driver.find_element(By.ID, 'onetrust-accept-btn-handler').click() # Cookies
img = driver.find_element(
By.XPATH, '//*[#id="root"]/div[1]/div[2]/form/div[5]/div/div[2]/div[2]/a/div/div/div[1]/div[1]/div/img')
imgURL = img.get_attribute('src')
print(imgURL)
savedImageName = 'Image.jpg'
downloadImage(imgURL, savedImageName)
Output:

get image thumbnail urls from webpage using python selenium

I have written this simple script to retrieve the thumbnail URLs after performing a YouTube search for "programming".
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
url = "https://www.youtube.com/results?search_query=programming"
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
browser.get(url)
time.sleep(10)
image_thumbnails = set()
thumbnails = browser.find_elements(By.CLASS_NAME, "style-scope ytd-thumbnail no-transition")
while len(image_thumbnails) < 10:
thumbnails = browser.find_elements(By.CLASS_NAME, "style-scope yt-img-shadow")
for img in thumbnails:
image_thumbnails.add(img.get_attribute('src'))
print(img)
time.sleep(10)
browser.close()
However, the output I get is and not the URLs:
[<selenium.webdriver.remote.webelement.WebElement (session="b9a60c0fe036ab4d592094d611ed7da0", element="cfdf2ad0-41b5-47a6-af7d-8bb00a80175f")>, ...]
You are printing the element used in the for loop. To solve this just print the value you are adding. E.g:
print(img.get_attribute('src'))

How to wait to page to fully load using requests_html

While accessing this link https://www.dickssportinggoods.com/f/tents-accessories?pageNumber=2 with requests_html, i need to wait to wait some time before the page actually loads. Is it possible with this?
My code:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from lxml import etree
s = HTMLSession()
response = s.get(
'https://www.dickssportinggoods.com/f/tents-accessories?pageNumber=2')
response.html.render()
soup = BeautifulSoup(response.content, "html.parser")
dom = etree.HTML(str(soup))
item = dom.xpath('//a[#class="rs_product_description d-block"]/text()')[0]
print(item)
It looks like the data you are looking for can be fetched using HTTP GET to
https://prod-catalog-product-api.dickssportinggoods.com/v2/search?searchVO=%7B%22selectedCategory%22%3A%2212301_1809051%22%2C%22selectedStore%22%3A%220%22%2C%22selectedSort%22%3A1%2C%22selectedFilters%22%3A%7B%7D%2C%22storeId%22%3A15108%2C%22pageNumber%22%3A2%2C%22pageSize%22%3A48%2C%22totalCount%22%3A112%2C%22searchTypes%22%3A%5B%22PINNING%22%5D%2C%22isFamilyPage%22%3Atrue%2C%22appliedSeoFilters%22%3Afalse%2C%22snbAudience%22%3A%22%22%2C%22zipcode%22%3A%22%22%7D
The call will return a JSON and you can use that direcly with zero scraping code.
Copy/Paste the URL into the browser --> see the data.
You can specify the page number in the url:
searchVO={"selectedCategory":"12301_1809051","selectedStore":"0","selectedSort":1,"selectedFilters":{},"storeId":15108,"pageNumber":2,"pageSize":48,"totalCount":112,"searchTypes":["PINNING"],"isFamilyPage":true,"appliedSeoFilters":false,"snbAudience":"","zipcode":""}
working code below
import requests
import pprint
page_num = 2
url = f'https://prod-catalog-product-api.dickssportinggoods.com/v2/search?searchVO=%7B%22selectedCategory%22%3A%2212301_1809051%22%2C%22selectedStore%22%3A%220%22%2C%22selectedSort%22%3A1%2C%22selectedFilters%22%3A%7B%7D%2C%22storeId%22%3A15108%2C%22pageNumber%22%3A2%2C%2{page_num}pageSize%22%3A48%2C%22totalCount%22%3A112%2C%22searchTypes%22%3A%5B%22PINNING%22%5D%2C%22isFamilyPage%22%3Atrue%2C%22appliedSeoFilters%22%3Afalse%2C%22snbAudience%22%3A%22%22%2C%22zipcode%22%3A%22%22%7D'
r = requests.get(url)
if r.status_code == 200:
pprint.pprint(r.json())
You can induce Selenium as well in headless mode.
Selenium has the capability to wait unit elements are found with Explicit waits.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--window-size=1920,1080')
options.add_argument("--headless")
driver = webdriver.Chrome(executable_path = driver_path, options = options)
driver.get("URL here")
wait = WebDriverWait(driver, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//a[#class='rs_product_description d-block']")))
PS: You'd have to download chromedriver from here

Google Maps some XPATH selectors return data some not Selenium Python

I was trying to scrape google maps. The phone and hours variable is not returning any data. Other variables work fine and return data. The XPATH is correct. I am not sure what's the issue here.
Here is the LINK
The other selectors like name, address, title, website return the data fine but phone and hours not returning any data.
Hoping for some answers.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from scrapy.selector import Selector
import csv
from tqdm import tqdm
import time
driver = webdriver.Firefox()
linksFile=open("links.txt",'r')
allLinks = linksFile.readlines()
for link in tqdm(allLinks):
try:
driver.get(link)
except Exception:
print('Something went wrong with the URL: ')
# time.sleep(15)
while True:
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, '//div[contains(text(), "Directions")] | //div[contains(text(), "Website")]'))
)
results = driver.find_elements_by_xpath('//div[contains(text(), "Directions")] | //div[contains(text(), "Website")]')
for result in results:
# writing to the CSV file
outFile = open("data.csv",'a+',newline="")
writer = csv.writer(outFile)
business = driver.find_element_by_xpath('//div[#role="heading"]/div')
business.click()
# waiting for the page to load
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, '//div[#class="immersive-container"]'))
)
# parcing response to the scrapy selector
response = Selector(text=driver.page_source)
name = response.xpath('//h2[#data-attrid="title"]/span/text()').get()
title = response.xpath('(//span[contains(text(), "Google reviews")])/parent::a/parent::span/parent::span/parent::div/parent::div/parent::div/following-sibling::div/div/span/span/text()').get()
address = response.xpath('//a[contains(text(), "Address")]/parent::span/following-sibling::span/text()').get()
website = response.xpath('(//a[contains(text(), "Website")])/#href').get()
phone = response.xpath('//a[contains(text(), "Phone")]/parent::span/following-sibling::span/a/span/text()').get()
hours = response.xpath('//a[contains(text(), "Hours")]/parent::span/following-sibling::div/label/span//btext()').get()
total_reviews = response.xpath('(//span[contains(text(), "Google reviews")])[1]/text()').get()
total_rating = response.xpath('(//span[contains(text(), "Google reviews")])/parent::a/parent::span/parent::span/parent::div/span/text()').get()
input('Check: ')
outFile = open("data.csv",'a+',newline="")
writer = csv.writer(outFile)
vals = [name, title, address, website, phone, hours, total_reviews, total_rating]
writer.writerow(vals)
outFile.close()
Can you use Java script outerHTML intead of pageSource.
response = Selector( driver.execute_script("return document.documentElement.outerHTML"))
Also there is an issue in xpath of Hours:
hours = response.xpath('//a[contains(text(), "Hours")]/parent::span/following-sibling::div/label/span//b/text()').get()
Try Google Maps link and not google search: https://www.google.com/maps/place/Leduc+Plumbing+and+Heating/#53.274672,-113.5486679,17z/data=!3m1!4b1!4m5!3m4!1s0x539ff9a5d31a87c9:0xf494d91aafd55e55!8m2!3d53.2746688!4d-113.5464739
IT should be more stable.

bs4 again from website and save to text file

I am learning on how to extract data from websites now and have managed to get alot of information. However for my next website I am failing for some unknown reason as nothing is saved to the text files nor do I get any output in print. Here is my piece of code:
import json
import urllib.request
from bs4 import BeautifulSoup
import requests
url = 'https://www.jaffari.org/'
request = urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0'})
response = urllib.request.urlopen(request)
html = response.read()
soup = BeautifulSoup(html.decode("utf-8"), "html.parser")
table = soup.find('div', attrs={"class":"textwidget"})
name = table.text.encode('utf-8').strip()
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name, outfile)
print (name)
Can anyone help please?
The prayer times are rendered by java-scripts therefore you need to use browser tool like selenium to load the page and then use beautiful soup to get the data.
You need to download compatible ChromeDriver from this link and passed the chrome driver path as i have provided.
Code here to fetch name and prayer times and saved in a text file.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
with open('testPrayers.txt', 'w') as outfile:
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
outfile.write(name + " " + time[0] + "\n")
print(name + " " + time[0])
outfile.close()
print('Done')
Updated data with different file name.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
print(name + " " + time[0])
with open(name+'.txt', 'w') as outfile:
outfile.write(time[0])
outfile.close()
print('Done')
The name variable needs to be a string rather than a bytes object. Try with
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name.decode(), outfile)
print (name.decode())
Hope it helps.

Categories

Resources