I have a question, I'm making myself a portfolio app (scrapping + Python) and I can't figure out two things,
How to download thumbnails of photos?
https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944
I had a code that downloaded after XPath:
//*[#id="root"]/div[1]/div[2]/form/div[5]/div/div[2]/div[{counter}]/a/div/div/div[1]/div[1]/div
and it was working fine and since 2 days I get the information
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element...
and I'm pulling my hair out of my head... as for the code, I'm just learning Scrapping so maybe it's not the best solution (maybe JS changes something dynamically?), How to get these thumbnails?
Second question: Which DIV to click a Scrapping to enter a specific car listing? also I'm lost in this because you can click both the price and the photo.... and one enters (example code: driver.find_element(By.CLASS_NAME, 'css-8wsg1m').click())
Thank you very much for your help.
import request
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
option = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
driver.get('https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944')
driver.find_element(By.ID, 'onetrust-accept-btn-handler').click() # Cookies
img=driver.find_element(By.CLASS_NAME, 'css-gl6djm')
imgURL=img.get_attribute('src')
savedImageName='Image.jpg'
downloadImage(imgURL,savedImageName)
name = r'/home/.../Pulpit/GitHub_Public/Selenium_Porsche/work_dir/Image.jpg'
url = r'https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944'
downloadImage(url, name)
The Easiest way to achieve this is by creating an imageDownload() function which takes the image URL as an input and it downloads the image to your machine. You can easily get the image URL with the help of selenium.
import request
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
img=driver.driver.find_element(By.CLASS_NAME, 'CLASS_NAME_HERE')
imgURL=img.get_attribute('src')
savedImageName='Image.jpg'
downloadImage(imgURL,savedImageName)
Your Xpath is wrong, Here's the final code:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
option = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
driver.get('https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944')
driver.find_element(By.ID, 'onetrust-accept-btn-handler').click() # Cookies
img = driver.find_element(
By.XPATH, '//*[#id="root"]/div[1]/div[2]/form/div[5]/div/div[2]/div[2]/a/div/div/div[1]/div[1]/div/img')
imgURL = img.get_attribute('src')
print(imgURL)
savedImageName = 'Image.jpg'
downloadImage(imgURL, savedImageName)
Output:
Related
I am currently working on a scraper for aniworld.to.
My goal is it to enter the anime name and get all of the Episodes downloaded.
I have everything working except one thing...
The websites has a Watch button. That Button redirects you to https://aniworld.to/redirect/SOMETHING and that Site has a captcha which means the link is not in the html...
Is there a way to bypass this/get the link in python? Or a way to display the captcha so I can solve it?
Because the captcha only appears every lightyear.
The only thing I need from that page is the redirect link. It looks like this:
https://vidoza.net/embed-something.html
My very very wip code is here if it helps: https://github.com/wolfswolke/aniworld_scraper
Mitchdu showed me how to do it.
If anyone else needs help here is my code: https://github.com/wolfswolke/aniworld_scraper/blob/main/src/logic/captcha.py
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from threading import Thread
import os
def open_captcha_window(full_url):
working_dir = os.getcwd()
path_to_ublock = r'{}\extensions\ublock'.format(working_dir)
options = webdriver.ChromeOptions()
options.add_argument("app=" + full_url)
options.add_argument("window-size=423,705")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
if os.path.exists(path_to_ublock):
options.add_argument('load-extension=' + path_to_ublock)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(full_url)
wait = WebDriverWait(driver, 100, 0.3)
wait.until(lambda redirect: redirect.current_url != full_url)
new_page = driver.current_url
Thread(target=threaded_driver_close, args=(driver,)).start()
return new_page
def threaded_driver_close(driver):
driver.close()
I have written this simple script to retrieve the thumbnail URLs after performing a YouTube search for "programming".
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
url = "https://www.youtube.com/results?search_query=programming"
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
browser.get(url)
time.sleep(10)
image_thumbnails = set()
thumbnails = browser.find_elements(By.CLASS_NAME, "style-scope ytd-thumbnail no-transition")
while len(image_thumbnails) < 10:
thumbnails = browser.find_elements(By.CLASS_NAME, "style-scope yt-img-shadow")
for img in thumbnails:
image_thumbnails.add(img.get_attribute('src'))
print(img)
time.sleep(10)
browser.close()
However, the output I get is and not the URLs:
[<selenium.webdriver.remote.webelement.WebElement (session="b9a60c0fe036ab4d592094d611ed7da0", element="cfdf2ad0-41b5-47a6-af7d-8bb00a80175f")>, ...]
You are printing the element used in the for loop. To solve this just print the value you are adding. E.g:
print(img.get_attribute('src'))
I was trying to scrape google maps. The phone and hours variable is not returning any data. Other variables work fine and return data. The XPATH is correct. I am not sure what's the issue here.
Here is the LINK
The other selectors like name, address, title, website return the data fine but phone and hours not returning any data.
Hoping for some answers.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from scrapy.selector import Selector
import csv
from tqdm import tqdm
import time
driver = webdriver.Firefox()
linksFile=open("links.txt",'r')
allLinks = linksFile.readlines()
for link in tqdm(allLinks):
try:
driver.get(link)
except Exception:
print('Something went wrong with the URL: ')
# time.sleep(15)
while True:
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, '//div[contains(text(), "Directions")] | //div[contains(text(), "Website")]'))
)
results = driver.find_elements_by_xpath('//div[contains(text(), "Directions")] | //div[contains(text(), "Website")]')
for result in results:
# writing to the CSV file
outFile = open("data.csv",'a+',newline="")
writer = csv.writer(outFile)
business = driver.find_element_by_xpath('//div[#role="heading"]/div')
business.click()
# waiting for the page to load
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, '//div[#class="immersive-container"]'))
)
# parcing response to the scrapy selector
response = Selector(text=driver.page_source)
name = response.xpath('//h2[#data-attrid="title"]/span/text()').get()
title = response.xpath('(//span[contains(text(), "Google reviews")])/parent::a/parent::span/parent::span/parent::div/parent::div/parent::div/following-sibling::div/div/span/span/text()').get()
address = response.xpath('//a[contains(text(), "Address")]/parent::span/following-sibling::span/text()').get()
website = response.xpath('(//a[contains(text(), "Website")])/#href').get()
phone = response.xpath('//a[contains(text(), "Phone")]/parent::span/following-sibling::span/a/span/text()').get()
hours = response.xpath('//a[contains(text(), "Hours")]/parent::span/following-sibling::div/label/span//btext()').get()
total_reviews = response.xpath('(//span[contains(text(), "Google reviews")])[1]/text()').get()
total_rating = response.xpath('(//span[contains(text(), "Google reviews")])/parent::a/parent::span/parent::span/parent::div/span/text()').get()
input('Check: ')
outFile = open("data.csv",'a+',newline="")
writer = csv.writer(outFile)
vals = [name, title, address, website, phone, hours, total_reviews, total_rating]
writer.writerow(vals)
outFile.close()
Can you use Java script outerHTML intead of pageSource.
response = Selector( driver.execute_script("return document.documentElement.outerHTML"))
Also there is an issue in xpath of Hours:
hours = response.xpath('//a[contains(text(), "Hours")]/parent::span/following-sibling::div/label/span//b/text()').get()
Try Google Maps link and not google search: https://www.google.com/maps/place/Leduc+Plumbing+and+Heating/#53.274672,-113.5486679,17z/data=!3m1!4b1!4m5!3m4!1s0x539ff9a5d31a87c9:0xf494d91aafd55e55!8m2!3d53.2746688!4d-113.5464739
IT should be more stable.
I am learning on how to extract data from websites now and have managed to get alot of information. However for my next website I am failing for some unknown reason as nothing is saved to the text files nor do I get any output in print. Here is my piece of code:
import json
import urllib.request
from bs4 import BeautifulSoup
import requests
url = 'https://www.jaffari.org/'
request = urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0'})
response = urllib.request.urlopen(request)
html = response.read()
soup = BeautifulSoup(html.decode("utf-8"), "html.parser")
table = soup.find('div', attrs={"class":"textwidget"})
name = table.text.encode('utf-8').strip()
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name, outfile)
print (name)
Can anyone help please?
The prayer times are rendered by java-scripts therefore you need to use browser tool like selenium to load the page and then use beautiful soup to get the data.
You need to download compatible ChromeDriver from this link and passed the chrome driver path as i have provided.
Code here to fetch name and prayer times and saved in a text file.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
with open('testPrayers.txt', 'w') as outfile:
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
outfile.write(name + " " + time[0] + "\n")
print(name + " " + time[0])
outfile.close()
print('Done')
Updated data with different file name.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
print(name + " " + time[0])
with open(name+'.txt', 'w') as outfile:
outfile.write(time[0])
outfile.close()
print('Done')
The name variable needs to be a string rather than a bytes object. Try with
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name.decode(), outfile)
print (name.decode())
Hope it helps.
I am using BeautifulSoup for extracting pictures which works well for normal pages.
Now I want to extract the picture of the Chromebook from a web page like this
https://twitter.com/banprada/statuses/829102430017187841
The page apparently contains a link to another page with the image. Here is my code for downloading an image from mentioned link but I am only getting the image of the person who posted the link.
import urllib.request
import os
from bs4 import BeautifulSoup
URL = "http://twitter.com/banprada/statuses/829102430017187841"
list_dir="D:\\"
default_dir = os.path.join(list_dir,"Pictures_neu")
opener = urllib.request.build_opener()
urllib.request.install_opener(opener)
soup = BeautifulSoup(urllib.request.urlopen(URL).read())
imgs = soup.findAll("img",{"alt":True, "src":True})
for img in imgs:
img_url = img["src"]
filename = os.path.join(default_dir, img_url.split("/")[-1])
img_data = opener.open(img_url)
f = open(filename,"wb")
f.write(img_data.read())
f.close()
Is there an opportunity to download the image somehow?
Many thanks and regards,
Andi
This is how you can get only mentioned image using Selenium + requests
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
link = 'https://twitter.com/banprada/statuses/829102430017187841'
driver = webdriver.PhantomJS()
driver.get(link)
wait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe[starts-with(#id, 'xdm_default')]")))
image_src = driver.find_element_by_tag_name('img').get_attribute('src')
response = requests.get(image_src).content
with open('C:\\Users\\You\\Desktop\\Image.jpeg', 'wb') as f:
f.write(response)
If you want to get all the images from all iframes on page (excluding images on initial page source that you can get with your code):
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import requests
import time
link = 'https://twitter.com/banprada/statuses/829102430017187841'
driver = webdriver.Chrome()
driver.get(link)
time.sleep(5) # To wait until all iframes completely rendered. Might be increased
iframe_counter = 0
while True:
try:
driver.switch_to_frame(iframe_counter)
pictures = driver.find_elements_by_xpath('//img[#src and #alt]')
if len(pictures) > 0:
for pic in pictures:
response = requests.get(pic.get_attribute('src')).content
with open('C:\\Users\\You\\Desktop\\Images\\%s.jpeg' % (str(iframe_counter) + str(pictures.index(pic))), 'wb') as f:
f.write(response)
driver.switch_to_default_content()
iframe_counter += 1
except WebDriverException:
break
Note, that you can use any webdriver