How to download file via Beautifulsoup in aspx? - python

I want to download pdf files on the website using beautiful soup and selenium.
I've written the code up to here and it's incomplete. However, since I can't find the link to download the pdf file.
#!/usr/bin/python
from bs4 import BeautifulSoup
from selenium import webdriver
import webbrowser
import os
import requests
import urllib2
import time
import urllib
try:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument('--no-sandbox')
driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=options)
except urllib2.HTTPError as e:
print(e)
except urllib2.URLError:
print ("Server down or incorrect domains.")
else:
def not_relative_uri(href):
return re.compile('^https://').search(href) is not None
driver.get("https://xxxxxx")
# print(driver.page_source.encode('utf-8'))
my_folder="/home/python/"
soup_res = BeautifulSoup(driver.page_source.encode('utf-8'), 'html.parser')
tr = soup_res.find("div", {"id":"pageWrapper"}).find("div", {"class":"EGZDefault-List"}).find("div", {"class":"EGZDefault-List-Info-List"}).find("table", {"class":"gridview"}).find("tbody").find_all('tr')[1:21]
I hope someone can help me.

With Selenium you can do it as following:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=options)
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
driver.get("https://xxxxxx")
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table.gridview input[type='image']")))
time.sleep(2)
images = driver.find_elements_by_css_selector("table.gridview input[type='image']")
for image in images:
actions.move_to_element(image).perform()
time.sleep(0.5)
image.click()
time.sleep(5)

Related

Selenium can't get a page with punycode url

Here's how I'm getting page content
from selenium.webdriver.support.wait import WebDriverWait
import os
from seleniumwire import webdriver
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from seleniumwire.handler import log as seleniumLog
from seleniumwire.server import logger as selenium_server_log
from webdriver_manager.firefox import GeckoDriverManager
options = Options()
options.add_argument('--user-agent="' + user_agent + '"')
options.add_argument("--start-maximized")
options.add_argument("--headless")
driver = webdriver.Firefox(
executable_path=GeckoDriverManager().install(),
options=options,
)
driver.set_page_load_timeout(30)
try:
driver.get('https://xn--e1aicoccdeejjbbl0l.xn--p1ai/uslugi/stroitelstvo/price/')
WebDriverWait(driver, 40).until(ec.presence_of_element_located((By.TAG_NAME, "html")))
except Exception as e:
error = True
print(e)
This is the output I have:
Message: Reached error page: about:neterror?e=dnsNotFound&u=https%3A//xn--e1aicoccdeejjbbl0l.xn--p1ai/uslugi/stroitelstvo/price/&c=UTF-8&d=We%20can%E2%80%99t%20connect%20to%20the%20server%20at%20xn--e1aicoccdeejjbbl0l.xn--p1ai.
When I try to get a content from usual latinic url, everything works ok. The problem occurs when I use cyrillic or punycode urls.
What can I do about it?

Clear CAPTCHA Success From HTML

I'm trying to scrape some site data and have cleared the CAPTCHA I'm triggering manually - however I continue to load the CAPTCHA success page after I close and reopen my session:
Code:
import urllib, os, urllib.request, time, requests, random, pandas as pd
from datetime import date
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from google_trans_new import google_translator
chrome_options = Options()
chrome_options.add_argument("user-data-dir=C:\\environments\\selenium")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://ca.indeed.com/")
search_company = driver.find_element(By.XPATH,"//*[#id='text-input-what']")
search_company.send_keys(Keys.CONTROL + "a")
search_company.send_keys(Keys.DELETE)
search_company.send_keys("Sales")
search_loc = driver.find_element(By.XPATH,"//*[#id='text-input-where']")
search_loc.send_keys(Keys.CONTROL + "a")
search_loc.send_keys(Keys.DELETE)
search_loc.send_keys("Quebec")
click_search = driver.find_element(By.XPATH,"//*[#id='jobsearch']/button")
click_search.click()
After running this block, I run:
page = driver.current_url
html = requests.get(page,verify=False)
soup = BeautifulSoup(html.content, 'html.parser', from_encoding = 'utf-8')
soup
And I can't avoid the HTML, and thus have nothing to scrape:
hCaptcha solve page
How do I stop returning the CAPTCHA success page and revert back to the page I'm trying to scrape? I've added my environment to try and retain the cookies but I'm at a loss on how to proceed.

Getting the element on page source but unable to locate using xpath

I am trying to find the element by using xpath but they are unable to locate. While when I am getting the page source using selenium they have the element and Also I have checked but the element are not in Iframe.
Here is my code:
from requests_html import HTMLSession
import pandas as pd
from fake_useragent import UserAgent
from requests_html import AsyncHTMLSession
from selenium import webdriver
from shutil import which
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
ua = UserAgent()
s = HTMLSession()
asession = AsyncHTMLSession()
url = 'https://ordiamond-frame-categoryembed-catid23621.jewelershowcase.com/search/results?query=124405'
try : User_Agent = str(ua.chrome)
except : pass
headers = {'User-Agent':User_Agent}
response = s.get(url, headers= headers)
print(response)
link = response.html.xpath('//a[#class="image logClick containerFix"]/#href')
if link:
p_url = "https://ordiamond-frame-categoryembed-catid23621.jewelershowcase.com" + (link[0])
chrome_path = which('chromedriver')
driver = webdriver.Chrome(executable_path=chrome_path)
driver.maximize_window()
driver.get(p_url)
time.sleep(20)
with open('data.html', 'w') as file:
file.write(str(driver.page_source))
print(driver.page_source)
driver.page_source
WebDriverWait(driver, 50).until(EC.visibility_of_element_located((By.XPATH, '(//h3[#class="description"])[2]')))
# time.sleep(16)
na = driver.find_element_by_xpath('(//h3[#class="description"])[2]')
print(na.text)
Hoping to get the solution. Thanks
If there are multiple matching nodes, Selenium will always fetch the first set if we are using find_element not find_elements. also same with webdriverwait.
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(30)
wait = WebDriverWait(driver, 30)
driver.get("https://ordiamond-frame-categoryembed-catid23621.jewelershowcase.com/search/results?query=124405")
product = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='results']/descendant::a")))
product.click()
heading = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//h3[#class='description']")))
print(heading.text)
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC4
Output :
14K Yellow 9x7 mm Oval Engagement Ring Mounting
Consider the fact that you want the xpath of the link of the ring, here it is:
link = response.html.xpath('//*[#id='results']//a[1]')

Value of CSS Property Selenium returning None for all images

I'm trying to scrape all of the images on this site. However, when I run my script and try to get the CSS attribute of 'background-image' to extract the url of each web element, the result is printing out "None". I have no idea why it would be returning None as I print out the web element and the attribute does exist. Any help would be greatly appreciated!
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def parse_style_attribute(style_string):
if 'background-image' in style_string:
style_string = style_string.split(' url("')[1].replace('");', '')
return style_string
return None
#setup opening url window of website to be scraped
options = webdriver.ChromeOptions()
options.headless=False
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
#driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3") #possible issue by not including the file extension
# driver.maximize_window()
# time.sleep(5)
# driver.get("""https://www.tripadvisor.com/""") #get the information from the page
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
driver.get("https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
time.sleep(1)
#waits for that amount of time
driver.implicitly_wait(12)
#find the searchbar and then plug in the key
#driver.find_element_by_xpath('//*[#class="typeahead_input"]').send_keys("Washington D.C.", Keys.ENTER)
#wait
time.sleep(1)
#list all of the hotels in that page
images = driver.find_elements_by_xpath('//*[#class="media-viewer-tile-gallery-v2-TileGallery__entryInner--JaADY "]')
image_url = []
for i in range(len(images)):
image_url.append(images[i].value_of_css_property("background-image"))
print("Total Number of images: ", len(images))
# print(images)
firstimage = images[0].get_attribute("innerHTML")
print(firstimage)
for i in range(len(image_url)):
print(image_url[i])
try this. it works for me.
# attach your code as set browser option
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
driver.get(
"https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
images = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[#class="media-viewer-dt-root-GalleryImageWithOverlay__galleryImage--1Drp0"]')))
image_url = []
for index, image in enumerate(images):
image_url.append(images[index].value_of_css_property("background-image"))
print(image_url)

Not able to click the radio button using selenium webdriver in python

I'm having a problem clicking the Radio button for the Registered Projects on this site. It is not clicking with my code in selenium webdriver.
import urllib.request
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.implicitly_wait(10)
driver.get(url)
soup=BeautifulSoup(driver.page_source, 'lxml')
link =driver.find_element_by_link_text("Search Project Details")
link.click()
driver.find_element_by_id("Promoter").click()
Use WebDriverWait and java Scripts Executor to click on the Registered Project radio button.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Browser snapshot:
WebDriverWait - An explicit wait is a code you define to wait for a certain condition to occur before proceeding further in the code.
import urllib.request
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
links = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME,\
"search-pro-details")))
#Click on Search Project Details link
links.find_element_by_link_text("Search Project Details").click()
promoter_radio_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,\
"Promoter")))
#select radio button
promoter_radio_button.send_keys(Keys.SPACE)
Try using this:
driver.get('https://maharerait.mahaonline.gov.in')
link =driver.find_element_by_link_text("Search Project Details")
link.click()
time.sleep(2)
radio_btn = driver.find_element_by_id("Promoter")
radio_btn.click()
time.sleep(5)
driver.close()

Categories

Resources