extracting csv download link from an webpage using python

extracting csv download link from an webpage using python - python

I want to extract the CSV download URL from website - https://www.nseindia.com/option-chain
enter image description here
Code I used till now
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
driver.get("https://www.nseindia.com/option-chain")
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID,
"equity_underlyingVal")))
nifty = (driver.find_element(By.XPATH, '//*
[#id="equity_underlyingVal"]').text).replace('NIFTY ',
'').replace(',','')
time_stamp = driver.find_element(By.XPATH, '//*
[#id="equity_timeStamp"]').text
I need the csv link to be load in pandas df. I dont want to use selenium or if using selenium, I need it as headless. Let me know if anyone has a better idea about extracting data directly into pandas datafream..

You can extract the downloading link contained in that element with Selenium as following:
link = driver.find_element(By.CSS_SELECTOR, '#downloadOCTable').get_attribute("href")

As the download link is not present in the href attribute, the best approach is to download the csv file.
Interacting in headless mode can cause problems if the window-size argument is not specified, and a workaround to download files in headless mode is to specify the download path using the driver.command_executor method.
Code snippet to download csv in headless mode-
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os
options = Options()
#add necessary arguments
options.add_argument("user-agent= Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36")
options.add_argument("--window-size=1920,1080")
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
#set download path (set to current working directory in this example)
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow','downloadPath':os.getcwd()}}
command_result = driver.execute("send_command", params)
driver.get("https://www.nseindia.com/option-chain")
#wait for table details to appear
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//*[#id="equity_optionChainTable"]')))
#find and click on download csv button
download_button=driver.find_element_by_xpath('//*[#id="downloadOCTable"]')
download_button.click()

Related

upload files to google without using APIs with Selenium in headerless mode?

I currently have this code
from undetected_chromedriver import Chrome
from undetected_chromedriver import ChromeOptions
import os
from time import sleep
from selenium.webdriver import ActionChains
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
opts = ChromeOptions()
opts.add_argument(f'--user-data-dir={os.getcwd()}/driver/profile')
opts.add_argument(f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
driver = Chrome(executable_path=f'{os.getcwd()}/driver/chromedriver.exe', options=opts)
sleep(2)
driver.get('https://drive.google.com/drive/my-drive')
sleep(5)
actionChains = ActionChains(driver)
sleep(2)
actionChains.click(driver.find_element(By.XPATH, '//*[#guidedhelpid="new_menu_button"]')).perform()
sleep(1)
WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, '//*[#data-tooltip="File upload"]')))
actionChains.click(driver.find_element(By.XPATH, '//*[#data-tooltip="File upload"]')).perform()
self.driver.find_element(By.XPATH, '//input[#type="file"]').send_keys(f'{os.getcwd()}/downloads/english.srt')
If I have the window open, it works without a problem, it can upload the file, the problem is when I try to use the mode without header, it does not detect the File Upload element, so I have tried another option on drag a dop and have not had any luck, I have been trying for several days and I'm about to throw in the towel
option with drag and drop
JS_DROP_FILES = "var k=arguments,d=k[0],g=k[1],c=k[2],m=d.ownerDocument||document;for(var e=0;;){var f=d.getBoundingClientRect(),b=f.left+(g||(f.width/2)),a=f.top+(c||(f.height/2)),h=m.elementFromPoint(b,a);if(h&&d.contains(h)){break}if(++e>1){var j=new Error('Element not interactable');j.code=15;throw j}d.scrollIntoView({behavior:'instant',block:'center',inline:'center'})}var l=m.createElement('INPUT');l.setAttribute('type','file');l.setAttribute('multiple','');l.setAttribute('style','position:fixed;z-index:2147483647;left:0;top:0;');l.onchange=function(q){l.parentElement.removeChild(l);q.stopPropagation();var r={constructor:DataTransfer,effectAllowed:'all',dropEffect:'none',types:['Files'],files:l.files,setData:function u(){},getData:function o(){},clearData:function s(){},setDragImage:function i(){}};if(window.DataTransferItemList){r.items=Object.setPrototypeOf(Array.prototype.map.call(l.files,function(x){return{constructor:DataTransferItem,kind:'file',type:x.type,getAsFile:function v(){return x},getAsString:function y(A){var z=new FileReader();z.onload=function(B){A(B.target.result)};z.readAsText(x)},webkitGetAsEntry:function w(){return{constructor:FileSystemFileEntry,name:x.name,fullPath:'/'+x.name,isFile:true,isDirectory:false,file:function z(A){A(x)}}}}}),{constructor:DataTransferItemList,add:function t(){},clear:function p(){},remove:function n(){}})}['dragenter','dragover','drop'].forEach(function(v){var w=m.createEvent('DragEvent');w.initMouseEvent(v,true,true,m.defaultView,0,0,0,b,a,false,false,false,false,0,null);Object.setPrototypeOf(w,null);w.dataTransfer=r;Object.setPrototypeOf(w,DragEvent.prototype);h.dispatchEvent(w)})};m.documentElement.appendChild(l);l.getBoundingClientRect();return l"
def drop_files(element, files, offsetX=0, offsetY=0):
driver = element.parent
isLocal = not driver._is_remote or '127.0.0.1' in driver.command_executor._url
paths = []
# ensure files are present, and upload to the remote server if session is remote
for file in (files if isinstance(files, list) else [files]):
if not os.path.isfile(file):
raise FileNotFoundError(file)
paths.append(file if isLocal else element._upload(file))
value = '\n'.join(paths)
elm_input = driver.execute_script(JS_DROP_FILES, element, offsetX, offsetY)
elm_input._execute('sendKeysToElement', {'value': [value], 'text': value})
WebElement.drop_files = drop_files
dropzone = driver.find_element_by_xpath('//c-wiz[#data-region-root]')
dropzone.drop_files(f'{os.getcwd()}\\downloads\\english.srt')

How to click a label without id with Selenium

I'm testing with this site here any https://www.nike.com.br/cosmic-unity-153-169-211-324680
And I'm trying after a few seconds that the page loads you must select the size and I can't select the size automatically with Selenium. Can someone help me?
Look, when it appears for you to select the size of the sneaker, I'm in Brazil and I select the size 40 of the sneaker, only if you inspect the "40" you will see that it is a label, and this label has no id, this label is the following html code snippet:
<label for="tamanho__id40">40</label>
How could I click on this label in Selenium?
I currently have this code:
import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities
import DesiredCapabilities from selenium.webdriver.support.ui
import WebDriverWait from selenium.webdriver.common.by
import By from selenium.webdriver.support
import expected_conditions as EC
import time
option = Options()
prefs = {'profile.default_content_setting_values': {'images': 2}}
option.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(options = option)
# Navigate to url
driver.get"https://www.nike.com.br/cosmic-unity-153-169-211-324680")
What would I have to add to be able to click on this label that has no id?

1 You need to accept cookies
2 Use Selenium's explicit waits. To use them you will need to import:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
3 Use reliable locators. I propose using this xpath locator for 40 shoe size: //label[#for="tamanho__id40"]
4 I added some chrome_options for dealing with this site.
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-blink-features")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(executable_path='/snap/bin/chromium.chromedriver', options=chrome_options)
driver.get("https://www.nike.com.br/cosmic-unity-153-169-211-324680")
wait = WebDriverWait(driver, 15)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.cc-allow'))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, '//label[#for="tamanho__id40"]'))).click()

There is an error in registering on the site using webdriver Selenium Python

I am trying to register a new account for this site. However, I cannot register there because of an error or block for ChromeDriver (selenium Python).
I am using this code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import random
import string
from time import sleep
from selenium.webdriver.common.keys import Keys
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
options = webdriver.ChromeOptions()
options.add_argument(f'user-agent={user_agent}')
options.add_argument('disable-infobars')
options.add_argument('--profile-directory=Default')
options.add_argument("--incognito")
options.add_argument("--disable-plugins-discovery")
options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors", "safebrowsing-disable-download-protection", "safebrowsing-disable-auto-update", "disable-client-side-phishing-detection"])
options.add_argument('--disable-extensions')
options.add_argument("start-maximized")
options.add_argument("--disable-blink-features")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r'chromedriver1.exe')
driver.get('https://www.nordstrom.com/signin?cm_sp=SI_SP_A-_-SI_SP_B-_-SI_SP_C&origin=tab&ReturnURL=https%3A%2F%2Fwww.nordstrom.com%2F')
def email(stringLength=8):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(stringLength))
email = email(6) + "#gmail.com"
sleep(5)
# email
driver.find_element_by_name("email").send_keys(email)
sleep(5)
# next
driver.find_element_by_id('account-check-next-button').send_keys(Keys.ENTER)
I think the website is blocking WebDriver. When I use Chrome in my computer, I don't encounter any problems, but using ChromeDriver, this is the issue I receive.

Open form use:
driver.get('https://www.nordstrom.com/signin')
Not
driver.get('https://www.nordstrom.com/signin?cm_sp=SI_SP_A-_-SI_SP_B-_-SI_SP_C&origin=tab&ReturnURL=https%3A%2F%2Fwww.nordstrom.com%2F')
Try to use explicit wait before click:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
wait = WebDriverWait(driver, 30)
wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, 'button[alt="next button"]')))
btn = driver.find_element_by_css_selector('button[alt="next button"]')
btn.click()
It must work because I reproduced your error. The problem is that:
You should use click() for clicking, not send_keys(Keys.ENTER) In your case you click Enter before email is completely input, just before # symbol.
Your time.sleep(5) is not enough. Use explicit wait. Or, in the works case increase your sleep (if you don't case about the speed)
Read here how to use Selenium's wait instead on time.sleep()
Update:
Clear all cookies, cache, application data for this site and try again. First manually. Looks like it block users after some unsuccessful attempts. Even valid emails.
UPDATE:
Unfortunately, Nordstrom is blocking automated requests...
Nordstrom is tracking all customers actions, so it's unlikely to be used for testing. I would suggest to try other sites to save your time.

won't execute for loop in python selenium headless mode

Without using selenium headless, the below code works fine. But with headless mode, why the for loop won't execute??
Here is my code:-
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
options = Options()
options.add_argument("--disable-notifications")
options.add_argument('headless')
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
url = "https://www.justdial.com/Delhi/S-K-Premium-Par-Hari-Nagar/011PXX11-XX11-131128122154-B8G6_BZDET"
driver.get(url)
try:
pop_up = WebDriverWait(driver, 30).until(
EC.element_to_be_clickable((By.XPATH, '//*[#id="best_deal_detail_div"]/section/span')))
pop_up.click() # For disable pop-up
except TimeoutException:
pass
while True:
try:
element = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, "//span[text()='Load More Reviews..']")))
element.click()
except TimeoutException:
break
except:
pass
soup = BeautifulSoup(driver.page_source, 'lxml')
services = soup.find_all('span', {'class': "rName lng_commn"})
for i in services:
print(i.text)
I want to run this code with selenium headless. Please help.

Some websites behave different when they see your "headless" user-agent.
Try changing your user-agent to Chrome and see if it works.
options.add_argument("""user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)""")

Access denied to website using webdriver with Selenium

I have a problem with even an open website using "webdriver Chrome". Only trying to open the website end with "Access denied" information and don't know why.
Below is my code:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
class PriceCheckPhoenix:
def __init__(self):
self.url_login = "https://www.phoenixcontact.com/online/portal/pl?1dmy&urile=wcm%3apath%3a/plpl/web/home"
self.create_session()
def create_session(self):
# Run browser with webdriver
driver = webdriver.Chrome(executable_path="D:/chromedriver_v84.exe")
driver.get(self.url_login)
time.sleep(2)
# Find link to sub-website with login
link = driver.find_element_by_xpath('//*[#id="pxc-funcnav"]/div[3]/ul/li[1]/a').get_attribute("href")
driver.get(link)
time.sleep(100)
Description to code:
#1 I create browser chrome session
#2 Loading first website from self.url_login
#3 Is loaded
#4 I need to find a link behind the active text on the website to log in
#5 I found it and try to open this, but the response after getting a link is:
Access Denied
You don't have permission to access
"http://www.phoenixcontact.com/online/portal/pl/pxc/offcontext/login/!ut/p/z1/tZJNa4NAEIZ_Sw45yszuuro9WkO1xqY2EqN7EbXGWPzYFDGlv74Gcio0oYTMZRgY3mcYHpAQg-yysa6yoe67rJnmRBqpu4zownzixDEYx2cWmIYTeYgrHSKQIFVRv0MieJZTZEITglFNLwTXRPaw03RGC6Qm10nOTttFN6hhD4lqVDPHY5nPcd-3JSQTy0ypQ5C4Onl5XUcmvgXCttzNWo-WCNuxLo-w6frPdjot_CfZxWsEciPhSjy7a7xN7xt_63M8kJdNmlSrPw4HaU2G9N1Qfg0Q_1Zke4JeiPHIeQH_KAshVE0a-GkQ24EPqm0F41WbLh5XWuKN3-fm78KgsmazH7dw0Ts!/dz/d5/L0lJSklKQ2dwUkEhIS9JRGpBQUF4QUFFUkNwcVlxLzRObEdRb1lwTWhUalVFZyEvWjZfR0FMNjE0ODI4RzNEQzBJMklPMlA2OTFHMDMvWjdfR0FMNjE0ODI4RzNEQzBJMklPMlA2OTFHSTcvdGFyZ2V0Vmlldy9sb2dpbg!!/" on this server.
Reference #18.d58655f.1597921471.5b29112
Is anyone know what is wrong here? :( When I try to load the website from the link in normal Chrome browser it's all fine :/
Thank you all for any help.

Please try the below code and let me know if it works for you :-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
options = Options()
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36'
options.add_argument('user-agent={0}'.format(user_agent))
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)
action = ActionChains(driver)
driver.get("https://www.phoenixcontact.com/online/portal/pl?1dmy&urile=wcm%3apath%3a/plpl/web/home")
Login_Btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[#class='pxc-fn-login']/a")))
action.move_to_element(Login_Btn).click().perform()
Note - Please make the changes in your code accordingly.

Google search brought me here. After trying several options. Undetected Chromedriver with a very simple script without any options worked for me.
import undetected_chromedriver as uc
driver = uc.Chrome()
driver.get(<url here>)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

extracting csv download link from an webpage using python - python

You can extract the downloading link contained in that element with Selenium as following: link = driver.find_element(By.CSS_SELECTOR, '#downloadOCTable').get_attribute("href")

Related

upload files to google without using APIs with Selenium in headerless mode?

How to click a label without id with Selenium

There is an error in registering on the site using webdriver Selenium Python

won't execute for loop in python selenium headless mode

Access denied to website using webdriver with Selenium

Categories

Resources