upload files to google without using APIs with Selenium in headerless mode? - python

I currently have this code
from undetected_chromedriver import Chrome
from undetected_chromedriver import ChromeOptions
import os
from time import sleep
from selenium.webdriver import ActionChains
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
opts = ChromeOptions()
opts.add_argument(f'--user-data-dir={os.getcwd()}/driver/profile')
opts.add_argument(f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
driver = Chrome(executable_path=f'{os.getcwd()}/driver/chromedriver.exe', options=opts)
sleep(2)
driver.get('https://drive.google.com/drive/my-drive')
sleep(5)
actionChains = ActionChains(driver)
sleep(2)
actionChains.click(driver.find_element(By.XPATH, '//*[#guidedhelpid="new_menu_button"]')).perform()
sleep(1)
WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, '//*[#data-tooltip="File upload"]')))
actionChains.click(driver.find_element(By.XPATH, '//*[#data-tooltip="File upload"]')).perform()
self.driver.find_element(By.XPATH, '//input[#type="file"]').send_keys(f'{os.getcwd()}/downloads/english.srt')
If I have the window open, it works without a problem, it can upload the file, the problem is when I try to use the mode without header, it does not detect the File Upload element, so I have tried another option on drag a dop and have not had any luck, I have been trying for several days and I'm about to throw in the towel
option with drag and drop
JS_DROP_FILES = "var k=arguments,d=k[0],g=k[1],c=k[2],m=d.ownerDocument||document;for(var e=0;;){var f=d.getBoundingClientRect(),b=f.left+(g||(f.width/2)),a=f.top+(c||(f.height/2)),h=m.elementFromPoint(b,a);if(h&&d.contains(h)){break}if(++e>1){var j=new Error('Element not interactable');j.code=15;throw j}d.scrollIntoView({behavior:'instant',block:'center',inline:'center'})}var l=m.createElement('INPUT');l.setAttribute('type','file');l.setAttribute('multiple','');l.setAttribute('style','position:fixed;z-index:2147483647;left:0;top:0;');l.onchange=function(q){l.parentElement.removeChild(l);q.stopPropagation();var r={constructor:DataTransfer,effectAllowed:'all',dropEffect:'none',types:['Files'],files:l.files,setData:function u(){},getData:function o(){},clearData:function s(){},setDragImage:function i(){}};if(window.DataTransferItemList){r.items=Object.setPrototypeOf(Array.prototype.map.call(l.files,function(x){return{constructor:DataTransferItem,kind:'file',type:x.type,getAsFile:function v(){return x},getAsString:function y(A){var z=new FileReader();z.onload=function(B){A(B.target.result)};z.readAsText(x)},webkitGetAsEntry:function w(){return{constructor:FileSystemFileEntry,name:x.name,fullPath:'/'+x.name,isFile:true,isDirectory:false,file:function z(A){A(x)}}}}}),{constructor:DataTransferItemList,add:function t(){},clear:function p(){},remove:function n(){}})}['dragenter','dragover','drop'].forEach(function(v){var w=m.createEvent('DragEvent');w.initMouseEvent(v,true,true,m.defaultView,0,0,0,b,a,false,false,false,false,0,null);Object.setPrototypeOf(w,null);w.dataTransfer=r;Object.setPrototypeOf(w,DragEvent.prototype);h.dispatchEvent(w)})};m.documentElement.appendChild(l);l.getBoundingClientRect();return l"
def drop_files(element, files, offsetX=0, offsetY=0):
driver = element.parent
isLocal = not driver._is_remote or '127.0.0.1' in driver.command_executor._url
paths = []
# ensure files are present, and upload to the remote server if session is remote
for file in (files if isinstance(files, list) else [files]):
if not os.path.isfile(file):
raise FileNotFoundError(file)
paths.append(file if isLocal else element._upload(file))
value = '\n'.join(paths)
elm_input = driver.execute_script(JS_DROP_FILES, element, offsetX, offsetY)
elm_input._execute('sendKeysToElement', {'value': [value], 'text': value})
WebElement.drop_files = drop_files
dropzone = driver.find_element_by_xpath('//c-wiz[#data-region-root]')
dropzone.drop_files(f'{os.getcwd()}\\downloads\\english.srt')

Related

extracting csv download link from an webpage using python

I want to extract the CSV download URL from website - https://www.nseindia.com/option-chain
enter image description here
Code I used till now
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
driver.get("https://www.nseindia.com/option-chain")
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID,
"equity_underlyingVal")))
nifty = (driver.find_element(By.XPATH, '//*
[#id="equity_underlyingVal"]').text).replace('NIFTY ',
'').replace(',','')
time_stamp = driver.find_element(By.XPATH, '//*
[#id="equity_timeStamp"]').text
I need the csv link to be load in pandas df. I dont want to use selenium or if using selenium, I need it as headless. Let me know if anyone has a better idea about extracting data directly into pandas datafream..
You can extract the downloading link contained in that element with Selenium as following:
link = driver.find_element(By.CSS_SELECTOR, '#downloadOCTable').get_attribute("href")
As the download link is not present in the href attribute, the best approach is to download the csv file.
Interacting in headless mode can cause problems if the window-size argument is not specified, and a workaround to download files in headless mode is to specify the download path using the driver.command_executor method.
Code snippet to download csv in headless mode-
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os
options = Options()
#add necessary arguments
options.add_argument("user-agent= Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36")
options.add_argument("--window-size=1920,1080")
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
#set download path (set to current working directory in this example)
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow','downloadPath':os.getcwd()}}
command_result = driver.execute("send_command", params)
driver.get("https://www.nseindia.com/option-chain")
#wait for table details to appear
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//*[#id="equity_optionChainTable"]')))
#find and click on download csv button
download_button=driver.find_element_by_xpath('//*[#id="downloadOCTable"]')
download_button.click()

Can you not get detected as a robot when using selenium in Python?

I've been trying to scrape this website for 2 days now. I'm completely stuck. The problem is that it detects me as a bot.
I have a list of urls that I need to crawl. and in the results folder, every file says that Access to this page has been denied... To continue, please prove you are not a robot... etc.
Below is my current code
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
CHROMEDRIVER_PATH = './chromedriver'
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
ua = UserAgent()
userAgent = ua.random
chrome_options.add_argument('user-agent={userAgent}')
LOGIN_PAGE = "https://www.seekingalpha.com/login"
ACCOUNT = "Account"
PASSWORD = "Password"
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, chrome_options=chrome_options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
wait = WebDriverWait(driver, 30)
driver.get("https://www.seekingalpha.com/login")
time.sleep(1)
wait.until(EC.element_to_be_clickable((By.NAME, "email"))).send_keys(ACCOUNT)
wait.until(EC.element_to_be_clickable((By.ID, "signInPasswordField"))).send_keys(PASSWORD)
wait.until(EC.element_to_be_clickable((By.XPATH, "//button[text()='Sign in']"))).click()
time.sleep(1)
with open("links.txt", "r") as inArticle:
    articles = inArticle.read().splitlines()
for article in articles:
    outName = article.split("/")[-1]
    outName = outName.split("-")[0]
    driver.get(article)
    time.sleep(1)
    html_source = driver.page_source
    out_text = str(html_source).encode("utf8")
    with open("./results/"+outName, "w") as outFile:
        outFile.write(out_text)
driver.quit()
Is there a better way to do this? and is there a way to pass this bot check?

How to use selenium performance log to get the traffic of all webs open in new tabs?

I have learned a similar question, but the answer does not fit my work.
I am working at getting redirectChain by selenium performance log after clicking on all the ad links in an original page.And I found out that if the link opens in orginal tab, then the performance log can gather all the traffic information, from which I can further extract redirectResponse url.
However, to make sure that the original page won't change when clicking on a link, I tried to send ctrl+click command to show new page in another tab. And now, I can only get traffic information in the original page.
Are there any solutions on my problem? Can I change some params of performance log so that I can get all the webs open by a driver in another tabs?
For reference, here is my testing code:
import json
import pprint
import time
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
#import action chains
from selenium.webdriver.common.action_chains import ActionChains
#import Keys
from selenium.webdriver.common.keys import Keys
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
options = webdriver.ChromeOptions()
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36')
driver = webdriver.Chrome(
r"chromedriver.exe",
desired_capabilities=capabilities,
options = options,
)
script = "Object.defineProperties(navigator, {webdriver:{get:()=>undefined}})"
driver.execute_script(script)
def process_browser_logs_for_network_events(logs):
for entry in logs:
log = json.loads(entry["message"])["message"]
if (
"Network.response" in log["method"]
or "Network.request" in log["method"]
or "Network.webSocket" in log["method"]
):
yield log
driver.get("https://stackoverflow.com/questions/35592602/how-can-i-get-a-intermediate-url-from-a-redirect-chain-from-selenium-using-pytho")
element = driver.find_element_by_xpath("//a[#class = 'ws-nowrap s-btn s-btn__primary']")
action = ActionChains(driver)
action.move_to_element(element).key_down(Keys.CONTROL)\
.click(element).key_up(Keys.CONTROL).perform()
#action.move_to_element(element).click(element).perform()
#print(driver.current_url)
windows = driver.window_handles
driver.switch_to_window(windows[-1])
time.sleep(5)
logs = driver.get_log("performance")
events = process_browser_logs_for_network_events(logs)
with open("log_entries1.txt", "wt") as out:
for event in events:
pprint.pprint(event, stream=out)

There is an error in registering on the site using webdriver Selenium Python

I am trying to register a new account for this site. However, I cannot register there because of an error or block for ChromeDriver (selenium Python).
I am using this code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import random
import string
from time import sleep
from selenium.webdriver.common.keys import Keys
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
options = webdriver.ChromeOptions()
options.add_argument(f'user-agent={user_agent}')
options.add_argument('disable-infobars')
options.add_argument('--profile-directory=Default')
options.add_argument("--incognito")
options.add_argument("--disable-plugins-discovery")
options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors", "safebrowsing-disable-download-protection", "safebrowsing-disable-auto-update", "disable-client-side-phishing-detection"])
options.add_argument('--disable-extensions')
options.add_argument("start-maximized")
options.add_argument("--disable-blink-features")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r'chromedriver1.exe')
driver.get('https://www.nordstrom.com/signin?cm_sp=SI_SP_A-_-SI_SP_B-_-SI_SP_C&origin=tab&ReturnURL=https%3A%2F%2Fwww.nordstrom.com%2F')
def email(stringLength=8):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(stringLength))
email = email(6) + "#gmail.com"
sleep(5)
# email
driver.find_element_by_name("email").send_keys(email)
sleep(5)
# next
driver.find_element_by_id('account-check-next-button').send_keys(Keys.ENTER)
I think the website is blocking WebDriver. When I use Chrome in my computer, I don't encounter any problems, but using ChromeDriver, this is the issue I receive.
Open form use:
driver.get('https://www.nordstrom.com/signin')
Not
driver.get('https://www.nordstrom.com/signin?cm_sp=SI_SP_A-_-SI_SP_B-_-SI_SP_C&origin=tab&ReturnURL=https%3A%2F%2Fwww.nordstrom.com%2F')
Try to use explicit wait before click:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
wait = WebDriverWait(driver, 30)
wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, 'button[alt="next button"]')))
btn = driver.find_element_by_css_selector('button[alt="next button"]')
btn.click()
It must work because I reproduced your error. The problem is that:
You should use click() for clicking, not send_keys(Keys.ENTER) In your case you click Enter before email is completely input, just before # symbol.
Your time.sleep(5) is not enough. Use explicit wait. Or, in the works case increase your sleep (if you don't case about the speed)
Read here how to use Selenium's wait instead on time.sleep()
Update:
Clear all cookies, cache, application data for this site and try again. First manually. Looks like it block users after some unsuccessful attempts. Even valid emails.
UPDATE:
Unfortunately, Nordstrom is blocking automated requests...
Nordstrom is tracking all customers actions, so it's unlikely to be used for testing. I would suggest to try other sites to save your time.

How to restart Selenium browser after quit?

I'm trying to quit and then restart a new browser session with Selenium when encountering a captcha, and I'm not sure yet why the code below isn't working.
It quits the existing driver, but after recursion browser.get() results in this error: ConnectionRefusedError: [Errno 61] Connection refused
Thanks in advance for any advice. I've included only the most relevant parts of the code below:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
path_to_chromedriver = '/Users/Myname/Desktop/a/chromedriver 2'
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
#options.add_argument('disable-infobars')
#options.add_argument('--disable-notifications')
options.add_argument('--disable-extensions')
browser = webdriver.Chrome(chrome_options=options, executable_path=path_to_chromedriver)
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
def get_page_info(url, browser = webdriver.Chrome(chrome_options=options, executable_path=path_to_chromedriver)):
browser.get(url)
try:
body = browser.find_element_by_tag_name('body')
if "been denied because we believe" in body.text:
print("going to new session...")
browser.quit()
human(4,6) #time delay
return winery_info(url)
Edit: I normally wouldn't use this tactic to get around a captcha, but in my use case this makes sense.
Try to use the driver.delete_all_cookies() method instead of closing browser and reopening it
edit : maybe the site block your ip adress i suggest you to use tor to change ip automatically i will give you this
import os
import time
os.system("killall tor")
os.system("tor &")
time.sleep(5)
#init driver
fp = webdriver.FirefoxProfile()
fp.set_preference("network.proxy.type", 1)
fp.set_preference("network.proxy.socks", "127.0.0.1")
fp.set_preference("network.proxy.socks_port", int("9050"))
fp.update_preferences()
browser = webdriver.Firefox(firefox_profile=fp)
browser.get(...)
...
...
if captcha:
os.system("killall tor")
os.system("tor &")
time.sleep(5)
browser.get(...)
# this will change your ip adress
# You can also configure tor to change ip every 10 seconds by changing torrc file

Categories

Resources