selenium python iframe get https requests - python

I'm using selenium with python, I want to get all https requests from iframe element.
here I get the iframe element and after I select a row from table and press button, http post request will start.
part of my code
define chrome driver
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--auto-open-devtools-for-tabs')
chrome_options.add_argument('--log-level=2')
chrome_options.add_argument('--disable-features=IsolateOrigins,site-per-process')
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL",'browser':'ALL','server':'ALL'} # chromedriver 75+
capabilities["goog:chromeOptions"] = {"w3c": "false","args": "%w[headless window-size=1280,800]"} # chromedriver 75+
capabilities['acceptSslCerts'] = True
capabilities['acceptInsecureCerts'] = True
capabilities['PageLoadStrategy'] = None
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options,desired_capabilities=capabilities)
driver.get(os.environ['URL'])
get iframe element and click on row table
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, '//*[#id="myiframe"]')))
row_selector = '//*[#id="root"]/div/div/div/div[2]/div[2]/div/div/div/div/table/tbody/tr[1]/th[3]'
row_selector_clickable = '//*[#id="root"]/div/div/div/div[2]/div[2]/div/div/div/div/table/tbody/tr[1]/th[3]/div/div/div[2]/div/button'
WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, row_selector)))
actions = ActionChains(driver)
row_element = driver.find_element(By.XPATH, row_selector)
row_clickable = driver.find_element(By.XPATH, row_selector_clickable)
actions.move_to_element(row_element)
actions.click(row_clickable)
actions.perform()
then here, I get all http post requests and write them to file
logs = driver.get_log("performance")
def process_browser_logs_for_network_events(logs):
"""
Return only logs which have a method that start with "Network.response", "Network.request", or "Network.webSocket"
since we're interested in the network events specifically.
"""
for entry in logs:
log = json.loads(entry["message"])["message"]
yield log
events = process_browser_logs_for_network_events(logs)
li = []
with open(f"log_entries-{datetime.datetime.now()}.txt", "wt") as out:
for event in events:
print(event)
if 'method' in event.get('params', {}).get('request', {}) and event.get('params', {}).get('request',
{}).get('method',
'') == 'POST':
li.append(event)
out.write(json.dumps(li))
but the issue is that it shows me requests from the first page I guess, even If I switch to iframe and it select me the right elements from iframe.
the flow is this way:
I make login to website then I redirect to main page and then I click on button and it open new tab and there I have the iframe, I switch the iframe, press on row on table and there is http request that take 5-10 seconds (in this time is pending status) when it success it make redirect to gmail website and the http request is disappeared because the redirect so I tried to add preserve logs but still.
I can't expose the https requests because it's of my job, but what I'm seeing is requests from the first page and not from the current iframe..

Ok, I'll try to be as clear as I can: selenium setup below is linux/selenium/chrome, you can adapt it to your own, just observe the imports, and the code after defining the browser/driver.
For intercepting browser's requests I used selenium-wire: https://pypi.org/project/selenium-wire/
If you prefer, you can use native selenium request intercepting.
I looked around for an example website containing an iframe with which you interact with, i.e. click a button (OP should have made the legwork and provide such example, but anyway).
Code:
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time as t
from datetime import datetime
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://fasecolda.com/ramos/automoviles/historial-de-accidentes-de-vehiculos-asegurados/'
browser.get(url)
for x in browser.requests:
print('URL:', x.url)
print('ORIGIN:', x.headers['origin'])
print('HOST:', x.headers['Host'])
print('SEC-FETCH-DEST:', x.headers['sec-fetch-dest'])
print('TIMESTAMP:', x.date)
print('______________________________________')
WebDriverWait(browser, 20).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//*[#title='Términos']")))
t.sleep(3)
print('switched to iframe')
button = WebDriverWait(browser,5).until(EC.element_to_be_clickable((By.XPATH, '//*[text()="Acepto los Términos y Condiciones"]')))
print('located the button, bringing it into view')
button.location_once_scrolled_into_view
print('now waiting 30 seconds, for clear separation of requests')
t.sleep(30)
print('printing last request again:')
print('URL:', browser.last_request.url)
print('ORIGIN:', browser.last_request.headers['origin'])
print('HOST:', browser.last_request.headers['Host'])
print('SEC-FETCH-DEST:', browser.last_request.headers['sec-fetch-dest'])
print('TIMESTAMP:', browser.last_request.date)
last_date = browser.last_request.date
print('clicked the button at', datetime.now())
button.click()
print('waiting another 30 seconds')
t.sleep(30)
print('and now printing the requests again (the ones after interacting with iframe)')
for x in browser.requests:
if x.date > last_date:
print('URL:', x.url)
print('ORIGIN:', x.headers['origin'])
print('HOST:', x.headers['Host'])
print('SEC-FETCH-DEST:', x.headers['sec-fetch-dest'])
print('TIMESTAMP:', x.date)
print('______________________________________')
As you can see, it's pretty straightforward:
go to website
print requests made (url, origin, host, sec-fetch-dest and timestamp)
locate the iframe and switch to it
locate the button you want to click on, and bring it into view
waiting 30 seconds, for any eventual requests made by JS in page
after 30 seconds, printing last request made (url, origin, host, sec-fetch-dest and timestamp) - also saving the timestamp into a variable, to be able to filter subsequent requests
clicking the button and registering the timestamp when we clicked it
waiting another 30 seconds, just to make sure all requests were performed
printing the requests made after the timestamp variable saved previously
The result in terminal:
[...]
______________________________________
URL: https://fonts.gstatic.com/s/roboto/v30/KFOlCnqEu92Fr1MmEU9fBBc4.woff2
ORIGIN: https://siniestroshava.com.co
HOST: None
SEC-FETCH-DEST: font
TIMESTAMP: 2022-10-08 21:44:44.794670
______________________________________
switched to iframe
located the button, bringing it into view
now waiting 30 seconds, for clear separation of requests
printing last request again:
URL: https://optimizationguide-pa.googleapis.com/v1:GetModels?key=AIzaSyCkfPOPZXDKNn8hhgu3JrA62wIgC93d44k
ORIGIN: None
HOST: None
SEC-FETCH-DEST: empty
TIMESTAMP: 2022-10-08 21:44:57.413952
clicked the button at 2022-10-08 21:45:19.036690
waiting another 30 seconds
and now printing the requests again (the ones after interacting with iframe)
URL: https://siniestroshava.com.co/hava/Seguridad/SolicitarCorreo
ORIGIN: None
HOST: siniestroshava.com.co
SEC-FETCH-DEST: iframe
TIMESTAMP: 2022-10-08 21:45:19.209288
______________________________________
URL: https://siniestroshava.com.co/hava/css/hava/estiloslocales.css
ORIGIN: None
HOST: siniestroshava.com.co
SEC-FETCH-DEST: style
TIMESTAMP: 2022-10-08 21:45:19.633076
______________________________________
URL: https://siniestroshava.com.co/hava/css/vendor.css?v=U1BT8Ls9ntdpDS12L5xpMjmSP3Eitncl_SyDnU5LLHk
ORIGIN: None
HOST: siniestroshava.com.co
SEC-FETCH-DEST: style
TIMESTAMP: 2022-10-08 21:45:19.645382
______________________________________
URL: https://siniestroshava.com.co/hava/css/devextreme/dx.material.hava.css
ORIGIN: None
HOST: siniestroshava.com.co
SEC-FETCH-DEST: style
TIMESTAMP: 2022-10-08 21:45:19.646197
______________________________________
[...]
______________________________________
​
As you can see, the main website is https://fasecolda.com, and iframe src is https://siniestroshava.com.co/. You can clearly observe all the requests made since loading the original page (I didn't post them all, too many), you can see the last request made before interacting with the iframe, the timestamp of interacting with iframe, and the subsequent request - the first one made having SEC-FETCH-DEST: iframe - obviously the request made by the iframe, due to us clicking the button. Also host and origin are relevant header keys, if they are present.
This is a method to isolate the requests made from the iframe, as opposed to the ones made from main page.
I believe this should answer your question as asked.

Related

Python Selenium scrape data when button "Load More" doesnt change URL

I am using the following code to attempt to keep clicking a "Load More" button until all page results are shown on the website:
from selenium import webdriver
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def startWebDriver():
global driver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(options = chrome_options)
startWebDriver()
driver.get("https://together.bunq.com/all")
time.sleep(4)
while True:
try:
wait = WebDriverWait(driver, 10,10)
element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "[title='Load More']")))
element.click()
print("Loading more page results")
except:
print("All page results displayed")
break;
However, since the button click does not change the URL, no new data is loaded into chromedriver and the while loop will break on the second iteration.
Selenium is overkill for this. You only need requests. Logging one's network traffic reveals that at some point JavaScript makes an XHR HTTP GET request to a REST API endpoint, the response of which is JSON and contains all the information you're likely to want to scrape.
One of the query-string parameters for that endpoint URL is page[offset], which is used to offset the query results for pagination (in this case the "load more button"). A value of 0 corresponds to no offset, or "start at the beginning". Increment this value to suit your needs - in a loop would probably be a good place to do this.
Simply imitate that XHR HTTP GET request - copy the API endpoint URL and query-string parameters and request headers, then parse the JSON response:
def get_discussions():
import requests
url = "https://together.bunq.com/api/discussions"
params = {
"include": "user,lastPostedUser,tags,firstPost",
"page[offset]": 0
}
headers = {
"user-agent": "Mozilla/5.0"
}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
yield from response.json()["data"]
def main():
for discussion in get_discussions():
print(discussion["attributes"]["title"])
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Output:
⚡️What’s new in App Update 18.8.0
Local Currencies Accounts Fees
Local Currencies 💸
Spanish IBANs Service Coverage 🇪🇸
bunq Update 18 FAQ 📚
Phishing and Spoofing - The new ways of scamming, explained 👀
Easily rent a car with your bunq credit card 🚗
Giveaway - Hallo Deutschland! 🇩🇪
Giveaway - Hello Germany! 🇩🇪
True Name: Feedback 💬
True Name 🌈
What plans are available?
Everything about refunds 💸
Identity verification explained! 🤳
When will I receive my payment?
Together Community Guidelines 📣
What is my Tax Identification Number (TIN)?
How do I export a bank statement?
How do I change my contact info?
Large cash withdrael
If this is a new concept for you, I would suggest you look up tutorials on how to use your browser's developer tools (Google Chrome's Devtools, for example), how to log your browser's network traffic, REST APIs, HTTP, etc.

Web page behaves differently if I type text in input box than I use webdriver to send Keys in selenium

I am new to selenium and trying to automate a WordPress social media schedule plugin. The problem is there is a link text box as shown below.
Now if I type URL in this box and click continue it I will get the next page like this :
But when I try to automate this step using this code :
mydriver = webdriver.Chrome(Path)
cookies = get_cookies_values("cookies.csv")
mydriver.get(url)
for i in cookies:
mydriver.add_cookie(i)
mydriver.get(url)
link_element = WebDriverWait(mydriver, 10).until(
EC.presence_of_element_located((By.ID, "b2s-curation-input-url"))
)
link_element.send_keys(link)
mydriver.find_element_by_xpath('//*[#id="b2s-curation-post-
form"]/div[2]/div[1]/div/div/div[2]/button').click()
Now, if I run the above code, which gets the URL, it also loads my cookies, but when it clicks on the continue button after sending keys, I get this type of page with an extra field with the name of the title. I don't want this extra title box as shown in the picture below
I would like to know what is causing this issue. I am using python 3.8 with Chrome web driver version 92.0.4515.131.
Thank you
One possibility is to try entering the URL character-by-character, as opposed to sending the entire string. The former is closer to the process of manually typing in the link, and assuming the site has a Javascript listener to catch the input, a character-by-character input process will be intercepted differently than a copy-paste of the entire URL:
mydriver = webdriver.Chrome(Path)
cookies = get_cookies_values("cookies.csv")
mydriver.get(url)
for i in cookies:
mydriver.add_cookie(i)
mydriver.get(url)
link_element = WebDriverWait(mydriver, 10).until(
EC.presence_of_element_located((By.ID, "b2s-curation-input-url"))
)
for c in link: #send each character to the field
link_element.send_keys(c)
#import time; time.sleep(0.3) => you could also add a short delay between each entry
mydriver.find_element_by_xpath('//*[#id="b2s-curation-post-
form"]/div[2]/div[1]/div/div/div[2]/button').click()

Issue when scraping data on McMaster-Carr website

I'm writing a crawler for McMaster-Carr. For example, the page https://www.mcmaster.com/98173A200 , if I open the page directly in browser, I can view all the product data.
Because the data is in dynamically-loaded content, so I'm using Selenium + bs4.
if __name__ == "__main__":
url = "https://www.mcmaster.com/98173A200"
options = webdriver.ChromeOptions()
options.add_argument("--enable-javascript")
driver = webdriver.Chrome("C:/chromedriver/chromedriver.exe", options=options)
driver.set_page_load_timeout(20)
driver.get(url)
soup = BeautifulSoup(driver.page_source, "html.parser")
delay = 20
try:
email_input = WebDriverWait(driver, delay).until(
EC.presence_of_element_located((By.ID, 'MainContent')))
except TimeoutException:
print("Timeout loading DOM!")
print(soup)
However, if I run the code I would get a login dialog, which I wouldn't get if I open this page directly in a browser like I mentioned.
I also tried logging in with the code below
try:
email_input = WebDriverWait(driver, delay).until(
EC.presence_of_element_located((By.ID, 'Email')))
print("Page is ready!!")
input("Press Enter to continue...")
except TimeoutException:
print("Loading took too much time!")
email_input.send_keys(email)
password_input = driver.find_element_by_id('Password')
password_input.send_keys(password)
login_button = driver.find_element_by_class_name("FormButton_primaryButton__1kNXY")
login_button.click()
Then it shows access restricted.
I compared the requested header in the page opened by Selenium and the page in my browser, I couldn't find anything wrong. I also tried other webdrivers like PhantomJS and FireFox, and I got the same result.
I also tried using random user-agent using the code below
from random_user_agent.user_agent import UserAgent
from random_user_agent.params import SoftwareName, OperatingSystem
software_names = [SoftwareName.CHROME.value]
operating_systems = [OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value]
user_agent_rotator = UserAgent(software_names=software_names,
operating_systems=operating_systems,
limit=100)
user_agent = user_agent_rotator.get_random_user_agent()
chrome_options = Options()
chrome_options.add_argument('user-agent=' + user_agent)
Still same result.
The developer tool in the page opened by Selenium showed there were a bunch of errors. I guess the tokenauthorization one is the key to this issue, but I don't know what should I do with it.
Any help would be appreciated!
The reason you saw a login window is that you were accessing McMaster carr via a chrome driver. When the server recognizes your behaviour, it will require you to sign in.
A typical login wouldn't work if you haven't been authenticated by McMaster (need to sign NDA)
You should look into McMaster API. With the API, you can access the database directly. However, you need to sign an NDA with McMaster Carr before obtaining access to the API. https://www.mcmaster.com/help/api/

Web scrape with multiple inputs and collect Total Margin required

I have a web link as:
url = "zerodha.com/margin-calculator/SPAN"
Here input parameters with sample values for reference mentioned below:
Exchange - NFO
Product - Options
Symbol - DHFL 27-JUN-19
Option Type - Calls
Strike Price - 120
Net Qty appears automatically as 1500,
and Use SELL Button then Click ADD Button.
I want to collect the Total Margin required (in above case its Rs 49,308) which appears in the right end.
You can just use requests. If you observe your network, you can see that it is making a POST requests with selected payload. This is how I would do it:
from requests import Session
BASE_URL = 'https://zerodha.com/margin-calculator/SPAN'
payload = {'action': 'calculate',
'exchange[]': 'NFO',
'product[]': 'FUT',
'scrip[]': 'DHFL19AUG',
'option_type[]': 'CE',
'strike_price[]':120,
'qty[]': 4000,
'trade[]': 'sell'
}
session = Session()
res = session.post(BASE_URL, data=payload)
data = res.json()
print(data)
I got the URL and Payload from observing network. This is what you will get as data in json form.
Results in chrome and python
Just observe how chrome or firefox send and receive data. And reverse engineer with your requests.
website link is dynamic rendering request table data. You should try automation selenium library. it allows you to scrape dynamic rendering request(js or ajax) page data.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import time
driver = webdriver.Chrome("/usr/bin/chromedriver")
driver.get("https://zerodha.com/margin-calculator/SPAN")
# select exchange option of NFO
exchange = driver.find_element_by_name('exchange[]')
exchange.send_keys("NFO")
# select product option of option
product = driver.find_element_by_name('product[]')
product.send_keys("OPT")
# select symbol by option value
symbol = Select(driver.find_element_by_name("scrip[]"))
symbol.select_by_value("DHFL19JUN")
# select option Type CELL option
optionType = driver.find_element_by_name('option_type[]')
optionType.send_keys("CE")
#add Strike price
strikePrice = driver.find_element_by_name('strike_price[]')
strikePrice.clear()
strikePrice.send_keys("120")
# add Net quantity
netQty = driver.find_element_by_name('qty[]')
netQty.clear()
netQty.send_keys("1500")
# select sell radio button
driver.find_elements_by_css_selector("input[name='trade[]'][value='sell']")[0].click()
#submit form
submit = driver.find_element_by_css_selector("input[type='submit'][value='Add']")
submit.click()
time.sleep(2)
# scrape margin
margin = driver.find_element_by_css_selector(".val.total")
print(margin.text)
where '/usr/bin/chromedriver' selenium web driver path.
Download selenium web driver for chrome browser:
http://chromedriver.chromium.org/downloads
Install web driver for chrome browser:
https://christopher.su/2015/selenium-chromedriver-ubuntu/
Selenium tutorial:
https://selenium-python.readthedocs.io/

web scraping a site without direct access

any help is appreciated in advance.
deal is i have been trying scrape data from this website(https://www.mptax.mp.gov.in/mpvatweb/leftMenu.do),but direct access to the website is not possible.Rather then data i need,i am getting invalid access.To access the website i must go to (https://www.mptax.mp.gov.in/mpvatweb/index.jsp) and then click on 'dealer search' from dropdown menu while hovering over dealer information.
I am looking for solution in Python,
Here's something i tried.I have just started web scraping:
import requests
from bs4 import BeautifulSoup
with requests.session() as request:
MAIN="https://www.mptax.mp.gov.in/mpvatweb/leftMenu.do"
INITIAL="https://www.mptax.mp.gov.in/mpvatweb/"
page=request.get(INITIAL)
jsession=page.cookies["JSESSIONID"]
print(jsession)
print(page.headers)
result=request.post(INITIAL,headers={"Cookie":"JSESSIONID="+jsession+"; zoomType=0","Referer":INITIAL})
page1=request.get(MAIN,headers={"Referer":INITIAL})
soup=BeautifulSoup(page1.content,'html.parser')
data=soup.find_all("tr",class_="whitepapartd1")
print(data)
Deal is i want to scrape data about firm's based on their firm name.
thanks for telling me a way #Arnav and #Arman ,so here's the final code:
from selenium import webdriver #to work with website
from bs4 import BeautifulSoup #to scrap data
from selenium.webdriver.common.action_chains import ActionChains #to initiate hovering
from selenium.webdriver.common.keys import Keys #to input value
PROXY = "10.3.100.207:8080" # IP:PORT or HOST:PORT
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % PROXY)
#ask for input
company_name=input("tell the company name")
#import website
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get("https://www.mptax.mp.gov.in/mpvatweb/")
#perform hovering to show hovering
element_to_hover_over = browser.find_element_by_css_selector("#mainsection > form:nth-child(2) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(3) > td:nth-child(3) > a:nth-child(1)")
hover = ActionChains(browser).move_to_element(element_to_hover_over)
hover.perform()
#click on dealer search from dropdown menu
browser.find_element_by_css_selector("#dropmenudiv > a:nth-child(1)").click()
#we are now on the leftmenu page
#click on radio button
browser.find_element_by_css_selector("#byName").click()
#input company name
inputElement = browser.find_element_by_css_selector("#showNameField > td:nth-child(2) > input:nth-child(1)")
inputElement.send_keys(company_name)
#submit form
inputElement.submit()
#now we are on dealerssearch page
#scrap data
soup=BeautifulSoup(browser.page_source,"lxml")
#get the list of values we need
list=soup.find_all('td',class_="tdBlackBorder")
#check length of 'list' and on that basis decide what to print
if(len(list)!=0):
#company name at index=9
#tin no. at index=10
#registration status at index=11
#circle name at index=15
#store the values
name=list[9].get_text()
tin=list[10].get_text()
status=list[11].get_text()
circle=list[15].get_text()
#make dictionary
Company_Details={"TIN":tin ,"Firm name":name ,"Circle_Name":circle, "Registration_Status":status}
print(Company_Details)
else:
Company_Details={"VAT RC No":"Not found in database"}
print(Company_Details)
#close the chrome
browser.stop_client()
browser.close()
browser.quit()
Would you mind using a browser?
You can use a browser and access the link at xpath (//*[#id="dropmenudiv"]/a[1]).
You might have to download and put chromedriver in the mentioned directory if you haven't used chromedriver before. You can also use selenium + phantomjs if you want to do headless browsing (without the browser opening up each time).
from selenium import webdriver
xpath = "//*[#id="dropmenudiv"]/a[1]"
browser = webdriver.Chrome('/usr/local/bin/chromedriver')
browser.set_window_size(1120,550)
browser.get('https://www.mptax.mp.gov.in/mpvatweb')
link = browser.find_element_by_xpath("//*[#id="dropmenudiv"]/a[1]")
link.click()
url = browser.current_url

Categories

Resources