Injecting jquery with selenium in python - python

I am trying to solve a problem i.e. highlighting the specific words which I want to highlight (dynamic) and so far got some success but facing an issue when trying to open a json file path and same code isn't working for that. Stack - python and selenium.
def launchBrowserAndEmbed():
driver = driverInitialization()
driver.get('https://worldwide.espacenet.com/3.2/rest-services/legal/publication/EP/0546789/A2.json')
driver.execute_script("""var jquery_script = document.createElement('script');
jquery_script.src = 'https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js';
jquery_script.onload = function(){inst = new Mark(document.querySelector("body")); var isMarked = inst.mark("COMMUNICATION"); console.log(isMarked)};
document.getElementsByTagName('head')[0].appendChild(jquery_script);"""
)
time.sleep(5)
#driver.execute_script('inst = new Mark(document.querySelector("body")); var isMarked = inst.mark("COMMUNICATION"); console.log(isMarked)')
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height'))
driver.find_element_by_tag_name('body').screenshot('./screenshots/take_'+time.strftime("%Y%m%d-%H%M%S")+'.png')
driver.quit()

Related

Find the xpath with get_attribute() in python selenium

This is a somewhat backwards approach to web scraping. I need to locate the xpath of a web element AFTER I have already found it with a text()= identifier
Because the xpath values are different based on what information shows up, I need to use predictable labels inside the row for locating the span text next to found element. I found a simple and reliable way is locating the keyword label and then increasing td integer by one inside the xpath.
def x_label(self, contains):
mls_data_xpath = f"//span[text()='{contains}']"
string = self.driver.find_element_by_xpath(mls_data_xpath).get_attribute("xpath")
digits = string.split("td[")[1]
num = int(re.findall(r'(\d+)', digits)[0]) + 1
labeled_data = f'{string.split("td[")[0]}td[{num}]/span'
print(labeled_data)
labeled_text = self.driver.find_element_by_xpath(labeled_data).text
return labeled_text
I cannot find too much information on .get_attribute() and get_property() so I am hoping there is something like .get_attribute("xpath") but I haven't been able to find it.
Basically, I am taking in a string like "ApprxTotalLivArea" which I can rely on and then increasing the integer after td[0] by 1 to find the span data from cell next door. I am hoping there is something like a get_attributes("xpath") to locate the xpath string from the element I locate through my text()='{contains}' search.
The Remote WebElement does includes the following methods:
get_attribute()
get_dom_attribute()
get_property()
But xpath isn't a valid property of a WebElement. So get_attribute("xpath") will always return NULL
I was able to find a python version of the execute script from this post that was based off a JavaScript answer in another forum. I had to make a lot of .replace() calls on the string this function creates but I was able to universally find the label string I need and increment the td/span xpath by +1 to find the column data and retrieve it regardless of differences in xpath values on different page listings.
def x_label(self, contains):
label_contains = f"//span[contains(text(), '{contains}')]"
print(label_contains)
labeled_element = self.driver.find_element_by_xpath(label_contains)
print(labeled_element)
element_label = labeled_element.text
print(element_label)
self.driver.execute_script("""
window.getPathTo = function (element) {
if (element.id!=='')
return 'id("'+element.id+'")';
if (element===document.body)
return element.tagName;
var ix= 0;
var siblings= element.parentNode.childNodes;
for (var i= 0; i<siblings.length; i++) {
var sibling= siblings[i];
if (sibling===element)
return window.getPathTo(element.parentNode)+'/'+element.tagName+'['+(ix+1)+']';
if (sibling.nodeType===1 && sibling.tagName===element.tagName)
ix++;
}
}
""")
generated_xpath = self.driver.execute_script("return window.getPathTo(arguments[0]);", labeled_element)
generated_xpath = f'//*[#{generated_xpath}'.lower().replace('tbody[1]', 'tbody')
print(f'generated_xpath = {generated_xpath}')
expected_path = r'//*[#id="wrapperTable"]/tbody/tr/td/table/tbody/tr[26]/td[6]/span'
generated_xpath = generated_xpath.replace('[#id("wrappertable")', '[#id="wrapperTable"]').replace('tr[1]', 'tr')
clean_path = generated_xpath.replace('td[1]', 'td').replace('table[1]', 'table').replace('span[1]', 'span')
print(f'clean_path = {clean_path}')
print(f'expected_path = {expected_path}')
digits = generated_xpath.split("]/td[")[1]
print(digits)
num = int(re.findall(r'(\d+)', digits)[0]) + 1
print(f'Number = {num}')
labeled_data = f'{clean_path.split("td[")[0]}td[{num}]/span'
print(f'labeled_data = {labeled_data}')
print(f'expected_path = {expected_path}')
if labeled_data == expected_path:
print('Congrats')
else:
print('Rats')
labeled_text = self.driver.find_element_by_xpath(labeled_data).text
print(labeled_text)
return labeled_text
This function iteratively get's the parent until it hits the html element at the top
from selenium import webdriver
from selenium.webdriver.common.by import By
def get_xpath(elm):
e = elm
xpath = elm.tag_name
while e.tag_name != "html":
e = e.find_element(By.XPATH, "..")
neighbours = e.find_elements(By.XPATH, "../" + e.tag_name)
level = e.tag_name
if len(neighbours) > 1:
level += "[" + str(neighbours.index(e) + 1) + "]"
xpath = level + "/" + xpath
return "/" + xpath
driver = webdriver.Chrome()
driver.get("https://www.stackoverflow.com")
login = driver.find_element(By.XPATH, "//a[text() ='Log in']")
xpath = get_xpath(login)
print(xpath)
assert login == driver.find_element(By.XPATH, xpath)
Hope this helps!

Full Page Screenshot with Selenium python in IE

I am trying to take a full page screenshot on Internet Explorer using Selenium.
Looking through the Options.py code from selenium/webdriver/ie I found these lines:
class Options(object):
KEY = 'se:ieOptions'
SWITCHES = 'ie.browserCommandLineSwitches'
BROWSER_ATTACH_TIMEOUT = 'browserAttachTimeout'
ELEMENT_SCROLL_BEHAVIOR = 'elementScrollBehavior'
ENSURE_CLEAN_SESSION = 'ie.ensureCleanSession'
FILE_UPLOAD_DIALOG_TIMEOUT = 'ie.fileUploadDialogTimeout'
FORCE_CREATE_PROCESS_API = 'ie.forceCreateProcessApi'
FORCE_SHELL_WINDOWS_API = 'ie.forceShellWindowsApi'
**>>FULL_PAGE_SCREENSHOT = 'ie.enableFullPageScreenshot'**
IGNORE_PROTECTED_MODE_SETTINGS = 'ignoreProtectedModeSettings'
IGNORE_ZOOM_LEVEL = 'ignoreZoomSetting'
INITIAL_BROWSER_URL = 'initialBrowserUrl'
NATIVE_EVENTS = 'nativeEvents'
PERSISTENT_HOVER = 'enablePersistentHover'
REQUIRE_WINDOW_FOCUS = 'requireWindowFocus'
USE_PER_PROCESS_PROXY = 'ie.usePerProcessProxy'
VALIDATE_COOKIE_DOCUMENT_TYPE = 'ie.validateCookieDocumentType'
...
#property
def full_page_screenshot(self):
""" Returns the options Full Page Screenshot value """
return self._options.get(self.FULL_PAGE_SCREENSHOT)
#full_page_screenshot.setter
def full_page_screenshot(self, value):
"""
Sets the options Full Page Screenshot value
:Args:
- value: boolean value
"""
self._options[self.FULL_PAGE_SCREENSHOT] = value
However, I cannot for the life of me figure out how to use these. Any help would be appreciated. Or if you may have any other tips for taking a full page screenshot on IE would be helpful.
Thank you.
I use maximize_window() to full size the browser window, then use save_screenshot() to take the screenshot. You can refer to the code sample below. It works well in IE 11:
from selenium import webdriver
import time
url = "https://www.google.com/"
driver = webdriver.Ie(executable_path='IEDriverServer.exe')
driver.maximize_window()
driver.get(url)
time.sleep(3)
driver.save_screenshot("C:\\your\\path\\filename.png")
Please note to change the path to your owns.

Parsing html from google with python

I have problems with parsing the html source from google with python
def events():
location = sys.argv[1];
url = "https://www.google.com/search?client=firefox-b-d&q=evenementen+" + location;
event = requests.get(url=url)
print("De zoekterm is leeg, probeer het opnieuw")
soup_events = BeautifulSoup(event.text, 'html.parser')
events_google = soup_events.find_all('<a class="rl_item rl_item_base" tabindex="0" href="/search?client=firefox-b-d')
print(type(events_google))
print(len(events_google))
print(events_google)
I tried:
events_google = soup_events.find_all('div', 'BNeawe tAd8D AP7Wnd')
Which worked but when i try any other value it outputs 0
But all values that i want to print out show up when i try to print the event.text
i'm not sure what im doing wrong.
When I run the code above i get the following response:
<class 'bs4.element.ResultSet'>
0
[]
I'm actually looking to find the events shown in a google search.
ahhh, I think your issue is
events_google = soup_events.find_all('<a class="rl_item rl_item_base" tabindex="0" href="/search?client=firefox-b-d')
should be
events_google = soup_events.find_all("a",
{"class" : ["r1_item", " r1_item_base"]})

How can i scrape information from web page?

I am new to programming and need some help with my web-crawler.
At the moment, I have my code opening up every web-page in the list. However, I wish to extract information from each one it loads. This is what I have.
from selenium import webdriver
import csv
driver = webdriver.Firefox()
links_code = driver.find_elements_by_xpath('//a[#class="in-match"]')
first_two = links_code[0:2]
first_two_links = []
for i in first_two:
link = i.get_attribute("href")
first_two_links.append(link)
for i in first_two_links:
driver.get(i)
This loops through the first two pages but scrapes no info. So I tried adding to the for-loop as follows
odds = []
for i in first_two_links:
driver.get(i)
driver.find_element_by_xpath('//span[#class="table-main__detail-
odds--hasarchive"]')
odds.append(odd)
However. This runs into an error.
Any help much appreciated.
You are not actually appending anything! you need to assign a variable to
driver.find_element_by_xpath('//span[#class="table-main__detail-
odds--hasarchive"]')
then append it to the list!
from selenium import webdriver;
import csv;
driver = webdriver.Firefox();
links_code : list = driver.find_elements_by_xpath('//a[#class="in-match"]');
first_two : list = links_code[0:2];
first_two_links : list = [];
i : int;
for i in first_two:
link = i.get_attribute("href");
first_two_links.append(link);
for i in first_two_links:
driver.get(i);
odds : list = [];
i :int;
for i in first_two_links:
driver.get(i);
o = driver.find_element_by_xpath('//span[#class="table-main__detail- odds--hasarchive"]');
odds.append(o);
First, after you start the driver you need to go to a website...
Second, in the second for loop, you are trying to append the wrong object... use i not odd or make odd = driver.find_element_by_xpath('//span[#class="table-main__detail-odds--hasarchive"]')
If you can provide the URL or the HTML we can help more!
Try this (I have used Google as an example you will need to change the code...):
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.google.com")
links_code = driver.find_elements_by_xpath('//a')
first_two = links_code[0:2]
first_two_links = []
for i in first_two:
link = i.get_attribute("href")
first_two_links.append(link)
print(link)
odds = []
for i in first_two_links:
driver.get(i)
odd = driver.page_source
print(odd)
# driver.find_element_by_xpath('//span[#class="table-main__detail- odds--hasarchive"]')
odds.append(odd)

StaleElementReferenceException selenium webdriver python

I'm writing a crawler using Selenium, Python and PhantomJS to use Google's reverse image search. So far I've successfully been able to upload an image and crawl the search results on the first page. However, when I try to click on the search results navigation, I'm getting a StaleElementReferenceError. I have read about it in many posts but still I could not implement the solution. Here is the code that breaks:
ele7 = browser.find_element_by_id("nav")
ele5 = ele7.find_elements_by_class_name("fl")
count = 0
for elem in ele5:
if count <= 2:
print str(elem.get_attribute("href"))
elem.click()
browser.implicitly_wait(20)
ele6 = browser.find_elements_by_class_name("rc")
for result in ele6:
f = result.find_elements_by_class_name("r")
for line in f:
link = line.find_elements_by_tag_name("a")[0].get_attribute("href")
links.append(link)
parsed_uri = urlparse(link)
domains.append('{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri))
count += 1
The code breaks at print str(elem.get_attribute("href")) . How can I solve this?
Thanks in advance.
Clicking a link will cause the browser to go to another page; make references to the elements in old page (ele5, elem) invalid.
Modify the code not to reference invalid elements.
For example, you can get urls before you visit other pages:
ele7 = browser.find_element_by_id("nav")
ele5 = ele7.find_elements_by_class_name("fl")
urls = [elem.get_attribute('href') for elem in ele5] # <-----
browser.implicitly_wait(20)
for url in urls[:2]: # <------
print url
browser.get(url) # <------ used `browser.get` instead of `click`.
# ; using `element.click` will cause the error.
ele6 = browser.find_elements_by_class_name("rc")
for result in ele6:
f = result.find_elements_by_class_name("r")
for line in f:
link = line.find_elements_by_tag_name("a")[0].get_attribute("href")
links.append(link)
parsed_uri = urlparse(link)
domains.append('{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri))

Categories

Resources