I have this code and I want to iterate through all the divs inside the div. The first time I write in python and the first time I write a bot, so the first Silenium library I come across, I write in it.
all_urls_div = browser.find_elements_by_class_name("DPiy6, qF0y9, Igw0E, IwRSH, eGOV_, _4EzTm")
j = 0
for i in range(len(all_urls_div)):
chat = all_urls_div.find_elements_by_class_name("DPiy6, qF0y9, Igw0E, IwRSH, eGOV_, _4EzTm")
chat[j].click()
lastMessageBlock = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div")
j = j + 1
act = ActionChains(browser)
act.move_to_element(lastMessageBlock).perform()
threeDots = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div/button[1]").click()
deleteLastMessage = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div/div[2]/div/div[2]/div[4]/button").click()
browser.find_element_by_xpath("/html/body/div[6]/div/div/div/div[2]/button[1]").click()
textArea = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[2]/div/div/div[2]/textarea")
textArea.clear()
browser.execute_script("arguments[0].value='" + message + "'", textArea)
textArea.send_keys(Keys.SPACE)
sendButton = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[2]/div/div/div[3]/button").click()
browser.execute_script("arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight;",
general_path_block)
general_path_block)
I tried to write a loop, but after the first iteration an exception pops up, what should I do?
File "D:\PROJECTS\InstaBotByHushchadi\main.py", line 356, in resend_message_in_direct
chat = all_urls_div.find_elements_by_class_name("DPiy6, qF0y9, Igw0E, IwRSH, eGOV_, _4EzTm")
AttributeError: 'list' object has no attribute 'find_elements_by_class_name'
This exception
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
is because you have performed
chat.click()
for the first element, and that may have been redirected you to some other page/link, causing other web elements to be stale in nature.
Fix:
Redefine the list, again
all_urls_div = general_path_chats_grid.find_elements_by_tag_name("div")
j = 0
for i in range(len(all_urls_div)):
chat = general_path_chats_grid.find_elements_by_tag_name("div")
chat[j].click()
lastMessageBlock = browser.find_element_by_xpath("/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div")
j = j + 1
act = ActionChains(browser)
act.move_to_element(lastMessageBlock).perform()
threeDots = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div/button[1]").click()
deleteLastMessage = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div/div[2]/div/div[2]/div[4]/button").click()
browser.find_element_by_xpath("/html/body/div[6]/div/div/div/div[2]/button[1]").click()
textArea = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[2]/div/div/div[2]/textarea")
textArea.clear()
browser.execute_script("arguments[0].value='" + message + "'", textArea)
textArea.send_keys(Keys.SPACE)
sendButton = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[2]/div/div/div[3]/button").click()
browser.execute_script("arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight;",
general_path_block)
Related
I am running a selenium code on the website DNCA to scrap for some of the document links. I am trying to get links of each value in the drop down for each section shown in this page. My code is working fine, but when I run the same code with option headless = True, I am getting the following error:
ElementClickInterceptedException: element click intercepted: Element <li data-original-index="0">...</li> is not clickable at point (226, 250). Other element would receive the click: <div class="col-md-12">...</div>
(Session info: headless chrome=104.0.5112.81)
Code:
def get_active_row(active_tab, fund_id):
active_row = active_tab.find_elements(By.XPATH, ".//tr[#style='' or #style='display: table-row;'][#fund-id = '{}']".format(fund_id))
try:
assert len(active_row) == 1
active_row = active_row[0]
return active_row
except AssertionError as asserr:
print(asserr, ' -- More than one active row for the fund id: ', fund_id)
sys.exit(1)
except Exception as err:
print(err, ' -- fund id:', fund_id)
sys.exit(1)
def scrap(driver):
tab_list = driver.find_element(By.XPATH, "//ul[contains(#role, 'tablist')]")
tab_list_names = tab_list.find_elements(By.XPATH, './/li')
data_list = []
for loc, tab_name in enumerate(tab_list_names):
if loc < 20:
tab_name.click()
html = driver.page_source
soup = BeautifulSoup(html)
bs_active_tab = soup.find('div', {'class': 'tab-pane table-datas active'})
bs_headers = bs_active_tab.find('thead')
headers = [i.text for i in bs_headers.find_all('td')]
active_tab = driver.find_element(By.XPATH, "//div[contains(#class, 'tab-pane table-datas active')]")
unique_fund_ids = [i_fund.get_attribute('fund-id') for i_fund in active_tab.find_elements(By.XPATH, ".//tr[#style]") if i_fund.get_attribute('fund-id') != '-']
lookup = set()
unique_fund_ids = [x for x in unique_fund_ids if x not in lookup and lookup.add(x) is None]
for fund_id in unique_fund_ids: #Iterate over each fund
active_row = get_active_row(active_tab, fund_id)
active_row.find_element(By.XPATH, './/button').click()
isin_list = [i.text for i in active_row.find_elements(By.XPATH, './/li')]
for pos, isin_val in enumerate(isin_list):
isin_selected = active_row.find_elements(By.XPATH, './/li')[pos]
isin_selected.click()
active_row = get_active_row(active_tab, fund_id)
fund_name = ''
for pos_inner, td in enumerate(active_row.find_elements(By.XPATH, ".//td")):
a_tag = td.find_elements(By.XPATH, ".//a")
if len(a_tag) == 1:
a_tag = a_tag[0]
if pos_inner == 0:
fund_name = a_tag.text
link = a_tag.get_attribute('href')
data_list.append([tab_name.text, fund_name, isin_val, headers[pos_inner], link])
else:
data_list.append([tab_name.text, fund_name, isin_val, headers[pos_inner], ''])
active_row = get_active_row(active_tab, fund_id)
active_row.find_element(By.XPATH, './/button').click()
isin_selected_to_close = active_row.find_elements(By.XPATH, './/li')[0]
isin_selected_to_close.click()
tlg_tr_tab = active_tab.find_element(By.XPATH, ".//tr[#fund-id='-']")
for tlg_pos_inner, tlg_td in enumerate(tlg_tr_tab.find_elements(By.XPATH, ".//td")):
tlg_a_tag = tlg_td.find_elements(By.XPATH, ".//a")
if len(tlg_a_tag) == 1:
tlg_a_tag = tlg_a_tag[0]
tlg_link = tlg_a_tag.get_attribute('href') #Get document link
data_list.append([tab_name.text, 'Toute la gamme', '', headers[tlg_pos_inner], tlg_link])
else:
data_list.append([tab_name.text, 'Toute la gamme', '', headers[tlg_pos_inner], ''])
dataset_links = pd.DataFrame(data_list, columns = ['Tab', 'Fund Name', 'ISIN', 'Type', 'Link'])
driver.quit()
Can someone please explain me why is it working fine with headless = False but not with with headless = True.
In headless mode the default screen size is very small, significantly less than screen size in regular mode.
So, to overcome this problem you need to set the screen size.
It can be done in the following ways:
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1920, 1080")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
Or just
driver.set_window_size(1920, 1080)
Both approaches should work.
I prefer the first way :)
I am learning Python-docx and I am using this solution to add page number as given on stackoverflow by #Utkarsh Dalal
def create_element(name):
return OxmlElement(name)
def create_attribute(element, name, value):
element.set(nsqn(name), value)
def add_page_number(paragraph):
paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
page_run = paragraph.add_run()
t1 = create_element('w:t')
create_attribute(t1, 'xml:space', 'preserve')
t1.text = 'Page '
page_run._r.append(t1)
page_num_run = paragraph.add_run()
fldChar1 = create_element('w:fldChar')
create_attribute(fldChar1, 'w:fldCharType', 'begin')
instrText = create_element('w:instrText')
create_attribute(instrText, 'xml:space', 'preserve')
instrText.text = "PAGE"
fldChar2 = create_element('w:fldChar')
create_attribute(fldChar2, 'w:fldCharType', 'end')
page_num_run._r.append(fldChar1)
page_num_run._r.append(instrText)
page_num_run._r.append(fldChar2)
of_run = paragraph.add_run()
t2 = create_element('w:t')
create_attribute(t2, 'xml:space', 'preserve')
t2.text = ' of '
of_run._r.append(t2)
fldChar3 = create_element('w:fldChar')
create_attribute(fldChar3, 'w:fldCharType', 'begin')
instrText2 = create_element('w:instrText')
create_attribute(instrText2, 'xml:space', 'preserve')
instrText2.text = "NUMPAGES"
fldChar4 = create_element('w:fldChar')
create_attribute(fldChar4, 'w:fldCharType', 'end')
num_pages_run = paragraph.add_run()
num_pages_run._r.append(fldChar3)
num_pages_run._r.append(instrText2)
num_pages_run._r.append(fldChar4)
for pg in num_pages_run.element:
print(pg.text)
doc = Document()
add_page_number(doc.sections[0].footer.paragraphs[0])
doc.save("your_doc.docx")
I receive the Page number in format of Page x of y.
But when I try to access the value of 'y' for total number of pages, I could not do it. I have tried to access it getting the text attribute of num_pages_run as pg.text. But all I get is NUMPAGES as output instead of number of pages.
I am looking for this feature because I would like to do some actions whenever a new page is added to the document.
Is there a way to get total pages from python-docx or any other alternative?
I'm am currently using selenium to take the product information from Sneider electric and this is currently the error I am receiving:
selenium.common.exceptions.NoSuchElementException: Message:
no such element: Unable to locate element:
{"method":"xpath","selector":"/html/body/div[2]/main/div[5]/ul/li/div/div/div/div/div/ul/li[1]/div/div/div[2]/div[2]/section/div/product-cards-wrapper//div/ul/li[1]/product-card/article/div/div[1]/product-card-main-info//div/pes-router-link[2]/a/h3"}
Currently, the website I am trying to pull this information from is this URL:
https://www.se.com/us/en/product-range/63426-powerlogic-accusine-pcs%2B/?N=4176697776&No=0&Nrpp=12
The Xpath file is for the description of their products which according to my inspection and findings is this:
/html/body/div[2]/main/div[5]/ul/li/div/div/div/div/div/ul/li[1]/div/div/div[2]/div[2]/section/div/product-cards-wrapper//div/ul/li[1]/product-card//article/div/div[1]/product-card-main-info//div/pes-router-link[2]/a/h3
Any ideas??
Current Code:
def page_function():
driver.get('https://www.se.com/us/en/product-range/63426-powerlogic-accusine-pcs%2B/?N=4176697776&No=12&Nrpp=12')
driver.maximize_window()
# gets the amount of items in the search bar
print("Number of products:", 69)
# for loop to read the product name and descriptions
# product = driver.find_element(By.CSS_SELECTOR, ".search-item")
# product = product.text
# print(product)
pr = "]/product-card//article/div/div[2]/div[1]/pes-product-price/p/span[1]"
nam = "]/product-card//article/div/div[1]/product-card-main-info//div/pes-router-link[1]/a"
des = "]/product-card//article/div/div[1]/product-card-main-info//div/pes-router-link[2]/a/h3"
# des_path = "#search-items > .search-item .details > a > .row.pt-5.pb-sm-5 > .multilines-3.text-truncate-multilines.xs-single-col-8.col-12 > .font-weight-bold.text-dark"
follow_loop = range(1, 70)
for x in follow_loop:
y = x
if (x > 61):
y = x - 60
elif (x > 49):
y = x - 48
elif (x > 37):
y = x - 36
elif (x > 25):
y = x - 24
elif(x > 13):
y = x - 12
else:
print("")
if ( ((x % 13) == 0) ):
driver.delete_all_cookies()
next_arrow = driver.find_element(By.CLASS_NAME, "page-links__arrow page-links__arrow--next js-page-link js-page-link-next")
driver.execute_script("arguments[0].click();", next_arrow)
xpath = "/html/body/div[2]/main/div[5]/ul/li/div/div/div/div/div/ul/li[1]/div/div/div[2]/div[2]/section/div/product-cards-wrapper//div/ul/li["
xpath += str(y)
xpath += des
driver.implicitly_wait(5)
description.append(driver.find_element(By.XPATH, xpath))
xpath2 = xpath.replace(des, '')
xpath2 += pr
unit_price.append(driver.find_element(By.XPATH, xpath2).text)
xpath3 = xpath2.replace(pr, '')
xpath3 += nam
name.append(driver.find_element(By.XPATH, xpath3).text)
The product description is within a #shadow-root (open)
Solution
Tto extract the desired text you need to use shadowRoot.querySelector() and you can use the following Locator Strategy:
driver.get("https://www.se.com/us/en/product-range/63426-powerlogic-accusine-pcs%2B/?N=4176697776&No=0&Nrpp=12")
time.sleep(5)
description = driver.execute_script('''return document.querySelector("product-cards-wrapper.hydrated").shadowRoot.querySelector("product-card.hydrated").shadowRoot.querySelector("product-card-main-info.hydrated").shadowRoot.querySelector("pes-router-link.description.hydrated a > h3")''')
print(description.text)
Console Output:
Active harmonic filter - 60 A 380..480 V AC - IP00 enclosure
References
You can find a couple of relevant detailed discussions in:
How to locate the First name field within shadow-root (open) within the website https://www.virustotal.com using Selenium and Python
How to get past a cookie agreement page using Python and Selenium?
Unable to locate the Sign In element within #shadow-root (open) using Selenium and Python
I've tried to crawl a page with many "article" elements, but the loop returns everytime only the first element (as many as their are article elements).
for offer in response.xpath("//article"):
product = Product()
product['kind'] = 'deal'
product['portal'] = 'mydealz'
product['link'] = offer.xpath("//a[#class='cept-dealBtn boxAlign-jc--all-c space--h-3 width--all-12 btn btn--mode-primary']/#href").get()
product['merchant'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title']/a[#class='thread-title--list']/text()").get()
#product['merchant'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title js-contextual-message-placeholder']/div[#class='overflow--fade']/a[#class='cept-merchant-link text--color-greyShade size--all-s']/span[#class='cept-merchant-link-term overflow--wrap-off']/span[#class='text--b text--color-brandPrimary cept-merchant-name']/text()").get()
product['offer'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title js-contextual-message-placeholder']/strong[#class='thread-title']/a[#class='cept-tt thread-link linkPlain thread-title--list']/text()").get()
#product['offer'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title js-contextual-message-placeholder']/strong[#class='thread-title']/a[#class='cept-tt thread-link linkPlain thread-title--list']/text()").get()
product['crawlingdate'] = strftime("%Y-%m-%d %H:%M:%S", gmtime())
yield product
I need insert all html tags and attributes into database
el.driver.get(url_page)
txthtml = el.driver.page_source
soup = BeautifulSoup(txthtml, "html.parser")
body = soup.find('html')
html_parse(body, el, url_page_id, 0, 0, 0,url_page)
def html_parse(html, el, url_page_id, level, i, parent_id, url_page):
txt = ""
if len(html.text) > 0:
txt = html.text.replace("\n","").replace("\t","").replace("\r","")
ta = tag_list()
ta.p_id = el.id
ta.page_id = url_page_id
ta.level = level
ta.number = i
ta.txt = txt
ta.name = html.name
ta.parent_id = parent_id
ta.html = str(html)
ta.save()
insert_attr(html, el.id, url_page_id, ta.id, url_page)
children = list(html.children)
j = 0
for child in children:
if child.name is None:
continue
j = j + 1
html_parse(child, el, url_page_id, level + 1, j, ta.id, url_page)
When I have recursive function html_parse
html - current html object
el - driver class
url_page_id - id of page
level - level in DOM
i - childe number
parent_id - id of parent
url_page - current URL
tag_list - insert current tag
insert_attr - insert into database attrs of tag
Every html_parse function run fast, but full html parsing run about 4-5 minutes per big html page.
How I can speed up the code?