Send key function not working properly in Selenium Python - python

Take a look at this site : https://www.arabam.com/ilan/sahibinden-satilik-mercedes-benz-cla-180-d-style/sahibinden-boyasiz-hasarsiz-cam-tavan-temiz-arac/14229201
I press end key so that it goes to the end of page. And then one by one it presses up key until it finds this :
enter image description here
It was working just fine but it doesnt seem to be working anymore.
options.add_argument('window-size=1200x600')
prefs = {}
prefs = {"profile.default_content_setting_values.geolocation": 2, "profile.default_content_setting_values.notifications": 2}
options.add_experimental_option("prefs", prefs)
d = webdriver.Chrome(chrome_options=options,
executable_path='./chromedriver')
d.get(features["ad_url"])
# Use send_keys(Keys.HOME) to scroll up to the top of page
d.find_element_by_tag_name('body').send_keys(
Keys.END)
while True:
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div")
if e.text:
break
Here is a fully functional code to try:
import json
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.crawler import CrawlerProcess
from selenium import webdriver
from datetime import datetime
from selenium.webdriver.common.keys import Keys
import pickle
import time
class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = ['https://www.arabam.com/sitemap/otomobil_1.xml','https://www.arabam.com/sitemap/otomobil_2.xml',
'https://www.arabam.com/sitemap/otomobil_3.xml','https://www.arabam.com/sitemap/otomobil_4.xml',
'https://www.arabam.com/sitemap/otomobil_5.xml','https://www.arabam.com/sitemap/otomobil_6.xml',
'https://www.arabam.com/sitemap/otomobil_7.xml','https://www.arabam.com/sitemap/otomobil_8.xml',
'https://www.arabam.com/sitemap/otomobil_9.xml','https://www.arabam.com/sitemap/otomobil_10.xml',
'https://www.arabam.com/sitemap/otomobil_11.xml','https://www.arabam.com/sitemap/otomobil_12.xml',
'https://www.arabam.com/sitemap/otomobil_13.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
]
custom_settings = {'FEED_FORMAT':'csv','FEED_URI': "arabam_"+str(datetime.today().strftime('%d%m%y'))+'.csv'
}
def parse(self,response):
for td in response.xpath("/html/body/div[3]/div[6]/div[4]/div/div[2]/table/tbody/tr/td[4]/div/a"):
link = td.xpath("#href").extract()
year = td.xpath("text()").extract()
self.crawled.append(link[0])
self.new_links += 1
if int(year[0]) > 2010:
url = "https://www.arabam.com/" + link[0]
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self,response):
features = {}
options = webdriver.ChromeOptions()
# options.add_argument('headless')
options.add_argument('window-size=1200x600')
prefs = {}
prefs = {"profile.default_content_setting_values.geolocation": 2, "profile.default_content_setting_values.notifications": 2}
options.add_experimental_option("prefs", prefs)
d = webdriver.Chrome(chrome_options=options,
executable_path='./chromedriver')
d.get(features["ad_url"])
# Use send_keys(Keys.HOME) to scroll up to the top of page
d.find_element_by_tag_name('body').send_keys(
Keys.END)
while True:
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div")
if e.text:
break
overview1 = e.text.split("\n")
yield features
process = CrawlerProcess({
})
process.crawl(Myspider)
process.start() # the script wi
Edit:
I commented out and ran the code and it turns out that keys are being sent. The problem is with trying to find specific div. I tried putting try catch on it but that doesn't seem to be working.
while True:
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
try:
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div")
if e.text:
break
except:
pass
Edit:
This is what I did to scroll up. But unfortunately this doesn't work for most cases
for i in range(0,37):
d.find_element_by_tag_name('body').send_keys(
Keys.UP)
time.sleep(1)
e = d.find_element_by_xpath("/html/body/div[3]/div[6]/div[3]/div/div[1]/div[3]/div/div[3]/div[2]/div")
overview1 = e.text.split("\n")
Edit:
Tried this. It scrolls into view but doesnt get element
e = d.find_element_by_xpath("//div[#id = 'js-hook-appendable-technicalPropertiesWrapper' and #class = 'cf' ] ")
actions = ActionChains(d)
actions.move_to_element(e).perform()
wait = WebDriverWait(d, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//div[#id = 'js-hook-appendable-technicalPropertiesWrapper' and #class = 'cf' ]")))
overview1 = e.text.split("\n")
Edit:
Screenshot of HTML
enter image description here

Adding as an answer as this is a bit lengthy comment.
First, you need to wait for the element to appear. And then find the element and extract the values. From your code, the element find is done before the visibility check.
And another thing you can try is to scroll to the specific element before extracting values. This specific table seems to be loading values only if it is in the viewport.
actions = ActionChains(d)
actions.move_to_element(e).perform()
wait = WebDriverWait(d, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//div[#id = 'js-hook-appendable-technicalPropertiesWrapper' and #class = 'cf' ]")))
e = d.find_element_by_xpath("//div[#id = 'js-hook-appendable-technicalPropertiesWrapper' and #class = 'cf' ] ")
# Scroll to the element
d.executeScript("arguments[0].scrollIntoView(true);", element);
# Check what is the actual text value you get
print(e.text)
print (e.text.split("\n"))

Related

How to run 'implicity_wait()' in a 'for loop' with respect to Web Scraping using Python?

Actually, I want to scrape the 'title' and 'product description' for all the products and from all the pages, and then save it into the '.csv' file.
URL:- hhttps://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
This is what, I have tried.
from msilib.schema import Error
from os import sep
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
title_list = []
para_list = []
expiry_list = []
country_list = []
importer_list = []
address_list = []
myDict = {'body-art': 3024}
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')
browser.maximize_window()
browser.implicitly_wait(20)
for item_name in myDict:
page_num = 1
while True:
try:
page = f"https://www.nykaa.com/makeup/{item_name}/c/{myDict[item_name]}?page_no={page_num}&sort=popularity&ptype=lst&id={myDict[item_name]}&root=nav_2&dir=desc&order=popularity&eq=desktop"
print(page)
requests.get(page)
soup = BeautifulSoup(requests.get(page).content, 'html.parser')
urls = [item.get("href")
for item in soup.find_all("a", class_="css-qlopj4")]
# print(urls)
if len(urls) == 0:
break
for i in range(0, 2): #Since, it's a huge amount of data, that's why I have taken 2 products on one page, otherwise it will be in the range(0,30). It will cover all the products from an individual pages.
try:
url = urls[i]
browser.get("https://www.nykaa.com" + url)
title_data = browser.find_elements(
By.CLASS_NAME, 'css-1gc4x7i').text
print(title_data)
for t in title_data:
title_list.append(t)
browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(
By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements(By.ID, 'content-details')
for desc in desc_data:
para_details = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[1]').text
para_list.append(para_details)
expiry = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[2]').text
expiry_list.append(expiry)
country = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[3]').text
country_list.append(country)
importer = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[4]').text
importer_list.append(importer)
address = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[5]').text
address_list.append(address)
except:
break
except:
break
page_num += 1
title_list = [i.split('.css', 1)[0] for i in title_list]
print(*title_list, sep="\n")
print(*para_list, sep="\n")
print(*expiry_list, sep="\n")
print(*country_list, sep="\n")
print(*importer_list, sep="\n")
print(*address_list, "\n")
data_new = {"Title": title_list, "Para": para_list, "Expiry": expiry_list,
"Country": country_list, "Importer": importer_list, "Address": address_list}
df = pd.DataFrame(data_new)
df.to_csv("nykaa_makeup_bodyArt_new.csv")
# print(df)
The Output, I am receiving is as:
DevTools listening on ws://127.0.0.1:30887/devtools/browser/a222842a-7ce3-4070-a684-7e8bb8772279
https://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=2&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=3&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=4&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=5&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
PS E:\Web Scraping - Nykaa>
I think, due to the implicity_wait() function, it's not able to fetch the product's title & description. After my code runs, the '.csv' file is created, but it's a blank file. Maybe, I am wrong. Please help me regarding this. Do I need change to add/change some parts of the code?
Thanks 🙏🏻
There is no need to set browser.implicitly_wait multiple times.
browser.implicitly_wait is setting the timeout, how much time the driver will try to pool the DOM in order to locate an element on the page before it races exception.
browser.implicitly_wait is normally set per driver session.
This is definetely not a pause command like time.sleep.
So, in case you need to put a pause in your code you should use time.sleep while this is not recommended.
Also, it's much preferably to use Expected Conditions explicit waits rather than browser.implicitly_wait since browser.implicitly_wait waits for element presence i.e. it will release the run when element is just appeared while it may not be completely rendered.
In order to wait for element completely rendered and containing it text you should use something like
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")))
Where "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]" is XPath of element you wishe to get the text from.

I am very new to scraping please bear with me and this is my 1st project. I am trying to scrape a site using selenium

"problem lines"
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
website I'm scraping https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb
label image
I was not able to print the radio buttons label according to checked button. I don't know what is the mistake and where I did it. could anyone help on this. It will be helpful for me to learn. Change tariff links given below links,
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
class telekommobiles:
def __init__(self):
self.url="https://www.telekom.de/mobilfunk/geraete/smartphone?page=1&pageFilter=promotion"
self.country='DE'
self.currency='GBP'
self.VAT='Included'
self.shipping = 'free shipping within 3-4 weeks'
self.Pre_PromotionPrice ='N/A'
self.color ='N/A'
def telekom(self):
#try:
driver=webdriver.Chrome()
driver.maximize_window()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('button.cl-btn.cl-btn--accept-all').click()
print("cookies accepted")
links_prod_check = []
prod_models = []
prod_manufacturer =[]
prod_memorys = []
product_colors =[]
product_price_monthly_payments = []
product_price_one_time_payments =[]
product_links = []
containers = driver.find_elements_by_css_selector('div[class="styles_item__12Aw4"]')
i = 1
for container in containers:
p_links =container.find_element_by_tag_name('a').get_attribute('href')
i = i + 1
product_links.append(p_links)
#print(p_links)
for links in product_links:
driver.get(links)
#time.sleep(5)
#print(driver.current_url)
#links_prod_check.append(driver.current_url)
coloroptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//li[#data-qa='list_ColorVariant']")))
#print(coloroptions)
for i in range(len(coloroptions)):
coloroption = driver.find_elements_by_xpath("//li[#data-qa='list_ColorVariant']")
coloroption[i].click()
#print(coloroption[i])
time.sleep(3)
memoryoptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(memoryoptions)):
memoryoption = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
try:
memoryoption[i].click()
except:
pass
time.sleep(5)
change_traiff = driver.find_element_by_css_selector('button[class="phx-link phx-list-of-links__link js-mod tracking-added"]').click()
time.sleep(3)
#looping for each section
section_loops = driver.find_elements_by_css_selector('section[class="tariff-catalog--layer"]')
#print(len(section_loops))
for section_loop in section_loops:
#print(section_loop)
time.sleep(5)
#Headings
heading_1 = section_loop.find_element_by_css_selector('h2[class="page-title page-title--lowercase"]').text
print(heading_1)
# looping for each separate boxes
each_box_subcontainers = section_loop.find_elements_by_css_selector('.phx-tariff-box__section')
#print(len(each_box_subcontainers))
for subcontainer in each_box_subcontainers:
#print(subcontainer)
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
#print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
#print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
change_traiff_close_button = driver.find_element_by_css_selector('span[class="icon-after-yellow-close right close popup-close-tr js-popup-close"]').click()
telekom_de=telekommobiles()
telekom_de.telekom()
You are trying to find element within an element. Finding radio_label_list using for_tariff_loop[i], xpath for radio_label_list will become like below:
//span[#class='phx-radio__element']//span[#class="phx-radio__label"]
Which does not exist in the DOM.
I tried the last part of the code. And was able to print the Memory size like below. Do try and confirm:
Replaced css-selector for radio_label_list with this xpath ./following-sibling::span
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//span[#class='phx-radio__element']")))
# print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
# print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_xpath("./following-sibling::span").text
print(radio_label_list)
time.sleep(1)
As per the comments, check this code:
driver.get("https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb")
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[text()='Accept All']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH,"//ul[contains(#class,'phx-tariff-notification-box-new__element--desktop-tablet')]/li[2]/button"))).click()
length = len(driver.find_elements_by_class_name("phx-tariff-box__section"))
for i in range(length):
print("----------------------------------------------------------------------------------------------------------")
options = driver.find_elements_by_class_name("phx-tariff-box__section")
datas = options[i].find_element_by_xpath(".//div[contains(#class,'phx-tariff-box__volume')]").get_attribute("innerText")
print("data: {}".format(datas))
len_types = len(options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label"))
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
if len(types) == 0:
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(price)
else:
for j in range(len_types):
types[j].click()
time.sleep(2)
options = driver.find_elements_by_class_name("phx-tariff-box__section")
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
try:
types[j].find_element_by_xpath("./input[#checked]")
type = types[j].find_element_by_xpath("./span[2]").get_attribute("innerText")
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(f"{type}: {price}")
except:
pass

Get this error when try to iterate using selenium. "stale element reference: element is not attached to the page document"

I write a python script. first, it visits this website. then click on the arrow on the right side and go to the new web page to collect some data. finally back to the previous page and do the same thing with next item.
Web page : https://register.fca.org.uk/s/search?q=capital&type=Companies
This is the code.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
import time
url = 'https://register.fca.org.uk/s/search?q=capital&type=Companies'
service = Service('link to come driver')
service.start()
driver = webdriver.Remote(service.service_url)
driver.get(url)
time.sleep(12)
divs = driver.find_elements_by_xpath('//div[#class="result-card_main"]')
for d in divs:
RN = ''
companyName = ''
companyName = d.find_element_by_tag_name('h2').text
RNData = d.find_element_by_xpath('.//div[#class="result-card_figure-offset"]').text
RN = RNData.split(':')[1].strip()
d.click()
time.sleep(12)
phoneNumber = ''
phoneNumberData = driver.find_elements_by_xpath('//*[#id="who-is-this-details-content"]/div[1]/div[2]/div[2]/div/div/div[2]')
phoneNumber = phoneNumberData[0].text.split('\n')[1]
print(RN)
print(companyName)
print(phoneNumber)
driver.execute_script("history.back();")
it givesme this Error:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
How can I solve this problem?
Here's a quick and dirty way to avoid that error, change your code like this:
url = 'https://register.fca.org.uk/s/search?q=capital&type=Companies'
driver.get(url)
time.sleep(12)
divs = driver.find_elements_by_xpath('//div[#class="result-card_main"]')
for i in range(len(divs)):
time.sleep(4)
d = driver.find_elements_by_xpath('//div[#class="result-card_main"]')
RN = ''
companyName = ''
companyName = d[i].find_element_by_tag_name('h2').text
RNData = d[i].find_element_by_xpath('.//div[#class="result-card_figure-offset"]').text
RN = RNData.split(':')[1].strip()
d[i].click()
time.sleep(12)
phoneNumber = ''
phoneNumberData = driver.find_elements_by_xpath('//*[#id="who-is-this-details-content"]/div[1]/div[2]/div[2]/div/div/div[2]')
phoneNumber = phoneNumberData[0].text.split('\n')[1]
print(RN)
print(companyName)
print(phoneNumber)
driver.execute_script("window.history.go(-1)")

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)
Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here

how to make scrapy crawler work properly when dealing with multilevel webpage crawling

I am learning crawling skills, and I want to do as follows:
login to a specific webpage (done)
go to a page that contains the links that I need
for each link in that page, crawl its content.
The problem is I have tested my code for a single link, it worked, but when I tried it for the multilevel job. It failed in a way I could not understand: It can only crawl some part of each link. I am wondering if there is some logical mistake in my code, please help. Below is the code
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
start_urls = ['http://tieba.baidu.com']
main_url = 'http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
username = ""
password = ""
def __init__(self, username=username, password=password):
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#options.add_argument('window-size=1200x600')
self.driver = webdriver.Chrome()#chrome_options=options)
self.username = username
self.password = password
# checked
def logIn(self):
elem = self.driver.find_element_by_css_selector('#com_userbar > ul > li.u_login > div > a')
elem.click()
wait = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#TANGRAM__PSP_10__footerULoginBtn')))
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__footerULoginBtn')
elem.click()
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__userName')
elem.send_keys(self.username)
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__password')
elem.send_keys(self.password)
self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit').click()
# basic checked
def parse(self, response):
self.driver.get(response.url)
self.logIn()
# wait for hand input verify code
time.sleep(20)
self.driver.get('http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
# try first page first
for url in self.driver.find_elements_by_css_selector('a.j_th_tit'):
#new_url = response.urljoin(url)
new_url = url.get_attribute("href")
yield scrapy.Request(url=new_url, callback=self.parse_sub)
# checked
def pageScroll(self, url):
self.log('I am scrolling' + url)
self.driver.get(url)
SCROLL_PAUSE_TIME = 0.5
SCROLL_LENGTH = 1200
page_height = int(self.driver.execute_script("return document.body.scrollHeight"))
scrollPosition = 0
while scrollPosition < page_height:
scrollPosition = scrollPosition + SCROLL_LENGTH
self.driver.execute_script("window.scrollTo(0, " + str(scrollPosition) + ");")
time.sleep(SCROLL_PAUSE_TIME)
time.sleep(1.2)
def parse_sub(self, response):
self.log('I visited ' + response.url)
self.pageScroll(response.url)
for sel in self.driver.find_elements_by_css_selector('div.l_post.j_l_post.l_post_bright'):
name = sel.find_element_by_css_selector('.d_name').text
try:
content = sel.find_element_by_css_selector('.j_d_post_content').text
except: content = ''
replys = []
for i in sel.find_elements_by_xpath('.//div[#class="lzl_cnt"]'):
user1 = i.find_element_by_xpath('.//a[#username]')
user1 = self.driver.execute_script("return arguments[0].firstChild.textContent", user1)
try:
user2 = i.find_element_by_xpath('.//span[#class="lzl_content_main"]/a[#username]')
user2 = self.driver.execute_script("return arguments[0].firstChild.textContent", user2)
except: user2 = name
span = i.find_element_by_xpath('.//span[#class="lzl_content_main"]')
reply = self.driver.execute_script('return arguments[0].lastChild.textContent;', span)
replys.append(tuple(user1, user2, reply))
yield {"topic": response.css(".core_title_txt::text").extract(), "name":name, "content":content, "replys":replys}
#follow to next page
#next_sel = self.driver.find_element_by_css_selector('#thread_theme_7 a:nth-child(3)')
#next_url_name = next_sel.text
#if next_sel and next_url_name == '下一页':
# next_url = next_sel.get_attribute('href')
# yield scrapy.Request(url=next_url, callback=self.parse_sub)
Seems like you are using a hardcoded container for the link instead of a generic one and hence getting back just one link in
for url in self.driver.find_elements_by_css_selector('a.j_th_tit')
This - j_th_tit - seems to be a dynamically generated class name and might not be the same for all anchor(a) tags.
You could try
for url in self.driver.find_elements_by_css_selector('a')
for getting all links of the page.

Categories

Resources