Turning pages using a parser how can you implement it? - python

I need to write a loop so that the parser collects data from all pages, but my version does not work, how could I implement it differently?
import time
import pandas as pd
from selenium.webdriver import Chrome
from datetime import datetime
webdriver = r"C:\Users\К.Бояр (Второй)\source\repos\RozetaParcer\chromedriver.exe"
driver = Chrome(webdriver)
driver.implicitly_wait(10)
driver.get("https://rozetka.com.ua/search/?producer=gazer&seller=rozetka&text=Gazer")
total = []
items = driver.find_elements_by_css_selector(".goods-tile.ng-star-inserted")
cur_date = datetime.now().strftime("%d_%m_%Y")
for item in items:
t_name = item.find_element_by_css_selector('.goods-tile__title').text
t_price = item.find_element_by_css_selector('.goods-tile__price-value').text
t_nal = item.find_element_by_css_selector('.goods-tile__availability').text
row = cur_date, t_name, t_price, t_nal
total.append(row)
driver.close()
df = pd.DataFrame(total, columns=['Date','Name', 'Price', 'Nal'])
df.to_csv(f'Rozetka_parcer_{cur_date}.csv')

you have to get button with .pagination__direction_type_forward in a while loop until button get disabled and gray ( this means you are at last page ) on that while loop you get items before you click on the next page button
there to many way to approach this but the easiest imo is this ( and this problems its different for every websites because they are different in tech they used and the html they have )

Related

with open nested loop in selenium isn't working

it's my first time posting here, so please let me know if I've messed anything up. I'm having some trouble with a nested loop in selenium. I'm trying to iterate through a list of players, gather stats for each one, and add them to a dataframe. Right now each player in the list gets entered into the search bar and their page is displayed, but stats are only collected for the last player in the list.
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
url = "https://www.sports-reference.com/cfb/"
driver = webdriver.Chrome(path)
driver.get(url)
dataframe1 = []
with open('A.txt') as f:
players = f.readlines()
for player in players:
search = driver.find_element(By.NAME, "search")
search.send_keys(player)
button = driver.find_element(By.XPATH, '//*[#id="header"]/div[3]/form/input[1]')
button.click()
stats = driver.find_elements(By.XPATH, '//*[#id="passing"]/tfoot/tr')
for stat in stats:
comps = stat.find_element(By.XPATH, '//*[#id="passing"]/tfoot/tr/td[6]').text
data = {
'Player': player,
'Completions': comps,
}
dataframe1.append(data)
df = pd.DataFrame(dataframe1)
print(df)
driver.close()
You have wrong indentation for the lines where you initialize the data dict, and append to dataframe1. they must be at the same level as the block under the innermost for loop.
I modified your code, here:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
url = "https://www.sports-reference.com/cfb/"
driver = webdriver.Chrome(path)
driver.get(url)
dataframe1 = []
with open('A.txt') as f:
players = f.readlines()
for player in players:
search = driver.find_element(By.NAME, "search")
search.send_keys(player)
button = driver.find_element(By.XPATH, '//*[#id="header"]/div[3]/form/input[1]')
button.click()
stats = driver.find_elements(By.XPATH, '//*[#id="passing"]/tfoot/tr')
for stat in stats:
comps = stat.find_element(By.XPATH, '//*[#id="passing"]/tfoot/tr/td[6]').text
data = {
'Player': player,
'Completions': comps,
}
dataframe1.append(data)
df = pd.DataFrame(dataframe1)
print(df)
driver.close()
Thank you everyone for your assistance with this issue. I eventually found out that I did not need the 'button' variable or button.click() in the script. Send keys was already hitting "return" after the string was passed to the search parameter on the page, so basically return was getting hit twice, once on the players name, and once on an empty search parameter. The default page that was returned when searching for the empty parameter did not contain the element I was attempting to find, which resulted in an empty list. Again thank you for your help with this issue.

How to run 'implicity_wait()' in a 'for loop' with respect to Web Scraping using Python?

Actually, I want to scrape the 'title' and 'product description' for all the products and from all the pages, and then save it into the '.csv' file.
URL:- hhttps://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
This is what, I have tried.
from msilib.schema import Error
from os import sep
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
title_list = []
para_list = []
expiry_list = []
country_list = []
importer_list = []
address_list = []
myDict = {'body-art': 3024}
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')
browser.maximize_window()
browser.implicitly_wait(20)
for item_name in myDict:
page_num = 1
while True:
try:
page = f"https://www.nykaa.com/makeup/{item_name}/c/{myDict[item_name]}?page_no={page_num}&sort=popularity&ptype=lst&id={myDict[item_name]}&root=nav_2&dir=desc&order=popularity&eq=desktop"
print(page)
requests.get(page)
soup = BeautifulSoup(requests.get(page).content, 'html.parser')
urls = [item.get("href")
for item in soup.find_all("a", class_="css-qlopj4")]
# print(urls)
if len(urls) == 0:
break
for i in range(0, 2): #Since, it's a huge amount of data, that's why I have taken 2 products on one page, otherwise it will be in the range(0,30). It will cover all the products from an individual pages.
try:
url = urls[i]
browser.get("https://www.nykaa.com" + url)
title_data = browser.find_elements(
By.CLASS_NAME, 'css-1gc4x7i').text
print(title_data)
for t in title_data:
title_list.append(t)
browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(
By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements(By.ID, 'content-details')
for desc in desc_data:
para_details = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[1]').text
para_list.append(para_details)
expiry = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[2]').text
expiry_list.append(expiry)
country = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[3]').text
country_list.append(country)
importer = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[4]').text
importer_list.append(importer)
address = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[5]').text
address_list.append(address)
except:
break
except:
break
page_num += 1
title_list = [i.split('.css', 1)[0] for i in title_list]
print(*title_list, sep="\n")
print(*para_list, sep="\n")
print(*expiry_list, sep="\n")
print(*country_list, sep="\n")
print(*importer_list, sep="\n")
print(*address_list, "\n")
data_new = {"Title": title_list, "Para": para_list, "Expiry": expiry_list,
"Country": country_list, "Importer": importer_list, "Address": address_list}
df = pd.DataFrame(data_new)
df.to_csv("nykaa_makeup_bodyArt_new.csv")
# print(df)
The Output, I am receiving is as:
DevTools listening on ws://127.0.0.1:30887/devtools/browser/a222842a-7ce3-4070-a684-7e8bb8772279
https://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=2&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=3&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=4&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=5&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
PS E:\Web Scraping - Nykaa>
I think, due to the implicity_wait() function, it's not able to fetch the product's title & description. After my code runs, the '.csv' file is created, but it's a blank file. Maybe, I am wrong. Please help me regarding this. Do I need change to add/change some parts of the code?
Thanks 🙏🏻
There is no need to set browser.implicitly_wait multiple times.
browser.implicitly_wait is setting the timeout, how much time the driver will try to pool the DOM in order to locate an element on the page before it races exception.
browser.implicitly_wait is normally set per driver session.
This is definetely not a pause command like time.sleep.
So, in case you need to put a pause in your code you should use time.sleep while this is not recommended.
Also, it's much preferably to use Expected Conditions explicit waits rather than browser.implicitly_wait since browser.implicitly_wait waits for element presence i.e. it will release the run when element is just appeared while it may not be completely rendered.
In order to wait for element completely rendered and containing it text you should use something like
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")))
Where "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]" is XPath of element you wishe to get the text from.

Extract information from products on Website after pressing "more items" button with Selenium

I managed to extract the names, specs, prices, and priceUnits from the products on this page: https://www.bauhaus.info/baustoffe/c/10000819.
I do, however, only manage to get the first 36 products visible on the page. How would I extract all the products on this page that appear when pressing on the button for "more items"?
For this, see the inspection of the page here:
see inspect here
Any help is very much appreciated!
This is my code:
from selenium import webdriver
import pandas as pd
import re
browser = webdriver.Chrome(r'C:\Users\KristerJens\Downloads\chromedriver_win32\chromedriver')
browser.get('https://www.bauhaus.info/baustoffe/c/10000819')
names= []
specs = []
prices = []
priceUnit = []
for li in browser.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li"):
names.append(li.find_element_by_class_name("product-list-tile__info__name").text)
specs.append(li.find_element_by_class_name("product-list-tile__info__attributes").text)
prices.append(li.find_element_by_class_name("price-tag__box").text.split('\n')[0] + "€")
p = li.find_element_by_class_name("price-tag__sales-unit").text.split('\n')[0]
priceUnit.append(p[p.find("(")+1:p.find(")")])
df2 = pd.DataFrame()
df2['names'] = names
df2['specs'] = specs
df2['prices'] = prices
df2['priceUnit'] = priceUnit
Was able to click on More option continuously with below code. Try to incorporate this with your code.
# Imports Required for Explicit Waits:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver.get("https://www.bauhaus.info/baustoffe/c/10000819")
wait = WebDriverWait(driver,30)
options = driver.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li")
print(len(options))
#Using `Count` variable to keep track of number of times of clicking on More option. Remove the `Count` part of the code to continuously click on More option.
count = 0
try:
while True:
if count > 5: # Click on "More" option only for 5 times
break
moreoption = wait.until(EC.element_to_be_clickable((By.XPATH,"//button[#data-message='adb-show-more-products-button']")))
driver.execute_script("arguments[0].scrollIntoView(true);",moreoption)
driver.execute_script("window.scrollBy(0,-300);")
time.sleep(2)
moreoption.click()
count += 1
time.sleep(2)
options = driver.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li")
print(len(options))
except:
pass
firstly try to click on "More Products" button until it gets disabled i.e all products gets listed down and then use the common xpath for locating product info.
For each page add scroll to element more items and click it, see below example of scroll to element implementation
from selenium.webdriver.common.action_chains import ActionChains
element = driver.find_element_by_id("more_items")
actions = ActionChains(driver)
actions.move_to_element(element).perform()

How can i scrape information from web page?

I am new to programming and need some help with my web-crawler.
At the moment, I have my code opening up every web-page in the list. However, I wish to extract information from each one it loads. This is what I have.
from selenium import webdriver
import csv
driver = webdriver.Firefox()
links_code = driver.find_elements_by_xpath('//a[#class="in-match"]')
first_two = links_code[0:2]
first_two_links = []
for i in first_two:
link = i.get_attribute("href")
first_two_links.append(link)
for i in first_two_links:
driver.get(i)
This loops through the first two pages but scrapes no info. So I tried adding to the for-loop as follows
odds = []
for i in first_two_links:
driver.get(i)
driver.find_element_by_xpath('//span[#class="table-main__detail-
odds--hasarchive"]')
odds.append(odd)
However. This runs into an error.
Any help much appreciated.
You are not actually appending anything! you need to assign a variable to
driver.find_element_by_xpath('//span[#class="table-main__detail-
odds--hasarchive"]')
then append it to the list!
from selenium import webdriver;
import csv;
driver = webdriver.Firefox();
links_code : list = driver.find_elements_by_xpath('//a[#class="in-match"]');
first_two : list = links_code[0:2];
first_two_links : list = [];
i : int;
for i in first_two:
link = i.get_attribute("href");
first_two_links.append(link);
for i in first_two_links:
driver.get(i);
odds : list = [];
i :int;
for i in first_two_links:
driver.get(i);
o = driver.find_element_by_xpath('//span[#class="table-main__detail- odds--hasarchive"]');
odds.append(o);
First, after you start the driver you need to go to a website...
Second, in the second for loop, you are trying to append the wrong object... use i not odd or make odd = driver.find_element_by_xpath('//span[#class="table-main__detail-odds--hasarchive"]')
If you can provide the URL or the HTML we can help more!
Try this (I have used Google as an example you will need to change the code...):
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.google.com")
links_code = driver.find_elements_by_xpath('//a')
first_two = links_code[0:2]
first_two_links = []
for i in first_two:
link = i.get_attribute("href")
first_two_links.append(link)
print(link)
odds = []
for i in first_two_links:
driver.get(i)
odd = driver.page_source
print(odd)
# driver.find_element_by_xpath('//span[#class="table-main__detail- odds--hasarchive"]')
odds.append(odd)

Extract data from site and different pages and save to csv

I am working with selenium webdriver to search and extract data from this site https://www.idealista.com/venta-viviendas/marbella-malaga/
I want to get a table with the price of each of the houses (class item_price), the number of rooms (class item_detail) and the sq. meters (class item_detail).
I believe I have to use the driver.find_elements() method, but I don't know where to add it and how to make sure we add all prices, rooms and sq. meters in a table with three columns.
I got this code so far but it doesn't work. I see FireFox going though pages but it seems it doesn't keep and store the data in houses.csv. Can anyone help? Thanks
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import csv
import time
driver = webdriver.Firefox()
driver.get("https://www.idealista.com/venta-viviendas/marbella/las-chapas-el-rosario/")
pages_remaining = True
price = []
rooms = []
size = []
while pages_remaining:
price = driver.find_elements_by_class_name("item-price")
rooms = driver.find_elements_by_xpath("//*[contains(text(), 'hab.')]")
size = driver.find_elements_by_xpath("//*[contains(text(), 'm²')]")
houses = [price, rooms, size]
try:
# Checks if there are more pages with links
next_link = driver.find_element_by_class_name("icon-arrow-right-after")
next_link.click()
time.sleep(10)
except NoSuchElementException:
rows_remaining = False
with open('houses.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerows(houses)
print(houses)

Categories

Resources