it's my first time posting here, so please let me know if I've messed anything up. I'm having some trouble with a nested loop in selenium. I'm trying to iterate through a list of players, gather stats for each one, and add them to a dataframe. Right now each player in the list gets entered into the search bar and their page is displayed, but stats are only collected for the last player in the list.
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
url = "https://www.sports-reference.com/cfb/"
driver = webdriver.Chrome(path)
driver.get(url)
dataframe1 = []
with open('A.txt') as f:
players = f.readlines()
for player in players:
search = driver.find_element(By.NAME, "search")
search.send_keys(player)
button = driver.find_element(By.XPATH, '//*[#id="header"]/div[3]/form/input[1]')
button.click()
stats = driver.find_elements(By.XPATH, '//*[#id="passing"]/tfoot/tr')
for stat in stats:
comps = stat.find_element(By.XPATH, '//*[#id="passing"]/tfoot/tr/td[6]').text
data = {
'Player': player,
'Completions': comps,
}
dataframe1.append(data)
df = pd.DataFrame(dataframe1)
print(df)
driver.close()
You have wrong indentation for the lines where you initialize the data dict, and append to dataframe1. they must be at the same level as the block under the innermost for loop.
I modified your code, here:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
url = "https://www.sports-reference.com/cfb/"
driver = webdriver.Chrome(path)
driver.get(url)
dataframe1 = []
with open('A.txt') as f:
players = f.readlines()
for player in players:
search = driver.find_element(By.NAME, "search")
search.send_keys(player)
button = driver.find_element(By.XPATH, '//*[#id="header"]/div[3]/form/input[1]')
button.click()
stats = driver.find_elements(By.XPATH, '//*[#id="passing"]/tfoot/tr')
for stat in stats:
comps = stat.find_element(By.XPATH, '//*[#id="passing"]/tfoot/tr/td[6]').text
data = {
'Player': player,
'Completions': comps,
}
dataframe1.append(data)
df = pd.DataFrame(dataframe1)
print(df)
driver.close()
Thank you everyone for your assistance with this issue. I eventually found out that I did not need the 'button' variable or button.click() in the script. Send keys was already hitting "return" after the string was passed to the search parameter on the page, so basically return was getting hit twice, once on the players name, and once on an empty search parameter. The default page that was returned when searching for the empty parameter did not contain the element I was attempting to find, which resulted in an empty list. Again thank you for your help with this issue.
Related
Actually, I want to scrape the 'title' and 'product description' for all the products and from all the pages, and then save it into the '.csv' file.
URL:- hhttps://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
This is what, I have tried.
from msilib.schema import Error
from os import sep
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
title_list = []
para_list = []
expiry_list = []
country_list = []
importer_list = []
address_list = []
myDict = {'body-art': 3024}
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')
browser.maximize_window()
browser.implicitly_wait(20)
for item_name in myDict:
page_num = 1
while True:
try:
page = f"https://www.nykaa.com/makeup/{item_name}/c/{myDict[item_name]}?page_no={page_num}&sort=popularity&ptype=lst&id={myDict[item_name]}&root=nav_2&dir=desc&order=popularity&eq=desktop"
print(page)
requests.get(page)
soup = BeautifulSoup(requests.get(page).content, 'html.parser')
urls = [item.get("href")
for item in soup.find_all("a", class_="css-qlopj4")]
# print(urls)
if len(urls) == 0:
break
for i in range(0, 2): #Since, it's a huge amount of data, that's why I have taken 2 products on one page, otherwise it will be in the range(0,30). It will cover all the products from an individual pages.
try:
url = urls[i]
browser.get("https://www.nykaa.com" + url)
title_data = browser.find_elements(
By.CLASS_NAME, 'css-1gc4x7i').text
print(title_data)
for t in title_data:
title_list.append(t)
browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(
By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements(By.ID, 'content-details')
for desc in desc_data:
para_details = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[1]').text
para_list.append(para_details)
expiry = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[2]').text
expiry_list.append(expiry)
country = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[3]').text
country_list.append(country)
importer = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[4]').text
importer_list.append(importer)
address = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[5]').text
address_list.append(address)
except:
break
except:
break
page_num += 1
title_list = [i.split('.css', 1)[0] for i in title_list]
print(*title_list, sep="\n")
print(*para_list, sep="\n")
print(*expiry_list, sep="\n")
print(*country_list, sep="\n")
print(*importer_list, sep="\n")
print(*address_list, "\n")
data_new = {"Title": title_list, "Para": para_list, "Expiry": expiry_list,
"Country": country_list, "Importer": importer_list, "Address": address_list}
df = pd.DataFrame(data_new)
df.to_csv("nykaa_makeup_bodyArt_new.csv")
# print(df)
The Output, I am receiving is as:
DevTools listening on ws://127.0.0.1:30887/devtools/browser/a222842a-7ce3-4070-a684-7e8bb8772279
https://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=2&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=3&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=4&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=5&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
PS E:\Web Scraping - Nykaa>
I think, due to the implicity_wait() function, it's not able to fetch the product's title & description. After my code runs, the '.csv' file is created, but it's a blank file. Maybe, I am wrong. Please help me regarding this. Do I need change to add/change some parts of the code?
Thanks 🙏🏻
There is no need to set browser.implicitly_wait multiple times.
browser.implicitly_wait is setting the timeout, how much time the driver will try to pool the DOM in order to locate an element on the page before it races exception.
browser.implicitly_wait is normally set per driver session.
This is definetely not a pause command like time.sleep.
So, in case you need to put a pause in your code you should use time.sleep while this is not recommended.
Also, it's much preferably to use Expected Conditions explicit waits rather than browser.implicitly_wait since browser.implicitly_wait waits for element presence i.e. it will release the run when element is just appeared while it may not be completely rendered.
In order to wait for element completely rendered and containing it text you should use something like
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")))
Where "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]" is XPath of element you wishe to get the text from.
I managed to extract the names, specs, prices, and priceUnits from the products on this page: https://www.bauhaus.info/baustoffe/c/10000819.
I do, however, only manage to get the first 36 products visible on the page. How would I extract all the products on this page that appear when pressing on the button for "more items"?
For this, see the inspection of the page here:
see inspect here
Any help is very much appreciated!
This is my code:
from selenium import webdriver
import pandas as pd
import re
browser = webdriver.Chrome(r'C:\Users\KristerJens\Downloads\chromedriver_win32\chromedriver')
browser.get('https://www.bauhaus.info/baustoffe/c/10000819')
names= []
specs = []
prices = []
priceUnit = []
for li in browser.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li"):
names.append(li.find_element_by_class_name("product-list-tile__info__name").text)
specs.append(li.find_element_by_class_name("product-list-tile__info__attributes").text)
prices.append(li.find_element_by_class_name("price-tag__box").text.split('\n')[0] + "€")
p = li.find_element_by_class_name("price-tag__sales-unit").text.split('\n')[0]
priceUnit.append(p[p.find("(")+1:p.find(")")])
df2 = pd.DataFrame()
df2['names'] = names
df2['specs'] = specs
df2['prices'] = prices
df2['priceUnit'] = priceUnit
Was able to click on More option continuously with below code. Try to incorporate this with your code.
# Imports Required for Explicit Waits:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver.get("https://www.bauhaus.info/baustoffe/c/10000819")
wait = WebDriverWait(driver,30)
options = driver.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li")
print(len(options))
#Using `Count` variable to keep track of number of times of clicking on More option. Remove the `Count` part of the code to continuously click on More option.
count = 0
try:
while True:
if count > 5: # Click on "More" option only for 5 times
break
moreoption = wait.until(EC.element_to_be_clickable((By.XPATH,"//button[#data-message='adb-show-more-products-button']")))
driver.execute_script("arguments[0].scrollIntoView(true);",moreoption)
driver.execute_script("window.scrollBy(0,-300);")
time.sleep(2)
moreoption.click()
count += 1
time.sleep(2)
options = driver.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li")
print(len(options))
except:
pass
firstly try to click on "More Products" button until it gets disabled i.e all products gets listed down and then use the common xpath for locating product info.
For each page add scroll to element more items and click it, see below example of scroll to element implementation
from selenium.webdriver.common.action_chains import ActionChains
element = driver.find_element_by_id("more_items")
actions = ActionChains(driver)
actions.move_to_element(element).perform()
I'm trying to loop through a list and then having the code click the search button, print the result and repeat. I get this error:
Traceback (most recent call last):
File "qtest.py", line 17, in
list = [PR311, PR311, 5, 7, 9]
NameError: name 'PR311' is not defined
This is my code:
# Imports, of course
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
# Initialize a Firefox webdriver
driver = webdriver.Firefox()
# Grab the web page
driver.get("https://mnlairport.ph/terminal-finder")
# We use .find_element_by_id here because we know the id
text_input = driver.find_element_by_xpath("/html/body/div[1]/div/div/div/div[2]/div/div[2]/div/form/div/input")
list = [PR311, PR3345, PR323, PR355, PR3987]
# Using for loop
for i in list:
# Then we'll fake typing into it
text_input.send_keys(list)
# Now we can grab the search button and click it
search_button = driver.find_element_by_xpath("/html/body/div[1]/div/div/div/div[2]/div/div[2]/div/form/div/button")
search_button.click()
# We can feed that into Beautiful Soup
soup = BeautifulSoup(driver.page_source, "html.parser")
form = soup.find('div', class_= 'info-box')
for post in form:
print(post)
UPDATED CODE: Issue now is, it doesn't loop properly
csv_file = open('test.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['post'])
list = ["PR311", "XC827", "KD271", "5J745", "SQ916"]
# Using for loop
for i in list:
# We use .find_element_by_id here because we know the id
text_input = driver.find_element_by_xpath("//input[contains(#class, 'form-control')]")
# Then we'll fake typing into it
text_input.send_keys(i)
# Now we can grab the search button and click it
search_button = driver.find_element_by_xpath("//button[contains(#class, 'search-btn')]")
search_button.click()
# We can feed that into Beautiful Soup
soup = BeautifulSoup(driver.page_source, "html.parser")
form = soup.find_all('div', attrs={'class': 'info-box'})
for post in form:
print(post)
csv_writer.writerow([post.text])
#Clear previous inputs
text_input.clear()
csv_file.close()
# Close the webdriver
driver.close()
I closed the loop by clearing the search bar but it skips through some in the list or doesn't return the right value.
Are string the elements in your list? Then replace with this, your code try to find a variable with that names
list = ["PR311", "PR3345", "PR323", "PR355", "PR3987"]
Also, you will have to get input element at start or end loop each time. And you will get problems with that Xpath definition
for i in list:
text_input = driver.find_element_by_xpath("//input[contains(#class, 'form-control')]")
#Clear previous inputs
text_input.clear()
text_input.send_keys(i)
search_button = driver.find_element_by_xpath("//button[contains(#class, 'search-btn')]")
I am new to programming and need some help with my web-crawler.
At the moment, I have my code opening up every web-page in the list. However, I wish to extract information from each one it loads. This is what I have.
from selenium import webdriver
import csv
driver = webdriver.Firefox()
links_code = driver.find_elements_by_xpath('//a[#class="in-match"]')
first_two = links_code[0:2]
first_two_links = []
for i in first_two:
link = i.get_attribute("href")
first_two_links.append(link)
for i in first_two_links:
driver.get(i)
This loops through the first two pages but scrapes no info. So I tried adding to the for-loop as follows
odds = []
for i in first_two_links:
driver.get(i)
driver.find_element_by_xpath('//span[#class="table-main__detail-
odds--hasarchive"]')
odds.append(odd)
However. This runs into an error.
Any help much appreciated.
You are not actually appending anything! you need to assign a variable to
driver.find_element_by_xpath('//span[#class="table-main__detail-
odds--hasarchive"]')
then append it to the list!
from selenium import webdriver;
import csv;
driver = webdriver.Firefox();
links_code : list = driver.find_elements_by_xpath('//a[#class="in-match"]');
first_two : list = links_code[0:2];
first_two_links : list = [];
i : int;
for i in first_two:
link = i.get_attribute("href");
first_two_links.append(link);
for i in first_two_links:
driver.get(i);
odds : list = [];
i :int;
for i in first_two_links:
driver.get(i);
o = driver.find_element_by_xpath('//span[#class="table-main__detail- odds--hasarchive"]');
odds.append(o);
First, after you start the driver you need to go to a website...
Second, in the second for loop, you are trying to append the wrong object... use i not odd or make odd = driver.find_element_by_xpath('//span[#class="table-main__detail-odds--hasarchive"]')
If you can provide the URL or the HTML we can help more!
Try this (I have used Google as an example you will need to change the code...):
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.google.com")
links_code = driver.find_elements_by_xpath('//a')
first_two = links_code[0:2]
first_two_links = []
for i in first_two:
link = i.get_attribute("href")
first_two_links.append(link)
print(link)
odds = []
for i in first_two_links:
driver.get(i)
odd = driver.page_source
print(odd)
# driver.find_element_by_xpath('//span[#class="table-main__detail- odds--hasarchive"]')
odds.append(odd)
I'm learning webscraping and working on Eat24 (Yelp's website). I'm able to scrape basic data from Yelp, but unable to do something pretty simple: append that data to a dataframe. Here is my code, I've notated it so it should be simple to follow along.
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
#go to eat24, type in zip code 10007, choose pickup and click search
driver.get("https://new-york.eat24hours.com/restaurants/index.php")
search_area = driver.find_element_by_name("address_auto_complete")
search_area.send_keys("10007")
pickup_element = driver.find_element_by_xpath("//[#id='search_form']/div/table/tbody/tr/td[2]")
pickup_element.click()
search_button = driver.find_element_by_xpath("//*[#id='search_form']/div/table/tbody/tr/td[3]/button")
search_button.click()
#scroll up and down on page to load more of 'infinity' list
for i in range(0,3):
driver.execute_script("window.scrollTo(0,
document.body.scrollHeight);")
driver.execute_script("window.scrollTo(0,0);")
time.sleep(1)
#find menu urls
menu_urls = [page.get_attribute('href') for page in
driver.find_elements_by_xpath('//*[#title="View Menu"]')]
df = pd.DataFrame(columns=['name', 'menuitems'])
#collect menu items/prices/name from each URL
for url in menu_urls:
driver.get(url)
menu_items = driver.find_elements_by_class_name("cpa")
menu_items = [x.text for x in menu_items]
menu_prices = driver.find_elements_by_class_name('item_price')
menu_prices = [x.text for x in menu_prices]
name = driver.find_element_by_id('restaurant_name')
menuitems = dict(zip(menu_items, menu_prices))
df['name'] = name
df['menuitems'] = menuitems
df.to_csv('test.csv', index=False)
The problem is at the end. It isn't adding menuitems + name into successive rows in the dataframe. I have tried using .loc and other functions but it got messy so I removed my attempts. Any help would be appreciated!!
Edit: The error I get is "ValueError: Length of values does not match length of index" when the for loop attempts to add a second set of menuitems/restaurant name to the dataframe
I figured out a simple solution, not sure why I didn't think of it before. I added a "row" count that goes up by 1 on each iteration, and used .loc to place data in the "row"th row
row = 0
for url in menu_urls:
row +=1
driver.get(url)
menu_items = driver.find_elements_by_class_name("cpa")
menu_items = [x.text for x in menu_items]
menu_prices = driver.find_elements_by_class_name('item_price')
menu_prices = [x.text for x in menu_prices]
name = driver.find_element_by_id('restaurant_name').text
menuitems = [dict(zip(menu_items, menu_prices))]
df.loc[row, 'name'] = name
df.loc[row, 'menuitems'] = menuitems
print df