How can i scrape information from web page? - python

I am new to programming and need some help with my web-crawler.
At the moment, I have my code opening up every web-page in the list. However, I wish to extract information from each one it loads. This is what I have.
from selenium import webdriver
import csv
driver = webdriver.Firefox()
links_code = driver.find_elements_by_xpath('//a[#class="in-match"]')
first_two = links_code[0:2]
first_two_links = []
for i in first_two:
link = i.get_attribute("href")
first_two_links.append(link)
for i in first_two_links:
driver.get(i)
This loops through the first two pages but scrapes no info. So I tried adding to the for-loop as follows
odds = []
for i in first_two_links:
driver.get(i)
driver.find_element_by_xpath('//span[#class="table-main__detail-
odds--hasarchive"]')
odds.append(odd)
However. This runs into an error.
Any help much appreciated.

You are not actually appending anything! you need to assign a variable to
driver.find_element_by_xpath('//span[#class="table-main__detail-
odds--hasarchive"]')
then append it to the list!
from selenium import webdriver;
import csv;
driver = webdriver.Firefox();
links_code : list = driver.find_elements_by_xpath('//a[#class="in-match"]');
first_two : list = links_code[0:2];
first_two_links : list = [];
i : int;
for i in first_two:
link = i.get_attribute("href");
first_two_links.append(link);
for i in first_two_links:
driver.get(i);
odds : list = [];
i :int;
for i in first_two_links:
driver.get(i);
o = driver.find_element_by_xpath('//span[#class="table-main__detail- odds--hasarchive"]');
odds.append(o);

First, after you start the driver you need to go to a website...
Second, in the second for loop, you are trying to append the wrong object... use i not odd or make odd = driver.find_element_by_xpath('//span[#class="table-main__detail-odds--hasarchive"]')
If you can provide the URL or the HTML we can help more!
Try this (I have used Google as an example you will need to change the code...):
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.google.com")
links_code = driver.find_elements_by_xpath('//a')
first_two = links_code[0:2]
first_two_links = []
for i in first_two:
link = i.get_attribute("href")
first_two_links.append(link)
print(link)
odds = []
for i in first_two_links:
driver.get(i)
odd = driver.page_source
print(odd)
# driver.find_element_by_xpath('//span[#class="table-main__detail- odds--hasarchive"]')
odds.append(odd)

Related

How to scrape multiple pages from search results all at once

I am trying to scrape multiple pages from search results and print it all at once, but got an empty list instead.
Here is the code I used:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
element_list = []
for skip in range(0, 20, 10):
page_url = "https://jdih.esdm.go.id/index.php/web/result?tahun_terbit=2022,2021,2020,2019,2018,2017,2016,2015,2014&skip=" + str(skip)
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(page_url)
Tahun = driver.find_elements(By.CSS_SELECTOR, 'div.numb separator')
No_Peraturan = driver.find_elements(By.CSS_SELECTOR, 'span.result-value')
Nama_Peraturan = driver.find_elements(By.CSS_SELECTOR, 'div.result__content__item__title')
Deskripsi = driver.find_elements(By.CSS_SELECTOR, 'div.result__content__item__desc')
for i in range(len(Tahun)):
element_list.append([Tahun[i].text, No_Peraturan[i].text, Nama_Peraturan[i].text, Deskripsi[i].text])
print(element_list)
driver.close()
The code return only return an empty list like in this picture
enter image description here
Note: the website does not use 'page' as generally use for search results, but uses 'skip' instead
Anyone can help me with this ?
The CSS selector to find Tahun elements is incorrect as there are 2 classes assigned to the div. This results in Tahun being an empty list and since the loop to append text to element_list is based on the length of Tahun, nothing gets appended.
Update the selector to below.
Tahun = driver.find_elements(By.CSS_SELECTOR, 'div.numb.separator')

with open nested loop in selenium isn't working

it's my first time posting here, so please let me know if I've messed anything up. I'm having some trouble with a nested loop in selenium. I'm trying to iterate through a list of players, gather stats for each one, and add them to a dataframe. Right now each player in the list gets entered into the search bar and their page is displayed, but stats are only collected for the last player in the list.
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
url = "https://www.sports-reference.com/cfb/"
driver = webdriver.Chrome(path)
driver.get(url)
dataframe1 = []
with open('A.txt') as f:
players = f.readlines()
for player in players:
search = driver.find_element(By.NAME, "search")
search.send_keys(player)
button = driver.find_element(By.XPATH, '//*[#id="header"]/div[3]/form/input[1]')
button.click()
stats = driver.find_elements(By.XPATH, '//*[#id="passing"]/tfoot/tr')
for stat in stats:
comps = stat.find_element(By.XPATH, '//*[#id="passing"]/tfoot/tr/td[6]').text
data = {
'Player': player,
'Completions': comps,
}
dataframe1.append(data)
df = pd.DataFrame(dataframe1)
print(df)
driver.close()
You have wrong indentation for the lines where you initialize the data dict, and append to dataframe1. they must be at the same level as the block under the innermost for loop.
I modified your code, here:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
url = "https://www.sports-reference.com/cfb/"
driver = webdriver.Chrome(path)
driver.get(url)
dataframe1 = []
with open('A.txt') as f:
players = f.readlines()
for player in players:
search = driver.find_element(By.NAME, "search")
search.send_keys(player)
button = driver.find_element(By.XPATH, '//*[#id="header"]/div[3]/form/input[1]')
button.click()
stats = driver.find_elements(By.XPATH, '//*[#id="passing"]/tfoot/tr')
for stat in stats:
comps = stat.find_element(By.XPATH, '//*[#id="passing"]/tfoot/tr/td[6]').text
data = {
'Player': player,
'Completions': comps,
}
dataframe1.append(data)
df = pd.DataFrame(dataframe1)
print(df)
driver.close()
Thank you everyone for your assistance with this issue. I eventually found out that I did not need the 'button' variable or button.click() in the script. Send keys was already hitting "return" after the string was passed to the search parameter on the page, so basically return was getting hit twice, once on the players name, and once on an empty search parameter. The default page that was returned when searching for the empty parameter did not contain the element I was attempting to find, which resulted in an empty list. Again thank you for your help with this issue.

How to run 'implicity_wait()' in a 'for loop' with respect to Web Scraping using Python?

Actually, I want to scrape the 'title' and 'product description' for all the products and from all the pages, and then save it into the '.csv' file.
URL:- hhttps://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
This is what, I have tried.
from msilib.schema import Error
from os import sep
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
title_list = []
para_list = []
expiry_list = []
country_list = []
importer_list = []
address_list = []
myDict = {'body-art': 3024}
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')
browser.maximize_window()
browser.implicitly_wait(20)
for item_name in myDict:
page_num = 1
while True:
try:
page = f"https://www.nykaa.com/makeup/{item_name}/c/{myDict[item_name]}?page_no={page_num}&sort=popularity&ptype=lst&id={myDict[item_name]}&root=nav_2&dir=desc&order=popularity&eq=desktop"
print(page)
requests.get(page)
soup = BeautifulSoup(requests.get(page).content, 'html.parser')
urls = [item.get("href")
for item in soup.find_all("a", class_="css-qlopj4")]
# print(urls)
if len(urls) == 0:
break
for i in range(0, 2): #Since, it's a huge amount of data, that's why I have taken 2 products on one page, otherwise it will be in the range(0,30). It will cover all the products from an individual pages.
try:
url = urls[i]
browser.get("https://www.nykaa.com" + url)
title_data = browser.find_elements(
By.CLASS_NAME, 'css-1gc4x7i').text
print(title_data)
for t in title_data:
title_list.append(t)
browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(
By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements(By.ID, 'content-details')
for desc in desc_data:
para_details = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[1]').text
para_list.append(para_details)
expiry = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[2]').text
expiry_list.append(expiry)
country = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[3]').text
country_list.append(country)
importer = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[4]').text
importer_list.append(importer)
address = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[5]').text
address_list.append(address)
except:
break
except:
break
page_num += 1
title_list = [i.split('.css', 1)[0] for i in title_list]
print(*title_list, sep="\n")
print(*para_list, sep="\n")
print(*expiry_list, sep="\n")
print(*country_list, sep="\n")
print(*importer_list, sep="\n")
print(*address_list, "\n")
data_new = {"Title": title_list, "Para": para_list, "Expiry": expiry_list,
"Country": country_list, "Importer": importer_list, "Address": address_list}
df = pd.DataFrame(data_new)
df.to_csv("nykaa_makeup_bodyArt_new.csv")
# print(df)
The Output, I am receiving is as:
DevTools listening on ws://127.0.0.1:30887/devtools/browser/a222842a-7ce3-4070-a684-7e8bb8772279
https://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=2&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=3&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=4&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=5&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
PS E:\Web Scraping - Nykaa>
I think, due to the implicity_wait() function, it's not able to fetch the product's title & description. After my code runs, the '.csv' file is created, but it's a blank file. Maybe, I am wrong. Please help me regarding this. Do I need change to add/change some parts of the code?
Thanks 🙏🏻
There is no need to set browser.implicitly_wait multiple times.
browser.implicitly_wait is setting the timeout, how much time the driver will try to pool the DOM in order to locate an element on the page before it races exception.
browser.implicitly_wait is normally set per driver session.
This is definetely not a pause command like time.sleep.
So, in case you need to put a pause in your code you should use time.sleep while this is not recommended.
Also, it's much preferably to use Expected Conditions explicit waits rather than browser.implicitly_wait since browser.implicitly_wait waits for element presence i.e. it will release the run when element is just appeared while it may not be completely rendered.
In order to wait for element completely rendered and containing it text you should use something like
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")))
Where "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]" is XPath of element you wishe to get the text from.

Selenium skips elements of the same class during iteration

I am trying to iterate over a list of links on a [website][1] but Selenium is not able to locate particular and seemingly random ones. In particular, I am trying to click on each of the cities and extract the number of stores using a for loop but it always skips, say, "Alameda" among all some other cities even though when I see nothing different about the html code.
driver = webdriver.Chrome(path)
driver.set_window_size(1120, 1000)
driver.get("https://locations.traderjoes.com/ca/")
cities = driver.find_elements_by_class_name('itemlist')
for i in range(0, len(cities)):
print(city_list[i])
if cities[i].is_displayed():
cities[i].click()
num = len(driver.find_elements_by_class_name('address-left'))
num_stores_by_city.append(num)
driver.find_element_by_xpath('//*[#id="content"]/a[2]').click()
else:
time.sleep(3)
cities[i].click()
num = len(driver.find_elements_by_class_name('address-left'))
num_stores_by_city.append(num)
driver.find_element_by_xpath('//*[#id="content"]/a[2]').click()
This will determine the cities and then loop through each gathering the number of stores and adding information to a dictionary type object:
driver = webdriver.Chrome(path)
url = 'https://locations.traderjoes.com/ca/'
driver.get(url)
city_list = {}
city_index = 0
processing_cities = True
while processing_cities:
cities = driver.find_elements_by_css_selector('.itemlist a')
if city_index < len(cities):
city_text = cities[city_index].text
cities[city_index].click()
store_locations = driver.find_elements_by_css_selector('.itemlist')
city_list[city_text] = len(store_locations)
driver.get(url)
city_index += 1
else:
processing_cities = False
print(city_list)
One of the issues you were running into was that once you click on an element your previously found elements become stale. You need to re-find previously found elements to interact with them again.

StaleElementReferenceException selenium webdriver python

I'm writing a crawler using Selenium, Python and PhantomJS to use Google's reverse image search. So far I've successfully been able to upload an image and crawl the search results on the first page. However, when I try to click on the search results navigation, I'm getting a StaleElementReferenceError. I have read about it in many posts but still I could not implement the solution. Here is the code that breaks:
ele7 = browser.find_element_by_id("nav")
ele5 = ele7.find_elements_by_class_name("fl")
count = 0
for elem in ele5:
if count <= 2:
print str(elem.get_attribute("href"))
elem.click()
browser.implicitly_wait(20)
ele6 = browser.find_elements_by_class_name("rc")
for result in ele6:
f = result.find_elements_by_class_name("r")
for line in f:
link = line.find_elements_by_tag_name("a")[0].get_attribute("href")
links.append(link)
parsed_uri = urlparse(link)
domains.append('{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri))
count += 1
The code breaks at print str(elem.get_attribute("href")) . How can I solve this?
Thanks in advance.
Clicking a link will cause the browser to go to another page; make references to the elements in old page (ele5, elem) invalid.
Modify the code not to reference invalid elements.
For example, you can get urls before you visit other pages:
ele7 = browser.find_element_by_id("nav")
ele5 = ele7.find_elements_by_class_name("fl")
urls = [elem.get_attribute('href') for elem in ele5] # <-----
browser.implicitly_wait(20)
for url in urls[:2]: # <------
print url
browser.get(url) # <------ used `browser.get` instead of `click`.
# ; using `element.click` will cause the error.
ele6 = browser.find_elements_by_class_name("rc")
for result in ele6:
f = result.find_elements_by_class_name("r")
for line in f:
link = line.find_elements_by_tag_name("a")[0].get_attribute("href")
links.append(link)
parsed_uri = urlparse(link)
domains.append('{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri))

Categories

Resources