How to scrape multiple pages from search results all at once - python

I am trying to scrape multiple pages from search results and print it all at once, but got an empty list instead.
Here is the code I used:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
element_list = []
for skip in range(0, 20, 10):
page_url = "https://jdih.esdm.go.id/index.php/web/result?tahun_terbit=2022,2021,2020,2019,2018,2017,2016,2015,2014&skip=" + str(skip)
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(page_url)
Tahun = driver.find_elements(By.CSS_SELECTOR, 'div.numb separator')
No_Peraturan = driver.find_elements(By.CSS_SELECTOR, 'span.result-value')
Nama_Peraturan = driver.find_elements(By.CSS_SELECTOR, 'div.result__content__item__title')
Deskripsi = driver.find_elements(By.CSS_SELECTOR, 'div.result__content__item__desc')
for i in range(len(Tahun)):
element_list.append([Tahun[i].text, No_Peraturan[i].text, Nama_Peraturan[i].text, Deskripsi[i].text])
print(element_list)
driver.close()
The code return only return an empty list like in this picture
enter image description here
Note: the website does not use 'page' as generally use for search results, but uses 'skip' instead
Anyone can help me with this ?

The CSS selector to find Tahun elements is incorrect as there are 2 classes assigned to the div. This results in Tahun being an empty list and since the loop to append text to element_list is based on the length of Tahun, nothing gets appended.
Update the selector to below.
Tahun = driver.find_elements(By.CSS_SELECTOR, 'div.numb.separator')

Related

How to extract all the google reviews from google map

I need to scrap all the google reviews. There are 90,564 reviews in my page. However the code i wrote can scrap only top 9 reviews. The other reviews are not scraped.
The code is given below:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# specify the url of the business page on Google
url = 'https://www.google.com/maps/place/ISKCON+temple+Bangalore/#13.0098328,77.5510964,15z/data=!4m7!3m6!1s0x0:0x7a7fb24a41a6b2b3!8m2!3d13.0098328!4d77.5510964!9m1!1b1'
# create an instance of the Chrome driver
driver = webdriver.Chrome()
# navigate to the specified url
driver.get(url)
# Wait for the reviews to load
wait = WebDriverWait(driver, 20) # increased the waiting time
review_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'wiI7pd')))
# extract the text of each review
reviews = [element.text for element in review_elements]
# print the reviews
print(reviews)
# close the browser
driver.quit()
what should i edit/modify the code to extract all the reviews?
Here is the working code for you after launching the url
totalRev = "div div.fontBodySmall"
username = ".d4r55"
reviews = "wiI7pd"
wait = WebDriverWait(driver, 20)
totalRevCount = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, totalRev))).get_attribute("textContent").split(' ')[0].replace(',','').replace('.','')
print("totalRevCount - ", totalRevCount)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, totalRev))).click()
mydict = {}
found = 0
while found < int(totalRevCount):
review_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, reviews)))
reviewer_names = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, username)))
found = len(mydict)
for rev, name in zip(review_elements, reviewer_names):
mydict[name.text] = rev.text
if len(rev.text) == 0:
found = int(totalRevCount) + 1
break
for i in range(8):
ActionChains(driver).key_down(Keys.ARROW_DOWN).perform()
print("found - ", found)
print(mydict)
time.sleep(2)
Explanation -
Get the locators for user name and review since we are going to create a key-value pair which will be useful in creating a non-duplicate result
You need to first get the total number of reviews/ratings that are present for that given location.
Get the username and review for the "visible" part of the webpage and store it in the dictionary
Scroll down the page and wait a few seconds
Get the username and review again and add them to dictionary. Only new ones will be added
As soon as a review that has no text (only rating), the loop will close and you have your results.
NOTE - If you want all reviews irrespective of the review text present or not, you can remove the "if" loop
I think you'll need to scoll down at first, and the get all the reviews.
scroll_value = 230
driver.execute_script( 'window.scrollBy( 0, '+str(scroll_value)+ ' )' ) # to scroll by value
# to get the current scroll value on the y axis
scroll_Y = driver.execute_script( 'return window.scrollY' )
That might be because the elements don't get loaded elsewise.
Since they are over 90'000, you might consider scolling down a little, then getting the reviews, repeat.
Resource: https://stackoverflow.com/a/74508235/20443541

how to use driver.get(url) from extracted href list from the page?

I am wanting to go to https://www.bookmaker.com.au/sports/soccer, extract the soccer urls which it does. I am then wanting to go to each of those webpages through driver.get(url). I have done this as a list and then it extracts the data for each of those urls and place in pandas. I am stuck at getting driver.get(url) for each of those links extracted. Any help appreciated.
Css/href for driver.get(url):
url = #a[class *= 'matches-filter__region']
import time
import pandas as pd
import webdriver_manager.chrome
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
###########################################################################################################################################################
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_experimental_option("detach", True)
service = Service('driver/chromedriver.exe')
driver = webdriver.Chrome(service=Service(webdriver_manager.chrome.ChromeDriverManager().install()), options=options)
driver.get('https://www.bookmaker.com.au/sports/soccer')
aa = driver.find_elements(By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")
WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")))
################################################################################################################
for url in aa:
aa = driver.find_elements(By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")
driver.get(aa)
##############################################################################
#Full Code https://pastebin.com/W0VqaKVD
After using VPN I could connect and I found few problems with code
you have to get aa before for-loop
find gives reference to objects in browser's memory but when you use get() then it removes these objects from memory to create new object from new page, and results from find() are useless. You have to use .get_attribute('href') to get urls as strings.
you have to use for-loop to run get() for every string from list and you have to run other code inside this loop. And after loop you have to create DataFrame
This is code without code inside loop. But at least it visits all urls.
# --- before loop ---
all_a = driver.find_elements(By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")
# get URLs as strings
all_urls = []
for item in all_a:
all_urls.append(item.get_attribute('href'))
# shorter
#all_urls = [item.get_attribute('href') for item in all_a]
team1List = []
backOddsList = []
team2List = []
layOddsList = []
# --- loop ---
# visit all pages and get teams
for url in all_urls:
print(url)
driver.get(url)
# here (inside loop) all code to get teams from active page
# --- after loop ---
df = pd.DataFrame({
'Team1': team1List,
'Back Odds': backOddsList,
'Team2': team2List,
'Lay Odds': layOddsList
})
df.to_excel('bookmaker.xlsx', engine='openpyxl', sheet_name='Sheet_name_1', index=False)
Updated the code, I checked, it is working, navigating to all the 15 urls in the same browser window:
diff_country_urls = []
for i in range(len(aa)):
diff_country_urls.append(aa[i].get_attribute("href"))
for url in diff_country_urls:
driver.get(url)

Screenshot all child elements of element

I need to take a screenshot of all of the div's and p's within a selected element individually, here's what I have so far:
import selenium
from selenium import webdriver
url = 'www.example.com'
driver = webdriver.Firefox()
driver.get(url)
i = 0
body= driver.find_element_by_id('body-text')
for element in body:
i=i+1
image_title = "pic"+str(i)+".jpg"
print("saving"+image_title)
item.screenshot(image_title)
What is the proper way to go by each element individually?
Thank you
for element in body.find_elements_by_xpath(".//p | .//div"):
driver.execute_script("arguments[0].scrollIntoView();", element)
#insert your code
element.screenshot(image_title)
To get all divs and p elements regardless of nesting you can do the following.

How can i scrape information from web page?

I am new to programming and need some help with my web-crawler.
At the moment, I have my code opening up every web-page in the list. However, I wish to extract information from each one it loads. This is what I have.
from selenium import webdriver
import csv
driver = webdriver.Firefox()
links_code = driver.find_elements_by_xpath('//a[#class="in-match"]')
first_two = links_code[0:2]
first_two_links = []
for i in first_two:
link = i.get_attribute("href")
first_two_links.append(link)
for i in first_two_links:
driver.get(i)
This loops through the first two pages but scrapes no info. So I tried adding to the for-loop as follows
odds = []
for i in first_two_links:
driver.get(i)
driver.find_element_by_xpath('//span[#class="table-main__detail-
odds--hasarchive"]')
odds.append(odd)
However. This runs into an error.
Any help much appreciated.
You are not actually appending anything! you need to assign a variable to
driver.find_element_by_xpath('//span[#class="table-main__detail-
odds--hasarchive"]')
then append it to the list!
from selenium import webdriver;
import csv;
driver = webdriver.Firefox();
links_code : list = driver.find_elements_by_xpath('//a[#class="in-match"]');
first_two : list = links_code[0:2];
first_two_links : list = [];
i : int;
for i in first_two:
link = i.get_attribute("href");
first_two_links.append(link);
for i in first_two_links:
driver.get(i);
odds : list = [];
i :int;
for i in first_two_links:
driver.get(i);
o = driver.find_element_by_xpath('//span[#class="table-main__detail- odds--hasarchive"]');
odds.append(o);
First, after you start the driver you need to go to a website...
Second, in the second for loop, you are trying to append the wrong object... use i not odd or make odd = driver.find_element_by_xpath('//span[#class="table-main__detail-odds--hasarchive"]')
If you can provide the URL or the HTML we can help more!
Try this (I have used Google as an example you will need to change the code...):
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.google.com")
links_code = driver.find_elements_by_xpath('//a')
first_two = links_code[0:2]
first_two_links = []
for i in first_two:
link = i.get_attribute("href")
first_two_links.append(link)
print(link)
odds = []
for i in first_two_links:
driver.get(i)
odd = driver.page_source
print(odd)
# driver.find_element_by_xpath('//span[#class="table-main__detail- odds--hasarchive"]')
odds.append(odd)

looping through a dropdown menu using Selenium and Python

I'm trying to loop through a dropdown menu on at this url: https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006
So, for example, the first dropdown menu - under options - lists out different materials and I want to select each one in turn and then gather some other information from the webpage before moving on to the next material. Here is my current code:
driver = webdriver.Firefox()
driver.get('https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006')
time.sleep(3)
driver.find_element_by_id('x-mark-icon').click()
select = Select(driver.find_element_by_name('Wiqj7mb4rsAq9LB'))
options = select.options
optionsList = []
driver.find_elements_by_class_name('select-wrapper')[0].click()
element = driver.find_element_by_xpath("//select[#name='Wiqj7mb4rsAq9LB']")
actions = ActionChains(driver)
actions.move_to_element(element).perform()
# driver.execute_script("arguments[0].scrollIntoView();", element)
for option in options: #iterate over the options, place attribute value in list
optionsList.append(option.get_attribute("value"))
for optionValue in optionsList:
print("starting loop on option %s" % optionValue)
# select = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//select[#name='Wiqj7mb4rsAq9LB']")))
# select = Select(select)
select.select_by_value(optionValue)
I started with just the loop, but got this error:
ElementNotInteractableException: Message: Element <option> could not be scrolled into view
I then added the webdriverwait and get a TimeoutException error.
I then realized I should probably click on the wrapper in which the dropdown is held, so I added the click, which does pup up the menu, but I still got the TimeoutException.
So I thought, maybe I should move to the element, which I tried with the action chain lines and I got this error
WebDriverException: Message: TypeError: rect is undefined
I tried to avoid that error by using this code instead:
# driver.execute_script("arguments[0].scrollIntoView();", element)
Which just resulted in the timeoutexception again.
I pretty new to Python and Selenium and have basically just been modifying code from SO answers to similar questions, but nothing has worked.
I'm using python 3.6 and the current versions of Selenium and firefox webdriver.
If anything is unclear or if you need more info just let me know.
Thanks so much!
EDIT: Based on the answer and comments by Kajal Kunda, I've updated my code to the following:
`material_dropdown = driver.find_element_by_xpath("//input[#class='select-
dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
materials=driver.find_elements_by_css_selector("div.select-wrapper
ul.dropdown-content li")
for material in materials:
# material_dropdown =
driver.find_element_by_xpath("//input[#class='select-dropdown']")
# driver.execute_script("arguments[0].click();", material_dropdown)
# materials=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material_ele=material.find_element_by_tag_name('span')
if material_ele.text!='':
material_ele.click()
time.sleep(5)
price = driver.find_element_by_class_name("dataPriceDisplay")
print(price.text)`
The result is that it successfully prints the price for the first type of material, but then it returns:
StaleElementReferenceException: Message: The element reference of <li class=""> is stale;...
I've tried variations of having the hashed out lines in and outside of the loop, but always get a version of the StaleElementReferenceException error.
Any suggestions?
Thanks!
You could do the whole thing with requests. Grab the drop down list from the options listed in drop down then concatenate the value attributes into requests url that retrieves json containing all the info on the page. Same principle applies for adding in other dropdown values. The ids for each drop down selection are the value attributes of the options in the drop down and appear in the url I show separated by // for each drop down selection.
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.accuform.com/product/getSku/danger-danger-authorized-personnel-only-MADM006/1/false/null//{}//WHFIw3xXmQx8zlz//6wr93DdrFo5JV//WdnO0RpwKpc4fGF'
startURL = 'https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006'
res = requests.get(startURL)
soup = bs(res.content, 'lxml')
materials = [item['value'] for item in soup.select('#Wiqj7mb4rsAq9LB option')]
sizes = [item['value'] for item in soup.select('#WvXESrTyQjM3Ciw option')]
languages = [item['value'] for item in soup.select('#WUYWGMePtpmpmhy option')]
units = [item['value'] for item in soup.select('#W91eqaJ0WPXwe9b option')]
for material in materials:
data = requests.get(url.format(material)).json()
soup = bs(data['dataMaterialBullets'], 'lxml')
lines = [item.text for item in soup.select('li')]
print(lines)
print(data['dataPriceDisplay'])
# etc......
Sample of JSON:
Try the below code.It should work.
driver = webdriver.Firefox()
driver.get('https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006')
time.sleep(3)
driver.find_element_by_id('x-mark-icon').click()
material_dropdown = driver.find_element_by_xpath("//input[#class='select-dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
#Code for material dropdown
materials=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material_optionsList = []
for material in materials:
material_ele=material.find_element_by_tag_name('span')
if material_ele.text!='':
material_optionsList.append(material_ele.text)
print(material_optionsList)
driver.execute_script("arguments[0].click();", material_dropdown)
size_dropdown = driver.find_element_by_xpath("(//input[#class='select-dropdown'])[2]")
driver.execute_script("arguments[0].click();", size_dropdown)
#Code for size dropdown
Sizes=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
size_optionsList = []
for size in Sizes:
size_ele=size.find_element_by_tag_name('span')
if size_ele.text!='':
size_optionsList.append(size_ele.text)
driver.execute_script("arguments[0].click();", size_dropdown)
Output :
[u'Adhesive Vinyl', u'Plastic', u'Adhesive Dura-Vinyl', u'Aluminum', u'Dura-Plastic\u2122', u'Aluma-Lite\u2122', u'Dura-Fiberglass\u2122', u'Accu-Shield\u2122']
Hope you will do the remaining.Let me know if it works for you.
EDIT Code for loop through and get the price value of materials.
for material in range(len(materials)):
material_ele=materials[material]
if material_ele.text!='':
#material_optionsList.append(material_ele.text)
#material_ele.click()
driver.execute_script("arguments[0].click();", material_ele)
time.sleep(2)
price = driver.find_element_by_id("priceDisplay")
print( price.text)
time.sleep(2)
material_dropdown = driver.find_element_by_xpath("//input[#class='select-dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
materials = driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material+=2
Output :
$8.31
$9.06
$13.22
$15.91
$15.91

Categories

Resources