Selenium only prints first result and I don't know why - python

My code :
from selenium.webdriver.common.by import By
from selenium import webdriver
import pandas as pd
url = 'https://www.tajeran-group.de/fahrzeuge/'
PATH = 'C:\\Users\\czoca\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Python 3.6\\chromedriver.exe'
driver = webdriver.Chrome(PATH)
driver.get(url)
driver.maximize_window()# For maximizing window
driver.implicitly_wait(10)# gives an implicit wait for 20 seconds
dealers = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div[3]/div[1]/div/div[1]')
for n in dealers:
name = n.find_element(By.XPATH, "/html/body/div[1]/div[4]/div/div[3]/div[1]/div/div[1]/div[1]/h3/a")
km = n.find_element(By.XPATH, "/html/body/div[1]/div[4]/div/div[3]/div[1]/div/div[1]/div[2]/div/div[2]/div/div[1]/ul/li[1]/span")
firstreg = n.find_element(By.XPATH,"/html/body/div[1]/div[4]/div/div[3]/div[1]/div/div[1]/div[2]/div/div[2]/div/div[1]/ul/li[2]/span")
print(name.text,km.text,firstreg.text)
#print(email.text)
I tried adding s to "element" and did not work, tried just print(n.text) only gives me 1 result.. The website is : https://www.tajeran-group.de/fahrzeuge/ i want to get all info for each car that is just it.. any ideias? Thanks

Try with something like this:
dealers = driver.find_elements(By.XPATH, //div[#class='uk-card uk-card-small uk-box-shadow-small uk-card-default uk-margin-bottom']//div[#class='uk-card-header'])
or at least to match all elements not only the first one:
/html/body/div[1]/div[4]/div/div[3]/div[1]/div/div[*]

Related

how to use driver.get(url) from extracted href list from the page?

I am wanting to go to https://www.bookmaker.com.au/sports/soccer, extract the soccer urls which it does. I am then wanting to go to each of those webpages through driver.get(url). I have done this as a list and then it extracts the data for each of those urls and place in pandas. I am stuck at getting driver.get(url) for each of those links extracted. Any help appreciated.
Css/href for driver.get(url):
url = #a[class *= 'matches-filter__region']
import time
import pandas as pd
import webdriver_manager.chrome
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
###########################################################################################################################################################
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_experimental_option("detach", True)
service = Service('driver/chromedriver.exe')
driver = webdriver.Chrome(service=Service(webdriver_manager.chrome.ChromeDriverManager().install()), options=options)
driver.get('https://www.bookmaker.com.au/sports/soccer')
aa = driver.find_elements(By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")
WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")))
################################################################################################################
for url in aa:
aa = driver.find_elements(By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")
driver.get(aa)
##############################################################################
#Full Code https://pastebin.com/W0VqaKVD
After using VPN I could connect and I found few problems with code
you have to get aa before for-loop
find gives reference to objects in browser's memory but when you use get() then it removes these objects from memory to create new object from new page, and results from find() are useless. You have to use .get_attribute('href') to get urls as strings.
you have to use for-loop to run get() for every string from list and you have to run other code inside this loop. And after loop you have to create DataFrame
This is code without code inside loop. But at least it visits all urls.
# --- before loop ---
all_a = driver.find_elements(By.CSS_SELECTOR, "a[class *= 'matches-filter__region']")
# get URLs as strings
all_urls = []
for item in all_a:
all_urls.append(item.get_attribute('href'))
# shorter
#all_urls = [item.get_attribute('href') for item in all_a]
team1List = []
backOddsList = []
team2List = []
layOddsList = []
# --- loop ---
# visit all pages and get teams
for url in all_urls:
print(url)
driver.get(url)
# here (inside loop) all code to get teams from active page
# --- after loop ---
df = pd.DataFrame({
'Team1': team1List,
'Back Odds': backOddsList,
'Team2': team2List,
'Lay Odds': layOddsList
})
df.to_excel('bookmaker.xlsx', engine='openpyxl', sheet_name='Sheet_name_1', index=False)
Updated the code, I checked, it is working, navigating to all the 15 urls in the same browser window:
diff_country_urls = []
for i in range(len(aa)):
diff_country_urls.append(aa[i].get_attribute("href"))
for url in diff_country_urls:
driver.get(url)

Webscraping Multiple Pages in Python with Selenium - loop not working

I'm quite new to python and have written a script using selenium to scrape a website. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 5 times. I want to scrape all the pages for 'BR1' any help would be great, currently the script below only scrapes the first page 5 times.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
with open('rightmove.csv', 'w') as file:
file.write('PropertyCardcontent \n')
PATH = ("/usr/local/bin/chromedriver")
driver = webdriver.Chrome(PATH)
driver.get("https://www.rightmove.co.uk/house-prices.html")
print(driver.title)
elem = driver.find_element(By.NAME, 'searchLocation') # Find the search box
elem.send_keys('BR1' + Keys.RETURN)
try:
content = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.ID,'content'))
)
finally:
time.sleep(3)
for p in range(5):
sold = content.find_elements(By.CLASS_NAME, 'sold-prices-content-wrapper ')
for solds in sold:
address = solds.find_elements(By.CLASS_NAME, 'sold-prices-content ')
for addresses in address:
result = addresses.find_elements(By.CLASS_NAME, 'results ')
for results in result:
card = results.find_elements(By.CLASS_NAME,'propertyCard')
for propertyCard in card:
header = propertyCard.find_elements(By.CLASS_NAME,'propertyCard-content')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'title')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'subTitle')
for subtitle in road:
bed = subtitle.find_elements(By.CLASS_NAME, 'propertyType')
with open('rightmove.csv', 'a') as file:
for i in range(len(result)):
file.write(header[i].text + '\n')
button = driver.find_element(By.XPATH, '//*[#id="content"]/div[2]/div[2]/div[4]/div[27]/div[3]/div')
button.click()
file.close()
time.sleep(3)
driver.quit()
Since the website link has page number on it, I recommend you put the base url as "https://www.rightmove.co.uk/house-prices/br1.html?page=1", and loop through the pages while changing the last index of the url with methods like format string.
One other thing, you don't need to implement all those for loops, you can simply assign each variable to its specific value since everything you need is inside an html block which is easy to navigate on it.
Update:
I'm sorry for being late, had unexpected stuff(...).
I've made some changes as I use Brave, so make sure you select your browser, Chrome I believe, the chromedriver(ver:102) stays the same (or depending your Chrome version).
I've also got the Price and Date and stored them in a tuple.
Every record is stored in a list[Title, propertyType, tupleof(Price_Date)]
At the end, it creates a csv and stores everything inside with a ";" as delimter.
You can if you prefer split the price and date for later use, up to you.
Note: This looping method only applies to websites in which the number of page is included within the URL. In this case, both the key and number of page is included in the URL.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import time
import random
import itertools
options = Options()
options.binary_location = r'C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe'
driver = webdriver.Chrome(options = options, service = Service("chromedriver.exe"))
key_word = "BR1".lower()
base_url = f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page=1"
driver.get(base_url)
#Number of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.strip('of'))
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, 'results '))
)
data = []
pc = 0
for p in range(1,pages+1):
driver.get(f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page={p}")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
price_list = propertyCard.find_elements(By.CLASS_NAME, 'price')
date_list = propertyCard.find_elements(By.CLASS_NAME, 'date-sold')
data.append([title,propertyType])
for p, d in itertools.zip_longest(price_list, date_list , fillvalue = None):
try:
price = p.text
date = d.text
data[pc].append((price, date))
except Exception as e:
print(e)
pc+=1
time.sleep(random.randint(1,4))
print(data)
with open('rightmove.csv', 'w') as file:
header = "Title;propertyType;Price_Date\n"
file.write(header)
for record in data:
file.write("{};{};{}\n".format(record[0],record[1],record[2:]))
driver.quit()
You don't have to go down to dom elem by elem, you can just use xpath or class_name (if it's unique, otherwise it's better xpath or css-selector) and get the item you are looking for.
Anyway follow this:
import time
import selenium.webdriver as webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get("https://www.rightmove.co.uk/house-prices.html")
# send query
query = "BR1"
search_bar = driver.find_element(By.XPATH, '//input[#class="searchBox ac_input"]')
search_bar.send_keys(query)
search_bar.send_keys(Keys.ENTER)
# wait to result been loaded
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'propertyCard'))
)
#get amount of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.replace('of ', ''))
data = []
i = 1
while i <= pages:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//div[contains(text(), "Next")]'))
).click()
# wait page load result
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
# loop over result and store data
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
data.append((title, propertyType))
time.sleep(1)
i += 1
print("you reach the last page")
#get number of results
printf(data)
driver.close()
I use a list of tuple cause in your example you want store 2 item, if you want store more data you can use a dict and then convert into csv with Dictwriter directly. Enjoy.

Having Trouble Clicking in Date Field with Selenium

I'm trying to scrape a table from the 1/30/2022 slate. However, I get the 'unable to locate element' error when I attempt to click in the date field and change the date from 2/6 to 1/30. I've tried finding by class name as well. Is there another way to do this, or is there something I'm doing wrong?
from ast import Return
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
import time
path = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(path)
driver.get('https://rotogrinders.com/resultsdb/nfl')
time.sleep(5)
driver.maximize_window()
time.sleep(10)
search = driver.find_element_by_xpath('//*[#id="navbar-demo1-mobile"]/div[1]/div/span/div')
search.click()
previous = driver.find_element_by_class_name('react-datepicker__navigation react-datepicker__navigation--previous')
previous.click()
time.sleep(5)
date = driver.find_element_by_class_name('react-datepicker__day react-datepicker__day--030
react-datepicker__day--weekend')
date.click()
You are not able to find it because it is inside an iframe. You have to switch to iframe window first, and then try to access the element.
Also, I see that the date picker has enabled to key in the dates, so you could use send_keys to type in the date. It makes your code a little easier on you perhaps. But you may write to click on the date picker ui. It's your choice per se.
Having said that here is the code:
driver.get("https://rotogrinders.com/resultsdb/nfl")
time.sleep(10)
frame = driver.find_element(By.XPATH, "//iframe")
driver.switch_to.frame(frame)
date_picker = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "//div[#class='react-datepicker__input-container']//input")))
date_picker.send_keys("01/16/2022")
time.sleep(10)
Try to change the time.sleep to explicit wait if possible (webdriverwait)
wait=WebDriverWait(driver,60)
driver.get('https://rotogrinders.com/resultsdb/nfl')
wait.until(EC.frame_to_be_available_and_switch_to_it((By.XPATH,"//iframe")))
date = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.react-datepicker__input-container input')))
date.send_keys("01/16/2022")
First wait for the iframe and then proceed to click the search element and then send keys.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
It might be possible to avoid Selenium here. It's just a matter of pulling out some id's to feed into the direct url.
import requests
import datetime
import pandas as pd
dateStr = input('Enter date (YYYY-MM-DD): ')
dateStr_alpha = datetime.datetime.strptime(dateStr, '%Y-%M-%d').strftime('%Y%M%d')
url = f'https://service.fantasylabs.com/contest-sources/?sport_id=1&date={dateStr}'
jsonData = requests.get(url).json()
groupId = jsonData['contest-sources'][0]['draft_groups'][0]['id']
url = f'https://service.fantasylabs.com/live-contests/?sport=NFL&contest_group_id={groupId}'
jsonData = requests.get(url).json()
tables = {}
for each in jsonData['live_contests']:
contestId = each['contest_id']
if each['contest_name'] not in tables.keys():
tables[each['contest_name']] = {}
url = f'https://dh5nxc6yx3kwy.cloudfront.net/contests/nfl/{dateStr_alpha}/{contestId}/data/'
jsonData = requests.get(url).json()
contestUsers = pd.DataFrame(jsonData['users']).T.reset_index(drop=True)
tables[each['contest_name']]['users'] = contestUsers
fieldExposures = pd.DataFrame(jsonData['players']).T
for k, v in jsonData['exposures'].items():
exposureDf = pd.DataFrame(v['exposureCounts']).T
exposureDf.columns = [x + f'_top_{k}%' for x in exposureDf.columns]
fieldExposures = pd.merge(fieldExposures, exposureDf, how='left', left_index=True, right_index=True )
fieldExposures = fieldExposures.fillna(0).reset_index(drop=True)
tables[each['contest_name']]['exposures'] = fieldExposures
print('****** ' + each['contest_name'] + ' ******')
print(contestUsers,fieldExposures )
Output:
Now just call the table by its contest name:
print(tables['NFL $100K Conference Special [$20K to 1st]'])

Indeed Webscrape (Selenium): Script only returning one page of data frame into CSV/Long Run Time

I am currently learning Python in order to webscrape and am running into an issue with my current script. After closing the pop-up on Page 2 of Indeed and cycling through the pages, the script only returns one page into the data frame to CSV. However, it does print out each page in my terminal area. It also on occasion only returns part of the data from a page. EX page 2 will return info for the first 3 jobs as part of my print(df_da), but nothing for the next 12. Additionally, it seems to take a very long time to run the script (averaging around 6 minutes and 45 seconds for the 5 pages, around 1 minute to 1.5 minutes per page). Any suggestions? I've attached my script and can also attach the return I get from my Print(df_da) if needed below. Thank you in advance!
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
options = Options()
options.add_argument("window-size=1400,1400")
PATH = "C://Program Files (x86)//chromedriver.exe"
driver = webdriver.Chrome(PATH)
for i in range(0,50,10):
driver.get('https://www.indeed.com/jobs?q=chemical%20engineer&l=united%20states&start='+str(i))
driver.implicitly_wait(5)
jobtitles = []
companies = []
locations = []
descriptions = []
jobs = driver.find_elements_by_class_name("slider_container")
for job in jobs:
jobtitle = job.find_element_by_class_name('jobTitle').text.replace("new", "").strip()
jobtitles.append(jobtitle)
company = job.find_element_by_class_name('companyName').text.replace("new", "").strip()
companies.append(company)
location = job.find_element_by_class_name('companyLocation').text.replace("new", "").strip()
locations.append(location)
description = job.find_element_by_class_name('job-snippet').text.replace("new", "").strip()
descriptions.append(description)
try:
WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button.popover-x-button-close.icl-CloseButton"))).click()
except:
pass
df_da=pd.DataFrame()
df_da['JobTitle']=jobtitles
df_da['Company']=companies
df_da['Location']=locations
df_da['Description']=descriptions
print(df_da)
df_da.to_csv('C:/Users/Dan/Desktop/AZNext/file_name1.csv')
You are defining the df_da inside the outer for loop so that the df_da will contain the data from the last page only.
You should define it out of the loops and put the total data there only after all the data have been collected.
I guess you are getting not all the jobs on the second page because of the pop-up. So, you should close it before collecting the job details on that page.
Also, you can reduce waiting for the pop-up element from all the loop iterations and leave it for the second loop iteration only.
Your code can be something like this:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
options = Options()
options.add_argument("window-size=1400,1400")
PATH = "C://Program Files (x86)//chromedriver.exe"
driver = webdriver.Chrome(PATH)
jobtitles = []
companies = []
locations = []
descriptions = []
for i in range(0,50,10):
driver.get('https://www.indeed.com/jobs?q=chemical%20engineer&l=united%20states&start='+str(i))
driver.implicitly_wait(5)
jobs = driver.find_elements_by_class_name("slider_container")
for idx, job in enumerate(jobs):
if(idx == 1):
try:
WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button.popover-x-button-close.icl-CloseButton"))).click()
except:
pass
jobtitle = job.find_element_by_class_name('jobTitle').text.replace("new", "").strip()
jobtitles.append(jobtitle)
company = job.find_element_by_class_name('companyName').text.replace("new", "").strip()
companies.append(company)
location = job.find_element_by_class_name('companyLocation').text.replace("new", "").strip()
locations.append(location)
description = job.find_element_by_class_name('job-snippet').text.replace("new", "").strip()
descriptions.append(description)
df_da=pd.DataFrame()
df_da['JobTitle']=jobtitles
df_da['Company']=companies
df_da['Location']=locations
df_da['Description']=descriptions
print(df_da)
df_da.to_csv('C:/Users/Dan/Desktop/AZNext/file_name1.csv')

Extract information from products on Website after pressing "more items" button with Selenium

I managed to extract the names, specs, prices, and priceUnits from the products on this page: https://www.bauhaus.info/baustoffe/c/10000819.
I do, however, only manage to get the first 36 products visible on the page. How would I extract all the products on this page that appear when pressing on the button for "more items"?
For this, see the inspection of the page here:
see inspect here
Any help is very much appreciated!
This is my code:
from selenium import webdriver
import pandas as pd
import re
browser = webdriver.Chrome(r'C:\Users\KristerJens\Downloads\chromedriver_win32\chromedriver')
browser.get('https://www.bauhaus.info/baustoffe/c/10000819')
names= []
specs = []
prices = []
priceUnit = []
for li in browser.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li"):
names.append(li.find_element_by_class_name("product-list-tile__info__name").text)
specs.append(li.find_element_by_class_name("product-list-tile__info__attributes").text)
prices.append(li.find_element_by_class_name("price-tag__box").text.split('\n')[0] + "€")
p = li.find_element_by_class_name("price-tag__sales-unit").text.split('\n')[0]
priceUnit.append(p[p.find("(")+1:p.find(")")])
df2 = pd.DataFrame()
df2['names'] = names
df2['specs'] = specs
df2['prices'] = prices
df2['priceUnit'] = priceUnit
Was able to click on More option continuously with below code. Try to incorporate this with your code.
# Imports Required for Explicit Waits:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver.get("https://www.bauhaus.info/baustoffe/c/10000819")
wait = WebDriverWait(driver,30)
options = driver.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li")
print(len(options))
#Using `Count` variable to keep track of number of times of clicking on More option. Remove the `Count` part of the code to continuously click on More option.
count = 0
try:
while True:
if count > 5: # Click on "More" option only for 5 times
break
moreoption = wait.until(EC.element_to_be_clickable((By.XPATH,"//button[#data-message='adb-show-more-products-button']")))
driver.execute_script("arguments[0].scrollIntoView(true);",moreoption)
driver.execute_script("window.scrollBy(0,-300);")
time.sleep(2)
moreoption.click()
count += 1
time.sleep(2)
options = driver.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li")
print(len(options))
except:
pass
firstly try to click on "More Products" button until it gets disabled i.e all products gets listed down and then use the common xpath for locating product info.
For each page add scroll to element more items and click it, see below example of scroll to element implementation
from selenium.webdriver.common.action_chains import ActionChains
element = driver.find_element_by_id("more_items")
actions = ActionChains(driver)
actions.move_to_element(element).perform()

Categories

Resources