I took a Python class my junior year of college but have forgotten a lot. For work I was asked to try to find a way to web scrape some date from a website. I have a python file that does something similar for a different site I use. Here is that code:
from bs4 import BeautifulSoup
import io
import requests
soup =
BeautifulSoup(requests.get("https://servicenet.dewalt.com/Parts/Search?searchedNumber=N365763").content)
rows = soup.select("#customerList tbody tr")
with io.open("data.txt", "w", encoding="utf-8") as f:
f.write(u", ".join([row.select_one("td a").text for row in rows]))
This gets a list of model numbers for power tool parts for that site. Now I basically want to do the same thing but I don't know where to begin. The site is https://www.powertoolreplacementparts.com/briggs-stratton-part-finder/#/s/BRG//498260/1/y
You click on the "Where Used" button and then there is a list of model numbers "093412-0011-01", "093412-0011-02", etc. I want those numbers to be sent to a text file separated by commas just like in my first code ("093412-0011-01, 093412-0011-02,...") Any help is much appreciated. Thanks!
I used selenium to be able to navigate pages.
Code:
import io
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Selenium Intializations
driver = webdriver.Chrome()
driver.get('https://www.powertoolreplacementparts.com/briggs-stratton-part-finder/#/s/BRG//498260/1/y')
wait = WebDriverWait(driver, 30)
driver.maximize_window()
# Locating the "Where Used" Button
driver.find_element_by_xpath("//input[#id='aripartsSearch_whereUsedBtn_0'][#class='ariPartListWhereUsed ariImageOverride'][#title='Where Used']").click()
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="ari_searchResults_Grid"]/ul')))
# Intializing BS4 and looking for the "Show More" Button
soup = BeautifulSoup(driver.page_source, "html.parser")
show = soup.find('li', {'class': 'ari-search-showMore'})
# Keep clicking the "Show More" Button until it is not visible anymore
while not show is None:
time.sleep(2)
hidden_element = driver.find_element_by_css_selector('#ari-showMore-unhide')
if hidden_element.is_displayed():
print("Element found")
driver.find_element_by_css_selector('#ari-showMore-unhide').click()
show = soup.find('li', {'class': 'ari-search-showMore'})
else:
print("Element not found")
break
# Write the data parsed to the text file "data.txt"
with io.open("data.txt", "w", encoding="utf-8") as f:
rows = soup.findAll('li', {'class': 'ari-ModelByPrompt'})
for row in rows:
part = str(row.text).replace(" ", "").replace("\n", "")
print(part)
f.write(part + ",")
Output:
Element found
Element found
Element found
Element not found
093412-0011-01
093412-0011-02
093412-0015-01
093412-0039-01
093412-0060-01
093412-0136-01
093412-0136-02
093412-0139-01
093412-0150-01
093412-0153-01
093412-0154-01
093412-0169-01
093412-0169-02
093412-0172-01
093412-0174-01
093412-0315-A1
093412-0339-A1
093412-0360-A1
093412-0636-A1
093412-0669-A1
093412-1015-E1
093412-1039-E1
093412-1060-E1
093412-1236-E1
093412-1236-E2
093412-1253-E1
093412-1254-E1
093412-1269-E1
093412-1274-E1
093412-1278-E1
093432-0035-01
093432-0035-02
093432-0035-03
093432-0036-01
093432-0036-03
093432-0036-04
093432-0037-01
093432-0038-01
093432-0038-03
093432-0041-01
093432-0140-01
093432-0145-01
093432-0149-01
093432-0152-01
093432-0157-01
093432-0158-01
093432-0160-01
093432-0192-B1
093432-0335-A1
093432-0336-A1
093432-0337-A1
093432-0338-A1
093432-1035-B1
093432-1035-E1
093432-1035-E2
093432-1035-E4
093432-1036-B1
093432-1036-E1
093432-1037-E1
093432-1038-B1
093432-1038-E1
093432-1240-B1
093432-1240-E1
093432-1257-E1
093432-1258-E1
093432-1280-B1
093432-1280-E1
093432-1281-B1
093432-1281-E1
093432-1282-B1
093432-1282-E1
093432-1286-B1
093452-0049-01
093452-0141-01
093452-0168-01
093452-0349-A1
093452-1049-B1
093452-1049-E1
093452-1049-E5
093452-1241-E1
093452-1242-E1
093452-1277-E1
093452-1283-B1
093452-1283-E1
09A412-0267-E1
09A413-0201-E1
09A413-0202-E1
09A413-0202-E2
09A413-0202-E3
09A413-0203-E1
09A413-0522-E1
09K432-0022-01
09K432-0023-01
09K432-0024-01
09K432-0115-01
09K432-0116-01
09K432-0116-02
09K432-0117-01
09K432-0118-01
120502-0015-E1
Content of the file:
093412-0011-01,093412-0011-02,093412-0015-01,093412-0039-01,093412-0060-01,093412-0136-01,093412-0136-02,093412-0139-01,093412-0150-01,093412-0153-01,093412-0154-01,093412-0169-01,093412-0169-02,093412-0172-01,093412-0174-01,093412-0315-A1,093412-0339-A1,093412-0360-A1,093412-0636-A1,093412-0669-A1,093412-1015-E1,093412-1039-E1,093412-1060-E1,093412-1236-E1,093412-1236-E2,093412-1253-E1,093412-1254-E1,093412-1269-E1,093412-1274-E1,093412-1278-E1,093432-0035-01,093432-0035-02,093432-0035-03,093432-0036-01,093432-0036-03,093432-0036-04,093432-0037-01,093432-0038-01,093432-0038-03,093432-0041-01,093432-0140-01,093432-0145-01,093432-0149-01,093432-0152-01,093432-0157-01,093432-0158-01,093432-0160-01,093432-0192-B1,093432-0335-A1,093432-0336-A1,093432-0337-A1,093432-0338-A1,093432-1035-B1,093432-1035-E1,093432-1035-E2,093432-1035-E4,093432-1036-B1,093432-1036-E1,093432-1037-E1,093432-1038-B1,093432-1038-E1,093432-1240-B1,093432-1240-E1,093432-1257-E1,093432-1258-E1,093432-1280-B1,093432-1280-E1,093432-1281-B1,093432-1281-E1,093432-1282-B1,093432-1282-E1,093432-1286-B1,093452-0049-01,093452-0141-01,093452-0168-01,093452-0349-A1,093452-1049-B1,093452-1049-E1,093452-1049-E5,093452-1241-E1,093452-1242-E1,093452-1277-E1,093452-1283-B1,093452-1283-E1,09A412-0267-E1,09A413-0201-E1,09A413-0202-E1,09A413-0202-E2,09A413-0202-E3,09A413-0203-E1,09A413-0522-E1,09K432-0022-01,09K432-0023-01,09K432-0024-01,09K432-0115-01,09K432-0116-01,09K432-0116-02,09K432-0117-01,09K432-0118-01,120502-0015-E1,
1) Open chrome to https://www.powertoolreplacementparts.com/briggs-stratton-part-finder/#/s/BRG//498260/1/y
2) open network tab
3) click on "Where used"
4) See API call to endpoint 'GetModelSearchModelsForPrompt'
5) Copy url https://partstream.arinet.com/Search/GetModelSearchModelsForPrompt?cb=jsonp1506134982932&arib=BRG&arisku=498260&modelName=&responsive=true&arik=AjydG6MJi4Y9noWP0hFB&aril=en-US&ariv=https%253A%252F%252Fwww.powertoolreplacementparts.com%252Fbriggs-stratton-part-finder%252F
6) Open that with requests, you will need some clever thinking to parse that because they are returning HTML in "JSON".
Related
I am new to web scraping, and I am trying to scrape the titles, dates, links, and contents of news articles on this website: https://www.iol.co.za/news/south-africa/eastern-cape.
The titles of the articles have different class names and heading (h) tag. I was able to scrape the dates, links, and titles using h tag. However, when I tried to store them in a pandas dataframe, I received the following errors-> ValueError: All arrays must be of the same length.
I also wrote the code to get the content of each article using the links. I got an error as well. I will thankful if I can be assisted.
I have tried different options to scrape the titles by creating a list of the different class names, but to no avail.
Please see my code below:
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from datetime import timedelta
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
art_title = [] # to store the titles of all news article
art_date = [] # to store the dates of all news article
art_link = [] # to store the links of all news article
pagesToGet = ['south-africa/eastern-cape']
for i in range(0, len(pagesToGet)):
print('processing page : \n')
url = 'https://www.iol.co.za' + str(pagesToGet[i])
print(url)
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
#time.sleep(5) # allow you to sleep your code before your retrieve the elements from the webpage. Additionally, to
# prevent the chrome driver opening a new instance for every url, open the browser outside of the loop.
# an exception might be thrown, so the code should be in a try-except block
try:
# use the browser to get the url. This is suspicious command that might blow up.
driver.get("https://www.iol.co.za/news/" +str(pagesToGet[i]))
except Exception as e: # this describes what to do if an exception is thrown
error_type, error_obj, error_info = sys.exc_info() # get the exception information
print('ERROR FOR LINK:', url) # print the link that cause the problem
print(error_type, 'Line:', error_info.tb_lineno) # print error info and line that threw the exception
continue # ignore this page. Abandon this and go back.
time.sleep(3) # Allow 3 seconds for the web page to open
# Code to scroll the screen to the end and click on more news till the 15th page before scraping all the news
k = 1
while k<=2:
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
driver.find_element(By.CSS_SELECTOR, '.Articles__MoreFromButton-sc-1mrfc98-0').click()
k += 1
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
news = soup.find_all('article', attrs={'class': 'sc-ifAKCX'})
print(len(news))
# Getting titles, dates, and links
for j in news:
# Article title
title = j.findAll(re.compile('^h[1-6]'))
for news_title in title:
art_title.append(news_title.text)
# Article dates
dates = j.find('p', attrs={'class': 'sc-cIShpX'})
if dates is not None:
date = dates.text
split_date = date.rsplit('|', 1)[1][10:].rsplit('<', 1)[0]
art_date.append(split_date)
# Article links
address = j.find('a').get('href')
news_link = 'https://www.iol.co.za' + address
art_link.append(news_link)
df = pd.DataFrame({'Article_Title': art_title, 'Date': art_date, 'Source': art_link})
# Getting contents
new_articles = ...struggling to write the code
df['Content'] = news_articles
df.to_csv('data.csv')
driver.quit()
I think this is what you are looking for:
# Needed libs
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH, f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
# For every article we take what we want
for article in articles:
header = article.find_element(By.XPATH, f".//*[name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7']")
print(header.get_attribute('textContent'))
author_and_date = article.find_elements(By.XPATH, f".//*[name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7']/following-sibling::p[1]")
if author_and_date:
print(author_and_date[0].get_attribute('textContent'))
else:
print("No author found")
link = article.find_element(By.XPATH, f".//a")
print(link.get_attribute('href'))
Actually, I want to scrape the 'title' and 'product description' for all the products and from all the pages, and then save it into the '.csv' file.
URL:- hhttps://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
This is what, I have tried.
from msilib.schema import Error
from os import sep
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
title_list = []
para_list = []
expiry_list = []
country_list = []
importer_list = []
address_list = []
myDict = {'body-art': 3024}
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')
browser.maximize_window()
browser.implicitly_wait(20)
for item_name in myDict:
page_num = 1
while True:
try:
page = f"https://www.nykaa.com/makeup/{item_name}/c/{myDict[item_name]}?page_no={page_num}&sort=popularity&ptype=lst&id={myDict[item_name]}&root=nav_2&dir=desc&order=popularity&eq=desktop"
print(page)
requests.get(page)
soup = BeautifulSoup(requests.get(page).content, 'html.parser')
urls = [item.get("href")
for item in soup.find_all("a", class_="css-qlopj4")]
# print(urls)
if len(urls) == 0:
break
for i in range(0, 2): #Since, it's a huge amount of data, that's why I have taken 2 products on one page, otherwise it will be in the range(0,30). It will cover all the products from an individual pages.
try:
url = urls[i]
browser.get("https://www.nykaa.com" + url)
title_data = browser.find_elements(
By.CLASS_NAME, 'css-1gc4x7i').text
print(title_data)
for t in title_data:
title_list.append(t)
browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(
By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements(By.ID, 'content-details')
for desc in desc_data:
para_details = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[1]').text
para_list.append(para_details)
expiry = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[2]').text
expiry_list.append(expiry)
country = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[3]').text
country_list.append(country)
importer = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[4]').text
importer_list.append(importer)
address = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[5]').text
address_list.append(address)
except:
break
except:
break
page_num += 1
title_list = [i.split('.css', 1)[0] for i in title_list]
print(*title_list, sep="\n")
print(*para_list, sep="\n")
print(*expiry_list, sep="\n")
print(*country_list, sep="\n")
print(*importer_list, sep="\n")
print(*address_list, "\n")
data_new = {"Title": title_list, "Para": para_list, "Expiry": expiry_list,
"Country": country_list, "Importer": importer_list, "Address": address_list}
df = pd.DataFrame(data_new)
df.to_csv("nykaa_makeup_bodyArt_new.csv")
# print(df)
The Output, I am receiving is as:
DevTools listening on ws://127.0.0.1:30887/devtools/browser/a222842a-7ce3-4070-a684-7e8bb8772279
https://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=2&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=3&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=4&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=5&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
PS E:\Web Scraping - Nykaa>
I think, due to the implicity_wait() function, it's not able to fetch the product's title & description. After my code runs, the '.csv' file is created, but it's a blank file. Maybe, I am wrong. Please help me regarding this. Do I need change to add/change some parts of the code?
Thanks 🙏🏻
There is no need to set browser.implicitly_wait multiple times.
browser.implicitly_wait is setting the timeout, how much time the driver will try to pool the DOM in order to locate an element on the page before it races exception.
browser.implicitly_wait is normally set per driver session.
This is definetely not a pause command like time.sleep.
So, in case you need to put a pause in your code you should use time.sleep while this is not recommended.
Also, it's much preferably to use Expected Conditions explicit waits rather than browser.implicitly_wait since browser.implicitly_wait waits for element presence i.e. it will release the run when element is just appeared while it may not be completely rendered.
In order to wait for element completely rendered and containing it text you should use something like
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")))
Where "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]" is XPath of element you wishe to get the text from.
I managed to extract the names, specs, prices, and priceUnits from the products on this page: https://www.bauhaus.info/baustoffe/c/10000819.
I do, however, only manage to get the first 36 products visible on the page. How would I extract all the products on this page that appear when pressing on the button for "more items"?
For this, see the inspection of the page here:
see inspect here
Any help is very much appreciated!
This is my code:
from selenium import webdriver
import pandas as pd
import re
browser = webdriver.Chrome(r'C:\Users\KristerJens\Downloads\chromedriver_win32\chromedriver')
browser.get('https://www.bauhaus.info/baustoffe/c/10000819')
names= []
specs = []
prices = []
priceUnit = []
for li in browser.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li"):
names.append(li.find_element_by_class_name("product-list-tile__info__name").text)
specs.append(li.find_element_by_class_name("product-list-tile__info__attributes").text)
prices.append(li.find_element_by_class_name("price-tag__box").text.split('\n')[0] + "€")
p = li.find_element_by_class_name("price-tag__sales-unit").text.split('\n')[0]
priceUnit.append(p[p.find("(")+1:p.find(")")])
df2 = pd.DataFrame()
df2['names'] = names
df2['specs'] = specs
df2['prices'] = prices
df2['priceUnit'] = priceUnit
Was able to click on More option continuously with below code. Try to incorporate this with your code.
# Imports Required for Explicit Waits:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver.get("https://www.bauhaus.info/baustoffe/c/10000819")
wait = WebDriverWait(driver,30)
options = driver.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li")
print(len(options))
#Using `Count` variable to keep track of number of times of clicking on More option. Remove the `Count` part of the code to continuously click on More option.
count = 0
try:
while True:
if count > 5: # Click on "More" option only for 5 times
break
moreoption = wait.until(EC.element_to_be_clickable((By.XPATH,"//button[#data-message='adb-show-more-products-button']")))
driver.execute_script("arguments[0].scrollIntoView(true);",moreoption)
driver.execute_script("window.scrollBy(0,-300);")
time.sleep(2)
moreoption.click()
count += 1
time.sleep(2)
options = driver.find_elements_by_xpath("//ul[#class='product-list-tiles row list-unstyled']/li")
print(len(options))
except:
pass
firstly try to click on "More Products" button until it gets disabled i.e all products gets listed down and then use the common xpath for locating product info.
For each page add scroll to element more items and click it, see below example of scroll to element implementation
from selenium.webdriver.common.action_chains import ActionChains
element = driver.find_element_by_id("more_items")
actions = ActionChains(driver)
actions.move_to_element(element).perform()
so i'm trying to save data from googlescholar using selenium (webdriver) and so far i can print the data that i want, but i when i saved it into a csv it only saves the first page
from selenium import webdriver
from selenium.webdriver.common.by import By
# Import statements for explicit wait
from selenium.webdriver.support.ui import WebDriverWait as W
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
from csv import writer
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = ['//*[#id="gsc_authors_bottom_pag"]/div/button[2]', '//*[#id="gsc_authors_bottom_pag"]/div/button[2]','//*[#id="gsc_authors_bottom_pag"]/div/button[2]']
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
#driver.maximize_window()
for j in range(len(button_locators)):
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators[j])))
address = driver.find_elements_by_class_name("gsc_1usr")
#for post in address:
#print(post.text)
time.sleep(4)
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address
#if addresst == 'NONE':
# addresst = str(address)
#else:
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
button_link.click()
time.sleep(4)
#driver.quit()
You only get one first page data because your program stops after it clicks next page button. You have to put all that in a for loop.
Notice i wrote in range(7), because I know there are 7 pages to open, in reality we should never do that. Imagine if we have thousands of pages. We should add some logic to check if the "next page button" exists or something and loop until it doesn't
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = "/html/body/div/div[8]/div[2]/div/div[12]/div/button[2]"
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
time.sleep(4)
# 7 pages. In reality, we should get this number programmatically
for page in range(7):
# read data from new page
address = driver.find_elements_by_class_name("gsc_1usr")
# write to file
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
# find and click next page button
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators)))
button_link.click()
time.sleep(4)
also in the future you should look to change all these time.sleeps to wait.until. Because sometimes your page loads quicker, and the program could do it's job faster. Or even worse, your network might get a lag and that would screw up your script.
I'm trying to loop through a list and then having the code click the search button, print the result and repeat. I get this error:
Traceback (most recent call last):
File "qtest.py", line 17, in
list = [PR311, PR311, 5, 7, 9]
NameError: name 'PR311' is not defined
This is my code:
# Imports, of course
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
# Initialize a Firefox webdriver
driver = webdriver.Firefox()
# Grab the web page
driver.get("https://mnlairport.ph/terminal-finder")
# We use .find_element_by_id here because we know the id
text_input = driver.find_element_by_xpath("/html/body/div[1]/div/div/div/div[2]/div/div[2]/div/form/div/input")
list = [PR311, PR3345, PR323, PR355, PR3987]
# Using for loop
for i in list:
# Then we'll fake typing into it
text_input.send_keys(list)
# Now we can grab the search button and click it
search_button = driver.find_element_by_xpath("/html/body/div[1]/div/div/div/div[2]/div/div[2]/div/form/div/button")
search_button.click()
# We can feed that into Beautiful Soup
soup = BeautifulSoup(driver.page_source, "html.parser")
form = soup.find('div', class_= 'info-box')
for post in form:
print(post)
UPDATED CODE: Issue now is, it doesn't loop properly
csv_file = open('test.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['post'])
list = ["PR311", "XC827", "KD271", "5J745", "SQ916"]
# Using for loop
for i in list:
# We use .find_element_by_id here because we know the id
text_input = driver.find_element_by_xpath("//input[contains(#class, 'form-control')]")
# Then we'll fake typing into it
text_input.send_keys(i)
# Now we can grab the search button and click it
search_button = driver.find_element_by_xpath("//button[contains(#class, 'search-btn')]")
search_button.click()
# We can feed that into Beautiful Soup
soup = BeautifulSoup(driver.page_source, "html.parser")
form = soup.find_all('div', attrs={'class': 'info-box'})
for post in form:
print(post)
csv_writer.writerow([post.text])
#Clear previous inputs
text_input.clear()
csv_file.close()
# Close the webdriver
driver.close()
I closed the loop by clearing the search bar but it skips through some in the list or doesn't return the right value.
Are string the elements in your list? Then replace with this, your code try to find a variable with that names
list = ["PR311", "PR3345", "PR323", "PR355", "PR3987"]
Also, you will have to get input element at start or end loop each time. And you will get problems with that Xpath definition
for i in list:
text_input = driver.find_element_by_xpath("//input[contains(#class, 'form-control')]")
#Clear previous inputs
text_input.clear()
text_input.send_keys(i)
search_button = driver.find_element_by_xpath("//button[contains(#class, 'search-btn')]")