Can this logic be done with scrapy? - python

I did a project, where I looped over every single student ID I have in my college to get the results of each individual student to create analytical dashboard for each student and send their results to them via Email with a nice report done later. I scraped the website our college uploads our results in.
the code for it was this:
#Importing The Neccessary modules
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
#Reading the data
our_ids = pd.read_excel("All Our IDs.xlsx")
total_students = our_ids.shape[0]
df_to_hold_all_data = pd.DataFrame()
#Defining Functions to use in the script
def make_request(student_id):
"""
Makes a response for the student ID given, Keeps repeating it till it's a successful response.
"""
url = 'http://app1.helwan.edu.eg/Commerce/HasasnUpMlist.asp' #Base URL to our college website
params = {
'z_dep': '=',
'z_st_name': 'LIKE',
'z_st_settingno': '=',
'x_st_settingno': f'{student_id}',
'x_st_name': '',
'z_gro': '=',
'x_gro': '',
'x_dep': '',
'z_sec': 'LIKE',
'x_sec': '',
'Submit': '++++حفظ++++'
}
response_state = 0
while response_state != 200 :
try:
response = requests.get(url,params= params, timeout= 10 )
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout):
print("Requesting Again...")
continue
response_state = response.status_code
return response
def make_the_second_request_with_selenium(link):
# Create a headless Edge driver
options = Options()
options.add_argument('--headless')
driver = webdriver.Edge(options=options)
# Set timeout for the request and try to navigate to a website
timeout = 10 # seconds
try:
driver.get(link)
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[2]/div/font/b')))
return driver # Will Eventually return this.
except (TimeoutException, NoSuchElementException): # If the request takes more than 10 seconds or the request failed for any reason, repeat the request again
print("Requesting Again...")
make_the_second_request_with_selenium(link)
this_loop = 0
#Looping for all students
for student_id in our_ids['IDS'].unique():
print(f"\nNow Looping for {student_id}\n")
response = make_request(student_id) # Making our response
print(f"{response.status_code}")
# Parse the response and create a BeautifulSoup object
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a',{'href': True})
link_to_natega = ''
for link in links:
if "StdCode" in link['href']:
# get the link we want to go to eventually, Each Student has a unique link.
link_to_natega = f"http://app1.helwan.edu.eg/Commerce/{link['href']}"
print(link_to_natega)
try:
driver = make_the_second_request_with_selenium(link_to_natega)
name = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[2]/div/font/b').text
id_of_student = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[4]/div/font/b').text
department = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[5]/td[2]/div/font/b').text
first_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[3]/td[2]/div/font/b').text
first_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[3]/td[4]/div/font/b').text
second_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[4]/td[2]/div/font/b').text
second_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[4]/td[4]/div/font/b').text
third_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[5]/td[2]/div/font/b').text
third_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[5]/td[4]/div/font/b').text
fourth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[6]/td[2]/div/font/b').text
fourth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[6]/td[4]/div/font/b').text
fifth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[7]/td[2]/div/font/b').text
fifth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[7]/td[4]/div/font/b').text
sixth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[8]/td[2]/div/font/b').text
sixth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[8]/td[4]/div/font/b').text
data = {'name': name , 'ID' : id_of_student , "Department" : department , \
"Subject" : [first_sub,second_sub,third_sub,fourth_sub,fifth_sub,sixth_sub],\
"Score": [first_sub_score,second_sub_score,third_sub_score,fourth_sub_score,fifth_sub_score,sixth_sub_score]
}
df = pd.DataFrame(data) #Create a DataFrame
df_to_hold_all_data = df_to_hold_all_data.append(df) # Append it to the dataframe we created above.
# Close the driver
driver.quit()
print(f"The shape of the data now is: {df_to_hold_all_data.shape}")
except:
print(f'failed to get data for {student_id}')
this_loop += 1
remaining_students = total_students - this_loop
print(f'Done Looping For {student_id} The remaining students: {remaining_students}')
df_to_hold_all_data.to_excel("All Our Results.xlsx",index=False)
I don't know if it's possible to create this with scrapy?
If yes, How much would it make the process faster?
Is it worth investing the time and effort to learn it and rewrite the code again?
edit: Sorry for poor structure, Data Analysis and statistics is the part where I am actually good :D
Your help would be appreciated.

Related

Scraping: can't get stable results

I'm doing a scraping exercise on a job searching webpage. I want to get the link, name of the company, job title, salary, location and posting date. I've run the same code multiple times, and sometimes it gives the expected results in the salary item (salary if the info is displayed, "N/A" otherwise) and sometimes it gives me something different: salary if the info is displayed, "N/A", and some random character values in columns whose values should be "N/A". I have no problems with the other elements. Here is my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://ca.indeed.com/')
#Inputs a job title and location into the input boxes
input_box = driver.find_element(By.XPATH,'//*[#id="text-input-what"]')
input_box.send_keys('data analyst')
location = driver.find_element(By.XPATH,'//*[#id="text-input-where"]')
location.send_keys('toronto')
#Clicks on the search button
button = driver.find_element(By.XPATH,'//*[#id="jobsearch"]/button').click()
#Creates a dataframe
df = pd.DataFrame({'Link':[''], 'Job Title':[''], 'Company':[''], 'Location':[''],'Salary':[''], 'Date':['']})
#This loop goes through every page and grabs all the details of each posting
#Loop will only end when there are no more pages to go through
while True:
#Imports the HTML of the current page into python
soup = BeautifulSoup(driver.page_source, 'lxml')
#Grabs the HTML of each posting
postings = soup.find_all('div', class_ = 'slider_container css-g7s71f eu4oa1w0')
len(postings)
#grabs all the details for each posting and adds it as a row to the dataframe
for post in postings:
link = post.find('a').get('href')
link_full = 'https://ca.indeed.com'+link
name = post.find('h2', tabindex = '-1').text.strip()
company = post.find('span', class_ = 'companyName').text.strip()
try:
location = post.find('div', class_ = 'companyLocation').text.strip()
except:
location = 'N/A'
try:
salary = post.find('div', attrs = {'class':'heading6 tapItem-gutter metadataContainer noJEMChips salaryOnly'}).text.strip()
except:
salary = 'N/A'
date = post.find('span', class_ = 'date').text.strip()
df = df.append({'Link':link_full, 'Job Title':name, 'Company':company, 'Location':location,'Salary':salary, 'Date':date},
ignore_index = True)
#checks if there is a button to go to the next page, and if not will stop the loop
try:
button = soup.find('a', attrs = {'aria-label': 'Next'}).get('href')
driver.get('https://ca.indeed.com'+button)
except:
break
Can I fix my code to get the expected results everytime I run it? Also, an additional issue: I'm scraping around 60 pages. But usually the program stops between 20 and 30 pages before the last page. Is there a way to fix the code so that it scrapes until the last page everytime?
Here is a simplified example with requests library:
import requests
from bs4 import BeautifulSoup
cookies = {}
headers = {}
params = {
'q': 'data analyst',
'l': 'toronto',
'from': 'searchOnHP',
}
response = requests.get('https://ca.indeed.com/jobs', params=params, cookies=cookies, headers=headers)
soup = BeautifulSoup(response.text)
postings = soup.find_all('div', class_ = 'slider_container css-g7s71f eu4oa1w0')
len(postings)
prints
15

Data Mining IMDB Reviews - Only extracting the first 25 reviews

I am currently trying to extract all the reviews on Spiderman Homecoming movie but I am only able to get the first 25 reviews. I was able to load more in IMDB to get all the reviews as originally it only shows the first 25 but for some reason I am unable to mine all the reviews after every review has been loaded. Does anyone know what I am doing wrong?
Below is the code I am running:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#Set the web browser
driver = webdriver.Chrome(executable_path=r"C:\Users\Kent_\Desktop\WorkStudy\chromedriver.exe")
#Go to Google
driver.get("https://www.imdb.com/title/tt6320628/reviews?ref_=tt_urv")
#Loop load more button
wait = WebDriverWait(driver,10)
while True:
try:
driver.find_element_by_css_selector("button#load-more-trigger").click()
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,".ipl-load-more__load-indicator")))
soup = BeautifulSoup(driver.page_source, 'lxml')
except Exception:break
#Scrape IMBD review
ans = driver.current_url
page = requests.get(ans)
soup = BeautifulSoup(page.content, "html.parser")
all = soup.find(id="main")
#Get the title of the movie
all = soup.find(id="main")
parent = all.find(class_ ="parent")
name = parent.find(itemprop = "name")
url = name.find(itemprop = 'url')
film_title = url.get_text()
print('Pass finding phase.....')
#Get the title of the review
title_rev = all.select(".title")
title = [t.get_text().replace("\n", "") for t in title_rev]
print('getting title of reviews and saving into a list')
#Get the review
review_rev = all.select(".content .text")
review = [r.get_text() for r in review_rev]
print('getting content of reviews and saving into a list')
#Make it into dataframe
table_review = pd.DataFrame({
"Title" : title,
"Review" : review
})
table_review.to_csv('Spiderman_Reviews.csv')
print(title)
print(review)
Well, actually, there's no need to use Selenium. The data is available via sending a GET request to the websites API in the following format:
https://www.imdb.com/title/tt6320628/reviews/_ajax?ref_=undefined&paginationKey=MY-KEY
where you have to provide a key for the paginationKey in the URL (...&paginationKey=MY-KEY)
The key is found in the class load-more-data:
<div class="load-more-data" data-key="g4wp7crmqizdeyyf72ux5nrurdsmqhjjtzpwzouokkd2gbzgpnt6uc23o4zvtmzlb4d46f2swblzkwbgicjmquogo5tx2">
</div>
So, to scrape all the reviews into a DataFrame, try:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = (
"https://www.imdb.com/title/tt6320628/reviews/_ajax?ref_=undefined&paginationKey={}"
)
key = ""
data = {"title": [], "review": []}
while True:
response = requests.get(url.format(key))
soup = BeautifulSoup(response.content, "html.parser")
# Find the pagination key
pagination_key = soup.find("div", class_="load-more-data")
if not pagination_key:
break
# Update the `key` variable in-order to scrape more reviews
key = pagination_key["data-key"]
for title, review in zip(
soup.find_all(class_="title"), soup.find_all(class_="text show-more__control")
):
data["title"].append(title.get_text(strip=True))
data["review"].append(review.get_text())
df = pd.DataFrame(data)
print(df)
Output (truncated):
title review
0 Terrific entertainment Spiderman: Far from Home is not intended to be...
1 THe illusion of the identity of Spider man. Great story in continuation of spider man home...
2 What Happened to the Bad Guys I believe that Quinten Beck/Mysterio got what ...
3 Spectacular One of the best if not the best Spider-Man mov...
...
...

is there a way to wait for an a tag element that could be one of two based on their href value in selenium?

So I'm not sure if this practically valid, but was wondering if there's a way in selenium to wait for an <a tag - out of two based on their href value or the text contained after the tag closes.
What I'm trying to do is to power up this page https://www.coingecko.com/en/exchanges, iterate through the exchanges links, visit each one of them, then click on the about tab of each of those newly opened pages as they contain the info to be extracted. The code actually worked up until halfway through when it failed to identify the tab properly through a StaleElementException and elementNotFound as I did it through driver.find_element_by_text.
The problem is that the 'about' tab changes from one page to the other, so it's either //ul[#role='tablist']/li[3] or li[2], and that's why I'm trying to wait and click on the right element based on its href value. That is since one of the a tags on the page href's value contains the text # about ---> //ul[#role='tablist']/li[3]/a
Apologies if it wasn't straightforward but I was trying to pinpoint what the issue was until recently :)
This is the code that I've attempted so far if anyone can gratefully point me in the right direction
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
num_of_pages = 4
exchanges_list = []
names_list = []
websites_list = []
emails_list = []
years_list = []
countries_list = []
twitter_list = []
for i in range(num_of_pages):
url = 'https://www.coingecko.com/en/exchanges?page=' + str(i+1)
driver.get(url)
links = driver.find_elements_by_xpath("//tbody[#data-target='exchanges-list.tableRows']/tr/td[2]/div/span[2]/a")
links = [url.get_attribute('href') for url in links]
time.sleep(0.5)
for link in links:
driver.get(link)
wait = WebDriverWait(driver, 2)
wait.until(EC.text_to_be_present_in_element_value((By.XPATH, "//ul[#role='tablist']/li[position()=2 or position()=3]/a"), '#about'))
try:
name = driver.find_element_by_xpath("//div[#class='exchange-details-header-content']/div/h1").text
website = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[8]/a").get_attribute('href')
email = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[9]/a").get_attribute('href')
year_est = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[10]").text
inc_country = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[12]").text
twitter = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[16]/div[2]/div[2]/a").get_attribute('title')
except:
pass
try:
print('---------------')
print('exchange name is : {}'.format(name))
print('exchange website is : {}'.format(website))
print('exchange email is : {}'.format(email))
print('exchange established in year: {}'.format(year_est))
print('exchange incorporated in : {}'.format(Inc_country))
print('exchange twitter handle is: {}'.format(twitter))
except:
pass
try:
names_list.append(name)
websites_list.append(website)
emails_list.append(email)
years_list.append(year_est)
countries_list.append(Inc_country)
twitter_list.append(twitter)
except:
pass
df = pd.DataFrame(list(zip(names_list, websites_list,emails_list, years_list, countries_list, twitter_list)), columns=['Ex_Names', 'Website', 'Support Email', 'Inc Year', 'Inc Country', 'Twitter Handle' ])
CoinGecko2_data = df.to_csv('CoinGecko4.csv', index=False)
If you know the href just wait for: //a[contains(#href, 'my-href')]
I am not sue if there is any but you can create your custom wait. Here is an example:
https://seleniumbyexamples.github.io/waitcustom

CNN Scraper sporadically working in python

I've tried to create a Web Scraper for CNN. My goal is to scrape all news articles within the search query. Sometimes I get an output for some of the scraped pages and sometimes it doesn't work at all.
I am using selenium and BeautifulSoup packages in Jupiter Notebook. I am iterating over the pages via the url parameters &page={}&from={}. I tried by.XPATH before and simply clicking the next button at the end of the page, but it gave me the same results.
Here's the code I'm using:
#0 ------------import libraries
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import feedparser
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle
import pandas as pd
#3 ------------CNN SCRAPER
#3.1 ----------Define Funktion
def CNN_Scraper(max_pages):
base = "https://edition.cnn.com/"
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
load_content = browser.implicitly_wait(30)
base_url = 'https://edition.cnn.com/search?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100'
#-------------Define empty lists to be scraped
CNN_title = []
CNN_date = []
CNN_article = []
article_count = 0
#-------------iterate over pages and extract
for page in range(1, max_pages + 1):
print("Page %d" % page)
url= base_url + "&page=%d&from=%d" % (page, article_count)
browser.get(url)
load_content
soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('div', {'class':'cnn-search__results-list'})
contents = search_results.find_all('div', {'class':'cnn-search__result-contents'})
for content in contents:
try:
title = content.find('h3').text
print(title)
link = content.find('a')
link_url = link['href']
date = content.find('div',{'class':'cnn-search__result-publish-date'}).text.strip()
article = content.find('div',{'class':'cnn-search__result-body'}).text
except:
print("loser")
continue
CNN_title.append(title)
CNN_date.append(date)
CNN_article.append(article)
article_count += 100
print("-----")
#-------------Save in DF
df = pd.DataFrame()
df['title'] = CNN_title
df['date'] = CNN_date
df['article'] = CNN_article
df['link']=CNN_link
return df
#print("Complete")
browser.quit()
#3.2 ----------Call Function - Scrape CNN and save pickled data
CNN_data = CNN_Scraper(2)
#CNN_data.to_pickle("CNN_data")
Call the back-end API directly. For more details check my previous answer
import requests
import json
def main(url):
with requests.Session() as req:
for item in range(1, 1000, 100):
r = req.get(url.format(item)).json()
for a in r['result']:
print("Headline: {}, Url: {}".format(
a['headline'], a['url']))
main("https://search.api.cnn.io/content?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100&from={}")

how to get hidden href tag in selenium with "::before"

Im trying to get a url from a PLP and visit each of the elements to get certain keywords from the PDP and dump it into json file. However, the list only returns 1 data only. Im suspecting the website is trying to block the action. Im using this program once a month to see what new features are added in new items.
The code between the "***" is the part I am having trouble with. It returns the correct value but only returns 1 data.How can I get more data?In the example below I am only getting the product names to make it simple.
sample url: "https://store.nike.com/us/en_us/pw/mens-running-shoes/7puZ8yzZoi3"
Actual element
<div class="exp-product-wall clearfix">
::before
<div class="grid-item fullSize" data-pdpurl="https://www.nike.com/t/epic-react-flyknit-2-mens-running-shoe-459stf" data-column-index="0" data-item-index="1">
<div class="grid-item-box">
<div class="grid-item-content">
<div class="grid-item-image">
<div class="grid-item-image-wrapper sprite-sheet sprite-index-1">
<a href="https://www.nike.com/t/epic-react-flyknit-2-mens-running-shoe-459stf">
<img src="https://images.nike.com/is/image/DotCom/pwp_sheet2?$NIKE_PWPx3$&$img0=BQ8928_001&$img1=BQ8928_003&$img2=BQ8928_005">
Below working code
import selenium
import json
import time
import re
import string
import requests
import bs4
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
domain = 'website url goes here'
def prepare_driver(url):
'''Returns a Firefox Webdriver.'''
options = Options()
# options.add_argument('-headless')
driver = webdriver.Chrome(executable_path='location to chromedriver')
driver.get(url)
wait = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.CLASS_NAME, 'product-name ')))
time.sleep(2)
return driver
def fill_form(driver, search_argument):
'''Finds all the input tags in form and makes a POST requests.'''
#search_field = driver.find_element_by_id('q')
#search_field.send_keys(search_argument)
# We look for the search button and click it
#driver.find_element_by_class_name('search__submit')\
#.click()
wait = WebDriverWait(driver, timeout=10).until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'product-name ')))
def scrape_results(driver, n_results):
'''Returns the data from n_results amount of results.'''
products_urls = list()
products_data = list()
***for product_title in driver.find_elements_by_xpath('//div[#class="exp-gridwall-content clearfix"]'):
products_urls.append(product_title.find_element_by_xpath(
'//div[#class="grid-item fullSize"]').get_attribute('data-pdpurl'))***
for url in range(0, n_results):
if url == n_results:
break
url_data = scrape_product_data(driver, products_urls[url])
products_data.append(url_data)
return products_data
def scrape_product_data(driver, product_url):
'''Visits an product page and extracts the data.'''
if driver == None:
driver = prepare_driver(product_url)
driver.get(product_url)
time.sleep(12)
product_fields = dict()
# Get the product name
product_fields['product_name'] = driver.find_element_by_xpath(
'//h1[#id="pdp_product_title"]').get_attribute('textContent')
# .text.strip('name')
return product_fields
if __name__ == '__main__':
try:
driver = prepare_driver(domain)
#fill_form(driver, 'juniole tf')
products_data = scrape_results(driver, 2)
products_data = json.dumps(products_data, indent=4,ensure_ascii=False) #ensure_acii => changes japanese to correct character
with open('data.json', 'w') as f:
f.write(products_data)
finally:
driver.quit()
Desired Output in json:
[
{
"product_name": "Nike Epic React Flyknit 2",
"descr": "The Nike Epic React Flyknit 2 takes a step up from its predecessor with smooth, lightweight performance and a bold look. An updated Flyknit upper conforms to your foot with a minimal, supportive design. Underfoot, durable Nike React technology defies the odds by being both soft and responsive, for comfort that lasts as long as you can run."
},
{
"product_name": "Nike Zoom Fly SP Fast Nathan Bell",
"descr": "The Nike Zoom Fly SP Fast Nathan Bell is part of a collaboration with artist Nathan Bell, featuring hand-drawn graphics that celebrate running as a competition with yourself. It's designed to meet the demands of your toughest tempo runs, long runs and race day with a responsive construction that turns the pressure of each stride into energy return for the next."
}
]
You can easily get the urls with requests. I targeted the data-pdpurl attribute. In the selenium loop you may need to add some handling of requests for location. A short wait is needed during loop to prevent false claims of product not available.
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
d = webdriver.Chrome()
results = []
r = requests.get('https://store.nike.com/us/en_us/pw/mens-running-shoes/7puZ8yzZoi3')
soup = bs(r.content, 'lxml')
products = []
listings = soup.select('.grid-item')
for listing in listings:
url = listing['data-pdpurl']
title = listing.select_one('.product-display-name').text
row = {'title' :title ,
'url' : url}
products.append(row)
for product in products:
url = product['url']
d.get(url)
try:
d.get(url)
desc = WebDriverWait(d,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".description-preview")))
results.append({'product_name': product['title'],
'descr' : desc.text})
except Exception as e:
print(e, url)
finally:
time.sleep(1)
d.quit()
print(results)

Categories

Resources