I'm struggling to get this code to extract the desired information from one single page.
I've tried all the usual selenium tactics and added a time delay. Hopefully, it's something simple. I'm not getting any error messages.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
from time import sleep
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,600")
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36"
options.add_argument(f'user-agent={user_agent}')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path=r'/usr/local/bin/chromedriver',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
sleep(3)
source_data = browser.page_source
bs_data = bs(source_data,"html.parser")
#product id
try:
product_id = bs_data.findfindAll('span', {'class': 'pdp-main-details__product-code'})
product_id = product_id.replace('Product code:','').strip()
except:
product_id = "n/a"
#image address
try:
for image in bs_data.find("div", {"class":"s7staticimage"}):
image_url = image.find('img')['src']
except:
image_url = "n/a"
#product description
try:
product_desc = bs_data.find('class',{'pdp-main-pdp-main-details__title'})
product_desc = product_desc.get_text().strip()
except:
product_desc = "n/a"
#product price
try:
product_price = bs_data.find('class',{'co-product__price pdp-main-details__price'})
product_price = product_price.get_text().strip()
except:
product_price = "n/a"
print (url,'|',image_url,'|',product_id,'|',product_desc,'|',product_price)
browser.quit()
Any assistance is greatly appreciated.
Thanks
Since the content is dynamically generated, your soup has nothing in it to find. Selenium is good enough. I don't know why you have treated the elements as list because there is only one of each on this page.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path='C:/bin/chromedriver.exe',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
browser.implicitly_wait(15)
product_id = browser.find_element_by_class_name('pdp-main-details__product-code')
print(product_id.text)
image = browser.find_element_by_xpath("//*[#id=\"s7viewer_flyout\"]/div[1]/img[1]")
image_url = image.get_attribute('src')
print(image_url)
Output:-
Product code: 410212
https://ui.assets-asda.com/dm/asdagroceries/5050854288142_T1?defaultImage=asdagroceries/noImage&resMode=sharp2&id=PqaST3&fmt=jpg&fit=constrain,1&wid=188&hei=188
Related
I want to scrape the news articles from a number of pages on the site: https://koreajoongangdaily.joins.com/section/business
At the end, I want to create a dictionary out of the scraped data which should have the date, UTC_date, title, authors_name, news_content, url.
Here is my code, which I tried but couldn't make the dictionary.
Import all the necessary functions
from bs4 import BeautifulSoup as soup
import requests
import numpy as np
from pymongo import MongoClient
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from time import sleep
import uuid
import datetime
import time
from fake_useragent import UserAgent
import os
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import sys
from fake_useragent import UserAgent
import warnings
warnings.filterwarnings('ignore')
import re
from tqdm import tqdm
import pandas as pd
import datetime
def string_to_date(x):
return datetime.datetime.strptime(x, '%Y/%m/%d')
def datee(pp):
return str(pp.date())
To get the links,
def get_link(res):
href_list = []
for res in res_list: # h3
link_list = res.select('a')
for link in link_list: # a
href = link.get('href')
href_list.append(href)
return href_list
To get the article body, title, authors, date and utc date from every link
def get_article(url):
news_list = []
title_list= []
page = requests.get(url)
bsobj = soup(page.content)
for title in bsobj.findAll('h1',{'class':'view-article-title serif'}):
title_list.append(title.text.strip())
for news in bsobj.findAll('div',{'class':'article-content-left pb-30'}):
news = news_list.append(news.text.strip())
author_list = []
for f in news:
author = ""
pattern = r"BY\b(.+)(?=\[.+\])"
resultsss = re.search(pattern, f)
if resultsss != None:
author = resultsss.group(0).strip()[3:]
authors = author_list.append(author)
#there is date given in every links of the articles hence we can use that
date_list_1 = []
separator = '/business'
for link in href_list:
new_set1 = link.replace('https://koreajoongangdaily.joins.com/', '')
new_set2 = new_set1.split(separator, 1)[0]
new_set3 = date_list_1.append(new_set2)
new_set4 = list(map(datee, new_set_4))
#no separate time so add 00:00:00 for UTC
p=[]
for x in new_set4:
utc_date = p.append(str(x) + " 00:00:00")
#print(news_list)
return news_list, title_list, authors, new_set4, utc_date
The n denotes the number of page I want to scrape,
def scrape_the_article(n):
options = webdriver.ChromeOptions()
lists = ['disable-popup-blocking']
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "normal"
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-extensions")
options.add_argument("--disable-notifications")
options.add_argument("--disable-Advertisement")
options.add_argument("--disable-popup-blocking")
driver = webdriver.Chrome(executable_path= r"E:\chromedriver\chromedriver.exe", options=options) #paste your own choromedriver path
url = "https://koreajoongangdaily.joins.com/section/business"
driver.get(url)
page = 0
for step in tqdm(range(n)): # set the page range here, how many page you want to scrape
page += 1
time.sleep(2)
try:
button = driver.find_element_by_class_name("service-more-btn")
button.click()
except Exception as e:
print("trying to scroll")
driver.execute_script("window.scrollBy(0, 100);")
print("Page: ", page)
html = driver.page_source
bs = BeautifulSoup(html, 'html.parser')
res_list = bs.select('div[class="mid-article3"]')
for res in res_list:
links = get_article_links(res)
article = get_article(links)
scrape_the_article(4)
And at the end I wanna make a dictionary which will look like this,
data = {'date': new_set4, 'utc_date_time': utc_date, 'title': title_list,'author': authors,
'content': news_list,'link': href_list}
But I couldn't get back the dictionary I wanted to get back. Please help me with this. Thank you!
There's an API endpoint that holds (almost) all data you need and each item is a dictionary, so you can construct your own data structure out of the API response.
NOTE There's no author key in the response, so if you really need this, you'll have to visit each article URL.
Here's how to get the first 10 items:
import datetime
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
'X-Requested-With': 'XMLHttpRequest'
}
api_endpoint = "https://koreajoongangdaily.joins.com/section/business"
payload = {
"url": "/section/business",
"currPage": "1",
}
results = requests.post(api_endpoint, headers=headers, data=payload)
for result in results.json()['RESULT_LIST']:
date = (
datetime.datetime
.strptime(result['service_date'], '%Y%m%d%H%M%S')
.strftime('%Y-%m-%d %H:%M:%S')
)
print(date)
print(f"{result['list_title']}\n{result['cmss_url']}")
print(f"{result['summary']}")
print("-" * 50)
Output:
2022-10-25 18:20:42
Bio business
https://koreajoongangdaily.joins.com/2022/10/25/business/industry/Korea-World-Bio-Summit-Seoul/20221025182043006.html
President Yoon Suk-yeol delivers an opening address at the World Bio Summit 2022 held at the Grand Walkerhill Seoul in Gwangjin District, eastern Seoul, on Tuesday.
--------------------------------------------------
2022-10-25 18:20:33
Mirae Group invests in Musk's Twitter takeover
https://koreajoongangdaily.joins.com/2022/10/25/business/tech/Korea-Twitter-Elon-Musk/20221025182048690.html
Mirae Asset Financial Group will invest $212 million in Elon Musks’ $44 billion acquisition of Twitter, according to electronic disclosures and local media reports.
--------------------------------------------------
2022-10-25 18:20:00
Smart chair
https://koreajoongangdaily.joins.com/2022/10/25/imageNews/photos/KT-robotics-smart-autonomous-chairs/20221025182003312.html
A demonstration of an autonomous “smart” robot chair at the Dongdaemun Design Plaza in Seoul. KT announced that it is making the smart robotic chair available for three weeks to visitors attending the DDP-NFT exhibition.
--------------------------------------------------
and more ...
To paginate the API, try this example:
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
'X-Requested-With': 'XMLHttpRequest'
}
api_endpoint = "https://koreajoongangdaily.joins.com/section/business"
payload = {
"url": "/section/business",
"currPage": "1",
}
with requests.Session() as s:
for page in range(1, 100, 10):
payload["currPage"] = str(page)
results = s.post(api_endpoint, headers=headers, data=payload)
for result in results.json()['RESULT_LIST']:
print(result['service_date'])
print(f"{result['list_title']}\n{result['cmss_url']}")
print(f"{result['summary']}")
print("-" * 50)
NOTE: I'd highly recommend throttling the request to a 1 - 3 seconds between each attempt.
I am trying to use Beautiful Soup to read a value from a web page. The following steps are necessary:
go to the webpage:
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
insert the ISIN in the searchbar
3. select the autocomplete-results from the container msci-ac-search-data-dropdown (click)
4. read the value from the "div class: ratingdata-outercircle esgratings-profile-header-green" to get the text: "ratingdata-fund-rating esg-fund-ratings-circle-aaa".
so far i have tried the following:
import requests
from bs4 import BeautifulSoup
isin = 'IE00B4L5Y983'
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
payload = {}
for i in soup.select('form[action="https://www.msci.com/search"] input[value]'):
payload[i['name']] = i['value']
payload['UQ_txt'] = isin
Try:
import requests
from bs4 import BeautifulSoup
isin = "IE00B4L5Y983"
url = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"X-Requested-With": "XMLHttpRequest",
}
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "searchFundRatingsProfiles",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_keywords": isin,
}
data = requests.get(url, params=params, headers=headers).json()
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "showEsgFundRatingsProfile",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_fundShareClassId": data[0]["url"],
}
headers["Referer"] = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/{}/{}".format(
data[0]["encodedTitle"], data[0]["url"]
)
soup = BeautifulSoup(
requests.get(url, params=params, headers=headers).content, "html.parser"
)
data = soup.select_one(".ratingdata-fund-rating")["class"]
print(data)
Prints:
['ratingdata-fund-rating', 'esg-fund-ratings-circle-aaa']
When you press enter, you send another request, which already shows the search result. Here is an example of how to get what you want
import requests
isin = 'IE00B4L5Y983'
url = f"https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings?p_p_id=esg_fund_ratings_profile&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchFundRatingsProfiles&p_p_cacheability=cacheLevelPage&_esg_fund_ratings_profile_keywords={isin}"
for title in requests.get(url).json():
print(title['title'])
OUTPUT:
iShares Core MSCI World UCITS ETF USD (Acc)
If I may: from the OP's description I can only infer this is either an education related test, either a job interview related test. As such, following the exact instructions is paramount. In order to follow said instructions, you can only use selenium. The following code will work 'a la point', and get the desired result:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
browser.get(url)
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, '_esg_fund_ratings_profile_keywords'))).send_keys('IE00B4L5Y983')
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.ID, 'ui-id-1')))
result = browser.find_element(By.ID, "ui-id-1")
result.click()
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'esgratings-profile-header-green')))
result = browser.find_element(By.CLASS_NAME, "esgratings-profile-header-green").find_element(By.TAG_NAME, "div").get_attribute('class')
print(result)
browser.quit()
This will return:
ratingdata-fund-rating esg-fund-ratings-circle-aaa
I am using Beautifulsoup to filter data from a website. To do this, I pass several search terms in a loop using the site's built-in search box.
If the search term does not find any content, the following loop in soup breaks.
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_25352/1005464644.py in <cell line: 10>()
21
22 soup = BeautifulSoup(driver.page_source, "html.parser")
---> 23 results = soup.find('ul', {'class':'result-list'}).find_all('li')
24
25 for i in results:
AttributeError: 'NoneType' object has no attribute 'find_all'
Because no data was found, logically no data could be transferred to the soup.
How can I catch this error?
Thanks for your help.
Here is the code:
# Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.google.com")
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
ausschreibungsliste = []
sb_1 = ['66512200', '85140000', '75000000', '75130000', '75131000', '79200000' , '79210000' , '79510000']
for z in sb_1:
time.sleep(1)
driver.get('https://www.service.bund.de/Content/DE/Ausschreibungen/Suche/Formular.html')
was_sb1 = driver.find_element("xpath", '//input[#id="f4641464d4642144"]')
was_sb1.send_keys(z)
was_sb1.send_keys(Keys.RETURN)
while True:
soup = BeautifulSoup(driver.page_source, "html.parser")
results = soup.find('ul', {'class':'result-list'}).find_all('li')
for i in results:
# Liste erzeugen
# Ausschreibung
ausschreibung = i.find_all('h3')[0].get_text().strip().replace(u'Ausschreibung', u'').replace(u'\xad', u'')
# Vergabestelle
organisation = i.find_all('p')[0].get_text().strip().replace(u'Vergabestelle ', u'')
# Ausschreibungsdatum
verdatum = i.find_all('p')[1].get_text().strip().replace(u'Veröffentlicht ', u'')
# Frist
frist = i.find_all('p')[2].get_text().replace(u'Angebotsfrist ', u'')
# Typ
typ = 'Ausschreibung'
# Website
website = 'service.bund.de'
# Prüfung ab
pruefdatum_format = 'fehlt'
# Datei erzeugt
jetzt = 'fehlt'
i_info = {
'Vergabedatum': verdatum,
'Frist': frist,
'Organisation': organisation,
'Ausschreibung': ausschreibung,
'Typ': typ,
'Website': website,
'Prüfung ab': pruefdatum_format,
'Datei erzeugt': jetzt
}
ausschreibungsliste.append(i_info)
try:
time.sleep(2)
driver.find_element("xpath", '//*[#id="main"]/div/div/section[2]/div[1]/div/form/fieldset/div[2]/div[1]/ul/li[2]/a').click()
except NoSuchElementException:
break
Something in the lines like:
if soup.find('ul', {'class':'result-list'}):
results = soup.find('ul', {'class':'result-list'}).find_all('li')
for i in results:
etc.
else:
pass
I have now found a solution based on the comments of Scott Hunter and AudioBaten.
Here is the (shortened) code:
ausschreibungsliste = []
# CPV-Codes die zu suchen sind
cpvcode = ['32441300', '64226000' , '66512200']
for z in cpvcode:
time.sleep(1)
driver.get('the_url')
suchfeld = driver.find_element("xpath", '//input[#id="f4641464d4642144"]')
suchfeld.clear()
suchfeld.send_keys(z)
suchfeld.send_keys(Keys.RETURN)
try:
soup = BeautifulSoup(driver.page_source, "html.parser")
results = soup.find('ul', {'class':'result-list'}).find_all('li')
while True:
for i in results:
# Liste erzeugen
.... etc. ....
i_info = {
'Vergabedatum': verdatum,
'Frist': frist,
'Organisation': organisation,
'Ausschreibung': ausschreibung,
'CPV-Code': z,
'Link': linkausschreibung,
'Typ': typ,
'Website': website,
'Prüfung ab': pruefdatum_format,
'Datei erzeugt': jetzt
}
ausschreibungsliste.append(i_info)
# Nächster Seitenaufruf bis letzte Seite erreicht
if not soup.select_one('span', {'class':'disabled'}):
next=driver.find_element("xpath", '//*[#id="main"]/div/div/section[2]/div[1]/div/form/fieldset/div[2]/div[1]/ul/li[2]/a').click()
else:
print('Ausschreibungen gefunden :', len(ausschreibungsliste))
break
except:
continue
Thanks for your help.
I am trying to scrape a website through beautiful soup + selenium and getting their image URLs under <img> tag with src as an attribute. I don't want to scrape through div class names. Here is what i am scraping through:
<img src="https://secure.gravatar.com/avatar/f1fb5ec60129b029e968f0522fe4828c?s=100&d=retro&f=y" alt="" width="55" height="55">
I want to get all URLs under image tag. Here is my code which is giving me an error:
from bs4 import BeautifulSoup as Soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")
driver = webdriver.Chrome(executable_path=r"C:\Users\intel\Downloads\setups\chromedriver.exe", options=options)
driver.get("https://python-forum.io/Thread-Using-beautiful-soup-to-get-html-attribute-value")
page = Soup(driver.page_source, features='html.parser')
divs = page.select("img")
for product in divs:
ele = divs.find('src')
print(ele)
It's giving me attribute error :
AttributeError: ResultSet object has no attribute 'find'.
You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Any of your help would be appreciated...
Intitaly i thought, this ele = divs.find('src') should be ele = product.find('src') but that didnt work, so I have implemented in in the following way. Change this
page = Soup(driver.page_source, features='html.parser')
divs = page.select("img")
for product in divs:
ele = divs.find('src')
print(ele)
to this
page = Soup(driver.page_source, features='html.parser')
divs = page.find_all("img")
print(divs)
for product in divs:
ele = product['src']
print(ele)
This should give you values in the src attribute of the img tag.
import requests
from bs4 import BeautifulSoup
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [item['content']
for item in soup.findAll("meta", {'property': "og:image"})]
print(target)
main("https://python-forum.io/Thread-Using-beautiful-soup-to-get-html-attribute-value")
Output:
['https://python-forum.io/images/facebook.png', 'https://secure.gravatar.com/avatar/f1fb5ec60129b029e968f0522fe4828c?s=100&d=retro&f=y']
I want to get first 10 images url from google search (not base64).
I have code:
import os
import base64
import time
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
searchterm = 'bananas' # will also be the name of the folder
url = "https://www.google.com/search?q=banan&source=lnms&tbm=isch&sa=X&ved=2ahUKEwj-75rDlJLoAhWLHHcKHStFC6EQ_AUoAXoECA4QAw&biw=1867&bih=951"
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
browser = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)
browser.get(url)
actions = webdriver.common.action_chains.ActionChains(browser)
header = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
counter = 0
succounter = 0
if not os.path.exists(searchterm):
os.mkdir(searchterm)
for i in range(0, 11):
time.sleep(1)
x = browser.find_elements_by_xpath('//*[#id="islrg"]/descendant::img')[i]
x.click()
i += 1
if i > 10:
break
ba = browser.find_element_by_xpath('//*
[#id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[2]/a/img')
print(ba.get_attribute('src'))
It returns image urls, but sometimes base64. How to make the script always return image url?
Thank you.
Change the xpath to get the link rather image, and then get the href.
ba = browser.find_element_by_xpath("//div[#class='islrc']//a[#href][#rel='noopener']")
print(ba.get_attribute("href")
You can always get only Image URLs if you scrape another search engine DuckDuckGo using the following code:
search_query = 'what you want to find'
num_images = 10
driver_location = '/put/location/of/your/driver/here'
# setting up the driver
ser = Service(driver_location)
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ser, options=op)
# searching the query
driver.get(f'https://duckduckgo.com/?q={search_query}&kl=us-en&ia=web')
# going to Images Section
ba = driver.find_element(By.XPATH, "//a[#class='zcm__link js-zci-link js-zci-link--images']")
ba.click()
# getting the images URLs
for result in driver.find_elements(By.CSS_SELECTOR, '.js-images-link')[0:0+num_images]:
imageURL = result.get_attribute('data-id')
print(f'{imageURL}\n')
driver.quit()