I am trying to use Beautiful Soup to read a value from a web page. The following steps are necessary:
go to the webpage:
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
insert the ISIN in the searchbar
3. select the autocomplete-results from the container msci-ac-search-data-dropdown (click)
4. read the value from the "div class: ratingdata-outercircle esgratings-profile-header-green" to get the text: "ratingdata-fund-rating esg-fund-ratings-circle-aaa".
so far i have tried the following:
import requests
from bs4 import BeautifulSoup
isin = 'IE00B4L5Y983'
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
payload = {}
for i in soup.select('form[action="https://www.msci.com/search"] input[value]'):
payload[i['name']] = i['value']
payload['UQ_txt'] = isin
Try:
import requests
from bs4 import BeautifulSoup
isin = "IE00B4L5Y983"
url = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"X-Requested-With": "XMLHttpRequest",
}
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "searchFundRatingsProfiles",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_keywords": isin,
}
data = requests.get(url, params=params, headers=headers).json()
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "showEsgFundRatingsProfile",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_fundShareClassId": data[0]["url"],
}
headers["Referer"] = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/{}/{}".format(
data[0]["encodedTitle"], data[0]["url"]
)
soup = BeautifulSoup(
requests.get(url, params=params, headers=headers).content, "html.parser"
)
data = soup.select_one(".ratingdata-fund-rating")["class"]
print(data)
Prints:
['ratingdata-fund-rating', 'esg-fund-ratings-circle-aaa']
When you press enter, you send another request, which already shows the search result. Here is an example of how to get what you want
import requests
isin = 'IE00B4L5Y983'
url = f"https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings?p_p_id=esg_fund_ratings_profile&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchFundRatingsProfiles&p_p_cacheability=cacheLevelPage&_esg_fund_ratings_profile_keywords={isin}"
for title in requests.get(url).json():
print(title['title'])
OUTPUT:
iShares Core MSCI World UCITS ETF USD (Acc)
If I may: from the OP's description I can only infer this is either an education related test, either a job interview related test. As such, following the exact instructions is paramount. In order to follow said instructions, you can only use selenium. The following code will work 'a la point', and get the desired result:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
browser.get(url)
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, '_esg_fund_ratings_profile_keywords'))).send_keys('IE00B4L5Y983')
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.ID, 'ui-id-1')))
result = browser.find_element(By.ID, "ui-id-1")
result.click()
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'esgratings-profile-header-green')))
result = browser.find_element(By.CLASS_NAME, "esgratings-profile-header-green").find_element(By.TAG_NAME, "div").get_attribute('class')
print(result)
browser.quit()
This will return:
ratingdata-fund-rating esg-fund-ratings-circle-aaa
Related
I want to scrape the news articles from a number of pages on the site: https://koreajoongangdaily.joins.com/section/business
At the end, I want to create a dictionary out of the scraped data which should have the date, UTC_date, title, authors_name, news_content, url.
Here is my code, which I tried but couldn't make the dictionary.
Import all the necessary functions
from bs4 import BeautifulSoup as soup
import requests
import numpy as np
from pymongo import MongoClient
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from time import sleep
import uuid
import datetime
import time
from fake_useragent import UserAgent
import os
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import sys
from fake_useragent import UserAgent
import warnings
warnings.filterwarnings('ignore')
import re
from tqdm import tqdm
import pandas as pd
import datetime
def string_to_date(x):
return datetime.datetime.strptime(x, '%Y/%m/%d')
def datee(pp):
return str(pp.date())
To get the links,
def get_link(res):
href_list = []
for res in res_list: # h3
link_list = res.select('a')
for link in link_list: # a
href = link.get('href')
href_list.append(href)
return href_list
To get the article body, title, authors, date and utc date from every link
def get_article(url):
news_list = []
title_list= []
page = requests.get(url)
bsobj = soup(page.content)
for title in bsobj.findAll('h1',{'class':'view-article-title serif'}):
title_list.append(title.text.strip())
for news in bsobj.findAll('div',{'class':'article-content-left pb-30'}):
news = news_list.append(news.text.strip())
author_list = []
for f in news:
author = ""
pattern = r"BY\b(.+)(?=\[.+\])"
resultsss = re.search(pattern, f)
if resultsss != None:
author = resultsss.group(0).strip()[3:]
authors = author_list.append(author)
#there is date given in every links of the articles hence we can use that
date_list_1 = []
separator = '/business'
for link in href_list:
new_set1 = link.replace('https://koreajoongangdaily.joins.com/', '')
new_set2 = new_set1.split(separator, 1)[0]
new_set3 = date_list_1.append(new_set2)
new_set4 = list(map(datee, new_set_4))
#no separate time so add 00:00:00 for UTC
p=[]
for x in new_set4:
utc_date = p.append(str(x) + " 00:00:00")
#print(news_list)
return news_list, title_list, authors, new_set4, utc_date
The n denotes the number of page I want to scrape,
def scrape_the_article(n):
options = webdriver.ChromeOptions()
lists = ['disable-popup-blocking']
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "normal"
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-extensions")
options.add_argument("--disable-notifications")
options.add_argument("--disable-Advertisement")
options.add_argument("--disable-popup-blocking")
driver = webdriver.Chrome(executable_path= r"E:\chromedriver\chromedriver.exe", options=options) #paste your own choromedriver path
url = "https://koreajoongangdaily.joins.com/section/business"
driver.get(url)
page = 0
for step in tqdm(range(n)): # set the page range here, how many page you want to scrape
page += 1
time.sleep(2)
try:
button = driver.find_element_by_class_name("service-more-btn")
button.click()
except Exception as e:
print("trying to scroll")
driver.execute_script("window.scrollBy(0, 100);")
print("Page: ", page)
html = driver.page_source
bs = BeautifulSoup(html, 'html.parser')
res_list = bs.select('div[class="mid-article3"]')
for res in res_list:
links = get_article_links(res)
article = get_article(links)
scrape_the_article(4)
And at the end I wanna make a dictionary which will look like this,
data = {'date': new_set4, 'utc_date_time': utc_date, 'title': title_list,'author': authors,
'content': news_list,'link': href_list}
But I couldn't get back the dictionary I wanted to get back. Please help me with this. Thank you!
There's an API endpoint that holds (almost) all data you need and each item is a dictionary, so you can construct your own data structure out of the API response.
NOTE There's no author key in the response, so if you really need this, you'll have to visit each article URL.
Here's how to get the first 10 items:
import datetime
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
'X-Requested-With': 'XMLHttpRequest'
}
api_endpoint = "https://koreajoongangdaily.joins.com/section/business"
payload = {
"url": "/section/business",
"currPage": "1",
}
results = requests.post(api_endpoint, headers=headers, data=payload)
for result in results.json()['RESULT_LIST']:
date = (
datetime.datetime
.strptime(result['service_date'], '%Y%m%d%H%M%S')
.strftime('%Y-%m-%d %H:%M:%S')
)
print(date)
print(f"{result['list_title']}\n{result['cmss_url']}")
print(f"{result['summary']}")
print("-" * 50)
Output:
2022-10-25 18:20:42
Bio business
https://koreajoongangdaily.joins.com/2022/10/25/business/industry/Korea-World-Bio-Summit-Seoul/20221025182043006.html
President Yoon Suk-yeol delivers an opening address at the World Bio Summit 2022 held at the Grand Walkerhill Seoul in Gwangjin District, eastern Seoul, on Tuesday.
--------------------------------------------------
2022-10-25 18:20:33
Mirae Group invests in Musk's Twitter takeover
https://koreajoongangdaily.joins.com/2022/10/25/business/tech/Korea-Twitter-Elon-Musk/20221025182048690.html
Mirae Asset Financial Group will invest $212 million in Elon Musks’ $44 billion acquisition of Twitter, according to electronic disclosures and local media reports.
--------------------------------------------------
2022-10-25 18:20:00
Smart chair
https://koreajoongangdaily.joins.com/2022/10/25/imageNews/photos/KT-robotics-smart-autonomous-chairs/20221025182003312.html
A demonstration of an autonomous “smart” robot chair at the Dongdaemun Design Plaza in Seoul. KT announced that it is making the smart robotic chair available for three weeks to visitors attending the DDP-NFT exhibition.
--------------------------------------------------
and more ...
To paginate the API, try this example:
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
'X-Requested-With': 'XMLHttpRequest'
}
api_endpoint = "https://koreajoongangdaily.joins.com/section/business"
payload = {
"url": "/section/business",
"currPage": "1",
}
with requests.Session() as s:
for page in range(1, 100, 10):
payload["currPage"] = str(page)
results = s.post(api_endpoint, headers=headers, data=payload)
for result in results.json()['RESULT_LIST']:
print(result['service_date'])
print(f"{result['list_title']}\n{result['cmss_url']}")
print(f"{result['summary']}")
print("-" * 50)
NOTE: I'd highly recommend throttling the request to a 1 - 3 seconds between each attempt.
I need to scrape the titles for all blog post articles via a Load More button as set by my desired range for i in range(1,3):
At present I'm only able to capture the titles for the first page even though i'm able to navigate to the next page using selenium.
Update:
In a previous question (How To Scrape Content With Load More Pages Using Selenium Python) by myself the pagination url was captured via:
Network Tab > Reload Page > Click Show more button > Select wp-admin/admin-ajax.php?...... Right Click Copy > Copy Link Address.
However, i do not know how to capture similar url for the site learnwoo.com/blog. I'm not sure if it uses a different technique.
Any help would be much appreciated.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
# Selenium Routine
from requests_html import HTMLSession
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
# Removes SSL Issues With Chrome
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('log-level=3')
options.add_argument('--disable-notifications')
#options.add_argument('--headless') # Comment to view browser actions
# Get website url
urls = "https://learnwoo.com/blog/"
r = requests.get(urls)
driver = webdriver.Chrome(executable_path="C:\webdrivers\chromedriver.exe",options=options)
driver.get(urls)
productlist = []
for i in range(1,3):
# Get Page Information
soup = BeautifulSoup(r.content, features='lxml')
items = soup.find_all('div', class_ = 'td_module_1')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
title = single_item.find('h3').text.strip()
print('Title:', title)
product = {
'Title': title,
}
productlist.append(product)
print()
time.sleep(5)
WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).send_keys(Keys.ENTER)
driver.close()
# Save Results
df = pd.DataFrame(productlist)
df.to_csv('Results.csv', index=False)
Alternative solution: You can use API response to extract the desired data.From API response,I'm getting total 74 items where each page contains 6 items.
import pandas as pd
import requests
from bs4 import BeautifulSoup
params = {
'id': '',
'post_id': '0',
'slug': 'home',
'canonical_url': 'https://jooble.org/blog/',
'posts_per_page': '6',
'page': '0',
'offset': '20',
'post_type': 'post',
'repeater': 'default',
'seo_start_page': '1',
'preloaded': 'false',
'preloaded_amount': '0',
'lang': 'en',
'order': 'DESC',
'orderby': 'date',
'action': 'alm_get_posts',
'query_type': 'standard',
}
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
api_url='https://jooble.org/blog/wp-admin/admin-ajax.php'
productlist= []
for params['page'] in range(0,13):
req = requests.get(api_url,params=params,headers=headers)
e = req.json()['html']
soup = BeautifulSoup(e,'lxml')
items = soup.find_all('div', class_ = 'front__news-content-wrapper')
for single_item in items:
title = single_item.find('div', class_ = 'front__news-title')
title=title.text.strip() if title else None
product = {
'Title': title,
}
productlist.append(product)
df = pd.DataFrame(productlist)
print(df)
Output:
Title
0 How to become an anesthesiologist
1 How to Become a Flight Attendant
2 How To Become An Influencer
3 How to Become an Electrician
4 3 Common Job Scams You Should Stay Away From
.. ...
69 Exploring Main Types of Remote Work
70 14 books HR specialist should read. Part 2
71 14 books HR specialist should read. Part 1
72 Don’t do that: 7 mistakes ruining your job int...
73 Virtual job interview. Jooble tips how to nail it
[74 rows x 1 columns]
To answer your question in selenium context, you could call .click():
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).click()
Concerning your xhr request comment - Note: Here it is not a GET it is a POST request (https://learnwoo.com/wp-admin/admin-ajax.php?td_theme_name=Newspaper&v=11) and you have to send some additional payload with requests
Example
This example is based on selenium 4 and uses its imports, may check https://www.selenium.dev/documentation/webdriver/getting_started/upgrade_to_selenium_4/#python-1
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
urls = "https://learnwoo.com/blog/"
driver.get(urls)
productlist = []
for i in range(1,3):
soup = BeautifulSoup(driver.page_source)
items = soup.find_all('div', class_ = 'td_module_1')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
product = {
'Title': single_item.find('h3').text.strip(),
}
productlist.append(product)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).click()
pd.DataFrame(productlist)
Output
Title
0 5 Futuristic eCommerce Trends
1 9 Best Shopify Apps to Print Shipping Labels
2 Cloud Hosting VS VPS Hosting: A Comparison
3 10 Best WooCommerce Facebook Integration Plugins
4 Complete Guide to BigCommerce Security
... ...
91 How To Calculate ROI of Your Moodle LMS?
92 How and Where to Find Help for WordPress Begin...
93 Expert Speaks: In Conversation with Amir Helze...
94 A Complete Analysis: NetSuite WooCommerce Inte...
95 Review of Currency Switcher & Converter for Sh...
96 rows × 1 columns
Here is an alternative to HedgHog's response: maybe the better way here is to use a while loop, as we don't know how many entries there are. I used a counter to breakout of the loop after the 5th loading - if you want to get all those entries, you just remove the counter.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
import pandas as pd
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(browser, 20)
big_list = []
counter = 1
url = 'https://learnwoo.com/blog/'
browser.get(url)
while True:
try:
load_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[aria-label="load_more"]')))
load_button.click()
counter = counter + 1
print('clicked to load more')
t.sleep(3)
entries = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h3[class="entry-title td-module-title"]')))
print('we have', len(entries), 'articles')
big_list = [(x.text, x.find_element(By.TAG_NAME, 'a').get_attribute('href')) for x in entries]
if counter > 5:
break
except Exception as e:
print('all done')
break
df = pd.DataFrame(big_list, columns = ['Article', 'Url'])
print(df)
Result:
Article Url
0 5 Futuristic eCommerce Trends https://learnwoo.com/future-ecommerce-trends/
1 9 Best Shopify Apps to Print Shipping Labels https://learnwoo.com/best-shopify-apps-print-s...
2 Cloud Hosting VS VPS Hosting: A Comparison https://learnwoo.com/cloud-hosting-vps-hosting/
3 10 Best WooCommerce Facebook Integration Plugins https://learnwoo.com/best-woocommerce-facebook...
4 Complete Guide to BigCommerce Security https://learnwoo.com/bigcommerce-security-guide/
... ... ...
286 A Comparison Between Omnichannel and Multichan... https://learnwoo.com/omnichannel-multichannel-...
287 8 Winning Techniques for Off-page SEO https://learnwoo.com/winning-techniques-for-of...
288 WooCommerce – How to Understand User Roles and... https://learnwoo.com/woocommerce-understand-us...
289 7 Best Free WooCommerce Catalog Mode Plugins (... https://learnwoo.com/free-woocommerce-catalog-...
290 Different WooCommerce Product Types Explained ... https://learnwoo.com/woocommerce-different-pro...
291 rows × 2 columns
I was trying to make a crawler to follow links, with this code
import scrapy
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
class DicionarioSpider(scrapy.Spider):
name = 'dicionario'
allowed_domains = ['www.mediktor.com']
start_urls = ['http://www.mediktor.com/']
def start_requests(self):
url = "https://www.mediktor.com/pt-br/glossario"
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(10)
doencas = driver.find_elements(
By.XPATH, "//a[#class='mdk-dictionary-list__glossary-item']")
for doenca in doencas:
url = doenca.get_attribute('href')
yield scrapy.Request(url)
driver.quit()
def parse(self, response):
urls = response.css(
'.mdk-dictionary-list__glossary-item a::attr(href)')
for url in urls:
yield response.follow(url.get(), callback=self.parse_info)
def parse_info(self, response):
contents = response.css('div.page-glossary-detail__main-content')
for desc in response.css('div.mdk-conclusion-detail__main-description'):
desc = response.css('p ::text').getall()
yield {
'desc': desc
}
for content in contents:
yield{
'name': content.css(
'div.mdk-conclusion-detail__main-title ::text').get().strip(),
'espec': content.css(
'div.mdk-ui-list-item__text mdc-list-item__text span::text').strip()
}
I was able to get the links but the part of entering the links and getting the information I need was not working, so a friend helped me to come up with this code
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_auth_code():
url = "https://www.mediktor.com/vendor.js"
response = requests.get(url)
start_index = response.text.index('APP_API_AUTH_CODE:"', 0) + len('APP_API_AUTH_CODE:"')
end_index = response.text.index('"', start_index)
return response.text[start_index:end_index]
def get_auth_token_and_device_id():
url = "https://euapi01.mediktor.com/backoffice/services/login"
payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
"\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
"\"timezoneRaw\":180,\"authTokenRefreshExpiresIn\":null}"
headers = {
'authorization': f'Basic {get_auth_code()}',
'Content-Type': 'text/plain'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json()['authToken'], response.json()['deviceId']
def get_conclusion_list(auth_token, device_id):
url = "https://euapi01.mediktor.com/backoffice/services/conclusionList"
payload = "{\"useCache\":168,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"" \
",\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
"\"timezoneRaw\":180,\"deviceId\":\"" + device_id + "\"}"
headers = {
'accept': 'application/json, text/plain, */*',
'authorization': f'Bearer {auth_token}',
'content-type': 'application/json;charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
return [conclusionId['conclusionId'] for conclusionId in response.json()['conclusions']]
def get_details(conclusionId, auth_token, device_id):
url = "https://euapi01.mediktor.com/backoffice/services/conclusionDetail"
payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
"\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"en_EN\"," \
"\"timezoneRaw\":180,\"deviceId\":\"" + device_id + "\"," \
"\"conclusionId\":\"" + conclusionId + "\"," \
"\"conclusionTemplate\":\"conclusion_description_body\",\"includeActions\":true}"
headers = {
'authorization': f'Bearer {auth_token}',
'content-type': 'application/json;charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.text
auth_token, device_id = get_auth_token_and_device_id()
conclusion_list = get_conclusion_list(auth_token, device_id)
for conclusion in conclusion_list:
print(get_details(conclusion, auth_token, device_id))
It gets the json with the page items, but in loop number 230 it starts returning the following error and won't leave the loop
{"error":{"code":"ME667","description":"Expired user identification token.","retry":true}}
What I'd like to do is, pass this all to a file so I can see if it's getting all the items on the page I need and then leave a json with just the information I need, not everything from the site as it's returning now
I after many sleepless nights solved my problem, I will leave it here in case it helps someone.
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
class DicionarioSpider(scrapy.Spider):
name = 'dicionario'
allowed_domains = ['www.mediktor.com']
start_urls = ['http://www.mediktor.com/']
def parse(self, response):
url = "https://www.mediktor.com/pt-br/glossario"
option = Options()
option.headless = True
driver = webdriver.Chrome(options=option)
driver.get(url)
time.sleep(10)
el_links = driver.find_elements(
By.XPATH, "//a[#class='mdk-dictionary-list__glossary-item']")
urls = []
nome_doenca = []
for i in range(len(el_links)):
urls.append(el_links[i].get_attribute('href'))
for link in urls:
driver.get(link)
myElem = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH,
"//div[#class='mdk-conclusion-detail__main-title']"
)))
nome_source = driver.find_element(By.XPATH,
"//div[#class='mdk-conclusion-detail__main-title']"
).text
nome_doenca.append(nome_source)
driver.back()
print(nome_doenca)
driver.quit()
I just modified my code and didn't use scrapy, just the selenium selectors.
I'm trying to retrieve stock information from 2 different urls and write the information to a panda's dataframework. However, i keep on getting errors. Could anyone please help me out here?
I'm pretty new to python, so my code will probably look very ugly :D
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
PATH='C:\Program Files (x86)\chromedriver.exe'
options = Options()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument("--window-size=2550,1440")
s = Service('C:\Program Files (x86)\chromedriver.exe')
driver = webdriver.Chrome(PATH, options=options)
driver.implicitly_wait(10)
#maak een dataframe aan
dn=[]
def accept_cookies():
try:
driver.find_element(By.ID, 'accept-choices').click()
except:
print('fu')
stocklist=["FB","KLIC"]
for x in stocklist:
url = f"https://stockanalysis.com/stocks/{x}/financials/"
driver.get(url)
driver.implicitly_wait(10)
accept_cookies()
driver.implicitly_wait(10)
driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
soup = BeautifulSoup(driver.page_source, 'html.parser')
df = pd.read_html(str(soup), attrs={'id': 'financial-table'})[0]
new_df = pd.concat(df)
dn.to_excel(xlwriter, sheet_name='key', index=False)
xlwriter.save()
pd.concat needs a list of objects to concatenate, whereas you have only given it df.
So I think replace pd.concat(df) with pd.concat([df, new_df]) and have new_df = pd.DataFrame() before the for loop.
In case that there is no issue with the .read_html() part you should push your df to a list of data frames:
dflist =[]
for x in stocklist:
url = f"https://stockanalysis.com/stocks/{x}/financials/"
driver.get(url)
driver.implicitly_wait(10)
accept_cookies()
driver.implicitly_wait(10)
driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
dflist.append(pd.read_html(str(soup), attrs={'id': 'financial-table'})[0])
Finishing the iteration you can simply concat the list of data frames to a single one:
xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
pd.concat(dflist).to_excel(xlwriter, sheet_name='key', index=False)
xlwriter.save()
example
dflist =[]
for x in stocklist:
url = f"https://stockanalysis.com/stocks/{x}/financials/"
driver.get(url)
driver.implicitly_wait(10)
accept_cookies()
driver.implicitly_wait(10)
driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
dflist.append(pd.read_html(str(soup), attrs={'id': 'financial-table'})[0])
xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
pd.concat(dflist).to_excel(xlwriter, sheet_name='key', index=False)
xlwriter.save()
I'm struggling to get this code to extract the desired information from one single page.
I've tried all the usual selenium tactics and added a time delay. Hopefully, it's something simple. I'm not getting any error messages.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
from time import sleep
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,600")
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36"
options.add_argument(f'user-agent={user_agent}')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path=r'/usr/local/bin/chromedriver',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
sleep(3)
source_data = browser.page_source
bs_data = bs(source_data,"html.parser")
#product id
try:
product_id = bs_data.findfindAll('span', {'class': 'pdp-main-details__product-code'})
product_id = product_id.replace('Product code:','').strip()
except:
product_id = "n/a"
#image address
try:
for image in bs_data.find("div", {"class":"s7staticimage"}):
image_url = image.find('img')['src']
except:
image_url = "n/a"
#product description
try:
product_desc = bs_data.find('class',{'pdp-main-pdp-main-details__title'})
product_desc = product_desc.get_text().strip()
except:
product_desc = "n/a"
#product price
try:
product_price = bs_data.find('class',{'co-product__price pdp-main-details__price'})
product_price = product_price.get_text().strip()
except:
product_price = "n/a"
print (url,'|',image_url,'|',product_id,'|',product_desc,'|',product_price)
browser.quit()
Any assistance is greatly appreciated.
Thanks
Since the content is dynamically generated, your soup has nothing in it to find. Selenium is good enough. I don't know why you have treated the elements as list because there is only one of each on this page.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path='C:/bin/chromedriver.exe',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
browser.implicitly_wait(15)
product_id = browser.find_element_by_class_name('pdp-main-details__product-code')
print(product_id.text)
image = browser.find_element_by_xpath("//*[#id=\"s7viewer_flyout\"]/div[1]/img[1]")
image_url = image.get_attribute('src')
print(image_url)
Output:-
Product code: 410212
https://ui.assets-asda.com/dm/asdagroceries/5050854288142_T1?defaultImage=asdagroceries/noImage&resMode=sharp2&id=PqaST3&fmt=jpg&fit=constrain,1&wid=188&hei=188