Unable to scrape and make a dictionary from a news website - python

I want to scrape the news articles from a number of pages on the site: https://koreajoongangdaily.joins.com/section/business
At the end, I want to create a dictionary out of the scraped data which should have the date, UTC_date, title, authors_name, news_content, url.
Here is my code, which I tried but couldn't make the dictionary.
Import all the necessary functions
from bs4 import BeautifulSoup as soup
import requests
import numpy as np
from pymongo import MongoClient
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from time import sleep
import uuid
import datetime
import time
from fake_useragent import UserAgent
import os
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import sys
from fake_useragent import UserAgent
import warnings
warnings.filterwarnings('ignore')
import re
from tqdm import tqdm
import pandas as pd
import datetime
def string_to_date(x):
return datetime.datetime.strptime(x, '%Y/%m/%d')
def datee(pp):
return str(pp.date())
To get the links,
def get_link(res):
href_list = []
for res in res_list: # h3
link_list = res.select('a')
for link in link_list: # a
href = link.get('href')
href_list.append(href)
return href_list
To get the article body, title, authors, date and utc date from every link
def get_article(url):
news_list = []
title_list= []
page = requests.get(url)
bsobj = soup(page.content)
for title in bsobj.findAll('h1',{'class':'view-article-title serif'}):
title_list.append(title.text.strip())
for news in bsobj.findAll('div',{'class':'article-content-left pb-30'}):
news = news_list.append(news.text.strip())
author_list = []
for f in news:
author = ""
pattern = r"BY\b(.+)(?=\[.+\])"
resultsss = re.search(pattern, f)
if resultsss != None:
author = resultsss.group(0).strip()[3:]
authors = author_list.append(author)
#there is date given in every links of the articles hence we can use that
date_list_1 = []
separator = '/business'
for link in href_list:
new_set1 = link.replace('https://koreajoongangdaily.joins.com/', '')
new_set2 = new_set1.split(separator, 1)[0]
new_set3 = date_list_1.append(new_set2)
new_set4 = list(map(datee, new_set_4))
#no separate time so add 00:00:00 for UTC
p=[]
for x in new_set4:
utc_date = p.append(str(x) + " 00:00:00")
#print(news_list)
return news_list, title_list, authors, new_set4, utc_date
The n denotes the number of page I want to scrape,
def scrape_the_article(n):
options = webdriver.ChromeOptions()
lists = ['disable-popup-blocking']
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "normal"
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-extensions")
options.add_argument("--disable-notifications")
options.add_argument("--disable-Advertisement")
options.add_argument("--disable-popup-blocking")
driver = webdriver.Chrome(executable_path= r"E:\chromedriver\chromedriver.exe", options=options) #paste your own choromedriver path
url = "https://koreajoongangdaily.joins.com/section/business"
driver.get(url)
page = 0
for step in tqdm(range(n)): # set the page range here, how many page you want to scrape
page += 1
time.sleep(2)
try:
button = driver.find_element_by_class_name("service-more-btn")
button.click()
except Exception as e:
print("trying to scroll")
driver.execute_script("window.scrollBy(0, 100);")
print("Page: ", page)
html = driver.page_source
bs = BeautifulSoup(html, 'html.parser')
res_list = bs.select('div[class="mid-article3"]')
for res in res_list:
links = get_article_links(res)
article = get_article(links)
scrape_the_article(4)
And at the end I wanna make a dictionary which will look like this,
data = {'date': new_set4, 'utc_date_time': utc_date, 'title': title_list,'author': authors,
'content': news_list,'link': href_list}
But I couldn't get back the dictionary I wanted to get back. Please help me with this. Thank you!

There's an API endpoint that holds (almost) all data you need and each item is a dictionary, so you can construct your own data structure out of the API response.
NOTE There's no author key in the response, so if you really need this, you'll have to visit each article URL.
Here's how to get the first 10 items:
import datetime
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
'X-Requested-With': 'XMLHttpRequest'
}
api_endpoint = "https://koreajoongangdaily.joins.com/section/business"
payload = {
"url": "/section/business",
"currPage": "1",
}
results = requests.post(api_endpoint, headers=headers, data=payload)
for result in results.json()['RESULT_LIST']:
date = (
datetime.datetime
.strptime(result['service_date'], '%Y%m%d%H%M%S')
.strftime('%Y-%m-%d %H:%M:%S')
)
print(date)
print(f"{result['list_title']}\n{result['cmss_url']}")
print(f"{result['summary']}")
print("-" * 50)
Output:
2022-10-25 18:20:42
Bio business
https://koreajoongangdaily.joins.com/2022/10/25/business/industry/Korea-World-Bio-Summit-Seoul/20221025182043006.html
President Yoon Suk-yeol delivers an opening address at the World Bio Summit 2022 held at the Grand Walkerhill Seoul in Gwangjin District, eastern Seoul, on Tuesday.
--------------------------------------------------
2022-10-25 18:20:33
Mirae Group invests in Musk's Twitter takeover
https://koreajoongangdaily.joins.com/2022/10/25/business/tech/Korea-Twitter-Elon-Musk/20221025182048690.html
Mirae Asset Financial Group will invest $212 million in Elon Musks’ $44 billion acquisition of Twitter, according to electronic disclosures and local media reports.
--------------------------------------------------
2022-10-25 18:20:00
Smart chair
https://koreajoongangdaily.joins.com/2022/10/25/imageNews/photos/KT-robotics-smart-autonomous-chairs/20221025182003312.html
A demonstration of an autonomous “smart” robot chair at the Dongdaemun Design Plaza in Seoul. KT announced that it is making the smart robotic chair available for three weeks to visitors attending the DDP-NFT exhibition.
--------------------------------------------------
and more ...
To paginate the API, try this example:
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
'X-Requested-With': 'XMLHttpRequest'
}
api_endpoint = "https://koreajoongangdaily.joins.com/section/business"
payload = {
"url": "/section/business",
"currPage": "1",
}
with requests.Session() as s:
for page in range(1, 100, 10):
payload["currPage"] = str(page)
results = s.post(api_endpoint, headers=headers, data=payload)
for result in results.json()['RESULT_LIST']:
print(result['service_date'])
print(f"{result['list_title']}\n{result['cmss_url']}")
print(f"{result['summary']}")
print("-" * 50)
NOTE: I'd highly recommend throttling the request to a 1 - 3 seconds between each attempt.

Related

How to scrape a site by loading content via triggering load more button?

I need to scrape the titles for all blog post articles via a Load More button as set by my desired range for i in range(1,3):
At present I'm only able to capture the titles for the first page even though i'm able to navigate to the next page using selenium.
Update:
In a previous question (How To Scrape Content With Load More Pages Using Selenium Python) by myself the pagination url was captured via:
Network Tab > Reload Page > Click Show more button > Select wp-admin/admin-ajax.php?...... Right Click Copy > Copy Link Address.
However, i do not know how to capture similar url for the site learnwoo.com/blog. I'm not sure if it uses a different technique.
Any help would be much appreciated.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
# Selenium Routine
from requests_html import HTMLSession
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
# Removes SSL Issues With Chrome
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('log-level=3')
options.add_argument('--disable-notifications')
#options.add_argument('--headless') # Comment to view browser actions
# Get website url
urls = "https://learnwoo.com/blog/"
r = requests.get(urls)
driver = webdriver.Chrome(executable_path="C:\webdrivers\chromedriver.exe",options=options)
driver.get(urls)
productlist = []
for i in range(1,3):
# Get Page Information
soup = BeautifulSoup(r.content, features='lxml')
items = soup.find_all('div', class_ = 'td_module_1')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
title = single_item.find('h3').text.strip()
print('Title:', title)
product = {
'Title': title,
}
productlist.append(product)
print()
time.sleep(5)
WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).send_keys(Keys.ENTER)
driver.close()
# Save Results
df = pd.DataFrame(productlist)
df.to_csv('Results.csv', index=False)
Alternative solution: You can use API response to extract the desired data.From API response,I'm getting total 74 items where each page contains 6 items.
import pandas as pd
import requests
from bs4 import BeautifulSoup
params = {
'id': '',
'post_id': '0',
'slug': 'home',
'canonical_url': 'https://jooble.org/blog/',
'posts_per_page': '6',
'page': '0',
'offset': '20',
'post_type': 'post',
'repeater': 'default',
'seo_start_page': '1',
'preloaded': 'false',
'preloaded_amount': '0',
'lang': 'en',
'order': 'DESC',
'orderby': 'date',
'action': 'alm_get_posts',
'query_type': 'standard',
}
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
api_url='https://jooble.org/blog/wp-admin/admin-ajax.php'
productlist= []
for params['page'] in range(0,13):
req = requests.get(api_url,params=params,headers=headers)
e = req.json()['html']
soup = BeautifulSoup(e,'lxml')
items = soup.find_all('div', class_ = 'front__news-content-wrapper')
for single_item in items:
title = single_item.find('div', class_ = 'front__news-title')
title=title.text.strip() if title else None
product = {
'Title': title,
}
productlist.append(product)
df = pd.DataFrame(productlist)
print(df)
Output:
Title
0 How to become an anesthesiologist
1 How to Become a Flight Attendant
2 How To Become An Influencer
3 How to Become an Electrician
4 3 Common Job Scams You Should Stay Away From
.. ...
69 Exploring Main Types of Remote Work
70 14 books HR specialist should read. Part 2
71 14 books HR specialist should read. Part 1
72 Don’t do that: 7 mistakes ruining your job int...
73 Virtual job interview. Jooble tips how to nail it
[74 rows x 1 columns]
To answer your question in selenium context, you could call .click():
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).click()
Concerning your xhr request comment - Note: Here it is not a GET it is a POST request (https://learnwoo.com/wp-admin/admin-ajax.php?td_theme_name=Newspaper&v=11) and you have to send some additional payload with requests
Example
This example is based on selenium 4 and uses its imports, may check https://www.selenium.dev/documentation/webdriver/getting_started/upgrade_to_selenium_4/#python-1
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
urls = "https://learnwoo.com/blog/"
driver.get(urls)
productlist = []
for i in range(1,3):
soup = BeautifulSoup(driver.page_source)
items = soup.find_all('div', class_ = 'td_module_1')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
product = {
'Title': single_item.find('h3').text.strip(),
}
productlist.append(product)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).click()
pd.DataFrame(productlist)
Output
Title
0 5 Futuristic eCommerce Trends
1 9 Best Shopify Apps to Print Shipping Labels
2 Cloud Hosting VS VPS Hosting: A Comparison
3 10 Best WooCommerce Facebook Integration Plugins
4 Complete Guide to BigCommerce Security
... ...
91 How To Calculate ROI of Your Moodle LMS?
92 How and Where to Find Help for WordPress Begin...
93 Expert Speaks: In Conversation with Amir Helze...
94 A Complete Analysis: NetSuite WooCommerce Inte...
95 Review of Currency Switcher & Converter for Sh...
96 rows × 1 columns
Here is an alternative to HedgHog's response: maybe the better way here is to use a while loop, as we don't know how many entries there are. I used a counter to breakout of the loop after the 5th loading - if you want to get all those entries, you just remove the counter.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
import pandas as pd
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(browser, 20)
big_list = []
counter = 1
url = 'https://learnwoo.com/blog/'
browser.get(url)
while True:
try:
load_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[aria-label="load_more"]')))
load_button.click()
counter = counter + 1
print('clicked to load more')
t.sleep(3)
entries = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h3[class="entry-title td-module-title"]')))
print('we have', len(entries), 'articles')
big_list = [(x.text, x.find_element(By.TAG_NAME, 'a').get_attribute('href')) for x in entries]
if counter > 5:
break
except Exception as e:
print('all done')
break
df = pd.DataFrame(big_list, columns = ['Article', 'Url'])
print(df)
Result:
Article Url
0 5 Futuristic eCommerce Trends https://learnwoo.com/future-ecommerce-trends/
1 9 Best Shopify Apps to Print Shipping Labels https://learnwoo.com/best-shopify-apps-print-s...
2 Cloud Hosting VS VPS Hosting: A Comparison https://learnwoo.com/cloud-hosting-vps-hosting/
3 10 Best WooCommerce Facebook Integration Plugins https://learnwoo.com/best-woocommerce-facebook...
4 Complete Guide to BigCommerce Security https://learnwoo.com/bigcommerce-security-guide/
... ... ...
286 A Comparison Between Omnichannel and Multichan... https://learnwoo.com/omnichannel-multichannel-...
287 8 Winning Techniques for Off-page SEO https://learnwoo.com/winning-techniques-for-of...
288 WooCommerce – How to Understand User Roles and... https://learnwoo.com/woocommerce-understand-us...
289 7 Best Free WooCommerce Catalog Mode Plugins (... https://learnwoo.com/free-woocommerce-catalog-...
290 Different WooCommerce Product Types Explained ... https://learnwoo.com/woocommerce-different-pro...
291 rows × 2 columns

Following links and crawling them

I was trying to make a crawler to follow links, with this code
import scrapy
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
class DicionarioSpider(scrapy.Spider):
name = 'dicionario'
allowed_domains = ['www.mediktor.com']
start_urls = ['http://www.mediktor.com/']
def start_requests(self):
url = "https://www.mediktor.com/pt-br/glossario"
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(10)
doencas = driver.find_elements(
By.XPATH, "//a[#class='mdk-dictionary-list__glossary-item']")
for doenca in doencas:
url = doenca.get_attribute('href')
yield scrapy.Request(url)
driver.quit()
def parse(self, response):
urls = response.css(
'.mdk-dictionary-list__glossary-item a::attr(href)')
for url in urls:
yield response.follow(url.get(), callback=self.parse_info)
def parse_info(self, response):
contents = response.css('div.page-glossary-detail__main-content')
for desc in response.css('div.mdk-conclusion-detail__main-description'):
desc = response.css('p ::text').getall()
yield {
'desc': desc
}
for content in contents:
yield{
'name': content.css(
'div.mdk-conclusion-detail__main-title ::text').get().strip(),
'espec': content.css(
'div.mdk-ui-list-item__text mdc-list-item__text span::text').strip()
}
I was able to get the links but the part of entering the links and getting the information I need was not working, so a friend helped me to come up with this code
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_auth_code():
url = "https://www.mediktor.com/vendor.js"
response = requests.get(url)
start_index = response.text.index('APP_API_AUTH_CODE:"', 0) + len('APP_API_AUTH_CODE:"')
end_index = response.text.index('"', start_index)
return response.text[start_index:end_index]
def get_auth_token_and_device_id():
url = "https://euapi01.mediktor.com/backoffice/services/login"
payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
"\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
"\"timezoneRaw\":180,\"authTokenRefreshExpiresIn\":null}"
headers = {
'authorization': f'Basic {get_auth_code()}',
'Content-Type': 'text/plain'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json()['authToken'], response.json()['deviceId']
def get_conclusion_list(auth_token, device_id):
url = "https://euapi01.mediktor.com/backoffice/services/conclusionList"
payload = "{\"useCache\":168,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"" \
",\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
"\"timezoneRaw\":180,\"deviceId\":\"" + device_id + "\"}"
headers = {
'accept': 'application/json, text/plain, */*',
'authorization': f'Bearer {auth_token}',
'content-type': 'application/json;charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
return [conclusionId['conclusionId'] for conclusionId in response.json()['conclusions']]
def get_details(conclusionId, auth_token, device_id):
url = "https://euapi01.mediktor.com/backoffice/services/conclusionDetail"
payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
"\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"en_EN\"," \
"\"timezoneRaw\":180,\"deviceId\":\"" + device_id + "\"," \
"\"conclusionId\":\"" + conclusionId + "\"," \
"\"conclusionTemplate\":\"conclusion_description_body\",\"includeActions\":true}"
headers = {
'authorization': f'Bearer {auth_token}',
'content-type': 'application/json;charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.text
auth_token, device_id = get_auth_token_and_device_id()
conclusion_list = get_conclusion_list(auth_token, device_id)
for conclusion in conclusion_list:
print(get_details(conclusion, auth_token, device_id))
It gets the json with the page items, but in loop number 230 it starts returning the following error and won't leave the loop
{"error":{"code":"ME667","description":"Expired user identification token.","retry":true}}
What I'd like to do is, pass this all to a file so I can see if it's getting all the items on the page I need and then leave a json with just the information I need, not everything from the site as it's returning now
I after many sleepless nights solved my problem, I will leave it here in case it helps someone.
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
class DicionarioSpider(scrapy.Spider):
name = 'dicionario'
allowed_domains = ['www.mediktor.com']
start_urls = ['http://www.mediktor.com/']
def parse(self, response):
url = "https://www.mediktor.com/pt-br/glossario"
option = Options()
option.headless = True
driver = webdriver.Chrome(options=option)
driver.get(url)
time.sleep(10)
el_links = driver.find_elements(
By.XPATH, "//a[#class='mdk-dictionary-list__glossary-item']")
urls = []
nome_doenca = []
for i in range(len(el_links)):
urls.append(el_links[i].get_attribute('href'))
for link in urls:
driver.get(link)
myElem = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH,
"//div[#class='mdk-conclusion-detail__main-title']"
)))
nome_source = driver.find_element(By.XPATH,
"//div[#class='mdk-conclusion-detail__main-title']"
).text
nome_doenca.append(nome_source)
driver.back()
print(nome_doenca)
driver.quit()
I just modified my code and didn't use scrapy, just the selenium selectors.

Insert value in searchbar, select autocomplete result and get value by bs4

I am trying to use Beautiful Soup to read a value from a web page. The following steps are necessary:
go to the webpage:
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
insert the ISIN in the searchbar
3. select the autocomplete-results from the container msci-ac-search-data-dropdown (click)
4. read the value from the "div class: ratingdata-outercircle esgratings-profile-header-green" to get the text: "ratingdata-fund-rating esg-fund-ratings-circle-aaa".
so far i have tried the following:
import requests
from bs4 import BeautifulSoup
isin = 'IE00B4L5Y983'
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
payload = {}
for i in soup.select('form[action="https://www.msci.com/search"] input[value]'):
payload[i['name']] = i['value']
payload['UQ_txt'] = isin
Try:
import requests
from bs4 import BeautifulSoup
isin = "IE00B4L5Y983"
url = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"X-Requested-With": "XMLHttpRequest",
}
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "searchFundRatingsProfiles",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_keywords": isin,
}
data = requests.get(url, params=params, headers=headers).json()
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "showEsgFundRatingsProfile",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_fundShareClassId": data[0]["url"],
}
headers["Referer"] = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/{}/{}".format(
data[0]["encodedTitle"], data[0]["url"]
)
soup = BeautifulSoup(
requests.get(url, params=params, headers=headers).content, "html.parser"
)
data = soup.select_one(".ratingdata-fund-rating")["class"]
print(data)
Prints:
['ratingdata-fund-rating', 'esg-fund-ratings-circle-aaa']
When you press enter, you send another request, which already shows the search result. Here is an example of how to get what you want
import requests
isin = 'IE00B4L5Y983'
url = f"https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings?p_p_id=esg_fund_ratings_profile&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchFundRatingsProfiles&p_p_cacheability=cacheLevelPage&_esg_fund_ratings_profile_keywords={isin}"
for title in requests.get(url).json():
print(title['title'])
OUTPUT:
iShares Core MSCI World UCITS ETF USD (Acc)
If I may: from the OP's description I can only infer this is either an education related test, either a job interview related test. As such, following the exact instructions is paramount. In order to follow said instructions, you can only use selenium. The following code will work 'a la point', and get the desired result:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
browser.get(url)
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, '_esg_fund_ratings_profile_keywords'))).send_keys('IE00B4L5Y983')
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.ID, 'ui-id-1')))
result = browser.find_element(By.ID, "ui-id-1")
result.click()
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'esgratings-profile-header-green')))
result = browser.find_element(By.CLASS_NAME, "esgratings-profile-header-green").find_element(By.TAG_NAME, "div").get_attribute('class')
print(result)
browser.quit()
This will return:
ratingdata-fund-rating esg-fund-ratings-circle-aaa

Scraping: can't get stable results

I'm doing a scraping exercise on a job searching webpage. I want to get the link, name of the company, job title, salary, location and posting date. I've run the same code multiple times, and sometimes it gives the expected results in the salary item (salary if the info is displayed, "N/A" otherwise) and sometimes it gives me something different: salary if the info is displayed, "N/A", and some random character values in columns whose values should be "N/A". I have no problems with the other elements. Here is my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://ca.indeed.com/')
#Inputs a job title and location into the input boxes
input_box = driver.find_element(By.XPATH,'//*[#id="text-input-what"]')
input_box.send_keys('data analyst')
location = driver.find_element(By.XPATH,'//*[#id="text-input-where"]')
location.send_keys('toronto')
#Clicks on the search button
button = driver.find_element(By.XPATH,'//*[#id="jobsearch"]/button').click()
#Creates a dataframe
df = pd.DataFrame({'Link':[''], 'Job Title':[''], 'Company':[''], 'Location':[''],'Salary':[''], 'Date':['']})
#This loop goes through every page and grabs all the details of each posting
#Loop will only end when there are no more pages to go through
while True:
#Imports the HTML of the current page into python
soup = BeautifulSoup(driver.page_source, 'lxml')
#Grabs the HTML of each posting
postings = soup.find_all('div', class_ = 'slider_container css-g7s71f eu4oa1w0')
len(postings)
#grabs all the details for each posting and adds it as a row to the dataframe
for post in postings:
link = post.find('a').get('href')
link_full = 'https://ca.indeed.com'+link
name = post.find('h2', tabindex = '-1').text.strip()
company = post.find('span', class_ = 'companyName').text.strip()
try:
location = post.find('div', class_ = 'companyLocation').text.strip()
except:
location = 'N/A'
try:
salary = post.find('div', attrs = {'class':'heading6 tapItem-gutter metadataContainer noJEMChips salaryOnly'}).text.strip()
except:
salary = 'N/A'
date = post.find('span', class_ = 'date').text.strip()
df = df.append({'Link':link_full, 'Job Title':name, 'Company':company, 'Location':location,'Salary':salary, 'Date':date},
ignore_index = True)
#checks if there is a button to go to the next page, and if not will stop the loop
try:
button = soup.find('a', attrs = {'aria-label': 'Next'}).get('href')
driver.get('https://ca.indeed.com'+button)
except:
break
Can I fix my code to get the expected results everytime I run it? Also, an additional issue: I'm scraping around 60 pages. But usually the program stops between 20 and 30 pages before the last page. Is there a way to fix the code so that it scrapes until the last page everytime?
Here is a simplified example with requests library:
import requests
from bs4 import BeautifulSoup
cookies = {}
headers = {}
params = {
'q': 'data analyst',
'l': 'toronto',
'from': 'searchOnHP',
}
response = requests.get('https://ca.indeed.com/jobs', params=params, cookies=cookies, headers=headers)
soup = BeautifulSoup(response.text)
postings = soup.find_all('div', class_ = 'slider_container css-g7s71f eu4oa1w0')
len(postings)
prints
15

CNN Scraper sporadically working in python

I've tried to create a Web Scraper for CNN. My goal is to scrape all news articles within the search query. Sometimes I get an output for some of the scraped pages and sometimes it doesn't work at all.
I am using selenium and BeautifulSoup packages in Jupiter Notebook. I am iterating over the pages via the url parameters &page={}&from={}. I tried by.XPATH before and simply clicking the next button at the end of the page, but it gave me the same results.
Here's the code I'm using:
#0 ------------import libraries
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import feedparser
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle
import pandas as pd
#3 ------------CNN SCRAPER
#3.1 ----------Define Funktion
def CNN_Scraper(max_pages):
base = "https://edition.cnn.com/"
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
load_content = browser.implicitly_wait(30)
base_url = 'https://edition.cnn.com/search?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100'
#-------------Define empty lists to be scraped
CNN_title = []
CNN_date = []
CNN_article = []
article_count = 0
#-------------iterate over pages and extract
for page in range(1, max_pages + 1):
print("Page %d" % page)
url= base_url + "&page=%d&from=%d" % (page, article_count)
browser.get(url)
load_content
soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('div', {'class':'cnn-search__results-list'})
contents = search_results.find_all('div', {'class':'cnn-search__result-contents'})
for content in contents:
try:
title = content.find('h3').text
print(title)
link = content.find('a')
link_url = link['href']
date = content.find('div',{'class':'cnn-search__result-publish-date'}).text.strip()
article = content.find('div',{'class':'cnn-search__result-body'}).text
except:
print("loser")
continue
CNN_title.append(title)
CNN_date.append(date)
CNN_article.append(article)
article_count += 100
print("-----")
#-------------Save in DF
df = pd.DataFrame()
df['title'] = CNN_title
df['date'] = CNN_date
df['article'] = CNN_article
df['link']=CNN_link
return df
#print("Complete")
browser.quit()
#3.2 ----------Call Function - Scrape CNN and save pickled data
CNN_data = CNN_Scraper(2)
#CNN_data.to_pickle("CNN_data")
Call the back-end API directly. For more details check my previous answer
import requests
import json
def main(url):
with requests.Session() as req:
for item in range(1, 1000, 100):
r = req.get(url.format(item)).json()
for a in r['result']:
print("Headline: {}, Url: {}".format(
a['headline'], a['url']))
main("https://search.api.cnn.io/content?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100&from={}")

Categories

Resources