Following links and crawling them - python

I was trying to make a crawler to follow links, with this code
import scrapy
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
class DicionarioSpider(scrapy.Spider):
name = 'dicionario'
allowed_domains = ['www.mediktor.com']
start_urls = ['http://www.mediktor.com/']
def start_requests(self):
url = "https://www.mediktor.com/pt-br/glossario"
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(10)
doencas = driver.find_elements(
By.XPATH, "//a[#class='mdk-dictionary-list__glossary-item']")
for doenca in doencas:
url = doenca.get_attribute('href')
yield scrapy.Request(url)
driver.quit()
def parse(self, response):
urls = response.css(
'.mdk-dictionary-list__glossary-item a::attr(href)')
for url in urls:
yield response.follow(url.get(), callback=self.parse_info)
def parse_info(self, response):
contents = response.css('div.page-glossary-detail__main-content')
for desc in response.css('div.mdk-conclusion-detail__main-description'):
desc = response.css('p ::text').getall()
yield {
'desc': desc
}
for content in contents:
yield{
'name': content.css(
'div.mdk-conclusion-detail__main-title ::text').get().strip(),
'espec': content.css(
'div.mdk-ui-list-item__text mdc-list-item__text span::text').strip()
}
I was able to get the links but the part of entering the links and getting the information I need was not working, so a friend helped me to come up with this code
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_auth_code():
url = "https://www.mediktor.com/vendor.js"
response = requests.get(url)
start_index = response.text.index('APP_API_AUTH_CODE:"', 0) + len('APP_API_AUTH_CODE:"')
end_index = response.text.index('"', start_index)
return response.text[start_index:end_index]
def get_auth_token_and_device_id():
url = "https://euapi01.mediktor.com/backoffice/services/login"
payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
"\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
"\"timezoneRaw\":180,\"authTokenRefreshExpiresIn\":null}"
headers = {
'authorization': f'Basic {get_auth_code()}',
'Content-Type': 'text/plain'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json()['authToken'], response.json()['deviceId']
def get_conclusion_list(auth_token, device_id):
url = "https://euapi01.mediktor.com/backoffice/services/conclusionList"
payload = "{\"useCache\":168,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"" \
",\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
"\"timezoneRaw\":180,\"deviceId\":\"" + device_id + "\"}"
headers = {
'accept': 'application/json, text/plain, */*',
'authorization': f'Bearer {auth_token}',
'content-type': 'application/json;charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
return [conclusionId['conclusionId'] for conclusionId in response.json()['conclusions']]
def get_details(conclusionId, auth_token, device_id):
url = "https://euapi01.mediktor.com/backoffice/services/conclusionDetail"
payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
"\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"en_EN\"," \
"\"timezoneRaw\":180,\"deviceId\":\"" + device_id + "\"," \
"\"conclusionId\":\"" + conclusionId + "\"," \
"\"conclusionTemplate\":\"conclusion_description_body\",\"includeActions\":true}"
headers = {
'authorization': f'Bearer {auth_token}',
'content-type': 'application/json;charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.text
auth_token, device_id = get_auth_token_and_device_id()
conclusion_list = get_conclusion_list(auth_token, device_id)
for conclusion in conclusion_list:
print(get_details(conclusion, auth_token, device_id))
It gets the json with the page items, but in loop number 230 it starts returning the following error and won't leave the loop
{"error":{"code":"ME667","description":"Expired user identification token.","retry":true}}
What I'd like to do is, pass this all to a file so I can see if it's getting all the items on the page I need and then leave a json with just the information I need, not everything from the site as it's returning now

I after many sleepless nights solved my problem, I will leave it here in case it helps someone.
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
class DicionarioSpider(scrapy.Spider):
name = 'dicionario'
allowed_domains = ['www.mediktor.com']
start_urls = ['http://www.mediktor.com/']
def parse(self, response):
url = "https://www.mediktor.com/pt-br/glossario"
option = Options()
option.headless = True
driver = webdriver.Chrome(options=option)
driver.get(url)
time.sleep(10)
el_links = driver.find_elements(
By.XPATH, "//a[#class='mdk-dictionary-list__glossary-item']")
urls = []
nome_doenca = []
for i in range(len(el_links)):
urls.append(el_links[i].get_attribute('href'))
for link in urls:
driver.get(link)
myElem = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH,
"//div[#class='mdk-conclusion-detail__main-title']"
)))
nome_source = driver.find_element(By.XPATH,
"//div[#class='mdk-conclusion-detail__main-title']"
).text
nome_doenca.append(nome_source)
driver.back()
print(nome_doenca)
driver.quit()
I just modified my code and didn't use scrapy, just the selenium selectors.

Related

Proxycurl api doesnt return data properly

first, i'm sorry for this long message but i have an issue that is blocking me from advancing in my project: Let me first explain quickly the workflow, User enters a search query -> making a search in linkedin with this query -> grabbing urls of users (in function of nb of pages) -> search for these users in proxycurl (https://nubela.co/proxycurl/docs#people-api-person-lookup-endpoint) -> grab their infos with a function -> store them in my db -> grabs infos about the experiences of the scraped users -> make a search in proxycurl API again but for the companies this time -> grab infos about companies and store them in db -> search infos about employees in this company (https://nubela.co/proxycurl/docs#company-api-employee-search-api-endpoint) -> grab url of the CTO -> search in the contact API to grab the infos about the CTO (https://nubela.co/proxycurl/docs#contact-api-personal-contact-number-lookup-endpoint and https://nubela.co/proxycurl/docs#contact-api-personal-email-lookup-endpoint) -> store everything in database.
Ok so i manage to grab urls, search for the users in api, but i never manage to get the 'extra' information with my code while i can grab them for the same profiles in Postman, same for personnal_email, personnal_contact_number, github_profile_id.
Then i manage to grab the data about the companies, but still same problem, can't retrieve the 'extra' information, or the 'funding_data' or 'acquisitions' even if i include them in my code.
I really don't know what's wrong with my code (i'm assuming something's wrong because everything works perfectly with postman), and i can take a little help here, thanks for your time ! (Full code below)
from telnetlib import EC
import requests
from datetime import datetime
import json
import re
from cgitb import text
import selenium
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup, NavigableString, Tag
from time import sleep
from time import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import csv
import firebase_admin
from firebase_admin import credentials
from firebase_admin import db
import openpyxl
import requests
cred = credentials.Certificate(r"C:\Users\radia\Downloads\st-londres-2-firebase-adminsdk-7eowq-786e799875.json")
firebase_admin.initialize_app(cred, {
'databaseURL': 'https://st-londres-2-default-rtdb.firebaseio.com/'
})
print('- Importation des packages')
# Task 1: webdriver configuration
driver = webdriver.Chrome(ChromeDriverManager().install())
# Task 1.1: Open Chrome and Access Linkedin
sleep(2)
url = 'https://www.linkedin.com/login'
driver.get(url)
print('Initialisation du chrome driver')
sleep(2)
# Task 1.2: Import username and password
credential = open(r"C:\Users\radia\OneDrive\Bureau\credentials.txt")
line = credential.readlines()
username = line[0]
password = line[1]
print('Importation des id')
sleep(2)
# Task 1.2: Key in login credentials
email_field = driver.find_element(By.ID, 'username')
email_field.send_keys(username)
print('Email ok')
sleep(3)
password_field = driver.find_element(By.NAME, 'session_password')
password_field.send_keys(password)
print('Mdp ok')
sleep(2)
# Task 1.2: Click the Login button
signin_field = driver.find_element(By.XPATH, '//*[#id="organic-div"]/form/div[3]/button')
signin_field.click()
sleep(3)
print('- Task A: Connexion à Linkedin')
search_field = driver.find_element(By.XPATH, '//*[#id="global-nav-typeahead"]/input')
search_query = input('Type of profile to scrape ')
search_field.send_keys(search_query)
search_field.send_keys(Keys.RETURN)
print('TASK B OK')
sleep(10)
try:
driver.find_element(By.XPATH, "//*[#id='search-reusables__filters-bar']/ul/li[2]/button").click()
except selenium.common.exceptions.NoSuchElementException:
print("Element not found")
def GetURL(): #function to grab linkedin urls
page_source = BeautifulSoup(driver.page_source, features='lxml')
a_elements = page_source.find_all('a', {'class': "app-aware-link"})
all_urls = []
for element in a_elements:
url = element.get('href')
all_urls.append(url)
return all_urls
##Pagination
sleep(2)
input_page = int(input('Nombre de pages à scraper: '))
URLs_all_page = []
for page in range(input_page):
URLs_one_page = GetURL()
sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #scrolling to the end of the page
sleep(3)
next_button = driver.find_element(By.XPATH, '//button[contains(#class, "artdeco-pagination__button--next") and .//li-icon]')
driver.execute_script("arguments[0].click();", next_button)
sleep(2)
if URLs_one_page is not None:
URLs_all_page = URLs_all_page + URLs_one_page
print(URLs_all_page)
else:
print('variable stores a None value')
sleep(2)
print(URLs_all_page)
sleep(1)
def get_profile_info(url): # function to make api calls for users
api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
params = {
'url': url,
'fallback_to_cache': 'on-error',
'use_cache': 'if-present',
'skills': 'include',
'inferred_salary': 'include',
'personal_email': 'include',
'personal_contact_number': 'include',
'twitter_profile_id': 'include',
'facebook_profile_id': 'include',
'github_profile_id': 'include',
'extra': 'include',
}
try:
response = requests.get(api_endpoint, headers=headers, params=params)
if response.status_code != 404:
data_profile = response.json()
return data_profile
else:
return None
except requests.exceptions.RequestException as e:
print (e)
return None
def get_company_info(url): #function to make api calls for companies
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company'
params = {
'resolve_numeric_id': 'true',
'categories': 'include',
'funding_data': 'include',
'extra': 'include',
'exit_data': 'include',
'acquisitions': 'include',
'url': 'include',
'use_cache': 'if-present',
}
try:
response = requests.get(api_endpoint, params={'url':url}, headers=headers)
if response.status_code == 404:
print("Company not found for URL:", url)
return None
else:
data_company = response.json()
print(data_company)
if 'extra' in data_company:
print("Extra information found:", data_company['extra'])
else:
print("No extra information found in JSON response.")
return data_company
except requests.exceptions.RequestException as e:
print (e)
return None
def get_company_employee_url(company_linkedin_profile_url):
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company/employee/search/'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
header_dic = {'Authorization': 'Bearer ' + api_key}
params = {
'page_size': '10',
'linkedin_company_profile_url': company_linkedin_profile_url,
'keyword_regex': '[Cc][Tt][Oo]',
'enrich_profiles': 'enrich',
'resolve_numeric_id': 'false',
}
response = requests.get(api_endpoint,
params=params,
headers=header_dic)
print(response.status_code)
print(response.text)
if response.status_code == 404:
print("No employees found for URL:", url)
return None
else:
data_employees = response.json()
if 'employees' in data_employees:
print("Employees found:", data_employees['employee_search_results'])
else:
print("No employees found in JSON response.")
#return and store profile_url in data_employees:
for employee in data_employees['employee_search_results']:
profile_url = employee['profile_url']
print(profile_url)
def get_company_employee_info(profile_url):
api_endpoint = 'https://nubela.co/proxycurl/api/contact-api/personal-contact'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
header_dic = {'Authorization': 'Bearer ' + api_key}
params = {
'linkedin_profile_url': 'https://linkedin.com/in/test-phone-number',
}
response = requests.get(api_endpoint,
params=params,
headers=header_dic)
# Initialize visited URLs + data_list
visited_urls = []
for url in URLs_all_page:
if url in visited_urls:
print("Profile already exists in the database for URL:", url)
continue
data = get_profile_info(url)
if data and "error" in data:
print(data["error"])
if not data or "experiences" not in data:
continue
data["search_query"] = search_query # Add the search_query to the data
db.reference('profiles').push(data) # Store data in the candidates table
visited_urls.append(url)
print("Profile data and search query successfully added to the candidates table for URL:", url)
for item in data['experiences']:
company_name = str(item['company'])
company_name_push = re.sub(r'[^a-zA-Z0-9]', '', company_name) # Error handling when pushing code to db, replacement of illegal values
company_linkedin_profile_url = item['company_linkedin_profile_url']
company_description = item['description']
company_data = get_company_info(company_linkedin_profile_url)
if company_name_push:
filtered_company = db.reference('companies/'+ company_name_push).get()
else:
continue
if filtered_company is None:
db.reference('companies').push({
'company_name': company_name_push,
'company_linkedin_profile_url': company_linkedin_profile_url,
'company_description': company_description,
'company_data': company_data
})
print("Company data successfully added for URL:", company_linkedin_profile_url)
else:
print("Company already exists in the database for URL:", company_linkedin_profile_url)
experiences = {
'candidate_name': data['full_name'],
'title': item['title'],
'company': item['company'],
'location': item['location'],
'start_date': item['starts_at'],
'end_date': item['ends_at'],
'description': item['description'],
}
db.reference('experiences').push(experiences)
company_employee_url = get_company_employee_url(company_linkedin_profile_url)
company_employee_data = get_company_employee_info(company_employee_url)
if company_employee_data:
db.reference('company_employees/' + company_name_push).push(company_employee_data)
print("Company employee data successfully added for company:", company_name)
else:
print("No data found for company employees for company:", company_name)

Unable to scrape and make a dictionary from a news website

I want to scrape the news articles from a number of pages on the site: https://koreajoongangdaily.joins.com/section/business
At the end, I want to create a dictionary out of the scraped data which should have the date, UTC_date, title, authors_name, news_content, url.
Here is my code, which I tried but couldn't make the dictionary.
Import all the necessary functions
from bs4 import BeautifulSoup as soup
import requests
import numpy as np
from pymongo import MongoClient
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from time import sleep
import uuid
import datetime
import time
from fake_useragent import UserAgent
import os
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import sys
from fake_useragent import UserAgent
import warnings
warnings.filterwarnings('ignore')
import re
from tqdm import tqdm
import pandas as pd
import datetime
def string_to_date(x):
return datetime.datetime.strptime(x, '%Y/%m/%d')
def datee(pp):
return str(pp.date())
To get the links,
def get_link(res):
href_list = []
for res in res_list: # h3
link_list = res.select('a')
for link in link_list: # a
href = link.get('href')
href_list.append(href)
return href_list
To get the article body, title, authors, date and utc date from every link
def get_article(url):
news_list = []
title_list= []
page = requests.get(url)
bsobj = soup(page.content)
for title in bsobj.findAll('h1',{'class':'view-article-title serif'}):
title_list.append(title.text.strip())
for news in bsobj.findAll('div',{'class':'article-content-left pb-30'}):
news = news_list.append(news.text.strip())
author_list = []
for f in news:
author = ""
pattern = r"BY\b(.+)(?=\[.+\])"
resultsss = re.search(pattern, f)
if resultsss != None:
author = resultsss.group(0).strip()[3:]
authors = author_list.append(author)
#there is date given in every links of the articles hence we can use that
date_list_1 = []
separator = '/business'
for link in href_list:
new_set1 = link.replace('https://koreajoongangdaily.joins.com/', '')
new_set2 = new_set1.split(separator, 1)[0]
new_set3 = date_list_1.append(new_set2)
new_set4 = list(map(datee, new_set_4))
#no separate time so add 00:00:00 for UTC
p=[]
for x in new_set4:
utc_date = p.append(str(x) + " 00:00:00")
#print(news_list)
return news_list, title_list, authors, new_set4, utc_date
The n denotes the number of page I want to scrape,
def scrape_the_article(n):
options = webdriver.ChromeOptions()
lists = ['disable-popup-blocking']
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "normal"
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-extensions")
options.add_argument("--disable-notifications")
options.add_argument("--disable-Advertisement")
options.add_argument("--disable-popup-blocking")
driver = webdriver.Chrome(executable_path= r"E:\chromedriver\chromedriver.exe", options=options) #paste your own choromedriver path
url = "https://koreajoongangdaily.joins.com/section/business"
driver.get(url)
page = 0
for step in tqdm(range(n)): # set the page range here, how many page you want to scrape
page += 1
time.sleep(2)
try:
button = driver.find_element_by_class_name("service-more-btn")
button.click()
except Exception as e:
print("trying to scroll")
driver.execute_script("window.scrollBy(0, 100);")
print("Page: ", page)
html = driver.page_source
bs = BeautifulSoup(html, 'html.parser')
res_list = bs.select('div[class="mid-article3"]')
for res in res_list:
links = get_article_links(res)
article = get_article(links)
scrape_the_article(4)
And at the end I wanna make a dictionary which will look like this,
data = {'date': new_set4, 'utc_date_time': utc_date, 'title': title_list,'author': authors,
'content': news_list,'link': href_list}
But I couldn't get back the dictionary I wanted to get back. Please help me with this. Thank you!
There's an API endpoint that holds (almost) all data you need and each item is a dictionary, so you can construct your own data structure out of the API response.
NOTE There's no author key in the response, so if you really need this, you'll have to visit each article URL.
Here's how to get the first 10 items:
import datetime
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
'X-Requested-With': 'XMLHttpRequest'
}
api_endpoint = "https://koreajoongangdaily.joins.com/section/business"
payload = {
"url": "/section/business",
"currPage": "1",
}
results = requests.post(api_endpoint, headers=headers, data=payload)
for result in results.json()['RESULT_LIST']:
date = (
datetime.datetime
.strptime(result['service_date'], '%Y%m%d%H%M%S')
.strftime('%Y-%m-%d %H:%M:%S')
)
print(date)
print(f"{result['list_title']}\n{result['cmss_url']}")
print(f"{result['summary']}")
print("-" * 50)
Output:
2022-10-25 18:20:42
Bio business
https://koreajoongangdaily.joins.com/2022/10/25/business/industry/Korea-World-Bio-Summit-Seoul/20221025182043006.html
President Yoon Suk-yeol delivers an opening address at the World Bio Summit 2022 held at the Grand Walkerhill Seoul in Gwangjin District, eastern Seoul, on Tuesday.
--------------------------------------------------
2022-10-25 18:20:33
Mirae Group invests in Musk's Twitter takeover
https://koreajoongangdaily.joins.com/2022/10/25/business/tech/Korea-Twitter-Elon-Musk/20221025182048690.html
Mirae Asset Financial Group will invest $212 million in Elon Musks’ $44 billion acquisition of Twitter, according to electronic disclosures and local media reports.
--------------------------------------------------
2022-10-25 18:20:00
Smart chair
https://koreajoongangdaily.joins.com/2022/10/25/imageNews/photos/KT-robotics-smart-autonomous-chairs/20221025182003312.html
A demonstration of an autonomous “smart” robot chair at the Dongdaemun Design Plaza in Seoul. KT announced that it is making the smart robotic chair available for three weeks to visitors attending the DDP-NFT exhibition.
--------------------------------------------------
and more ...
To paginate the API, try this example:
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
'X-Requested-With': 'XMLHttpRequest'
}
api_endpoint = "https://koreajoongangdaily.joins.com/section/business"
payload = {
"url": "/section/business",
"currPage": "1",
}
with requests.Session() as s:
for page in range(1, 100, 10):
payload["currPage"] = str(page)
results = s.post(api_endpoint, headers=headers, data=payload)
for result in results.json()['RESULT_LIST']:
print(result['service_date'])
print(f"{result['list_title']}\n{result['cmss_url']}")
print(f"{result['summary']}")
print("-" * 50)
NOTE: I'd highly recommend throttling the request to a 1 - 3 seconds between each attempt.

Insert value in searchbar, select autocomplete result and get value by bs4

I am trying to use Beautiful Soup to read a value from a web page. The following steps are necessary:
go to the webpage:
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
insert the ISIN in the searchbar
3. select the autocomplete-results from the container msci-ac-search-data-dropdown (click)
4. read the value from the "div class: ratingdata-outercircle esgratings-profile-header-green" to get the text: "ratingdata-fund-rating esg-fund-ratings-circle-aaa".
so far i have tried the following:
import requests
from bs4 import BeautifulSoup
isin = 'IE00B4L5Y983'
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
payload = {}
for i in soup.select('form[action="https://www.msci.com/search"] input[value]'):
payload[i['name']] = i['value']
payload['UQ_txt'] = isin
Try:
import requests
from bs4 import BeautifulSoup
isin = "IE00B4L5Y983"
url = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"X-Requested-With": "XMLHttpRequest",
}
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "searchFundRatingsProfiles",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_keywords": isin,
}
data = requests.get(url, params=params, headers=headers).json()
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "showEsgFundRatingsProfile",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_fundShareClassId": data[0]["url"],
}
headers["Referer"] = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/{}/{}".format(
data[0]["encodedTitle"], data[0]["url"]
)
soup = BeautifulSoup(
requests.get(url, params=params, headers=headers).content, "html.parser"
)
data = soup.select_one(".ratingdata-fund-rating")["class"]
print(data)
Prints:
['ratingdata-fund-rating', 'esg-fund-ratings-circle-aaa']
When you press enter, you send another request, which already shows the search result. Here is an example of how to get what you want
import requests
isin = 'IE00B4L5Y983'
url = f"https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings?p_p_id=esg_fund_ratings_profile&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchFundRatingsProfiles&p_p_cacheability=cacheLevelPage&_esg_fund_ratings_profile_keywords={isin}"
for title in requests.get(url).json():
print(title['title'])
OUTPUT:
iShares Core MSCI World UCITS ETF USD (Acc)
If I may: from the OP's description I can only infer this is either an education related test, either a job interview related test. As such, following the exact instructions is paramount. In order to follow said instructions, you can only use selenium. The following code will work 'a la point', and get the desired result:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
browser.get(url)
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, '_esg_fund_ratings_profile_keywords'))).send_keys('IE00B4L5Y983')
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.ID, 'ui-id-1')))
result = browser.find_element(By.ID, "ui-id-1")
result.click()
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'esgratings-profile-header-green')))
result = browser.find_element(By.CLASS_NAME, "esgratings-profile-header-green").find_element(By.TAG_NAME, "div").get_attribute('class')
print(result)
browser.quit()
This will return:
ratingdata-fund-rating esg-fund-ratings-circle-aaa

Failed to parse content from a webpage using requests

I'm trying to create a script using requests module (without using session) to parse two fields from a webpage but the script fails miserably. However, when I created another script using session, I could fetch the content from that site flawlessly.
Here goes the manual steps to reach the content:
Choose the first item from dropdown.
Get the links to the detail page.
Grab these two fields from detail page.
While creating the script using plain requests, I tried to make use of cookies but I ended up getting AttributeError.
Script without session:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def grab_first_link_from_dropdown(link):
r = requests.get(link,headers=headers)
soup = BeautifulSoup(r.text,"html.parser")
category_link = urljoin(base,soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link,headers=headers)
str_cookie = f"JSESSIONID={res.cookies['JSESSIONID']}"
soup = BeautifulSoup(res.text,"html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'",target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield str_cookie,inner_link
def get_content(str_cookie,inner_link):
headers['Cookie'] = str_cookie
res = requests.get(inner_link,headers=headers)
soup = BeautifulSoup(res.text,"html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: descripcion = ""
return expediente,descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie,detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie,detail_page_link))
What possible change should I bring about to make the script work?
There's a redirect that occurs on fetch_detail_page_link. Python Requests follows redirects by default. When your script obtains the cookies, it is only grabbing the cookies for the final request in the chain. You must access the history field of the response to see the redirects that were followed. Doing this with a Session object worked because it was preserving those cookies for you.
I must agree with others who have commented that it really would be a good idea to use a Session object for this. However if you insist on not using Session, your script would look like this:
import re
import requests
from requests.cookies import RequestsCookieJar
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': "Scraping Your Vigentes 1.0",
}
def grab_first_link_from_dropdown(link):
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
category_link = urljoin(base, soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link, headers=headers)
cookies = RequestsCookieJar() # create empty cookie jar
for r in res.history:
cookies.update(r.cookies) # merge in cookies from each redirect response
cookies.update(res.cookies) # merge in cookies from the final response
soup = BeautifulSoup(res.text, "html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'", target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield cookies, inner_link
def get_content(cookies, inner_link):
res = requests.get(inner_link, headers=headers, cookies=cookies)
if not res.ok:
print("Got bad response %s :(" % res.status_code)
return "", ""
soup = BeautifulSoup(res.text, "html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
descripcion = ""
return expediente, descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie, detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie, detail_page_link))

How to improve this webscraping python script?

Brief context, I started Python two weeks ago so don't hesitate to correct any mistake or improvement you see.I am trying to scrape data from the results club list of the site www.fff.fr .
My way of organizing it is:
Go to Homepage
Accept Cookies
Use search bar for cityname
Get result list
Follow each url of the result page
Go to each "Staff" sub-section
Extract data from this page
I started to build the below python code which is not working so far. I'd be really interested in feedback on how to actually make it work.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from shutil import which
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get("https://fff.fr")
cookie_btn = driver.find_element_by_id("didomi-notice-agree-button")
cookie_btn.click()
search_input = driver.find_element_by_xpath("/html//form[#id='proximiteSearch']//input[#id='fff_club_form_club_near_to_search_address']")
search_input.send_keys("Paris")
search_input.send_keys(Keys.ENTER)
self.html = driver.page_source
driver.close()
def parse(self, response):
resp = Selector(text=self.html)
clubs = resp.xpath("(//ul[contains(#id, 'listresulclub')])/li/text()")
for club in clubs:
name = club.xpath(".//text()").get()
name_link = club.xpath(".//#href").get()
url = f"https://www.ffr.fr{name_link}"
absolute_url = url[:-10] + "/le-staff"
# absolute_url = response.urljoin()
yield scrapy.Request(url=absolute_url, meta={'club_name':name})
#yield response.follow (url = name_link, callback=self.parse_country, meta={'club_name': name})
def parse_country(self, response):
name = response.request.meta['club_name']
contacts = response.xpath("//div[#class='coor-block-content']/ol")
for contact in contacts:
contact_nom = contact.xpath(".//li[1]/text()").get()
yield {
'club_name': name,
'correspondant_nom': contact_nom
}
You can try the same thing without selenium and it works:
import bs4
import requests
import sys
import re
import unicodedata
import os
import random
import datetime
Current_Date_Formatted = datetime.datetime.today().strftime ('%d-%b-%Y')
time = str(Current_Date_Formatted)
filename = "footballstuff"
cityname = sys.argv[1]
filename=r"D:\Huzefa\Desktop\\" +filename+ ".txt"
url = "https://www.fff.fr/resultats?search="+cityname
res = requests.get(url)
soup = bs4.BeautifulSoup(res.text, "lxml")
file = open(filename , 'wb')
for i in soup.select("a"):
f=i.text
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", f)).encode('ascii', 'ignore'))
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", os.linesep)).encode('ascii', 'ignore'))
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", os.linesep)).encode('ascii', 'ignore'))
file.close()

Categories

Resources