BeautifulSoup : Web scraping information after submit button is clicked

BeautifulSoup : Web scraping information after submit button is clicked - python

I'm relatively new to python coding, and i'm currently trying to extract data from a website but the information only shows up after a submit button is clicked. The webpage is https://www.ccq.org/fr-CA/qualification-acces-industrie/bassins-main-oeuvre/etat-bassins-main-oeuvre
Button I have to click : button
When I inspect the website, I was able to retrieve the url of the information contained/displayed after the button click (through the network tab when inspecting website).
Here is a preview of the information output the button URL gives : info output
What i'd like to know is if it's possible to keep the information classified by DIV elements, like it does when I click the button on the site... Thank you!
Code :
import requests
from bs4 import BeautifulSoup
import re
URL = "https://www.ccq.org/fr-CA/qualification-acces-industrie/bassins-main-oeuvre/etat-
bassins-main-oeuvre"
page = requests.get(URL)
soup = BeautifulSoup(page.content,features="html.parser")
btn4 = soup.find('button',{"id":"get-labourpools"})
btn4_click = btn4['onclick']

There's an endpoint you can query to get the table data you're after.
Here's how:
import json
import requests
region_id = "01"
occupation_id = "110"
url = f"https://www.ccq.org/api/labourpools?regionId={region_id}&occupationId={occupation_id}"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0",
"X-Requested-With": "XMLHttpRequest",
}
data = requests.get(url, headers=headers).json()
print(json.dumps(data, indent=2))
Output:
[
{
"Id": "01",
"Name": "Iles de la Madeleine",
"Occupations": [
{
"Id": "110",
"Name": "Briqueteur-ma\u00e7on",
"Pool": {
"IsOpen": true,
"IsLessThan10": true,
"IsLessThan30": true
}
}
],
"EffectiveDate": "17 janvier 2022"
}
]
EDIT:
And if you want to get all tables for all regions and occupations, you can create all possible API request urls and get the data.
Here's how:
import json
import requests
from bs4 import BeautifulSoup
base_url = "https://www.ccq.org/fr-CA/qualification-acces-industrie/bassins-main-oeuvre/etat-bassins-main-oeuvre"
api_url = "https://www.ccq.org/api/labourpools?"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0",
"X-Requested-With": "XMLHttpRequest",
}
def get_ids(id_value: str) -> list:
return [
i["value"] for i
in soup.find("select", {"id": id_value}).find_all("option")[1:]
]
with requests.Session() as session:
soup = BeautifulSoup(session.get(base_url, headers=headers).text, "lxml")
region_ids = get_ids("dropdown-region")
occupation_ids = get_ids("dropdown-occupation")
all_query_urls = [
f"{api_url}regionId={region_id}&occupationId={occupation_id}"
for region_id in region_ids for occupation_id in occupation_ids
]
for query_url in all_query_urls[:2]: # remove [:2] to get all combinations
data = session.get(query_url, headers=headers).json()
print(json.dumps(data, indent=2))
This should output two entries:
[
{
"Id": "01",
"Name": "Iles de la Madeleine",
"Occupations": [
{
"Id": "110",
"Name": "Briqueteur-ma\u00e7on",
"Pool": {
"IsOpen": true,
"IsLessThan10": true,
"IsLessThan30": true
}
}
],
"EffectiveDate": "17 janvier 2022"
}
]
[
{
"Id": "01",
"Name": "Iles de la Madeleine",
"Occupations": [
{
"Id": "130",
"Name": "Calorifugeur",
"Pool": {
"IsOpen": true,
"IsLessThan10": true,
"IsLessThan30": true
}
}
],
"EffectiveDate": "17 janvier 2022"
}
]

Related

Python: Make duplicate keys unique json response

I am extracting data through POST method and let's say we have different post data in a list and for each request there will be a different post data like this:
cookies = {
"_sp_id.cf1a": "205e16a5-8970-4c92-97b8-969eebfcbb63.1647289721.7.1648545896.1648465133.73852927-e047-4c36-bae7-90c001509900",
"_sp_ses.cf1a": "*",
}
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
"Accept": "text/plain, */*; q=0.01",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.tradingview.com/",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Origin": "https://www.tradingview.com",
"Connection": "keep-alive",
# Requests sorts cookies= alphabetically
# 'Cookie': '_sp_id.cf1a=205e16a5-8970-4c92-97b8-969eebfcbb63.1647289721.7.1648545896.1648465133.73852927-e047-4c36-bae7-90c001509900; _sp_ses.cf1a=*',
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-site",
"Sec-GPC": "1",
# Requests doesn't support trailers
# 'TE': 'trailers',
}
data = [
'{"filter":[{"left":"name","operation":"nempty"},{"left":"exchange","operation":"in_range","right":["AMEX","NASDAQ","NYSE"]},{"left":"High.All","operation":"eless","right":"high"},{"left":"is_primary","operation":"equal","right":true},{"left":"subtype","operation":"nequal","right":"preferred"}],"options":{"lang":"en","active_symbols_only":true},"markets":["america"],"symbols":{"query":{"types":[]},"tickers":[]},"columns":["logoid","name","close","change","change_abs","Recommend.All","volume","Value.Traded","market_cap_basic","price_earnings_ttm","earnings_per_share_basic_ttm","number_of_employees","sector","description","type","subtype","update_mode","pricescale","minmov","fractional","minmove2","currency","fundamental_currency_code"],"sort":{"sortBy":"name","sortOrder":"asc"},"range":[0,150]}',
'{"filter":[{"left":"name","operation":"nempty"},{"left":"exchange","operation":"in_range","right":["AMEX","NASDAQ","NYSE"]},{"left":"High.All","operation":"eless","right":"high"},{"left":"is_primary","operation":"equal","right":true},{"left":"subtype","operation":"nequal","right":"preferred"}],"options":{"lang":"en","active_symbols_only":true},"markets":["america"],"symbols":{"query":{"types":[]},"tickers":[]},"columns":["logoid","name","change|1","change|5","change|15","change|60","change|240","change","change|1W","change|1M","Perf.3M","Perf.6M","Perf.YTD","Perf.Y","beta_1_year","Volatility.D","description","type","subtype","update_mode","currency","fundamental_currency_code"],"sort":{"sortBy":"name","sortOrder":"asc"},"range":[0,150]}',
]
with httpx.Client() as client:
for d in data:
r = client.post(
"https://scanner.tradingview.com/america/scan",
headers=headers,
cookies=cookies,
data=d,
)
I store post data in a list and iterating over it and passing it in request object, so far so good. Post requests returns json object and problem begins here, every request returns different json object except keys, they are the same for e.g
'{"filter":[{"left":"name","operation":"nempty"},{"left":"exchange","operation":"in_range","right":["AMEX","NASDAQ","NYSE"]},{"left":"High.All","operation":"eless","right":"high"},{"left":"is_primary","operation":"equal","right":true},{"left":"subtype","operation":"nequal","right":"preferred"}],"options":{"lang":"en","active_symbols_only":true},"markets":["america"],"symbols":{"query":{"types":[]},"tickers":[]},"columns":["logoid","name","change|1","change|5","change|15","change|60","change|240","change","change|1W","change|1M","Perf.3M","Perf.6M","Perf.YTD","Perf.Y","beta_1_year","Volatility.D","description","type","subtype","update_mode","currency","fundamental_currency_code"],"sort":{"sortBy":"name","sortOrder":"asc"},"range":[0,150]}'
this post data returns this json:
{
"totalCount": 141,
"data": [
{
"s": "NYSE:ABBV",
"d": [
"abbvie",
"ABBV",
0,
0.13602918,
0.2662209,
0.37497288,
0.6400696,
0.39670241,
0.39670241,
9.60952832,
20.52236029,
48.81477398,
19.62333826,
51.75676942,
0.5128781,
1.56710337,
"AbbVie Inc.",
"stock",
"common",
"delayed_streaming_900",
"USD",
"USD"
]
}
]
}
and post data
{"filter":[{"left":"name","operation":"nempty"},{"left":"exchange","operation":"in_range","right":["AMEX","NASDAQ","NYSE"]},{"left":"High.All","operation":"eless","right":"high"},{"left":"is_primary","operation":"equal","right":true},{"left":"subtype","operation":"nequal","right":"preferred"}],"options":{"lang":"en","active_symbols_only":true},"markets":["america"],"symbols":{"query":{"types":[]},"tickers":[]},"columns":["logoid","name","close","change","change_abs","Recommend.All","volume","Value.Traded","market_cap_basic","price_earnings_ttm","earnings_per_share_basic_ttm","number_of_employees","sector","description","type","subtype","update_mode","pricescale","minmov","fractional","minmove2","currency","fundamental_currency_code"],"sort":{"sortBy":"name","sortOrder":"asc"},"range":[0,150]}'
returns this which is similar to the last one i.e. keys but values are different
{
"totalCount": 141,
"data": [
{
"s": "NYSE:ABBV",
"d": [
"abbvie",
"ABBV",
161.97,
0.39670241,
0.64,
0.42121212,
4516453,
731529892.41,
286085169370,
25.01356652,
6.4775,
50000,
"Health Technology",
"AbbVie Inc.",
"stock",
"common",
"delayed_streaming_900",
100,
1,
"false",
0,
"USD",
"USD"
]
}
]
}
Hence I am getting just one json object which is the last one in the loop instead of both.
Desired Output:
{
"overview": {
"totalCount": 141,
"data": [
{
"s": "NYSE:ABBV",
"d": [
"abbvie",
"ABBV",
161.97,
0.39670241,
0.64,
0.42121212,
4516453,
731529892.41,
286085169370,
25.01356652,
6.4775,
50000,
"Health Technology",
"AbbVie Inc.",
"stock",
"common",
"delayed_streaming_900",
100,
1,
"false",
0,
"USD",
"USD"
]
}
]
},
"performance": {
"totalCount": 141,
"data": [
{
"s": "NYSE:ABBV",
"d": [
"abbvie",
"ABBV",
0,
0.13602918,
0.2662209,
0.37497288,
0.6400696,
0.39670241,
0.39670241,
9.60952832,
20.52236029,
48.81477398,
19.62333826,
51.75676942,
0.5128781,
1.56710337,
"AbbVie Inc.",
"stock",
"common",
"delayed_streaming_900",
"USD",
"USD"
]
}
]
}
}
I am looking for some solution to handle this kind of duplication in json response and get data from different post data on each request by making the duplicate keys to unique.

unable to scrape status of product

I want to scrape price and status of website. I am able to scrape price but unable to scrape status. Couldn't find in JSON as well.
here is link: https://www.zoro.com/jonard-tools-diagonal-cutting-plier-8-l-jic-2488/i/G2736212/?recommended=true
from requests import get
from bs4 import BeautifulSoup
resp = get(url)
soup = BeautifulSoup(resp.text, 'lxml')
# print(soup.prettify())
price = soup.find('div', class_ = 'product-price')
status = soup.find('div', class_ = 'avl-status buy-box__shipping-item')
print(status.text)

You can use Json microformat embedded inside the page to obtain availability (price, images, description...).
For example:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.zoro.com/jonard-tools-diagonal-cutting-plier-8-l-jic-2488/i/G2736212/?recommended=true"
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
data = json.loads(soup.select_one('script[type="application/ld+json"]').contents[0])
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
print('Price : ', data['offers']['price'])
print('Availability: ', data['offers']['availability'])
Prints:
Price : 17.13
Availability: http://schema.org/InStock
EDIT: You can observe all product data that is embedded within the page:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.zoro.com/baldwin-filters-filter-service-kit-thermo-king-bk6092/i/G1609513/"
# url = 'https://www.zoro.com/jonard-tools-diagonal-cutting-plier-8-l-jic-2488/i/G2736212/?recommended=true'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
data = json.loads(soup.select_one('div.hidden[data-state]')['data-state'] )
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
_, product_data = data['product']['productDetailsData'].popitem()
print(json.dumps(product_data, indent=4))
print()
print('isExpeditable = ', product_data['isExpeditable'])
When this key isExpeditable is set to False, it means Drop Shipping (I think). When I tested it with product that is in stock, it prints True.
The output:
{
"packageQty": 1,
"isMotorCompliant": false,
"zoroNo": "G1609513",
"brand": "Baldwin Filters",
"salesStatus": "TP",
"orderChannel": "Default",
"description": "Filter Service Kit, For Vehicle Type - Filter Kits Thermo King, Includes Lube Spin-On, Fuel, Water Separator Element, Fuel Spin-On",
"restrictedStates": [],
"title": "Filter Service Kit",
"categoryPaths": [
[
{
"name": "Automotive Filters",
"slug": "automotive-filters",
"code": "7540"
},
{
"name": "Filter Service Kits",
"slug": "filter-service-kits",
"code": "10660"
}
]
],
"restrictedSaleItemCode": "",
"slug": "baldwin-filters-filter-service-kit-thermo-king-bk6092",
"energyGuideLabelFileName": "",
"variants": null,
"isForcedOutOfStock": false,
"lightingFactLabelFileName": "",
"isExpeditable": false,
"erpId": "2770121",
"californiaProp65Message": null,
"isHazmat": false,
"leadTime": 8,
"mfrNo": "BK6092",
"attributes": [
{
"name": "For Vehicle Type - Filter Kits",
"value": "Thermo King"
},
{
"name": "Item",
"value": "Filter Service Kit"
},
{
"name": "For Use With",
"value": "Thermo King"
},
{
"name": "Includes",
"value": "Lube Spin-On, Fuel, Water Separator Element, Fuel Spin-On"
},
{
"name": "Country of Origin (subject to change)",
"value": "United States"
}
],
"originalPrice": null,
"isCircleECompliant": false,
"lowLeadComplianceLevel": "",
"priceUnit": "EA",
"isDropShipDirect": false,
"minRetailQty": 1,
"price": 118.29,
"media": [
{
"name": "Z1qr7ymcpEx_.JPG",
"type": "image/jpeg"
}
]
}
isExpeditable = False

Unable to send an application in the right way using post requests having multiple parameters

I'm trying to send an application after filling in a form available in a webpage using python. I've tried to mimic the process that I see in chrome dev tools but it seems I've gone somewhere wrong and that is the reason when I execute the following script I get this error:
{
"message":"415 Unsupported Media Type returned for /apply-app/rest/jobs/PIDFK026203F3VBQB79V77VIY-87592/submissions with message: ",
"key":"Exception_server_error",
"errorId":"d6b128bd-426d-4bee-8dbb-03e232829f5e"
}
It seems to me that I need to use the value of token and version in an automatic manner as they are different in every application but I don't find them in page source and stuff.
I've selected No as value for all the dropdowns (when there is any) within Additional Information.
Link to the application page
Link to the attachment that I've used thrice.
I've tried with:
import requests
main_link = "https://karriere.hsbc.de/stellenangebote/stellenboerse/apply?jobId=PIDFK026203F3VBQB79V77VIY-87592&langCode=de_DE"
post_link = "https://emea3.recruitmentplatform.com/apply-app/rest/jobs/PIDFK026203F3VBQB79V77VIY-87592/submissions"
payload = {
"candidateIdentity":{"firstName":"syed","lastName":"mushfiq","email":"mthmt80#gmail.com"},
"answeredDocuments":[{"documentType":"answeredForm","formId":"hsbc_bewerbungsprozess_pers_nliche_daten",
"answers":[
{"questionId":"form_of_address","type":"options","value":["form_of_address_m"]},
{"questionId":"academic_title","type":"simple","value":" Dr.","questionIds":[]},
{"questionId":"first_name","type":"simple","value":"syed","questionIds":[]},
{"questionId":"last_name","type":"simple","value":"mushfiq","questionIds":[]},
{"questionId":"e-mail_address","type":"simple","value":"mthmt80#gmail.com","questionIds":[]},
{"questionId":"phone__mobile_","type":"phone","countryCode":"+880","isoCountryCode":"BD","subscriberNumber":"1790128884"}]},
{"documentType":"answeredForm","formId":"hsbc_bewerbungsprozess_standard_fragebogen","answers":[{"questionId":"custom_question_450","type":"options","value":["custom_question_450_ja"]},
{"questionId":"custom_question_451","type":"options","value":["custom_question_451_nein"]},
{"questionId":"custom_question_452","type":"options","value":["custom_question_452_unter_keine_der_zuvor_genannten"]},
{"questionId":"custom_question_580","type":"options","value":["custom_question_580_nein_978"]},
{"questionId":"custom_question_637","type":"options","value":["custom_question_637_nein"]},
{"questionId":"custom_question_579","type":"options","value":["custom_question_579_nein"]},
{"questionId":"custom_question_583","type":"options","value":["custom_question_583_hsbc_deutschland_karriereseite"]}]},
#============The following three lines are supposed to help upload three files============
{"documentType":"attachment","attachmentId":"cover_letter","token":"2d178469-cdb5-4d65-9f67-1e7637896953","filename": open("demo.pdf","rb")},
{"documentType":"attachment","attachmentId":"attached_resume","token":"81a5a661-66bb-4918-a35c-ec260ffb7d02","filename": open("demo.pdf","rb")},
{"documentType":"attachment","attachmentId":"otherattachment","token":"4c3f7500-b072-48d4-83cf-0af1399bc8ba","filename": open("demo.pdf","rb")}],
#============The version's value should not be hardcoded=========================
"version":"V2:3:14dfac80702d099625d0274121b0dba68ac0fd96:861836b7d86adae8cc1ce69198b69b8ca59e2ed5","lastModifiedDate":1562056029000,"answeredDataPrivacyConsents":[{"identifier":"urn:lms:ta:tlk:data-privacy-consent:mtu531:101","consentProvided":True},
{"identifier":"urn:lms:ta:tlk:data-privacy-consent:mtu531:102","consentProvided":True}],
"metaInformation":{"applicationFormUrl":"https://karriere.hsbc.de/stellenangebote/stellenboerse/apply?jobId=PIDFK026203F3VBQB79V77VIY-87592&langCode=de_DE","jobsToLink":[]}
}
def send_application(s,link):
res = s.post(link,data=payload)
print(res.text)
if __name__ == '__main__':
with requests.Session() as s:
send_application(s,post_link)
How can I send the application in the right way?
PS I can send the application manually multiple times using the same documents to the same email.

The best way to go about something like this is to open the page in a browser and view the network tab in the developer tools. From there as you're filling out the form you'll be able to see that each time you attach a document it sends an ajax request and receives the token in a json response. With those tokens you can build the final payload which should be submitted in json format.
Here's some example code that's working:
import requests
headers = {
'Host': 'emea3.recruitmentplatform.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'apply-config-key': 'AAACEwAA-55cd88d4-c9fd-41ce-95a4-f238402b898f',
'Origin': 'https://karriere.hsbc.de',
'DNT': '1',
'Connection': 'close',
'Referer': 'https://karriere.hsbc.de/',
'Cookie': 'lumesse_language=de_DE'
}
main_link = "https://karriere.hsbc.de/stellenangebote/stellenboerse/apply?jobId=PIDFK026203F3VBQB79V77VIY-87592&langCode=de_DE"
post_link = "https://emea3.recruitmentplatform.com/apply-app/rest/jobs/PIDFK026203F3VBQB79V77VIY-87592/submissions"
ajax_link = "https://emea3.recruitmentplatform.com/apply-app/rest/jobs/PIDFK026203F3VBQB79V77VIY-87592/attachments"
def build_payload(cover_letter_token, attached_resume_token, otherattachment_token):
return {
"candidateIdentity": {
"firstName": "Syed",
"lastName": "Mushfiq",
"email": "mthmt80#gmail.com"
},
"answeredDocuments": [
{
"documentType": "answeredForm",
"formId": "hsbc_bewerbungsprozess_pers_nliche_daten",
"answers": [
{
"questionId": "form_of_address",
"type": "options",
"value": [
"form_of_address_m"
]
},
{
"questionId": "academic_title",
"type": "simple",
"value": "prof",
"questionIds": []
},
{
"questionId": "first_name",
"type": "simple",
"value": "Syed",
"questionIds": []
},
{
"questionId": "last_name",
"type": "simple",
"value": "Mushfiq",
"questionIds": []
},
{
"questionId": "e-mail_address",
"type": "simple",
"value": "mthmt80#gmail.com",
"questionIds": []
},
{
"questionId": "phone__mobile_",
"type": "phone",
"countryCode": "+49",
"isoCountryCode": "DE",
"subscriberNumber": "30 33850062"
}
]
},
{
"documentType": "answeredForm",
"formId": "hsbc_bewerbungsprozess_standard_fragebogen",
"answers": [
{
"questionId": "custom_question_450",
"type": "options",
"value": [
"custom_question_450_ja"
]
},
{
"questionId": "custom_question_451",
"type": "options",
"value": [
"custom_question_451_nein"
]
},
{
"questionId": "custom_question_452",
"type": "options",
"value": [
"custom_question_452_unter_keine_der_zuvor_genannten"
]
},
{
"questionId": "custom_question_580",
"type": "options",
"value": [
"custom_question_580_ja"
]
},
{
"questionId": "custom_question_637",
"type": "options",
"value": [
"custom_question_637_nein"
]
},
{
"questionId": "custom_question_579",
"type": "options",
"value": [
"custom_question_579_nein"
]
},
{
"questionId": "custom_question_583",
"type": "options",
"value": [
"custom_question_583_linkedin"
]
}
]
},
{
"documentType": "attachment",
"attachmentId": "cover_letter",
"token": cover_letter_token,
"filename": "demo.pdf"
},
{
"documentType": "attachment",
"attachmentId": "attached_resume",
"token": attached_resume_token,
"filename": "demo.pdf"
},
{
"documentType": "attachment",
"attachmentId": "otherattachment",
"token": otherattachment_token,
"filename": "demo.pdf"
}
],
"version": "V2:3:14dfac80702d099625d0274121b0dba68ac0fd96:861836b7d86adae8cc1ce69198b69b8ca59e2ed5",
"lastModifiedDate": "1562056029000",
"answeredDataPrivacyConsents": [
{
"identifier": "urn:lms:ta:tlk:data-privacy-consent:mtu531:101",
"consentProvided": "true"
},
{
"identifier": "urn:lms:ta:tlk:data-privacy-consent:mtu531:102",
"consentProvided": "true"
}
],
"metaInformation": {
"applicationFormUrl": "https://karriere.hsbc.de/stellenangebote/stellenboerse/apply?jobId=PIDFK026203F3VBQB79V77VIY-87592&langCode=de_DE",
"jobsToLink": []
}
}
def submit_attachment(s, link, f):
d = open(f, 'rb').read()
r = s.post(link, files={'file':('demo.pdf', d),'applicationProcessVersion':(None, 'V2:3:14dfac80702d099625d0274121b0dba68ac0fd96:861836b7d86adae8cc1ce69198b69b8ca59e2ed5')})
r_data = r.json()
return r_data.get('token')
def send_application(s,link,p):
res = s.post(link, json=p)
return res
if __name__ == '__main__':
attachment_list = ["cover_letter_token", "attached_resume_token", "otherattachment_token"]
token_dict = {}
with requests.Session() as s:
s.headers.update(headers)
for at in attachment_list:
rt = submit_attachment(s, ajax_link, "demo.pdf")
token_dict[at] = rt
payload = build_payload(token_dict['cover_letter_token'], token_dict['attached_resume_token'], token_dict['otherattachment_token'])
rd = send_application(s, post_link, payload)
print(rd.text)
print(rd.status_code)

Scraping wsj.com

I wanted to scrape some data from wsj.com and print it. The actual website is: https://www.wsj.com/market-data/stocks?mod=md_home_overview_stk_main and the data is NYSE Issues Advancing, Declining and NYSE Share Volume Advancing, Declining.
I tried using beautifulsoup after watching a youtube video but I can't get any of the classes to return a value inside body.
Here is my code:
from bs4 import BeautifulSoup
import requests
source = requests.get('https://www.wsj.com/market-data/stocks?mod=md_home_overview_stk_main').text
soup = BeautifulSoup(source, 'lxml')
body = soup.find('body')
adv = body.find('td', class_='WSJTables--table__cell--2dzGiO7q WSJTheme--table__cell--1At-VGNg ')
print(adv)
Also while inspecting elements in Network I noticed that this data is also available as a JSON.
Here is the link: https://www.wsj.com/market-data/stocks?id=%7B%22application%22%3A%22WSJ%22%2C%22marketsDiaryType%22%3A%22overview%22%7D&type=mdc_marketsdiary
So I wrote another script to try and parse this data using JSON but again its not working.
Here is the code:
import json
import requests
url = 'https://www.wsj.com/market-data/stocks?id=%7B%22application%22%3A%22WSJ%22%2C%22marketsDiaryType%22%3A%22overview%22%7D&type=mdc_marketsdiary'
response = json.loads(requests.get(url).text)
print(response)
The error I get is:
File "C:\Users\User\Anaconda3\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
JSONDecodeError: Expecting value
I also tried a few different methods from this link and none seem to work.
Can you please set me on the right path how to scrape this data?

from bs4 import BeautifulSoup
import requests
import json
params = {
'id': '{"application":"WSJ","marketsDiaryType":"overview"}',
'type': 'mdc_marketsdiary'
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0"
}
r = requests.get(
"https://www.wsj.com/market-data/stocks", params=params, headers=headers).json()
data = json.dumps(r, indent=4)
print(data)
Output:
{
"id": "{\"application\":\"WSJ\",\"marketsDiaryType\":\"overview\"}",
"type": "mdc_marketsdiary",
"data": {
"instrumentSets": [
{
"headerFields": [
{
"value": "name",
"label": "Issues"
}
],
"instruments": [
{
"name": "Advancing",
"NASDAQ": "169",
"NYSE": "69"
},
{
"name": "Declining",
"NASDAQ": "3,190",
"NYSE": "2,973"
},
{
"name": "Unchanged",
"NASDAQ": "24",
"NYSE": "10"
},
{
"name": "Total",
"NASDAQ": "3,383",
"NYSE": "3,052"
}
]
},
{
"headerFields": [
{
"value": "name",
"label": "Issues At"
}
],
"instruments": [
{
"name": "New Highs",
"NASDAQ": "53",
"NYSE": "14"
},
{
"name": "New Lows",
"NASDAQ": "1,406",
"NYSE": "1,620"
}
]
},
{
"headerFields": [
{
"value": "name",
"label": "Share Volume"
}
],
"instruments": [
{
"name": "Total",
"NASDAQ": "4,454,691,895",
"NYSE": "7,790,947,818"
},
{
"name": "Advancing",
"NASDAQ": "506,192,012",
"NYSE": "219,412,232"
},
{
"name": "Declining",
"NASDAQ": "3,948,035,191",
"NYSE": "7,570,377,893"
},
{
"name": "Unchanged",
"NASDAQ": "464,692",
"NYSE": "1,157,693"
}
]
}
],
"timestamp": "4:00 PM EDT 3/09/20"
},
"hash": "{\"id\":\"{\\\"application\\\":\\\"WSJ\\\",\\\"marketsDiaryType\\\":\\\"overview\\\"}\",\"type\":\"mdc_marketsdiary\",\"data\":{\"instrumentSets\":[{\"headerFields\":[{\"value\":\"name\",\"label\":\"Issues\"}],\"instruments\":[{\"name\":\"Advancing\",\"NASDAQ\":\"169\",\"NYSE\":\"69\"},{\"name\":\"Declining\",\"NASDAQ\":\"3,190\",\"NYSE\":\"2,973\"},{\"name\":\"Unchanged\",\"NASDAQ\":\"24\",\"NYSE\":\"10\"},{\"name\":\"Total\",\"NASDAQ\":\"3,383\",\"NYSE\":\"3,052\"}]},{\"headerFields\":[{\"value\":\"name\",\"label\":\"Issues At\"}],\"instruments\":[{\"name\":\"New Highs\",\"NASDAQ\":\"53\",\"NYSE\":\"14\"},{\"name\":\"New Lows\",\"NASDAQ\":\"1,406\",\"NYSE\":\"1,620\"}]},{\"headerFields\":[{\"value\":\"name\",\"label\":\"Share Volume\"}],\"instruments\":[{\"name\":\"Total\",\"NASDAQ\":\"4,454,691,895\",\"NYSE\":\"7,790,947,818\"},{\"name\":\"Advancing\",\"NASDAQ\":\"506,192,012\",\"NYSE\":\"219,412,232\"},{\"name\":\"Declining\",\"NASDAQ\":\"3,948,035,191\",\"NYSE\":\"7,570,377,893\"},{\"name\":\"Unchanged\",\"NASDAQ\":\"464,692\",\"NYSE\":\"1,157,693\"}]}],\"timestamp\":\"4:00 PM EDT 3/09/20\"}}"
}
Note: You can access it as dict print(r.keys()).

You need to add a header on the url so that it will not return error=404.
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
url = 'https://www.wsj.com/market-data/stocks?id=%7B%22application%22%3A%22WSJ%22%2C%22marketsDiaryType%22%3A%22overview%22%7D&type=mdc_marketsdiary'
# put a header on the request
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:63.0) Gecko/20100101 Firefox/63.0'}
req = urllib.request.Request(url=url, headers=headers)
with urlopen(req) as response:
page_html = response.read()
df = pd.DataFrame()
data = json.loads(page_html).get('data')
for instrumentSets in data.get('instrumentSets'):
for k,v in instrumentSets.items():
if k == 'instruments':
df = df.append(pd.DataFrame(v))
df=df.rename(columns = {'name':'Issues'})
df
Result:

Is there a way to scrape ads' url from SeLoger?

I'm trying to scrape the french website SeLoger, I can find and scrape all ads and put it in an Json.
The issues is I can't find the final url of the ads with this way.
The Url is in a div called "cartouche" with the class c-pa-link link_AB.
import requests
from bs4 import BeautifulSoup
import json
url = 'https://www.seloger.com/list.htm?tri=initial&enterprise=0&idtypebien=2,1&idtt=2,5&naturebien=1,2,4&ci=440109'
headers = {
'User-Agent': '*',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
s = requests.Session()
s.headers.update(headers)
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for script_item in soup.find_all('script'):
if 'var ava_data' in script_item.text:
raw_json = script_item.text.split('var ava_data = ')[1].split('};')[0] + "}"
data = json.loads(raw_json)
print(data)
I expected to put a field in the json like this.
{
"url":"https://www.seloger.com/annonces/achat/appartement/nantes-44/centre-ville/144279775.htm?enterprise=0&natures=1,4&places=%5b%7bci%3a440109%7d%5d&projects=2,5&qsversion=1.0&types=1,2&bd=ListToDetail",
"idannonce": "149546457",
"idagence": "294918",
"idtiers": "323172",
"typedebien": "Appartement",
"typedetransaction": [
"viager"
],
"idtypepublicationsourcecouplage": "SL",
"position": "2",
"codepostal": "44100",
"ville": "Nantes",
"departement": "Loire-Atlantique",
"codeinsee": "440109",
"produitsvisibilite": "AD:AC:BX:AW",
"affichagetype": [
{
"name": "liste",
"value": "True"
}
],
"cp": "44100",
"etage": "0",
"idtypechauffage": "0",
"idtypecommerce": "0",
"idtypecuisine": "séparée équipée",
"naturebien": "1",
"si_balcon": "1",
"nb_chambres": "1",
"nb_pieces": "2",
"si_sdbain": "0",
"si_sdEau": "0",
"nb_photos": "15",
"prix": "32180",
"surface": "41"
}
Thanks for your help.

You can use zip() functions to "tie" products from json data to URLs in the webpage:
import requests
from bs4 import BeautifulSoup
import json
url = 'https://www.seloger.com/list.htm?tri=initial&enterprise=0&idtypebien=2,1&idtt=2,5&naturebien=1,2,4&ci=440109'
headers = {
'User-Agent': '*',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
s = requests.Session()
s.headers.update(headers)
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for script_item in soup.find_all('script'):
if 'var ava_data' in script_item.text:
raw_json = script_item.text.split('var ava_data = ')[1].split('};')[0] + "}"
data = json.loads(raw_json)
for a, p in zip(soup.select('.c-pa-info > a'), data['products']):
p['url'] = a['href']
print(json.dumps(data, indent=4))
Prints:
...
{
"idannonce": "139994713",
"idagence": "48074",
"idtiers": "24082",
"typedebien": "Appartement",
"typedetransaction": [
"vente"
],
"idtypepublicationsourcecouplage": "SL9",
"position": "16",
"codepostal": "44000",
"ville": "Nantes",
"departement": "Loire-Atlantique",
"codeinsee": "440109",
"produitsvisibilite": "AM:AC:BB:BX:AW",
"affichagetype": [
{
"name": "liste",
"value": true
}
],
"cp": "44000",
"etage": "0",
"idtypechauffage": "0",
"idtypecommerce": "0",
"idtypecuisine": "0",
"naturebien": "2",
"si_balcon": "0",
"nb_chambres": "0",
"nb_pieces": "3",
"si_sdbain": "0",
"si_sdEau": "0",
"nb_photos": "4",
"prix": "147900",
"surface": "63",
"url": "https://www.selogerneuf.com/annonces/achat/appartement/nantes-44/139994713/#?cmp=INTSL_ListToDetail"
},
{
"idannonce": "146486955",
"idagence": "334754",
...
NOTE: Some URLs have different structure from
https://www.seloger.com/annonces/achat/appartement/nantes-44/centre-ville/{idannonce}.htm?ci=440109&enterprise=0&idtt=2,5&idtypebien=2,1&naturebien=1,2,4&tri=initial&bd=ListToDetail
for example
https://www.selogerneuf.com/annonces/investissement/appartement/nantes-44/146486955/#?cmp=INTSL_ListToDetail

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

BeautifulSoup : Web scraping information after submit button is clicked - python

Related

Python: Make duplicate keys unique json response

unable to scrape status of product

Unable to send an application in the right way using post requests having multiple parameters

Scraping wsj.com

Is there a way to scrape ads' url from SeLoger?

Categories

Resources