Is there a way to scrape ads' url from SeLoger? - python

I'm trying to scrape the french website SeLoger, I can find and scrape all ads and put it in an Json.
The issues is I can't find the final url of the ads with this way.
The Url is in a div called "cartouche" with the class c-pa-link link_AB.
import requests
from bs4 import BeautifulSoup
import json
url = 'https://www.seloger.com/list.htm?tri=initial&enterprise=0&idtypebien=2,1&idtt=2,5&naturebien=1,2,4&ci=440109'
headers = {
'User-Agent': '*',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
s = requests.Session()
s.headers.update(headers)
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for script_item in soup.find_all('script'):
if 'var ava_data' in script_item.text:
raw_json = script_item.text.split('var ava_data = ')[1].split('};')[0] + "}"
data = json.loads(raw_json)
print(data)
I expected to put a field in the json like this.
{
"url":"https://www.seloger.com/annonces/achat/appartement/nantes-44/centre-ville/144279775.htm?enterprise=0&natures=1,4&places=%5b%7bci%3a440109%7d%5d&projects=2,5&qsversion=1.0&types=1,2&bd=ListToDetail",
"idannonce": "149546457",
"idagence": "294918",
"idtiers": "323172",
"typedebien": "Appartement",
"typedetransaction": [
"viager"
],
"idtypepublicationsourcecouplage": "SL",
"position": "2",
"codepostal": "44100",
"ville": "Nantes",
"departement": "Loire-Atlantique",
"codeinsee": "440109",
"produitsvisibilite": "AD:AC:BX:AW",
"affichagetype": [
{
"name": "liste",
"value": "True"
}
],
"cp": "44100",
"etage": "0",
"idtypechauffage": "0",
"idtypecommerce": "0",
"idtypecuisine": "séparée équipée",
"naturebien": "1",
"si_balcon": "1",
"nb_chambres": "1",
"nb_pieces": "2",
"si_sdbain": "0",
"si_sdEau": "0",
"nb_photos": "15",
"prix": "32180",
"surface": "41"
}
Thanks for your help.

You can use zip() functions to "tie" products from json data to URLs in the webpage:
import requests
from bs4 import BeautifulSoup
import json
url = 'https://www.seloger.com/list.htm?tri=initial&enterprise=0&idtypebien=2,1&idtt=2,5&naturebien=1,2,4&ci=440109'
headers = {
'User-Agent': '*',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
s = requests.Session()
s.headers.update(headers)
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for script_item in soup.find_all('script'):
if 'var ava_data' in script_item.text:
raw_json = script_item.text.split('var ava_data = ')[1].split('};')[0] + "}"
data = json.loads(raw_json)
for a, p in zip(soup.select('.c-pa-info > a'), data['products']):
p['url'] = a['href']
print(json.dumps(data, indent=4))
Prints:
...
{
"idannonce": "139994713",
"idagence": "48074",
"idtiers": "24082",
"typedebien": "Appartement",
"typedetransaction": [
"vente"
],
"idtypepublicationsourcecouplage": "SL9",
"position": "16",
"codepostal": "44000",
"ville": "Nantes",
"departement": "Loire-Atlantique",
"codeinsee": "440109",
"produitsvisibilite": "AM:AC:BB:BX:AW",
"affichagetype": [
{
"name": "liste",
"value": true
}
],
"cp": "44000",
"etage": "0",
"idtypechauffage": "0",
"idtypecommerce": "0",
"idtypecuisine": "0",
"naturebien": "2",
"si_balcon": "0",
"nb_chambres": "0",
"nb_pieces": "3",
"si_sdbain": "0",
"si_sdEau": "0",
"nb_photos": "4",
"prix": "147900",
"surface": "63",
"url": "https://www.selogerneuf.com/annonces/achat/appartement/nantes-44/139994713/#?cmp=INTSL_ListToDetail"
},
{
"idannonce": "146486955",
"idagence": "334754",
...
NOTE: Some URLs have different structure from
https://www.seloger.com/annonces/achat/appartement/nantes-44/centre-ville/{idannonce}.htm?ci=440109&enterprise=0&idtt=2,5&idtypebien=2,1&naturebien=1,2,4&tri=initial&bd=ListToDetail
for example
https://www.selogerneuf.com/annonces/investissement/appartement/nantes-44/146486955/#?cmp=INTSL_ListToDetail

Related

Python: Make duplicate keys unique json response

I am extracting data through POST method and let's say we have different post data in a list and for each request there will be a different post data like this:
cookies = {
"_sp_id.cf1a": "205e16a5-8970-4c92-97b8-969eebfcbb63.1647289721.7.1648545896.1648465133.73852927-e047-4c36-bae7-90c001509900",
"_sp_ses.cf1a": "*",
}
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
"Accept": "text/plain, */*; q=0.01",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.tradingview.com/",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Origin": "https://www.tradingview.com",
"Connection": "keep-alive",
# Requests sorts cookies= alphabetically
# 'Cookie': '_sp_id.cf1a=205e16a5-8970-4c92-97b8-969eebfcbb63.1647289721.7.1648545896.1648465133.73852927-e047-4c36-bae7-90c001509900; _sp_ses.cf1a=*',
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-site",
"Sec-GPC": "1",
# Requests doesn't support trailers
# 'TE': 'trailers',
}
data = [
'{"filter":[{"left":"name","operation":"nempty"},{"left":"exchange","operation":"in_range","right":["AMEX","NASDAQ","NYSE"]},{"left":"High.All","operation":"eless","right":"high"},{"left":"is_primary","operation":"equal","right":true},{"left":"subtype","operation":"nequal","right":"preferred"}],"options":{"lang":"en","active_symbols_only":true},"markets":["america"],"symbols":{"query":{"types":[]},"tickers":[]},"columns":["logoid","name","close","change","change_abs","Recommend.All","volume","Value.Traded","market_cap_basic","price_earnings_ttm","earnings_per_share_basic_ttm","number_of_employees","sector","description","type","subtype","update_mode","pricescale","minmov","fractional","minmove2","currency","fundamental_currency_code"],"sort":{"sortBy":"name","sortOrder":"asc"},"range":[0,150]}',
'{"filter":[{"left":"name","operation":"nempty"},{"left":"exchange","operation":"in_range","right":["AMEX","NASDAQ","NYSE"]},{"left":"High.All","operation":"eless","right":"high"},{"left":"is_primary","operation":"equal","right":true},{"left":"subtype","operation":"nequal","right":"preferred"}],"options":{"lang":"en","active_symbols_only":true},"markets":["america"],"symbols":{"query":{"types":[]},"tickers":[]},"columns":["logoid","name","change|1","change|5","change|15","change|60","change|240","change","change|1W","change|1M","Perf.3M","Perf.6M","Perf.YTD","Perf.Y","beta_1_year","Volatility.D","description","type","subtype","update_mode","currency","fundamental_currency_code"],"sort":{"sortBy":"name","sortOrder":"asc"},"range":[0,150]}',
]
with httpx.Client() as client:
for d in data:
r = client.post(
"https://scanner.tradingview.com/america/scan",
headers=headers,
cookies=cookies,
data=d,
)
I store post data in a list and iterating over it and passing it in request object, so far so good. Post requests returns json object and problem begins here, every request returns different json object except keys, they are the same for e.g
'{"filter":[{"left":"name","operation":"nempty"},{"left":"exchange","operation":"in_range","right":["AMEX","NASDAQ","NYSE"]},{"left":"High.All","operation":"eless","right":"high"},{"left":"is_primary","operation":"equal","right":true},{"left":"subtype","operation":"nequal","right":"preferred"}],"options":{"lang":"en","active_symbols_only":true},"markets":["america"],"symbols":{"query":{"types":[]},"tickers":[]},"columns":["logoid","name","change|1","change|5","change|15","change|60","change|240","change","change|1W","change|1M","Perf.3M","Perf.6M","Perf.YTD","Perf.Y","beta_1_year","Volatility.D","description","type","subtype","update_mode","currency","fundamental_currency_code"],"sort":{"sortBy":"name","sortOrder":"asc"},"range":[0,150]}'
this post data returns this json:
{
"totalCount": 141,
"data": [
{
"s": "NYSE:ABBV",
"d": [
"abbvie",
"ABBV",
0,
0.13602918,
0.2662209,
0.37497288,
0.6400696,
0.39670241,
0.39670241,
9.60952832,
20.52236029,
48.81477398,
19.62333826,
51.75676942,
0.5128781,
1.56710337,
"AbbVie Inc.",
"stock",
"common",
"delayed_streaming_900",
"USD",
"USD"
]
}
]
}
and post data
{"filter":[{"left":"name","operation":"nempty"},{"left":"exchange","operation":"in_range","right":["AMEX","NASDAQ","NYSE"]},{"left":"High.All","operation":"eless","right":"high"},{"left":"is_primary","operation":"equal","right":true},{"left":"subtype","operation":"nequal","right":"preferred"}],"options":{"lang":"en","active_symbols_only":true},"markets":["america"],"symbols":{"query":{"types":[]},"tickers":[]},"columns":["logoid","name","close","change","change_abs","Recommend.All","volume","Value.Traded","market_cap_basic","price_earnings_ttm","earnings_per_share_basic_ttm","number_of_employees","sector","description","type","subtype","update_mode","pricescale","minmov","fractional","minmove2","currency","fundamental_currency_code"],"sort":{"sortBy":"name","sortOrder":"asc"},"range":[0,150]}'
returns this which is similar to the last one i.e. keys but values are different
{
"totalCount": 141,
"data": [
{
"s": "NYSE:ABBV",
"d": [
"abbvie",
"ABBV",
161.97,
0.39670241,
0.64,
0.42121212,
4516453,
731529892.41,
286085169370,
25.01356652,
6.4775,
50000,
"Health Technology",
"AbbVie Inc.",
"stock",
"common",
"delayed_streaming_900",
100,
1,
"false",
0,
"USD",
"USD"
]
}
]
}
Hence I am getting just one json object which is the last one in the loop instead of both.
Desired Output:
{
"overview": {
"totalCount": 141,
"data": [
{
"s": "NYSE:ABBV",
"d": [
"abbvie",
"ABBV",
161.97,
0.39670241,
0.64,
0.42121212,
4516453,
731529892.41,
286085169370,
25.01356652,
6.4775,
50000,
"Health Technology",
"AbbVie Inc.",
"stock",
"common",
"delayed_streaming_900",
100,
1,
"false",
0,
"USD",
"USD"
]
}
]
},
"performance": {
"totalCount": 141,
"data": [
{
"s": "NYSE:ABBV",
"d": [
"abbvie",
"ABBV",
0,
0.13602918,
0.2662209,
0.37497288,
0.6400696,
0.39670241,
0.39670241,
9.60952832,
20.52236029,
48.81477398,
19.62333826,
51.75676942,
0.5128781,
1.56710337,
"AbbVie Inc.",
"stock",
"common",
"delayed_streaming_900",
"USD",
"USD"
]
}
]
}
}
I am looking for some solution to handle this kind of duplication in json response and get data from different post data on each request by making the duplicate keys to unique.

BeautifulSoup : Web scraping information after submit button is clicked

I'm relatively new to python coding, and i'm currently trying to extract data from a website but the information only shows up after a submit button is clicked. The webpage is https://www.ccq.org/fr-CA/qualification-acces-industrie/bassins-main-oeuvre/etat-bassins-main-oeuvre
Button I have to click : button
When I inspect the website, I was able to retrieve the url of the information contained/displayed after the button click (through the network tab when inspecting website).
Here is a preview of the information output the button URL gives : info output
What i'd like to know is if it's possible to keep the information classified by DIV elements, like it does when I click the button on the site... Thank you!
Code :
import requests
from bs4 import BeautifulSoup
import re
URL = "https://www.ccq.org/fr-CA/qualification-acces-industrie/bassins-main-oeuvre/etat-
bassins-main-oeuvre"
page = requests.get(URL)
soup = BeautifulSoup(page.content,features="html.parser")
btn4 = soup.find('button',{"id":"get-labourpools"})
btn4_click = btn4['onclick']
There's an endpoint you can query to get the table data you're after.
Here's how:
import json
import requests
region_id = "01"
occupation_id = "110"
url = f"https://www.ccq.org/api/labourpools?regionId={region_id}&occupationId={occupation_id}"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0",
"X-Requested-With": "XMLHttpRequest",
}
data = requests.get(url, headers=headers).json()
print(json.dumps(data, indent=2))
Output:
[
{
"Id": "01",
"Name": "Iles de la Madeleine",
"Occupations": [
{
"Id": "110",
"Name": "Briqueteur-ma\u00e7on",
"Pool": {
"IsOpen": true,
"IsLessThan10": true,
"IsLessThan30": true
}
}
],
"EffectiveDate": "17 janvier 2022"
}
]
EDIT:
And if you want to get all tables for all regions and occupations, you can create all possible API request urls and get the data.
Here's how:
import json
import requests
from bs4 import BeautifulSoup
base_url = "https://www.ccq.org/fr-CA/qualification-acces-industrie/bassins-main-oeuvre/etat-bassins-main-oeuvre"
api_url = "https://www.ccq.org/api/labourpools?"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0",
"X-Requested-With": "XMLHttpRequest",
}
def get_ids(id_value: str) -> list:
return [
i["value"] for i
in soup.find("select", {"id": id_value}).find_all("option")[1:]
]
with requests.Session() as session:
soup = BeautifulSoup(session.get(base_url, headers=headers).text, "lxml")
region_ids = get_ids("dropdown-region")
occupation_ids = get_ids("dropdown-occupation")
all_query_urls = [
f"{api_url}regionId={region_id}&occupationId={occupation_id}"
for region_id in region_ids for occupation_id in occupation_ids
]
for query_url in all_query_urls[:2]: # remove [:2] to get all combinations
data = session.get(query_url, headers=headers).json()
print(json.dumps(data, indent=2))
This should output two entries:
[
{
"Id": "01",
"Name": "Iles de la Madeleine",
"Occupations": [
{
"Id": "110",
"Name": "Briqueteur-ma\u00e7on",
"Pool": {
"IsOpen": true,
"IsLessThan10": true,
"IsLessThan30": true
}
}
],
"EffectiveDate": "17 janvier 2022"
}
]
[
{
"Id": "01",
"Name": "Iles de la Madeleine",
"Occupations": [
{
"Id": "130",
"Name": "Calorifugeur",
"Pool": {
"IsOpen": true,
"IsLessThan10": true,
"IsLessThan30": true
}
}
],
"EffectiveDate": "17 janvier 2022"
}
]

Web scraping Using request headers but it's returning page html instead of the Ajax data

I'm trying to scrape this site https://app.mybodygallery.com/#/
I checked the network for the xhr data and the data is neatly packed in there so i try to use the request header but i'm getting the html for the page instead of the data.
import requests
import json
session = requests.Session()
URL = 'https://app.mybodygallery.com/#/?age=30'
headers ={
'authority': 'app.mybodygallery.com',
'method': 'GET',
'path': '/classes/Photo?where=%7B%22gender%22:%22female%22,%22status%22:%22APPROVED%22,%22weight%22:%7B%22$gte%22:47,%22$lte%22:53%7D%7D',
'scheme': 'https',
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'referer': URL,
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?1',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36'
}
r = session.get(url=URL, headers=headers)
print(r.text)
This is what my code looks like, i appreciate any help
You can try that code
import requests
import json
# create session
session = requests.Session()
# creat url
url = 'https://app.mybodygallery.com/classes/Photo'
# create params
credentials = {
'apiKey': 'yFhgrXWYwWoXbH6T6CFhVpROwWaYeldrv7GiDUrZ',
'apiId': 'LNBLKqkrOMxfDgN18GcdJWfd8scviStNIRpzAoBm'
}
params = {
'where': json.dumps({"gender": "female", "status": "APPROVED", "age": 30})
}
# change headers
session.headers['x-parse-application-id'] = credentials['apiId']
session.headers['x-parse-javascript-key'] = credentials['apiKey']
# send request
page = session.get(url, params=params).json()
print(json.dumps(page, indent=4))
{
"results": [
{
"objectId": "65e4a670b0",
"createdAt": "2010-08-15T23:28:56.000Z",
"updatedAt": "2019-07-23T11:44:11.884Z",
"user": {
"__type": "Pointer",
"className": "_User",
"objectId": "65e4a7510f"
},
"sourceId": 1327,
"sourceName": "0/863-1",
"source": "v1Portal",
"sourceViews": 14852,
"height": 170,
"weight": 52.2,
"age": 30,
"pant": 30,
"shirt": 36,
"bodytype": "hourglass",
"status": "APPROVED",
"gender": "female",
"featured": false,
"file": {
"__type": "File",
"name": "c6767c0e1bb46d997bd0f7b2fcc44f45_logo.jpeg",
"url": "https://parsefiles.back4app.com/LNBLKqkrOMxfDgN18GcdJWfd8scviStNIRpzAoBm/c6767c0e1bb46d997bd0f7b2fcc44f45_logo.jpeg"
}
},
...
P.S. if I help you - please mark answer as correct :)

Web Scraping content-type:JSON

I am attempting to scrape location details from
here.
Using Beatifulsoup I got empty [ ]List as a result. The issue is the data which I want to scrape is not available in viewpagesource. In Developertool > network, content-type is JSON. So I have tried with below code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
url = 'https://ngc.taleo.net/careersection/ng_pro_intl_aujobs/jobsearch.ftl?lang=en_GB&location=756140022608&radiusType=K&searchExpanded=true&radius=1&portal=34140031600&_ga=2.197392303.1699610010.1604351575-1311873605.1579627290'
s = requests.Session()
cookies = {
'locale': 'en-GB',
'_gcl_au': '1.1.79711829.1614933155',
'_ga': 'GA1.2.693390019.1614933178',
'__atssc': 'google^%^3B1',
'_gid': 'GA1.2.1213481278.1618077337',
'__atuvc': '1^%^7C10^%^2C0^%^7C11^%^2C9^%^7C12^%^2C14^%^7C13^%^2C28^%^7C14',
'__atuvs': '6071e67dc413e3d6001',
}
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'sec-ch-ua': '^\\^Google',
'tzname': 'Asia/Calcutta',
'sec-ch-ua-mobile': '?0',
'tz': 'GMT+05:30',
'Content-Type': 'application/json',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'User-Agent': '###MY USER AGENT HERE####',
'X-Requested-With': 'XMLHttpRequest',
'Origin': 'https://ngc.taleo.net',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://ngc.taleo.net/careersection/ng_pro_intl_aujobs/jobsearch.ftl?lang=en_GB&location=756140022608&radiusType=K&searchExpanded=true&radius=1&portal=34140031600&_ga=2.197392303.1699610010.1604351575-1311873605.1579627290',
'Accept-Language': 'en-US,en;q=0.9',
}
params = (
('lang', 'en_GB'),
('portal', '34140031600'),
)
data = '^{^\\^multilineEnabled^\\^:true,^\\^sortingSelection^\\^:^{^\\^sortBySelectionParam^\\^:^\\^3^\\^,^\\^ascendingSortingOrder^\\^:^\\^false^\\^^},^\\^fieldData^\\^:^{^\\^fields^\\^:^{^\\^KEYWORD^\\^:^\\^^\\^,^\\^LOCATION^\\^:^\\^756140022608^\\^,^\\^JOB_TITLE^\\^:^\\^^\\^^},^\\^valid^\\^:true^},^\\^filterSelectionParam^\\^:^{^\\^searchFilterSelections^\\^:^[^{^\\^id^\\^:^\\^POSTING_DATE^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^LOCATION^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_FIELD^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_TYPE^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_SCHEDULE^\\^,^\\^selectedValues^\\^:^[^]^}^]^},^\\^advancedSearchFiltersSelectionParam^\\^:^{^\\^searchFilterSelections^\\^:^[^{^\\^id^\\^:^\\^ORGANIZATION^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^LOCATION^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_FIELD^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_NUMBER^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^URGENT_JOB^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^EMPLOYEE_STATUS^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^STUDY_LEVEL^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^WILL_TRAVEL^\\^,^\\^selectedValues^\\^:^[^]^},^{^\\^id^\\^:^\\^JOB_SHIFT^\\^,^\\^selectedValues^\\^:^[^]^}^]^},^\\^pageNo^\\^:1^}'
response = s.post(url, headers=headers, cookies=cookies, data=data).json()
#res_json = json.loads(response)
#print(response.status_code)
But in response line I got an error as JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Would really appreciate any help on this!!
I am unfortunately currently limited to using only requests or other popular python libraries.
Thanks in advance..
You need to send JSON data, you can use json = data with the python requests module. You also need to format the data into a dictionary:
import requests
r = requests.post("https://ngc.taleo.net/careersection/rest/jobboard/searchjobs",
params={
"lang": "en_GB",
"location": "756140022608",
"radiusType": "K",
"searchExpanded": "true",
"radius": "1",
"portal": "34140031600"
},
headers={
"tzname": "Asia/Calcutta",
"tz": "GMT+05:30"
},
json={
"multilineEnabled": True,
"sortingSelection": {
"sortBySelectionParam": "3",
"ascendingSortingOrder": "false"
},
"fieldData": {
"fields": {
"KEYWORD": "",
"LOCATION": "756140022608",
"JOB_TITLE": ""
},
"valid": True
},
"filterSelectionParam": {
"searchFilterSelections": [{
"id": "POSTING_DATE",
"selectedValues": []
}, {
"id": "LOCATION",
"selectedValues": []
}, {
"id": "JOB_FIELD",
"selectedValues": []
}, {
"id": "JOB_TYPE",
"selectedValues": []
}, {
"id": "JOB_SCHEDULE",
"selectedValues": []
}]
},
"advancedSearchFiltersSelectionParam": {
"searchFilterSelections": [{
"id": "ORGANIZATION",
"selectedValues": []
}, {
"id": "LOCATION",
"selectedValues": []
}, {
"id": "JOB_FIELD",
"selectedValues": []
}, {
"id": "JOB_NUMBER",
"selectedValues": []
}, {
"id": "URGENT_JOB",
"selectedValues": []
}, {
"id": "EMPLOYEE_STATUS",
"selectedValues": []
}, {
"id": "STUDY_LEVEL",
"selectedValues": []
}, {
"id": "WILL_TRAVEL",
"selectedValues": []
}, {
"id": "JOB_SHIFT",
"selectedValues": []
}]},
"pageNo": 1
})
print(r.json())

Scraping wsj.com

I wanted to scrape some data from wsj.com and print it. The actual website is: https://www.wsj.com/market-data/stocks?mod=md_home_overview_stk_main and the data is NYSE Issues Advancing, Declining and NYSE Share Volume Advancing, Declining.
I tried using beautifulsoup after watching a youtube video but I can't get any of the classes to return a value inside body.
Here is my code:
from bs4 import BeautifulSoup
import requests
source = requests.get('https://www.wsj.com/market-data/stocks?mod=md_home_overview_stk_main').text
soup = BeautifulSoup(source, 'lxml')
body = soup.find('body')
adv = body.find('td', class_='WSJTables--table__cell--2dzGiO7q WSJTheme--table__cell--1At-VGNg ')
print(adv)
Also while inspecting elements in Network I noticed that this data is also available as a JSON.
Here is the link: https://www.wsj.com/market-data/stocks?id=%7B%22application%22%3A%22WSJ%22%2C%22marketsDiaryType%22%3A%22overview%22%7D&type=mdc_marketsdiary
So I wrote another script to try and parse this data using JSON but again its not working.
Here is the code:
import json
import requests
url = 'https://www.wsj.com/market-data/stocks?id=%7B%22application%22%3A%22WSJ%22%2C%22marketsDiaryType%22%3A%22overview%22%7D&type=mdc_marketsdiary'
response = json.loads(requests.get(url).text)
print(response)
The error I get is:
File "C:\Users\User\Anaconda3\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
JSONDecodeError: Expecting value
I also tried a few different methods from this link and none seem to work.
Can you please set me on the right path how to scrape this data?
from bs4 import BeautifulSoup
import requests
import json
params = {
'id': '{"application":"WSJ","marketsDiaryType":"overview"}',
'type': 'mdc_marketsdiary'
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0"
}
r = requests.get(
"https://www.wsj.com/market-data/stocks", params=params, headers=headers).json()
data = json.dumps(r, indent=4)
print(data)
Output:
{
"id": "{\"application\":\"WSJ\",\"marketsDiaryType\":\"overview\"}",
"type": "mdc_marketsdiary",
"data": {
"instrumentSets": [
{
"headerFields": [
{
"value": "name",
"label": "Issues"
}
],
"instruments": [
{
"name": "Advancing",
"NASDAQ": "169",
"NYSE": "69"
},
{
"name": "Declining",
"NASDAQ": "3,190",
"NYSE": "2,973"
},
{
"name": "Unchanged",
"NASDAQ": "24",
"NYSE": "10"
},
{
"name": "Total",
"NASDAQ": "3,383",
"NYSE": "3,052"
}
]
},
{
"headerFields": [
{
"value": "name",
"label": "Issues At"
}
],
"instruments": [
{
"name": "New Highs",
"NASDAQ": "53",
"NYSE": "14"
},
{
"name": "New Lows",
"NASDAQ": "1,406",
"NYSE": "1,620"
}
]
},
{
"headerFields": [
{
"value": "name",
"label": "Share Volume"
}
],
"instruments": [
{
"name": "Total",
"NASDAQ": "4,454,691,895",
"NYSE": "7,790,947,818"
},
{
"name": "Advancing",
"NASDAQ": "506,192,012",
"NYSE": "219,412,232"
},
{
"name": "Declining",
"NASDAQ": "3,948,035,191",
"NYSE": "7,570,377,893"
},
{
"name": "Unchanged",
"NASDAQ": "464,692",
"NYSE": "1,157,693"
}
]
}
],
"timestamp": "4:00 PM EDT 3/09/20"
},
"hash": "{\"id\":\"{\\\"application\\\":\\\"WSJ\\\",\\\"marketsDiaryType\\\":\\\"overview\\\"}\",\"type\":\"mdc_marketsdiary\",\"data\":{\"instrumentSets\":[{\"headerFields\":[{\"value\":\"name\",\"label\":\"Issues\"}],\"instruments\":[{\"name\":\"Advancing\",\"NASDAQ\":\"169\",\"NYSE\":\"69\"},{\"name\":\"Declining\",\"NASDAQ\":\"3,190\",\"NYSE\":\"2,973\"},{\"name\":\"Unchanged\",\"NASDAQ\":\"24\",\"NYSE\":\"10\"},{\"name\":\"Total\",\"NASDAQ\":\"3,383\",\"NYSE\":\"3,052\"}]},{\"headerFields\":[{\"value\":\"name\",\"label\":\"Issues At\"}],\"instruments\":[{\"name\":\"New Highs\",\"NASDAQ\":\"53\",\"NYSE\":\"14\"},{\"name\":\"New Lows\",\"NASDAQ\":\"1,406\",\"NYSE\":\"1,620\"}]},{\"headerFields\":[{\"value\":\"name\",\"label\":\"Share Volume\"}],\"instruments\":[{\"name\":\"Total\",\"NASDAQ\":\"4,454,691,895\",\"NYSE\":\"7,790,947,818\"},{\"name\":\"Advancing\",\"NASDAQ\":\"506,192,012\",\"NYSE\":\"219,412,232\"},{\"name\":\"Declining\",\"NASDAQ\":\"3,948,035,191\",\"NYSE\":\"7,570,377,893\"},{\"name\":\"Unchanged\",\"NASDAQ\":\"464,692\",\"NYSE\":\"1,157,693\"}]}],\"timestamp\":\"4:00 PM EDT 3/09/20\"}}"
}
Note: You can access it as dict print(r.keys()).
You need to add a header on the url so that it will not return error=404.
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
url = 'https://www.wsj.com/market-data/stocks?id=%7B%22application%22%3A%22WSJ%22%2C%22marketsDiaryType%22%3A%22overview%22%7D&type=mdc_marketsdiary'
# put a header on the request
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:63.0) Gecko/20100101 Firefox/63.0'}
req = urllib.request.Request(url=url, headers=headers)
with urlopen(req) as response:
page_html = response.read()
df = pd.DataFrame()
data = json.loads(page_html).get('data')
for instrumentSets in data.get('instrumentSets'):
for k,v in instrumentSets.items():
if k == 'instruments':
df = df.append(pd.DataFrame(v))
df=df.rename(columns = {'name':'Issues'})
df
Result:

Categories

Resources