Scraping wsj.com

Scraping wsj.com - python

I wanted to scrape some data from wsj.com and print it. The actual website is: https://www.wsj.com/market-data/stocks?mod=md_home_overview_stk_main and the data is NYSE Issues Advancing, Declining and NYSE Share Volume Advancing, Declining.
I tried using beautifulsoup after watching a youtube video but I can't get any of the classes to return a value inside body.
Here is my code:
from bs4 import BeautifulSoup
import requests
source = requests.get('https://www.wsj.com/market-data/stocks?mod=md_home_overview_stk_main').text
soup = BeautifulSoup(source, 'lxml')
body = soup.find('body')
adv = body.find('td', class_='WSJTables--table__cell--2dzGiO7q WSJTheme--table__cell--1At-VGNg ')
print(adv)
Also while inspecting elements in Network I noticed that this data is also available as a JSON.
Here is the link: https://www.wsj.com/market-data/stocks?id=%7B%22application%22%3A%22WSJ%22%2C%22marketsDiaryType%22%3A%22overview%22%7D&type=mdc_marketsdiary
So I wrote another script to try and parse this data using JSON but again its not working.
Here is the code:
import json
import requests
url = 'https://www.wsj.com/market-data/stocks?id=%7B%22application%22%3A%22WSJ%22%2C%22marketsDiaryType%22%3A%22overview%22%7D&type=mdc_marketsdiary'
response = json.loads(requests.get(url).text)
print(response)
The error I get is:
File "C:\Users\User\Anaconda3\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
JSONDecodeError: Expecting value
I also tried a few different methods from this link and none seem to work.
Can you please set me on the right path how to scrape this data?

from bs4 import BeautifulSoup
import requests
import json
params = {
'id': '{"application":"WSJ","marketsDiaryType":"overview"}',
'type': 'mdc_marketsdiary'
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0"
}
r = requests.get(
"https://www.wsj.com/market-data/stocks", params=params, headers=headers).json()
data = json.dumps(r, indent=4)
print(data)
Output:
{
"id": "{\"application\":\"WSJ\",\"marketsDiaryType\":\"overview\"}",
"type": "mdc_marketsdiary",
"data": {
"instrumentSets": [
{
"headerFields": [
{
"value": "name",
"label": "Issues"
}
],
"instruments": [
{
"name": "Advancing",
"NASDAQ": "169",
"NYSE": "69"
},
{
"name": "Declining",
"NASDAQ": "3,190",
"NYSE": "2,973"
},
{
"name": "Unchanged",
"NASDAQ": "24",
"NYSE": "10"
},
{
"name": "Total",
"NASDAQ": "3,383",
"NYSE": "3,052"
}
]
},
{
"headerFields": [
{
"value": "name",
"label": "Issues At"
}
],
"instruments": [
{
"name": "New Highs",
"NASDAQ": "53",
"NYSE": "14"
},
{
"name": "New Lows",
"NASDAQ": "1,406",
"NYSE": "1,620"
}
]
},
{
"headerFields": [
{
"value": "name",
"label": "Share Volume"
}
],
"instruments": [
{
"name": "Total",
"NASDAQ": "4,454,691,895",
"NYSE": "7,790,947,818"
},
{
"name": "Advancing",
"NASDAQ": "506,192,012",
"NYSE": "219,412,232"
},
{
"name": "Declining",
"NASDAQ": "3,948,035,191",
"NYSE": "7,570,377,893"
},
{
"name": "Unchanged",
"NASDAQ": "464,692",
"NYSE": "1,157,693"
}
]
}
],
"timestamp": "4:00 PM EDT 3/09/20"
},
"hash": "{\"id\":\"{\\\"application\\\":\\\"WSJ\\\",\\\"marketsDiaryType\\\":\\\"overview\\\"}\",\"type\":\"mdc_marketsdiary\",\"data\":{\"instrumentSets\":[{\"headerFields\":[{\"value\":\"name\",\"label\":\"Issues\"}],\"instruments\":[{\"name\":\"Advancing\",\"NASDAQ\":\"169\",\"NYSE\":\"69\"},{\"name\":\"Declining\",\"NASDAQ\":\"3,190\",\"NYSE\":\"2,973\"},{\"name\":\"Unchanged\",\"NASDAQ\":\"24\",\"NYSE\":\"10\"},{\"name\":\"Total\",\"NASDAQ\":\"3,383\",\"NYSE\":\"3,052\"}]},{\"headerFields\":[{\"value\":\"name\",\"label\":\"Issues At\"}],\"instruments\":[{\"name\":\"New Highs\",\"NASDAQ\":\"53\",\"NYSE\":\"14\"},{\"name\":\"New Lows\",\"NASDAQ\":\"1,406\",\"NYSE\":\"1,620\"}]},{\"headerFields\":[{\"value\":\"name\",\"label\":\"Share Volume\"}],\"instruments\":[{\"name\":\"Total\",\"NASDAQ\":\"4,454,691,895\",\"NYSE\":\"7,790,947,818\"},{\"name\":\"Advancing\",\"NASDAQ\":\"506,192,012\",\"NYSE\":\"219,412,232\"},{\"name\":\"Declining\",\"NASDAQ\":\"3,948,035,191\",\"NYSE\":\"7,570,377,893\"},{\"name\":\"Unchanged\",\"NASDAQ\":\"464,692\",\"NYSE\":\"1,157,693\"}]}],\"timestamp\":\"4:00 PM EDT 3/09/20\"}}"
}
Note: You can access it as dict print(r.keys()).

You need to add a header on the url so that it will not return error=404.
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
url = 'https://www.wsj.com/market-data/stocks?id=%7B%22application%22%3A%22WSJ%22%2C%22marketsDiaryType%22%3A%22overview%22%7D&type=mdc_marketsdiary'
# put a header on the request
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:63.0) Gecko/20100101 Firefox/63.0'}
req = urllib.request.Request(url=url, headers=headers)
with urlopen(req) as response:
page_html = response.read()
df = pd.DataFrame()
data = json.loads(page_html).get('data')
for instrumentSets in data.get('instrumentSets'):
for k,v in instrumentSets.items():
if k == 'instruments':
df = df.append(pd.DataFrame(v))
df=df.rename(columns = {'name':'Issues'})
df
Result:

Related

BeautifulSoup : Web scraping information after submit button is clicked

I'm relatively new to python coding, and i'm currently trying to extract data from a website but the information only shows up after a submit button is clicked. The webpage is https://www.ccq.org/fr-CA/qualification-acces-industrie/bassins-main-oeuvre/etat-bassins-main-oeuvre
Button I have to click : button
When I inspect the website, I was able to retrieve the url of the information contained/displayed after the button click (through the network tab when inspecting website).
Here is a preview of the information output the button URL gives : info output
What i'd like to know is if it's possible to keep the information classified by DIV elements, like it does when I click the button on the site... Thank you!
Code :
import requests
from bs4 import BeautifulSoup
import re
URL = "https://www.ccq.org/fr-CA/qualification-acces-industrie/bassins-main-oeuvre/etat-
bassins-main-oeuvre"
page = requests.get(URL)
soup = BeautifulSoup(page.content,features="html.parser")
btn4 = soup.find('button',{"id":"get-labourpools"})
btn4_click = btn4['onclick']

There's an endpoint you can query to get the table data you're after.
Here's how:
import json
import requests
region_id = "01"
occupation_id = "110"
url = f"https://www.ccq.org/api/labourpools?regionId={region_id}&occupationId={occupation_id}"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0",
"X-Requested-With": "XMLHttpRequest",
}
data = requests.get(url, headers=headers).json()
print(json.dumps(data, indent=2))
Output:
[
{
"Id": "01",
"Name": "Iles de la Madeleine",
"Occupations": [
{
"Id": "110",
"Name": "Briqueteur-ma\u00e7on",
"Pool": {
"IsOpen": true,
"IsLessThan10": true,
"IsLessThan30": true
}
}
],
"EffectiveDate": "17 janvier 2022"
}
]
EDIT:
And if you want to get all tables for all regions and occupations, you can create all possible API request urls and get the data.
Here's how:
import json
import requests
from bs4 import BeautifulSoup
base_url = "https://www.ccq.org/fr-CA/qualification-acces-industrie/bassins-main-oeuvre/etat-bassins-main-oeuvre"
api_url = "https://www.ccq.org/api/labourpools?"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0",
"X-Requested-With": "XMLHttpRequest",
}
def get_ids(id_value: str) -> list:
return [
i["value"] for i
in soup.find("select", {"id": id_value}).find_all("option")[1:]
]
with requests.Session() as session:
soup = BeautifulSoup(session.get(base_url, headers=headers).text, "lxml")
region_ids = get_ids("dropdown-region")
occupation_ids = get_ids("dropdown-occupation")
all_query_urls = [
f"{api_url}regionId={region_id}&occupationId={occupation_id}"
for region_id in region_ids for occupation_id in occupation_ids
]
for query_url in all_query_urls[:2]: # remove [:2] to get all combinations
data = session.get(query_url, headers=headers).json()
print(json.dumps(data, indent=2))
This should output two entries:
[
{
"Id": "01",
"Name": "Iles de la Madeleine",
"Occupations": [
{
"Id": "110",
"Name": "Briqueteur-ma\u00e7on",
"Pool": {
"IsOpen": true,
"IsLessThan10": true,
"IsLessThan30": true
}
}
],
"EffectiveDate": "17 janvier 2022"
}
]
[
{
"Id": "01",
"Name": "Iles de la Madeleine",
"Occupations": [
{
"Id": "130",
"Name": "Calorifugeur",
"Pool": {
"IsOpen": true,
"IsLessThan10": true,
"IsLessThan30": true
}
}
],
"EffectiveDate": "17 janvier 2022"
}
]

Get data from Pendo via Python api call

There are some reports created in Pendo (Like adobe ) . I want to pull this report via python api call .
I have written code which is partially incorrect .Please let me know what I'm doing wrong?
The output should be in form of table which contains 4 columns and their records.
This code is only returning - {'overall': {}, 'fields': {'pipeline': 'Required'}}
Which is strange.
pendo_key='123fgh21-ab89-ab23-21ad-bvh3r11r2sv5.ca'
import matplotlib.pyplot as pt
import pandas as pd
import numpy as np
import requests
import os
import json
import pandas as pd
from io import StringIO
pd.set_option('display.max_colwidth',0)
url= "https://app.pendo.io/api/v1/aggregation"
headers = {
'X-Pendo-Integration-Key':pendo_key,
'Content-Type':'application/json'}
payload1= json.dumps({
"response": {
"location": "request",
"mimeType": "application/json"
},
"requests": [
{
"name": "SalesEventAggregation",
"pipeline": [
{
"source": {
"salesEvents": {
"blacklist": "apply",
"salesTypeId": "dg_5w_fgtdergJ67vFdfR8kWsxi"
},
"timeSeries": {
"period": "dayRange",
"first": "now()",
"count": 1
}
}
}],
"requestId": "saleseventAggregation-rId-etr1231-561s-6d6c-7d12-351f1d21gww2"
}
]
})
response = requests.request("POST",url,headers=headers, data= payload1)
print("Status code:", response.status_code)
response_dict = json.loads(response.text)
response_dict
Status code: 422
Output:
{'overall': {}, 'fields': {'pipeline': 'Required'}}
Expected:
There are 4 columns in form of table which is not showing up and their relevant records is not showing up.

The "request" object is not an array, and the "key" should be named as "request"(singular) not as "requests"(plural).
Try
payload1= json.dumps(
{
"response": {
"location": "request",
"mimeType": "application/json"
},
"request":
{
"name": "SalesEventAggregation",
"pipeline": [
{
"source": {
"salesEvents": {
"blacklist": "apply",
"salesTypeId": "dg_5w_fgtdergJ67vFdfR8kWsxi"
},
"timeSeries": {
"period": "dayRange",
"first": "now()",
"count": 1
}
}
}],
"requestId": "saleseventAggregation-rId-etr1231-561s-6d6c-7d12-351f1d21gww2"
}
})

Preserving sort order of .json file that gets created from api response

I am having a problem with getting the correct sort order of a .json file that gets created from an api response using PyCharm Community Edition with python 3.7.
This is the api request:
import requests
import json
url = "https://pokemon-go1.p.rapidapi.com/pokemon_names.json"
headers = {
'x-rapidapi-key': "c061ae2dffmshc2a33d10b00cee7p121f42jsn11f39d53dd1e",
'x-rapidapi-host': "pokemon-go1.p.rapidapi.com"
}
response = requests.request("GET", url, headers=headers)
If i now print(response.text), I get the following output: (That's how I want my .json file to look like later)
{
"1": {
"id": 1,
"name": "Bulbasaur"
},
"2": {
"id": 2,
"name": "Ivysaur"
},
"3": {
"id": 3,
"name": "Venusaur"
},
"4": {
"id": 4,
"name": "Charmander"
},
"5": {
"id": 5,
"name": "Charmeleon"
}
}
After that I write the response to the file "pokemondata.json" by doing this:
response_json = json.loads(response.text)
writeFile = open("pokemondata.json", "w")
writeFile.write(json.dumps(response_json, indent=4, sort_keys=True))
writeFile.close()
And then the file looks like this:
{
"1": {
"id": 1,
"name": "Bulbasaur"
},
"10": {
"id": 10,
"name": "Caterpie"
},
"100": {
"id": 100,
"name": "Voltorb"
},
"101": {
"id": 101,
"name": "Electrode"
},
"102": {
"id": 102,
"name": "Exeggcute"
}
}
I could not figure out how to get it done so that the file is sorted by ids (or the numbers before the id, e.g. "1") correctly. Could anybody please explain to me, how I can fix it?

import requests
import json
url = "https://pokemon-go1.p.rapidapi.com/pokemon_names.json"
headers = {
'x-rapidapi-key': "c061ae2dffmshc2a33d10b00cee7p121f42jsn11f39d53dd1e",
'x-rapidapi-host': "pokemon-go1.p.rapidapi.com"
}
response = requests.request("GET", url, headers=headers)
response_json = response.json()
int_keys = {int(k):v for k,v in response_json.items()}
with open("sample.json", 'w') as file_obj:
json.dump(int_keys, file_obj, indent=4, sort_keys=True)
Issue is with keys in your json which are in string format. Convert them to integers and save in the file

Filtering JSON in python

I want to filter a json file where it only show me entries where content-type is application/json.
For now this is my code :
import json
with open('rob.json', 'r', encoding="utf8") as original_file:
data = json.load(original_file)
for line in data:
if line['value'] == 'application/json':
print(line)
The code I have written is very basic as I am quite a beginner when it comes to scripting. However it is not working and I have an error:
TypeError: string indices must be integers
I require some help on why I am having this error and whether there is a better alternative to filter a JSON file.
TIA

You have to understand the structure of the returned data. It is a dictionary containing one key ("log") that is also a dictionary. That dictionary contains an "entries" key which is a list. That list consists of dictionaries that have keys for "request" and "response". The "request" key has a "headers" key, which is a list of dictionaries containing "name" and "value" keys.
import json
with open('rob.json',encoding='utf8') as f:
data = json.load(f)
# Traverse the list of log entries:
for entry in data['log']['entries']:
# Traverse the list of headers:
for header in entry['response']['headers']:
# Look for the appropriate name and value.
if header['name'] == 'Content-Type' and header['value'] == 'application/json':
# I just print the request as the response is very long...
print(json.dumps(entry['request'],indent=2))
Output:
{
"method": "GET",
"url": "http://ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min.map",
"httpVersion": "HTTP/1.1",
"headers": [
{
"name": "Pragma",
"value": "no-cache"
},
{
"name": "Accept-Encoding",
"value": "gzip,deflate,sdch"
},
{
"name": "Host",
"value": "ajax.googleapis.com"
},
{
"name": "Accept-Language",
"value": "en-US,en;q=0.8"
},
{
"name": "User-Agent",
"value": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
},
{
"name": "Accept",
"value": "*/*"
},
{
"name": "Referer",
"value": "http://ericduran.github.io/chromeHAR/"
},
{
"name": "Connection",
"value": "keep-alive"
},
{
"name": "Cache-Control",
"value": "no-cache"
}
],
"queryString": [],
"cookies": [],
"headersSize": 412,
"bodySize": 0
}

import json
with open('rob.json','r',encoding="utf8") as original_file:
data = json.load(original_file)
for entry in data["log"]["entries"]:
res = entry["response"]
for header in res["headers"]:
if "application/json" in header["value"]:
print(header)
I really don't know what you want to look for but pretty sure this code will print out the header which has value including "application/json".

Unable to send an application in the right way using post requests having multiple parameters

I'm trying to send an application after filling in a form available in a webpage using python. I've tried to mimic the process that I see in chrome dev tools but it seems I've gone somewhere wrong and that is the reason when I execute the following script I get this error:
{
"message":"415 Unsupported Media Type returned for /apply-app/rest/jobs/PIDFK026203F3VBQB79V77VIY-87592/submissions with message: ",
"key":"Exception_server_error",
"errorId":"d6b128bd-426d-4bee-8dbb-03e232829f5e"
}
It seems to me that I need to use the value of token and version in an automatic manner as they are different in every application but I don't find them in page source and stuff.
I've selected No as value for all the dropdowns (when there is any) within Additional Information.
Link to the application page
Link to the attachment that I've used thrice.
I've tried with:
import requests
main_link = "https://karriere.hsbc.de/stellenangebote/stellenboerse/apply?jobId=PIDFK026203F3VBQB79V77VIY-87592&langCode=de_DE"
post_link = "https://emea3.recruitmentplatform.com/apply-app/rest/jobs/PIDFK026203F3VBQB79V77VIY-87592/submissions"
payload = {
"candidateIdentity":{"firstName":"syed","lastName":"mushfiq","email":"mthmt80#gmail.com"},
"answeredDocuments":[{"documentType":"answeredForm","formId":"hsbc_bewerbungsprozess_pers_nliche_daten",
"answers":[
{"questionId":"form_of_address","type":"options","value":["form_of_address_m"]},
{"questionId":"academic_title","type":"simple","value":" Dr.","questionIds":[]},
{"questionId":"first_name","type":"simple","value":"syed","questionIds":[]},
{"questionId":"last_name","type":"simple","value":"mushfiq","questionIds":[]},
{"questionId":"e-mail_address","type":"simple","value":"mthmt80#gmail.com","questionIds":[]},
{"questionId":"phone__mobile_","type":"phone","countryCode":"+880","isoCountryCode":"BD","subscriberNumber":"1790128884"}]},
{"documentType":"answeredForm","formId":"hsbc_bewerbungsprozess_standard_fragebogen","answers":[{"questionId":"custom_question_450","type":"options","value":["custom_question_450_ja"]},
{"questionId":"custom_question_451","type":"options","value":["custom_question_451_nein"]},
{"questionId":"custom_question_452","type":"options","value":["custom_question_452_unter_keine_der_zuvor_genannten"]},
{"questionId":"custom_question_580","type":"options","value":["custom_question_580_nein_978"]},
{"questionId":"custom_question_637","type":"options","value":["custom_question_637_nein"]},
{"questionId":"custom_question_579","type":"options","value":["custom_question_579_nein"]},
{"questionId":"custom_question_583","type":"options","value":["custom_question_583_hsbc_deutschland_karriereseite"]}]},
#============The following three lines are supposed to help upload three files============
{"documentType":"attachment","attachmentId":"cover_letter","token":"2d178469-cdb5-4d65-9f67-1e7637896953","filename": open("demo.pdf","rb")},
{"documentType":"attachment","attachmentId":"attached_resume","token":"81a5a661-66bb-4918-a35c-ec260ffb7d02","filename": open("demo.pdf","rb")},
{"documentType":"attachment","attachmentId":"otherattachment","token":"4c3f7500-b072-48d4-83cf-0af1399bc8ba","filename": open("demo.pdf","rb")}],
#============The version's value should not be hardcoded=========================
"version":"V2:3:14dfac80702d099625d0274121b0dba68ac0fd96:861836b7d86adae8cc1ce69198b69b8ca59e2ed5","lastModifiedDate":1562056029000,"answeredDataPrivacyConsents":[{"identifier":"urn:lms:ta:tlk:data-privacy-consent:mtu531:101","consentProvided":True},
{"identifier":"urn:lms:ta:tlk:data-privacy-consent:mtu531:102","consentProvided":True}],
"metaInformation":{"applicationFormUrl":"https://karriere.hsbc.de/stellenangebote/stellenboerse/apply?jobId=PIDFK026203F3VBQB79V77VIY-87592&langCode=de_DE","jobsToLink":[]}
}
def send_application(s,link):
res = s.post(link,data=payload)
print(res.text)
if __name__ == '__main__':
with requests.Session() as s:
send_application(s,post_link)
How can I send the application in the right way?
PS I can send the application manually multiple times using the same documents to the same email.

The best way to go about something like this is to open the page in a browser and view the network tab in the developer tools. From there as you're filling out the form you'll be able to see that each time you attach a document it sends an ajax request and receives the token in a json response. With those tokens you can build the final payload which should be submitted in json format.
Here's some example code that's working:
import requests
headers = {
'Host': 'emea3.recruitmentplatform.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'apply-config-key': 'AAACEwAA-55cd88d4-c9fd-41ce-95a4-f238402b898f',
'Origin': 'https://karriere.hsbc.de',
'DNT': '1',
'Connection': 'close',
'Referer': 'https://karriere.hsbc.de/',
'Cookie': 'lumesse_language=de_DE'
}
main_link = "https://karriere.hsbc.de/stellenangebote/stellenboerse/apply?jobId=PIDFK026203F3VBQB79V77VIY-87592&langCode=de_DE"
post_link = "https://emea3.recruitmentplatform.com/apply-app/rest/jobs/PIDFK026203F3VBQB79V77VIY-87592/submissions"
ajax_link = "https://emea3.recruitmentplatform.com/apply-app/rest/jobs/PIDFK026203F3VBQB79V77VIY-87592/attachments"
def build_payload(cover_letter_token, attached_resume_token, otherattachment_token):
return {
"candidateIdentity": {
"firstName": "Syed",
"lastName": "Mushfiq",
"email": "mthmt80#gmail.com"
},
"answeredDocuments": [
{
"documentType": "answeredForm",
"formId": "hsbc_bewerbungsprozess_pers_nliche_daten",
"answers": [
{
"questionId": "form_of_address",
"type": "options",
"value": [
"form_of_address_m"
]
},
{
"questionId": "academic_title",
"type": "simple",
"value": "prof",
"questionIds": []
},
{
"questionId": "first_name",
"type": "simple",
"value": "Syed",
"questionIds": []
},
{
"questionId": "last_name",
"type": "simple",
"value": "Mushfiq",
"questionIds": []
},
{
"questionId": "e-mail_address",
"type": "simple",
"value": "mthmt80#gmail.com",
"questionIds": []
},
{
"questionId": "phone__mobile_",
"type": "phone",
"countryCode": "+49",
"isoCountryCode": "DE",
"subscriberNumber": "30 33850062"
}
]
},
{
"documentType": "answeredForm",
"formId": "hsbc_bewerbungsprozess_standard_fragebogen",
"answers": [
{
"questionId": "custom_question_450",
"type": "options",
"value": [
"custom_question_450_ja"
]
},
{
"questionId": "custom_question_451",
"type": "options",
"value": [
"custom_question_451_nein"
]
},
{
"questionId": "custom_question_452",
"type": "options",
"value": [
"custom_question_452_unter_keine_der_zuvor_genannten"
]
},
{
"questionId": "custom_question_580",
"type": "options",
"value": [
"custom_question_580_ja"
]
},
{
"questionId": "custom_question_637",
"type": "options",
"value": [
"custom_question_637_nein"
]
},
{
"questionId": "custom_question_579",
"type": "options",
"value": [
"custom_question_579_nein"
]
},
{
"questionId": "custom_question_583",
"type": "options",
"value": [
"custom_question_583_linkedin"
]
}
]
},
{
"documentType": "attachment",
"attachmentId": "cover_letter",
"token": cover_letter_token,
"filename": "demo.pdf"
},
{
"documentType": "attachment",
"attachmentId": "attached_resume",
"token": attached_resume_token,
"filename": "demo.pdf"
},
{
"documentType": "attachment",
"attachmentId": "otherattachment",
"token": otherattachment_token,
"filename": "demo.pdf"
}
],
"version": "V2:3:14dfac80702d099625d0274121b0dba68ac0fd96:861836b7d86adae8cc1ce69198b69b8ca59e2ed5",
"lastModifiedDate": "1562056029000",
"answeredDataPrivacyConsents": [
{
"identifier": "urn:lms:ta:tlk:data-privacy-consent:mtu531:101",
"consentProvided": "true"
},
{
"identifier": "urn:lms:ta:tlk:data-privacy-consent:mtu531:102",
"consentProvided": "true"
}
],
"metaInformation": {
"applicationFormUrl": "https://karriere.hsbc.de/stellenangebote/stellenboerse/apply?jobId=PIDFK026203F3VBQB79V77VIY-87592&langCode=de_DE",
"jobsToLink": []
}
}
def submit_attachment(s, link, f):
d = open(f, 'rb').read()
r = s.post(link, files={'file':('demo.pdf', d),'applicationProcessVersion':(None, 'V2:3:14dfac80702d099625d0274121b0dba68ac0fd96:861836b7d86adae8cc1ce69198b69b8ca59e2ed5')})
r_data = r.json()
return r_data.get('token')
def send_application(s,link,p):
res = s.post(link, json=p)
return res
if __name__ == '__main__':
attachment_list = ["cover_letter_token", "attached_resume_token", "otherattachment_token"]
token_dict = {}
with requests.Session() as s:
s.headers.update(headers)
for at in attachment_list:
rt = submit_attachment(s, ajax_link, "demo.pdf")
token_dict[at] = rt
payload = build_payload(token_dict['cover_letter_token'], token_dict['attached_resume_token'], token_dict['otherattachment_token'])
rd = send_application(s, post_link, payload)
print(rd.text)
print(rd.status_code)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping wsj.com - python

Related

BeautifulSoup : Web scraping information after submit button is clicked

Get data from Pendo via Python api call

Preserving sort order of .json file that gets created from api response

Filtering JSON in python

Unable to send an application in the right way using post requests having multiple parameters

Categories

Resources