Python Scraping AJAX Post Request - python

import requests
import json
import csv
import pandas as pd
import time
from bs4 import BeautifulSoup
from requests import Session
url = 'https://www.agathaparis.com/ajax.V1.php/en_US/Rbs/Storelocator/Store/'
payload={"websiteId":603593,"sectionId":603593,"pageId":868982,"data":{"currentStoreId":0,"distanceUnit":"kilometers","distance":"50kilometers","coordinates":{"latitude":48.856614,"longitude":2.3522219},"commercialSign":0},"dataSets":"coordinates,address,card,allow","URLFormats":"canonical,contextual","visualFormats":"original,listItem","pagination":"0,50","referer":"https://www.agathaparis.com/our-stores.html"}
s=requests.Session()
s.get('https://www.agathaparis.com/our-stores.html')
headers={
'Content-Type': 'application/json',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-gb',
'Host': 'www.agathaparis.com',
'Origin': 'https://www.agathaparis.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
'Connection': 'keep-alive',
'Referer': 'https://www.agathaparis.com/our-stores.html',
'Content-Length': '407',
'Cookie': '_fbp=fb.1.1607609084947.2075070555; _ga=GA1.2.964068958.1607609084; _gid=GA1.2.1470390017.1607868080; _gat_UA-33249847-1=1; rbsWebsiteTrackerHasConsent=true; rbsWebsiteTrackerHasConsentGdpr=%7B%22technical%22%3Atrue%2C%22analytics%22%3Atrue%2C%22advertising%22%3Atrue%7D; PHPSESSID=n4uf5tfuf96k141vemo5s9g99g',
}
resp = s.post(url,data=payload,headers=headers)
I am trying to extract the stores list through this post request. I don't understand what I am missing. Thanks in advance for your help

Your main error is that you post with wrong Content-Type. You need to post JSON instead of application/x-www-form-urlencoded:
headers={
'Content-Type': 'application/json',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-gb',
'Host': 'www.agathaparis.com',
'Origin': 'https://www.agathaparis.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
# 'Connection': 'keep-alive',
'Referer': 'https://www.agathaparis.com/our-stores.html',
'x-http-method-override': 'GET',
# 'Content-Length': '407',
# 'Cookie': '_fbp=fb.1.1607609084947.2075070555; _ga=GA1.2.964068958.1607609084; _gid=GA1.2.1470390017.1607868080; _gat_UA-33249847-1=1; rbsWebsiteTrackerHasConsent=true; rbsWebsiteTrackerHasConsentGdpr=%7B%22technical%22%3Atrue%2C%22analytics%22%3Atrue%2C%22advertising%22%3Atrue%7D; PHPSESSID=n4uf5tfuf96k141vemo5s9g99g',
}
resp = s.post(url, json=payload, headers=headers)

Related

Website not returning data that I want using beautifulsoup, but it shows up fine in my browser

I'm trying to scrape some data from this website but getting a 403 error. When I open it in my browser its not giving me the error. Help would be appreciated. This is my first time trying any web scraping. I think I need something different in my header? not sure. thanks
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
pp_props_url = 'https://api.prizepicks.com/projections?league_id=7&per_page=250&single_stat=true'
headers = {
'Connection': 'keep-alive',
'Accept': 'application/json; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
'Access-Control-Allow-Credentials': 'true',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Referer': 'https://app.prizepicks.com/',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9'
}
url = 'https://api.prizepicks.com/projections'
r = requests.get(url, headers=headers)
print(r)
df = pd.json_normalize(r.json()['data'])
print(df)
I get a 403 error and its not returning the data I want.
The following code should work:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
pp_props_url = 'https://api.prizepicks.com/projections?league_id=7&per_page=250&single_stat=true'
headers = {
'Connection': 'keep-alive',
'Accept': 'application/json; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
'Access-Control-Allow-Credentials': 'true',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Referer': 'https://app.prizepicks.com/',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9'
}
r = requests.get(pp_props_url, headers=headers)
print(r)
df = pd.json_normalize(r.json()['data'])
print(df)

python how can capture result get after post

i have this code
import requests
import time
import re
import json
from datetime import datetime
url = 'https://www.cryptocommando.io/mio-account/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://www.cryptocommando.io',
'Referer': 'https://www.cryptocommando.io/mio-account/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Linux"'
}
data = {
'username': "mylogin",
'password': '123456',
'woocommerce-login-nonce': '030f97cc6f',
'_wp_http_referer': '/mio-account/',
'login': 'Accedi'
}
response = requests.post(url, headers=headers, data=data)
#response_headers = response.headers
#print(response.status_code)
#print(response.text)
response = requests.get(url, headers=headers,data=data)
print(response.headers)
when i do a post after in browser i saw the site reply with GET if i copy headers i saw my wordpress_logged_in_....etc.... but if i call by pyhton i dont have why ?
i try to:
response = requests.post(url, headers=headers, data=data)
response_headers = response.headers

How do I get access token from website automaticaly using python?

I am trying to do some web scraping from here but I am struggling to get the access token automaticaly. Everytime I do the web scraping, I need to manually update the Bearer token. Is there a way to do this automaticaly?
Let me show you how I do it manually:
url_WiZink = 'https://www.creditopessoal.wizink.pt/gravitee/gateway/api-chn-loans/v1/loans/quotation'
headers_WiZink = {'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'pt-PT',
'Authorization': 'Bearer de6ea490-381e-417f-ab77-3aad0d7eb63c',
'Connection': 'keep-alive',
'Content-Length': '266',
'Content-Type': 'application/json;charset=UTF-8',
'Host': 'www.creditopessoal.wizink.pt',
'Origin': 'https://www.wizink.pt',
'Referer': 'https://www.wizink.pt/',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'X-Channel-Id': 'LOANSIMULATOR',
'X-Client-Id': 'simWzkPt',
'X-Country-Id': 'PRT',
'X-Device-UUID': 'd14e9b629804cbba1ac7c3e78ab39a56'}
payload_WiZink = {"productCode":"WZP01","fixedTermLoanId":84,"impositionAmount":{"amount":10000,"currency":"EUR"},"settlementDay":"5","dueOrAdvanceInterestIndicator":"3","nominalInterest":"8.0000000","feeRateId":"05","settlementFrequencyId":"0001","deprecationFrequencyId":"0001"}
response_WiZink = requests.post(url_WiZink, headers=headers_WiZink, json=payload_WiZink, verify=False).json()
For that website, you can get an access token by calling their oauth/token endpoint:
import requests
access_token = requests.post(
'https://www.creditopessoal.wizink.pt/gravitee/gateway/api-chn-auth-server/v1/oauth/token',
headers={'Authorization': 'Basic c2ltV3prUHQ6YmllZTktZmR6dzAzLXBvZWpuY2Q='},
data={'grant_type': 'client_credentials'},
).json()['access_token']
print(access_token)

Getting 403 response with requests

I want to get the source page of 'g2.com' website.
I included all the headers in the request and still get 403 response.
headers = {
'Authority': 'www.g2.com',
'Method': 'GET',
'Scheme': 'https',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US, en; q=0.9',
'Referer': 'https://www.g2.com/products/adobe-marketo-engage/reviews?__cf_chl_tk=Fo5h38ejOyOQlLESaOeFuwQwgN_UhEf3GEk.D8oGJUI-1657274049-0-gaNycGzNCmU',
'Pragma': 'no-cache',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-User': '?1',
'cookie': 'events_distinct_id=c2b8f12b-8533-4f3b-8116-5d1697f9fb9a; _ga=GA1.2.1805609855.1656582972; ajs_anonymous_id=c2b8f12b-8533-4f3b-8116-5d1697f9fb9a; intercom-id-rzpwcktf=ca9542d9-48e1-43e9-9c03-2e6f764bd313; _gcl_au=1.1.1117117031.1656582974; _delighted_web={"h7nzI49oCCJbJbS4":{"_delighted_fst":{"t":"1656582990103"}}}; __adroll_fpc=be0a43b909a1ee55f7fbbe0ff435677b-1656582993099; intercom-session-rzpwcktf=; cf_clearance=bZFBSAXN._7HrbMYKIHlwGhxjEpv1LcxhdgUZJvANuA-1657274052-0-150; __ar_v4=|C6MKFN32KVBHZAS4DKYVVW:20220707:1|EEPCTRZ5RNC6ZCBB2PJM4J:20220707:1|NBMTYK27EJFT3GYAV7FM56:20220707:1; _g2_session_id=741c7ae825402c34f500c9e045907672; _gid=GA1.2.923101367.1657517161; AWSALB=OS0LwCLYUCHeMYMSmWfhqAWaatiLtCuQhpwkGLfmjOb8/p6Lc/wFvC/DJ0MD7qnMTML3BbUpq9caoDwAQuU/EStHqVpw+cO7juiwSVs551sfSWPPYgiP8lwLahU+; AWSALBCORS=OS0LwCLYUCHeMYMSmWfhqAWaatiLtCuQhpwkGLfmjOb8/p6Lc/wFvC/DJ0MD7qnMTML3BbUpq9caoDwAQuU/EStHqVpw+cO7juiwSVs551sfSWPPYgiP8lwLahU+',
'User-Agent': 'Mozilla/5.0(Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/103.0.0.0 Safari/537.36',
'Content-Type': 'text/html; charset=UTF-8',
}
response = requests.get(url, headers=headers)
I tried with pytor package tho but I got the same problem.
I can view the website in all browsers but when I try to get the source page with requests I get 403.

Python Requests post on a site not working

I am trying to scrape property information from https://www.ura.gov.sg/realEstateIIWeb/resiRental/search.action using Python Requests. Using Chrome I have inspected the POST request and emulated it using requests. I use sessions to maintain cookies. When I try my code, the return from the website is "missing parameters in search query" so obviously something is wrong with my requests (though it is not obvious what).
Doing some digging there was one cookie that I did not get when doing request.get on the search side, so I added that manually. Still no go. I tried emulating the request headers exactly as well, still does not return the correct results.
The only time I have gotten it to work is when I manually copy the cookies from my browser to the Python request object.
url = 'https://www.ura.gov.sg/realEstateIIWeb/resiRental/submitSearch.action;jsessionid={}'
values = {'submissionType': 'pn',
'from_Date_Prj': 'JAN-2014',
'to_Date_Prj': 'JAN-2016',
'__multiselect_projectNameList': '',
'selectedProjects': '10 SHELFORD',
'__multiselect_selectedProjects': '',
'propertyType': 'lp',
'from_Date': 'JAN-2016',
'to_Date': 'JAN-2016',
'__multiselect_postalDistrictList': '',
'__multiselect_selectedPostalDistricts': ''}
header1 = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8,nb;q=0.6,no;q=0.4',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.ura.gov.sg',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.8,nb;q=0.6,no;q=0.4',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'www.ura.gov.sg',
'Origin': 'https://www.ura.gov.sg',
'Referer': 'https://www.ura.gov.sg/realEstateIIWeb/resiRental/search.action',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
}
with requests.Session() as r:
page1 = r.get('https://www.ura.gov.sg/realEstateIIWeb/resiRental/search.action', headers=header1)
requests.utils.add_dict_to_cookiejar(r.cookies, {'BIGipServerpl-prod_iis_web_v4': '3334383808.20480.0000'})
page2 = r.post(url.format(r.cookies.get_dict()['JSESSIONID']), data=values, headers=headers)

Categories

Resources