can insert code from Javascript or something else?I found the code on Python, is it giving something or working?to get a pdf file from coursehero?
import requests
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.3987.78 Safari/537.36'
}
data = {
"client": "web",
"query": "scrape",
"view": "list_w",
"filters": {
"type": ["document"],
"doc_type": [],
},
"sort": "relevancy",
"limit": 20,
"offset": 0,
"callout_types": ["textbook"]
}
response = requests.post(
'https://www.coursehero.com/api/v2/search/', headers=headers, json=data)
data = response.json()
for result in data['results']:
url = f"https://www.coursehero.com/file/{result['document']['db_filename']}"
print(f"'{result['core']['title']}' URL: {url}")
# Login and extract download URL from HTML
#
# response = requests.get(url, headers=headers)
# soup = BeautifulSoup(response.content, 'lxml')
# download_url = soup.select('...')
#
# OR
#
# Download file via direct HTTP request if URL is returned via XHR request
#
# download_url = 'https://www.coursehero.com/...'
# requests.get(download_url, headers=headers)
Course Hero front-end sends POST request to https://www.coursehero.com/api/v2/search and renders search results from JavaScript.Just fetch JSON via an HTTP request. Full example. I don't have a paid account so the last part of the code is commented since it's a pseudo-code.
Related
I am trying the get the content from a tvguide source https://sinematv.com.tr/Yayin-Akisi
With the below code, I receive an html page but it is just a runtime error message.
Most probably I am making a mistake when posting payload or headers.
What could be the problem?
url = 'https://sinematv.com.tr/Yayin-Akisi'
session = requests.Session()
request = session.get(url)
token = request.cookies['__RequestVerificationToken']
cookies = request.cookies
headers = {
"authority": "sinematv.com.tr",
"path": "/Asset/GetTvGuide/",
"scheme": "https",
"accept": "*/*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "tr-TR,tr;q=0.9,en-TR;q=0.8,en;q=0.7,en-US;q=0.6",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://sinematv.com.tr",
"referer": "https://sinematv.com.tr/Yayin-Akisi",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
}
payload = {"date": "16.10.2020", "__RequestVerificationToken":token}
response = requests.post(
"https://sinematv.com.tr/Asset/GetTvGuide",
data=payload,
headers=headers,
cookies=cookies,
)
print(response.content)
Wrong logic in getting token. It comes not from cookie, but from hidden element on page.
Try this:
import requests
from bs4 import BeautifulSoup
url = 'https://sinematv.com.tr/Yayin-Akisi'
session = requests.Session()
request = session.get(url)
soup = BeautifulSoup(request.content, "html.parser")
token = soup.find("input", {"name": "__RequestVerificationToken"})["value"]
payload = {
"date": "16.10.2020",
"__RequestVerificationToken": token
}
response = session.post(
"https://sinematv.com.tr/Asset/GetTvGuide",
data=payload
)
print(response.content)
I am trying to pull the json from this location -
https://www.nseindia.com/api/option-chain-indices?symbol=BANKNIFTY
This opens fine in my browser, but using requests in python throws a 401 permission error. I have tried adding headers with different arguments, but to no avail.
Interestingly, the json on this page does not open in the browser as well until https://www.nseindia.com is opened separately. I believe it requires some kind of authentication, but surprised it works in the browser without any.
Is there a way to extract the information from this url? Any help is much appreciated.
Here is my implementation -
import requests
url = 'https://www.nseindia.com/api/option-chain-indices?symbol=BANKNIFTY'
# This throws a 401 response error
page = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
# This throws a 'Connection aborted' error
page = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)"})
To get the correct data, first load cookies from other URL with requests.get() and then do Ajax request to load the JSON:
import json
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
url = 'https://www.nseindia.com/api/option-chain-indices?symbol=BANKNIFTY'
with requests.session() as s:
# load cookies:
s.get('https://www.nseindia.com/get-quotes/derivatives?symbol=BANKNIFTY', headers=headers)
# get data:
data = s.get(url, headers=headers).json()
# print data to screen:
print(json.dumps(data, indent=4))
Prints:
{
"records": {
"expiryDates": [
"03-Sep-2020",
"10-Sep-2020",
"17-Sep-2020",
"24-Sep-2020",
"01-Oct-2020",
"08-Oct-2020",
"15-Oct-2020",
"22-Oct-2020",
"29-Oct-2020",
"26-Nov-2020"
],
"data": [
{
"strikePrice": 18100,
"expiryDate": "03-Sep-2020",
"CE": {
"strikePrice": 18100,
"expiryDate": "03-Sep-2020",
"underlying": "BANKNIFTY",
"identifier": "OPTIDXBANKNIFTY03-09-2020CE18100.00",
"openInterest": 1,
"changeinOpenInterest": 1,
"pchangeinOpenInterest": 0,
"totalTradedVolume": 2,
"impliedVolatility": 95.51,
"lastPrice": 6523.6,
"change": 2850.1000000000004,
"pChange": 77.58540901048048,
"totalBuyQuantity": 2925,
"totalSellQuantity": 2800,
"bidQty": 25,
"bidprice": 6523.6,
"askQty": 25,
"askPrice": 6570.3,
"underlyingValue": 24523.8
},
"PE": {
"strikePrice": 18100,
"expiryDate": "03-Sep-2020",
"underlying": "BANKNIFTY",
...and so on.
Im trying to login to a website via a script but when I print the websites html content, but I cant see any of the data available after login...
Can someone tell what I am missing? Thank you!
def main():
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36",
}
s = requests.session()
s.headers.update(headers)
s.get('https://www.e-ams.at/eams-sfa-account/p/index.jsf')
# Generate the post data
data = {
'url': 'https://www.e-ams.at/eams-sfa-account/p/index.jsf',
'j_username': 'username',
'j_password': 'password'
}
# Perform the post request
r = s.post('https://www.e-ams.at/eams-sfa-account/p/index.jsf', data=data)
# Try to get data only available after login
r = s.get('https://www.e-ams.at/eams-sfa-account/p/EsaSBasisdaten.jsf?eamsTrack=1524234335254')
print(r.url)
print(r.text)
print(r.status_code)
If it is not part of the html form inputs, specifying the url in data dict is not correct.
Your request must be as follows:
data = {
'j_username': 'username',
'j_password': 'password'
}
r = s.post('https://www.e-ams.at/eams-sfa-account/p/index.jsf', data=data)
Generally speaking all the input tags of the form (both visible and hidden) must be included in the data dict
I am trying to scrape this website: https://ssweb.seap.minhap.es/portalEELL/consulta_alcaldes
When you choose Alicante from the first menu and then Ayuntamiento de Abengibre from the second you will see a table with results. This is what I want.
I saw in Chrome Console that choosing the values in drop-downs generates a POST request. So I thought it would be straight-forward to obtain that with requests.post
params = {
"consulta_alcalde[_csrf_token]":"dd1546dd35bf0f1af4a1f3aac165a1b5",
"consulta_alcalde[id_provincia]":"2",
"consulta_alcalde[id_entidad]":"17926"
}
r = requests.post("https://ssweb.seap.minhap.es/portalEELL/consulta_alcaldes", params)
But then when I check what r.text contains I get 200 response but can't see my data from the table. What am I doing wrong?
I am aware it can be done with Selenium but I am trying to avoid it as it's very slow.
EDIT:
As per Brian's suggestion I have modified my code as:
params = {
"consulta_alcalde[_csrf_token]":"dd1546dd35bf0f1af4a1f3aac165a1b5",
"consulta_alcalde[id_provincia]":"2",
"consulta_alcalde[id_entidad]":"17951",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
}
with requests.Session() as s:
s.get("https://ssweb.seap.minhap.es/portalEELL/consulta_alcaldes")
r = s.post("https://ssweb.seap.minhap.es/portalEELL/consulta_alcaldes", data=params)
But still no luck...
The "csrf_token" is not static, you'll have to parse the page with bs4 to get it.
Also the site provides content via xhr request, so you need to have "XMLHttpRequest" in the headers. Code:
url = 'https://ssweb.seap.minhap.es/portalEELL/consulta_alcaldes'
s = requests.Session()
r = s.get(url, verify=False)
soup = BeautifulSoup(r.content, 'html.parser')
csrf_token = soup.find('input', id="consulta_alcalde__csrf_token")['value']
data = {
"consulta_alcalde[_csrf_token]":csrf_token,
"consulta_alcalde[id_provincia]":"2",
"consulta_alcalde[id_entidad]":"17951"
}
headers = {"X-Requested-With":"XMLHttpRequest"}
r = s.post(url, data=data, headers=headers, verify=False)
print(r.content)
With a post request, the payload should be the body of the request. To do this make pass the params using the data keyword argument.
requests.post(url, data=payload)
If the post requires json, then you can either use json.dumps or simply pass the payload to the json keyword argument instead.
requests.post(url, json=payload)
i have this small funnction :
def wp_login_check(url,username,password):
UA = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"
headers = { 'User-Agent': UA, 'Content-type': 'application/x-www-form-urlencoded', 'Cookie': 'wordpress_test_cookie=WP+Cookie+check' }
http = httplib2.Http(timeout=10, disable_ssl_certificate_validation=True)
http.follow_redirects = True
body = { 'log':username,'pwd':password,'wp-submit':'Login','testcookie':'1' }
response, content = http.request(url, 'POST', headers=headers, body=urllib.urlencode(body))
url2 = url.replace('/wp-login.php','/wp-admin/plugin-install.php')
response1, content1 = http.request(url2)
print content1
i need use cookies of the first request to second request .. how can make this ?
Edited to use httplib2
Grab the cookie from the response header Set-cookie and include it in subsequent requests:
def wp_login_check(url,username,password):
UA = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"
headers = { 'User-Agent': UA, 'Content-type': 'application/x-www-form-urlencoded', 'Cookie': 'wordpress_test_cookie=WP+Cookie+check' }
http = httplib2.Http(timeout=10, disable_ssl_certificate_validation=True)
http.follow_redirects = True
body = { 'log':username,'pwd':password,'wp-submit':'Login','testcookie':'1' }
response, content = http.request(url, 'POST', headers=headers, body=urllib.urlencode(body))
# Grab the cookie for later presentation
headers = {'Cookie': response['set-cookie']}
url2 = url.replace('/wp-login.php','/wp-admin/plugin-install.php')
response1, content1 = http.request(url2, headers=headers)
print content1
Alternative
If you can, use a requests module session instead of httplib2:
import requests
s = requests.Session()
resp1 = s.post(url, headers=headers, data=body)
resp2 = s.post(...)
You will find that the cookie will be persisted in the session and then presented to the server on subsequent requests.
You will also find requests to be a much more pleasant module to work with.