How to send a request to Cloudflare protected website? - python

I am trying to send a request to a website but I am getting a 503 status code. It seems like the website is protected by Cloudflare. Is it possible to send a request to the Cloudflare protected website with the python-requests library? I have sent cookies and headers along with the request but it didn't get through.
Below is my code.
import requests
cookies = {
'SSPV_C': 'BPwAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAA',
'locale': 'en',
'cookieID': '390778282271656143963365',
'uui': '800.606.6969|',
'cartId': '42951851197',
'mapp': '0',
'__cfruid': '7f4badea550ab7327454d1e2bac7cdec7c0701cf-1656593179',
'__cf_bm': 'gn_yevoOR3SIcU9B8oDZQ.r_e9300kA61vY264Hls64-1656593179-0-AaRJibHSFeN0Z7jkQTvjq+HQMc3lRPlhM850slZTqy7uy5QzXhmRx3M6rxlwk78kIU+zC8Vb7eDsPpuhdnNOhAkil4ZdBSaZW4pRvSMX53Xd',
'SSLB_C': '0',
'SSID_C': 'CQD72x0AAAAAAABbwLZiApVDB1vAtmICAAAAAAAAAAAAHJu9YgANyA',
'SSSC_C': '333.G7113084158674703618.2|0.0',
'SSRT_C': 'HJu9YgAAAA',
'lpi': 'cat=2,cur=USD,app=D,lang=E,view=L,lgdin=N,cache=release-WEB-20220629v10-BHJ-DVB31150-11,ipp=24,view=L,sort=BS,priv=N,state=',
'sessionKey': 'f6fbd948-2fed-41f9-bcf3-7defa626f36a',
'dpi': 'cat=2,cur=USD,app=D,lang=E,view=L,lgdin=N,cache=release-WEB-20220629v10-BHJ-DVB31150-11',
'utkn': '97655ab6781ce66340f0d2aa809c3f68',
'build': '20220629v10-20220629v10',
'aperture-be-commit-id': 'n/a',
'JSESSIONID': 'SUq0pea5K0bNUyQEJscyjUnJFvvEGjW7!622981770',
'sshc': '61f5b3f36d4907c548b3efc82cfcecd9',
'_pxhd': '53bIlMthB4XG3X644UXFOgn-jRSXY56BvM49fjHfOdSg53A7NqKSOXYc0jBByweKQ4NgEZR/R61UG9ouHxGSUw==:Kfr40D-EuMhLJ4qxdatLAMna184C2zbBIJV3xlOVy2hTdUEI3sN3kCGBQV73oDxdiOoVZAKilYlJZn--t492StGQHTm21i-GiwB5xxziLd8=',
'cf-colo': 'KHI',
'aperture-be-commit-id': 'n/a',
'TS0188dba5': '01ec39615f5a1331c083e7ac7ff7f2895322c069326ea3e7a0fb426c2906479f8fdba41c2cbebcf1669847d2488313d23495cf506ce0991eb9af796b9032458b1a715a28e71e7a31b64b6791644a6f092364bff1d8e79d027277b851adf5faa365dd8e2609',
'TS01d628c4': '01ec39615f8b9833712bd8ae68ec8c0798bd1df2e408a949a17c3772b8419cc7bbfe911b2b2798bd33f09b9e2fa7d6837ec5814f8ca97bd51f8eccc8779214eac7cd387b8f1f1d5097bca3b926c8d264dd80d59d7e4879197618d3a0ef6777bdb5902263106d9d95ac8fd7d92cd8458f02fb7c1409230f71f6b3a638107bbd8a73aa1629da3456ce69fd32f210cf1826979006e713',
'TopBarCart': '0|0',
'dlc': '%43%4D%5F%4D%4D%43%3D%7C%54%59%50%45%44%56%41%4C%55%45%3D%7C%45%4D%4C%45%3D%7C%55%4E%42%49%3D%6E%75%6C%6C%7C%4C%4F%4E%47%3D%37%30%2E%30%30%30%30%30%7C%4C%41%54%3D%33%30%2E%30%30%30%30%30%7C',
'app_cookie': '1656593927',
'TS01e1f1fd': '01ec39615fa2aeefd67a3c8e74158e94069993ea3308a949a17c3772b8419cc7bbfe911b2b55db8474b97fd606a862d187b6fdf539dfd177a32a93169e75a1c8599fc7428443914075f1081235d9564cc0fc8b69460d7a08aef755b5c296a42cf6b735f4953465ca238a6965b0625b2de8e4934e04',
'forterToken': 'a1ba6a2e88e74edb91df3bcf567bdd45_1656593924573_588_dUAL43-mnts-ants_13ck',
}
headers = {
'authority': 'www.bhphotovideo.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-PK,en;q=0.9,ur-PK;q=0.8,ur;q=0.7,en-GB;q=0.6,en-US;q=0.5,sv;q=0.4,it;q=0.3',
'cache-control': 'no-cache',
# Requests sorts cookies= alphabetically
# 'cookie': 'SSPV_C=BPwAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAA; locale=en; cookieID=390778282271656143963365; uui=800.606.6969|; cartId=42951851197; mapp=0; __cfruid=7f4badea550ab7327454d1e2bac7cdec7c0701cf-1656593179; __cf_bm=gn_yevoOR3SIcU9B8oDZQ.r_e9300kA61vY264Hls64-1656593179-0-AaRJibHSFeN0Z7jkQTvjq+HQMc3lRPlhM850slZTqy7uy5QzXhmRx3M6rxlwk78kIU+zC8Vb7eDsPpuhdnNOhAkil4ZdBSaZW4pRvSMX53Xd; SSLB_C=0; SSID_C=CQD72x0AAAAAAABbwLZiApVDB1vAtmICAAAAAAAAAAAAHJu9YgANyA; SSSC_C=333.G7113084158674703618.2|0.0; SSRT_C=HJu9YgAAAA; lpi=cat=2,cur=USD,app=D,lang=E,view=L,lgdin=N,cache=release-WEB-20220629v10-BHJ-DVB31150-11,ipp=24,view=L,sort=BS,priv=N,state=; sessionKey=f6fbd948-2fed-41f9-bcf3-7defa626f36a; dpi=cat=2,cur=USD,app=D,lang=E,view=L,lgdin=N,cache=release-WEB-20220629v10-BHJ-DVB31150-11; utkn=97655ab6781ce66340f0d2aa809c3f68; build=20220629v10-20220629v10; aperture-be-commit-id=n/a; JSESSIONID=SUq0pea5K0bNUyQEJscyjUnJFvvEGjW7!622981770; sshc=61f5b3f36d4907c548b3efc82cfcecd9; _pxhd=53bIlMthB4XG3X644UXFOgn-jRSXY56BvM49fjHfOdSg53A7NqKSOXYc0jBByweKQ4NgEZR/R61UG9ouHxGSUw==:Kfr40D-EuMhLJ4qxdatLAMna184C2zbBIJV3xlOVy2hTdUEI3sN3kCGBQV73oDxdiOoVZAKilYlJZn--t492StGQHTm21i-GiwB5xxziLd8=; cf-colo=KHI; aperture-be-commit-id=n/a; TS0188dba5=01ec39615f5a1331c083e7ac7ff7f2895322c069326ea3e7a0fb426c2906479f8fdba41c2cbebcf1669847d2488313d23495cf506ce0991eb9af796b9032458b1a715a28e71e7a31b64b6791644a6f092364bff1d8e79d027277b851adf5faa365dd8e2609; TS01d628c4=01ec39615f8b9833712bd8ae68ec8c0798bd1df2e408a949a17c3772b8419cc7bbfe911b2b2798bd33f09b9e2fa7d6837ec5814f8ca97bd51f8eccc8779214eac7cd387b8f1f1d5097bca3b926c8d264dd80d59d7e4879197618d3a0ef6777bdb5902263106d9d95ac8fd7d92cd8458f02fb7c1409230f71f6b3a638107bbd8a73aa1629da3456ce69fd32f210cf1826979006e713; TopBarCart=0|0; dlc=%43%4D%5F%4D%4D%43%3D%7C%54%59%50%45%44%56%41%4C%55%45%3D%7C%45%4D%4C%45%3D%7C%55%4E%42%49%3D%6E%75%6C%6C%7C%4C%4F%4E%47%3D%37%30%2E%30%30%30%30%30%7C%4C%41%54%3D%33%30%2E%30%30%30%30%30%7C; app_cookie=1656593927; TS01e1f1fd=01ec39615fa2aeefd67a3c8e74158e94069993ea3308a949a17c3772b8419cc7bbfe911b2b55db8474b97fd606a862d187b6fdf539dfd177a32a93169e75a1c8599fc7428443914075f1081235d9564cc0fc8b69460d7a08aef755b5c296a42cf6b735f4953465ca238a6965b0625b2de8e4934e04; forterToken=a1ba6a2e88e74edb91df3bcf567bdd45_1656593924573_588_dUAL43-mnts-ants_13ck',
'pragma': 'no-cache',
'referer': 'https://www.bhphotovideo.com/c/buy/Notebooks/ci/6782/N/4110474287',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
response = requests.get('https://www.bhphotovideo.com/c/product/1663923-REG/lenovo_82h801ekus_ip3_15itl6_i3_1115g4_8gb.html', cookies=cookies, headers=headers)
print(response)

You can use cloudscraper to get rid of cloudflare protection
from bs4 import BeautifulSoup
import cloudscraper
scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
url = 'https://www.bhphotovideo.com/c/product/1663923-REG/lenovo_82h801ekus_ip3_15itl6_i3_1115g4_8gb.html'
req = scraper.get(url)
print(req)
Output:
<Response [200]>
cloudscraper

Related

How to get full html body of google search using requests

I have a following problem. I would like to get full HTML body of google search output.
Suppose, I would like to google Everton stadium address. This is my python code:
import urllib.request as urllib2
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = "https://www.google.com/search?q=Everton+stadium+address"
headers={'User-Agent':user_agent,}
request=urllib2.Request(url,None,headers)
response = urllib2.urlopen(request)
data = response.read()
But when I print my data I can see that the html of the right part of the page is missing, see the missing red area:
how can I get full html body including the red part, please?
This:
import requests
headers = {
'authority': 'www.google.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'de,de-DE;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,fr;q=0.5,de-CH;q=0.4,es;q=0.3',
'cache-control': 'no-cache',
'dnt': '1',
'pragma': 'no-cache',
'sec-ch-ua': '"Not_A Brand";v="99", "Microsoft Edge";v="109", "Chromium";v="109"',
'sec-ch-ua-arch': '"x86"',
'sec-ch-ua-bitness': '"64"',
'sec-ch-ua-full-version': '"109.0.1518.78"',
'sec-ch-ua-full-version-list': '"Not_A Brand";v="99.0.0.0", "Microsoft Edge";v="109.0.1518.78", "Chromium";v="109.0.5414.120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-model': '""',
'sec-ch-ua-platform': '"Windows"',
'sec-ch-ua-platform-version': '"10.0.0"',
'sec-ch-ua-wow64': '?0',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78',
}
params = {
'q': 'Everton stadium address',
}
response = requests.get('https://www.google.com/search', params=params, headers=headers)
print(response.content)
request serves me the full html, including the section you mentioned.

Python - How to log into a website that has a 302 page (while also passing on a cookie) after the initial sign in?

So here is the basic login process mapped out (when you successfully login on the site) for a high level understanding. Below it is my code, that only gets me the first cookie, when I need the second cookie to access the JSON I need. I have a working code that currently gets me the file but the cookie (2nd cookie) expires after a week so it is not sustainable.
Step 1 - Purpose: Get first cookie that will be "c1" below
Request URL: https://www.fakesite.com/sign-in.html
Request Method: GET
Status Code: 200
Step 2 - Purpose: Use "c1" to make a POST request to obtain second cookie "c2" <<<<-Can't get past here
Request URL: https://www.fakesite.com/sign-in.html
Request Method: POST
Status Code: 302
Step 3 - Purpose: Use "c2" and auth_token (doesn't change) to GET the json file I need <<<I have working code when manually getting 7-day cookie
Request URL: https://www.fakesite.com/ap1/file.json
Request Method: GET
Status Code: 200
import requests
headers = {
'authority': 'www.fakesite.com',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://www.fakesite.com/goodbye.html?service=logout',
'accept-language': 'en-US,en;q=0.9',
}
s = requests.Session()
r1 = s.get('https://www.fakesite.com/sign-in.html', headers=headers)
c1 = s.cookies['fakesite']
print(r1.status_code)
print(c1)
c2= 'fakesite='+c1+'; language=en'
headers = {
'authority': 'www.fakesite.com',
'cache-control': 'max-age=0',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'origin': 'https://www.fakesite.com',
'content-type': 'application/x-www-form-urlencoded',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://www.fakesite.com/sign-in.html',
'accept-language': 'en-US,en;q=0.9',
'cookie': c2,
}
data = {
'csrf_token': '12345=',
'username': 'userme',
'password': 'passwordme',
'returnUrl': '/sign-in.html',
'service': 'login',
'Login': 'Login'
}
r2 = requests.post('https://www.fakesite.com/sign-in.html', headers=headers, data=data)
c3 = s.cookies['fakesite']
print(r2.status_code)
print(c3)
OUTPUT:
200
151f3e3ba82030e5b7c03bc310ed5ad5
200
151f3e3ba82030e5b7c03bc310ed5ad5
My code results in returning the first cookie over again when I try and print all of them . I feel like I have tried everything to no avail. When I try to go to the last site while directly logging in, I get a 401 because I need the first cookie to get the second cookie first.

Using python to login to a website that use cookies

Im trying to log on into this website (https://phishtank.org/login.php) to use it with python, but the site use cookies. I tried this:
import urllib
cookies = {
'PHPSESSID': '3hdp8jeu933e8t4hvh240i8rp840p06j',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'https://phishtank.org/login_required.php',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Cache-Control': 'max-age=0',
'TE': 'trailers',
}
response = requests.get('https://phishtank.org/add_web_phish.php', headers=headers, cookies=cookies)
print(response.text)
It works, but after a few minutes the cookie just expires. What can I do to avoid this limitation? Maybe somethin that request new cookies for me and use it.
Use request sessions instead. It persists cookies for you
import requests
session = requests.Session()
session.headers.update(headers)
session.cookies.update(cookies)
session.get(<url>)

Need assistance with Post/Get Request

I have an issue with the particular website https://damas.terna.it/DMSPCAS08.
I am trying to either scrape the data or to fetch the excel file that it is included.
I tried to fetch the excel file with a post request.
import requests
from bs4 import BeautifulSoup
import json
import datetime
url = 'https://damas.terna.it/api/Ntc/GetNtc'
headers = {
'Host': 'damas.terna.it',
'Connection': 'keep-alive',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
'sec-ch-ua-mobile': '?0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://damas.terna.it/DMSPCAS08',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cookie': '__RequestVerificationToken=5mfiSM2dKte532I8fd3MRdn6nnHbSezkQX29r3fyF2tMegXsvnOInpxy8JvFuDyRVS6pZs03y-NL3CsNsItN1yboc128Kk51yEiuUU0mel41; pers_damas_2019=378972352.20480.0000; rxVisitor=1619766836714T8SRPFLUAH62F1R9K6KG3EKK104BSDFE; dtCookie=7$EC34ED7BFB4A503D379D8D8C69242174|846cd19ce7947380|1; rxvt=1619774462088|1619771198468; dtPC=7$369396404_351h-vIDMPHTCURIGKVKBWVVEWOAMRMKDNWCUH-0e1; DamasNetCookie=F-evmYb7kIS_YTMr2mwuClMB1zazemmhl9vzSXynWeuCII_keDb_jQr4VLSYif9t3juDS6LkOuIXKFfe8pydxSzHPfZzGveNB6xryj2Czp9J1qeWFFT9dYFlRXFWAHuaEIyUQQDJmzWfDBrFCWr309mZoE6hkCKzDtoJgIoor9bed1kQgcdeymAH9lrtrKxwsheaQm2qA-vWWqKjCiruO1VkJ6II_bcrAXU2A_ZPQPznE1_8AEC_AwXmBXETubMQwFAnDXsOLDfEYeQ61TGAupF3d-wz3aZfRs5eCA3kw-en-kpEbS0trwFBQzB-098610GIbIPki9ThVitZ2LN2zza6nn1A8qchqfQC_CZEgu6Jt1glfhHceWS6tvWCuyOEqo2jJpxAajMYXPB6mzlHzX13TiV-jgeFSPehugMAgms_exqocw9w27e4lI5laYZu0rkKkznpZ1mJLOhORwny8-bKa3nRUt7erFv7ul3nLLrgd3FP907tHpTh-qXt1Bmr6OqknDZr_EBN8GY_B2YHV-8hC0AjdqQqpS0xOpp7z_CzzgByTOHSNdeKjVgQfZLQ7amnp71lhxgPeJZvOIl_mIWOr_gWRy_iK6UuzrA3udCTV7bAnUXKB8gX89d9ShQf5tZDxPFchrAQBtdmDChQOA; dtLatC=2; dtSa=true%7CC%7C-1%7CExport%20to%20.xls%7C-%7C1619772685174%7C369396404_351%7Chttps%3A%2F%2Fdamas.terna.it%2FDMSPCAS08%7CTerna%20%5Ep%20NTC%7C1619772662568%7C%7C'
}
parameters = {
'busDay': "2021-05-01",
'busDayTill': "2021-05-01",
'borderDirId': '1',
'borderDirName': "TERNA-APG"
}
response = requests.post(url, data=parameters, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
print(soup.prettify())
I am receiving this error:
The parameters dictionary contains an invalid entry for parameter 'parameters' for method 'System.Web.Mvc.ActionResult GetNtc(Damas.Core.Data.DataSource.Data.ParametersModel)' in 'Terna.Web.Controllers.CapacityManagement.NtcController'. The dictionary contains a value of type 'System.Collections.Generic.Dictionary`2[System.String,System.Object]', but the parameter requires a value of type 'Damas.Core.Data.DataSource.Data.ParametersModel'.
Parameter name: parameters
Please don't post the answer to your question in the question's body; instead, post it in the answer box:
response = requests.post(url, data=json.dumps(parameters), headers=headers) seems to solve the issue.

Python requests gives 403 error when requesting from papara.com

I'm trying to get into papara.com using Python. When I make a request it always gives 403 as a response. I got cookies from my browser. Here is my code:
import requests
headers = {
'authority': 'www.papara.com',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'en-US,en;q=0.9',
'cookie': '__cfruid=64370d0d06d80a1e1a701ae8bee5a4b85c1de1af-1610296629',
}
response = requests.get('https://www.papara.com/', headers=headers)
I tried different user agents, I tried removing the cookie from the headers but didn't work.

Categories

Resources