webscraping returning backend data only [closed]

webscraping returning backend data only [closed] - python

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 1 year ago.
Improve this question
I built this code to scrape TSX website key data:
import requests
from bs4 import BeautifulSoup
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Brave Chrome/92.0.4515.131 Safari/537.36'
}
url = "https://money.tmx.com/en/quote/EIT.UN"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup.prettify())
But I get an extract that doesn't contain anything from the page:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="#000000" name="theme-color"/>
<link href="/favicon.ico" rel="icon"/>
<link href="/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link color="#5bbad5" href="/safari-pinned-tab.svg" rel="mask-icon"/>
<link href="/manifest.json" rel="manifest"/>
<meta content="#1e222b" name="msapplication-TileColor"/>
<meta content="#ffffff" name="theme-color"/>
<title>
TMX Money
</title>
<link href="https://unpkg.com/normalize.css#8.0.1/normalize.css" rel="stylesheet"/>
<script async="" src="//tags-cdn.deployads.com/a/tmx.com.js">
</script>
<link as="font" crossorigin="" href="/fonts/DINPro_Regular.woff2" rel="preload" type="font/woff2"/>
<link as="font" crossorigin="" href="/fonts/DINPro_Medium.woff2" rel="preload" type="font/woff2"/>
<meta content="oxLF6WU3pPwwIFLyZVG13lWRJMcqvfYNAX1IOvQiUPI" name="google-site-verification"/>
<script>
window.onpageshow=function(o){o.persisted&&window.location.reload()}
</script>
<script>
"undefined"!=typeof navigator&&(navigator.userAgent.includes("Prerender")||navigator.userAgent.includes("prerender"))&&(SC_DISABLE_SPEEDY=!0)
</script>
<link href="/static/css/45.0b0c574e.chunk.css" rel="stylesheet"/>
</head>
<body>
<noscript>
You need to enable JavaScript to run this app.
</noscript>
<div id="root">
</div>
<div id="modal-root">
</div>
<script type="text/javascript">
piAId="564512",piCId="20383",piHostname="pi.pardot.com",function(){function t(){var
t=document.createElement("script");t.type="text/javascript",t.src=("https:"==document.location.protocol?"https://pi":"http://cdn")+".pardot.com/pd.js";var
e=document.getElementsByTagName("script")[0];e.parentNode.insertBefore(t,e)}window.attachEvent?window.attachEvent("onload",t):window.addEventListener("load",t,!1)}()
!function(e){function a(a){for(var c,r,d=a[0],o=a[1],i=a[2],p=0,u=[];p<d.length;p++)r=d[p],Object.prototype.hasOwnProperty.call(n,r)&&n[r]&&u.push(n[r][0]),n[r]=0;for(c
in
o)Object.prototype.hasOwnProperty.call(o,c)&&(e[c]=o[c]);for(s&&s(a);u.length;)u.shift()();return
f.push.apply(f,i||[]),t()}function t(){for(var
e,a=0;a<f.length;a++){for(var t=f[a],c=!0,r=1;r<t.length;r++){var
o=t[r];0!==n[o]&&(c=!1)}c&&(f.splice(a--,1),e=d(d.s=t[0]))}return
e}var c={},r={44:0},n={44:0},f=[];function d(a){if(c[a])return
c[a].exports;var t=c[a]={i:a,l:!1,exports:{}};return
e[a].call(t.exports,t,t.exports,d),t.l=!0,t.exports}d.e=function(e){var
a=[];r[e]?a.push(r[e]):0!==r[e]&&{0:1,1:1,4:1,15:1,39:1,46:1}[e]&&a.push(r[e]=new
Promise((function(a,t){for(var
c="static/css/"+({18:"footer",19:"header",21:"page.admincompanycontent",22:"page.admindashboard",23:"page.admininfo",24:"page.advancedchart",25:"page.alerts",26:"page.authcallback",27:"page.devpanel",28:"page.etfcentre",29:"page.etfcomparison",30:"page.home",31:"page.marketssummary",32:"page.notfound",33:"page.quote",34:"page.search",35:"page.settings",36:"page.signin",37:"page.stocklist",38:"page.stocklists",39:"page.stockscreener",40:"page.terms",41:"page.tsx302021",42:"page.verifyemail",43:"page.watchlist"}[e]||e)+"."+{0:"c46f6ca1",1:"d488a845",2:"31d6cfe0",3:"31d6cfe0",4:"e69ef5e8",5:"31d6cfe0",6:"31d6cfe0",7:"31d6cfe0",8:"31d6cfe0",9:"31d6cfe0",10:"31d6cfe0",11:"31d6cfe0",12:"31d6cfe0",13:"31d6cfe0",14:"31d6cfe0",15:"d34d0f8b",16:"31d6cfe0",17:"31d6cfe0",18:"31d6cfe0",19:"31d6cfe0",21:"31d6cfe0",22:"31d6cfe0",23:"31d6cfe0",24:"31d6cfe0",25:"31d6cfe0",26:"31d6cfe0",27:"31d6cfe0",28:"31d6cfe0",29:"31d6cfe0",30:"31d6cfe0",31:"31d6cfe0",32:"31d6cfe0",33:"31d6cfe0",34:"31d6cfe0",35:"31d6cfe0",36:"31d6cfe0",37:"31d6cfe0",38:"31d6cfe0",39:"5cf87ee8",40:"31d6cfe0",41:"31d6cfe0",42:"31d6cfe0",43:"31d6cfe0",46:"94d2147f",47:"31d6cfe0",48:"31d6cfe0",49:"31d6cfe0",50:"31d6cfe0",51:"31d6cfe0",52:"31d6cfe0",53:"31d6cfe0",54:"31d6cfe0",55:"31d6cfe0",56:"31d6cfe0",57:"31d6cfe0",58:"31d6cfe0",59:"31d6cfe0",60:"31d6cfe0",61:"31d6cfe0",62:"31d6cfe0",63:"31d6cfe0",64:"31d6cfe0",65:"31d6cfe0",66:"31d6cfe0",67:"31d6cfe0"}[e]+".chunk.css",n=d.p+c,f=document.getElementsByTagName("link"),o=0;o<f.length;o++){var i=(s=f[o]).getAttribute("data-href")||s.getAttribute("href");if("stylesheet"===s.rel&&(i===c||i===n))return
a()}var
p=document.getElementsByTagName("style");for(o=0;o<p.length;o++){var
s;if((i=(s=p[o]).getAttribute("data-href"))===c||i===n)return a()}var
u=document.createElement("link");u.rel="stylesheet",u.type="text/css",u.onload=a,u.onerror=function(a){var
c=a&&a.target&&a.target.src||n,f=new Error("Loading CSS chunk "+e+"
failed.\n("+c+")");f.code="CSS_CHUNK_LOAD_FAILED",f.request=c,delete
r[e],u.parentNode.removeChild(u),t(f)},u.href=n,document.getElementsByTagName("head")[0].appendChild(u)})).then((function(){r[e]=0})));var
t=n[e];if(0!==t)if(t)a.push(t[2]);else{var c=new
Promise((function(a,c){t=n[e]=[a,c]}));a.push(t[2]=c);var
f,o=document.createElement("script");o.charset="utf-8",o.timeout=120,d.nc&&o.setAttribute("nonce",d.nc),o.src=function(e){return
d.p+"static/js/"+({18:"footer",19:"header",21:"page.admincompanycontent",22:"page.admindashboard",23:"page.admininfo",24:"page.advancedchart",25:"page.alerts",26:"page.authcallback",27:"page.devpanel",28:"page.etfcentre",29:"page.etfcomparison",30:"page.home",31:"page.marketssummary",32:"page.notfound",33:"page.quote",34:"page.search",35:"page.settings",36:"page.signin",37:"page.stocklist",38:"page.stocklists",39:"page.stockscreener",40:"page.terms",41:"page.tsx302021",42:"page.verifyemail",43:"page.watchlist"}[e]||e)+"."+{0:"3bf886dc",1:"36a077ef",2:"304b93f2",3:"aad23197",4:"b347aa4d",5:"8d07c059",6:"1a942e20",7:"3eace955",8:"684593b3",9:"a59fae53",10:"c86ddf7c",11:"c15a76a9",12:"2ebc2b8d",13:"5a9662c3",14:"98c1b9e7",15:"71ab84f5",16:"ab482800",17:"a0a7a872",18:"c1361d6f",19:"695f2560",21:"30610631",22:"88cd3df4",23:"02e7e23f",24:"7ac96b36",25:"b46712a8",26:"defeb6a3",27:"ea288e40",28:"1f2df7fa",29:"31f26ed5",30:"c025e5e2",31:"d6d116eb",32:"c1a96e84",33:"e61043f0",34:"53152b1c",35:"98c1f6a3",36:"f03094c0",37:"2d009271",38:"cf9680f3",39:"7101ddac",40:"0b15da7e",41:"91471a80",42:"f1ae28a6",43:"1a5a65d3",46:"4d4b1467",47:"8543c258",48:"7e804703",49:"b62fad4b",50:"dce0e3cb",51:"0ec82fe9",52:"9329bc73",53:"4279abc7",54:"93ee9948",55:"fcaa0f53",56:"7e64f2a0",57:"198998df",58:"b836b3c6",59:"7ef7187c",60:"b84c7ab4",61:"9f4229fa",62:"926a402c",63:"bc502904",64:"944dd1ae",65:"cdf5fd44",66:"fe991ddf",67:"8a557aa7"}[e]+".chunk.js"}(e);var
i=new Error;f=function(a){o.onerror=o.onload=null,clearTimeout(p);var
t=n[e];if(0!==t){if(t){var
c=a&&("load"===a.type?"missing":a.type),r=a&&a.target&&a.target.src;i.message="Loading
chunk "+e+" failed.\n("+c+":
"+r+")",i.name="ChunkLoadError",i.type=c,i.request=r,t1}n[e]=void
0}};var
p=setTimeout((function(){f({type:"timeout",target:o})}),12e4);o.onerror=o.onload=f,document.head.appendChild(o)}return
Promise.all(a)},d.m=e,d.c=c,d.d=function(e,a,t){d.o(e,a)||Object.defineProperty(e,a,{enumerable:!0,get:t})},d.r=function(e){"undefined"!=typeof
Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.t=function(e,a){if(1&a&&(e=d(e)),8&a)return e;if(4&a&&"object"==typeof e&&e&&e.__esModule)return e;var
t=Object.create(null);if(d.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:e}),2&a&&"string"!=typeof
e)for(var c in e)d.d(t,c,function(a){return e[a]}.bind(null,c));return
t},d.n=function(e){var a=e&&e.__esModule?function(){return
e.default}:function(){return e};return
d.d(a,"a",a),a},d.o=function(e,a){return
Object.prototype.hasOwnProperty.call(e,a)},d.p="/",d.oe=function(e){throw
console.error(e),e};var
o=this["webpackJsonptmx-money-client"]=this["webpackJsonptmx-money-client"]||[],i=o.push.bind(o);o.push=a,o=o.slice();for(var
p=0;p<o.length;p++)a(o[p]);var s=i;t()}([])
Is there a security in place blocking the web scraping or am I writing the code the wrong way?

Actually data is generating from api calls json response. If you make disable javascript then you will see that the page goes blank meaning the url is dynamic. That's why we can't get data thus way. Here is the working example:
Code:
import requests
import pandas as pd
import json
op=[]
so=[]
body = {"operationName":"getQuoteBySymbol","variables":{"symbol":"EIT.UN","locale":"en"},"query":"query getQuoteBySymbol($symbol: String, $locale: String) {\n getQuoteBySymbol(symbol: $symbol, locale: $locale) {\n symbol\n name\n price\n priceChange\n percentChange\n exchangeName\n exShortName\n exchangeCode\n marketPlace\n sector\n industry\n volume\n openPrice\n dayHigh\n dayLow\n MarketCap\n MarketCapAllClasses\n peRatio\n prevClose\n dividendFrequency\n dividendYield\n dividendAmount\n dividendCurrency\n beta\n eps\n exDividendDate\n shortDescription\n longDescription\n website\n email\n phoneNumber\n fullAddress\n employees\n shareOutStanding\n totalDebtToEquity\n totalSharesOutStanding\n sharesESCROW\n vwap\n dividendPayDate\n weeks52high\n weeks52low\n alpha\n averageVolume10D\n averageVolume30D\n averageVolume50D\n priceToBook\n priceToCashFlow\n returnOnEquity\n returnOnAssets\n day21MovingAvg\n day50MovingAvg\n day200MovingAvg\n dividend3Years\n dividend5Years\n datatype\n issueType\n __typename\n }\n}\n"}
headers = {
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9,bn;q=0.8,es;q=0.7,ar;q=0.6",
"cache-control": "no-cache",
"content-length": "1197",
"content-type": "application/json",
"locale": "en",
"origin": "https://money.tmx.com",
"pragma": "no-cache",
"referer": "https://money.tmx.com/",
"sec-ch-ua-mobile": "?0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
}
url = "https://app-money.tmx.com/graphql"
r = requests.post(url, data=json.dumps(body), headers=headers)
resp = r.json()['data']['getQuoteBySymbol']
op.append(resp['openPrice']),
so.append(resp['totalSharesOutStanding'])
df = pd.DataFrame({'Open_price':op,'Listed Shares Outstanding': so})
print(df)
Output:
Open_price Listed Shares Outstanding
12.45 135671000

Related

Python Scrapy Response 200 but Javascript content not loading

I can send a GET request to the main page (in start_urls) with the correct header, my __RequestVerificationToken is working and I can see the response content of the main page. But I do not see any content data of the subpages, the response is 200 but the content is just the plain HTML page source not containing the dynamically loaded content. Seems like that a subpage is detecting Scrapy as a bot.
My Code:
import scrapy
from scrapy.crawler import CrawlerProcess
import json
import time
class eva_db_spider(scrapy.Spider):
name = 'eva'
custom_settings = {
#'DEPTH_PRIORITY': 0,
#'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
#'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue',
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_DELAY': 3,
'COOKIES_ENABLED': True
}
start_urls = ['https://bieterportal.noncd.db.de/evergabe.bieter/eva/supplierportal/portal/tabs/vergaben']
def parse(self, response):
mainpage_100items = 'https://bieterportal.noncd.db.de/evergabe.bieter/api/supplier/project/publicProjects?cultureName=de-DE&pageIndex=0&pageSize=100&sortExpression=publicationDate%20desc'
# extract token from page source
token = response.xpath('/html/head/script[1]').get()
if 'auth_token' in token:
token = token[26:211]
# print('__RequestVerificationToken = ' + token)
else:
print('Token not found')
# logic for informing user + quit script
self.headers = {
'Host': 'bieterportal.noncd.db.de',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/json',
'Connection': 'keep-alive',
'Referer': 'https://bieterportal.noncd.db.de/evergabe.bieter/eva/supplierportal/portal/tabs/vergaben',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'__RequestVerificationToken': token
}
yield scrapy.Request(mainpage_100items, callback=self.parse_identifier_for_subpage, headers=self.headers)
def parse_identifier_for_subpage(self, response):
raw_data = response.text
data = json.loads(raw_data)
for item in data['projects']:
identifier_in_url = item['identifier']
subpage = f'https://bieterportal.noncd.db.de/evergabe.bieter/eva/supplierportal/portal/subproject/{identifier_in_url}/details'
request = scrapy.Request(subpage, callback=self.parse_subpage, headers=self.headers)
time.sleep(0.05)
yield request
def parse_subpage(self, response):
raw_data = response.text # Output plain page source, no content
print('+++ Subpage Content Start +++')
print(raw_data)
test = input('+++ Subpage Content End +++ \nPress ENTER to see the next subpage...')
process = CrawlerProcess(settings = {
'FEED_URI': 'output.csv',
'FEED_FORMAT': 'csv'
})
process.crawl(eva_db_spider)
process.start()
Output:
b'<!DOCTYPE html><html lang="de-DE"><head><base href="/evergabe.bieter/ClientUI.2/dist/"/><meta charset="utf-8"><meta http-equiv="x-ua-compatible" content="ie=edge"><meta name="viewport" content="width=device-width,initial-scale=1"><title>eVa 4.9 Healy Hudson - Supplier</title><meta name="description" content="eVa 4.9 Healy Hudson - Supplier"><script>var auth_token = \'<A...token...will...be...visible...here...as...soon...as...you...visit...the...page>\';\r\n var baseAppPath = \'/evergabe.bieter\';\r\n var copyright = \'© 2021 Healy Hudson GmbH\';\r\n var version = \'4.9.21.120\';</script><!-- Configured Head Tags --><link rel="icon" type="image/x-icon" sizes="32x32" href="assets/icon/favicon.ico"><link rel="shortcut icon" type="image/x-icon" href="assets/icon/favicon.ico"><meta name="theme-color" content="#00bcd4"><!-- CSS will be injected by webpack here --><!-- Preload link tags will be injected by webpack here --><link href="vendor-main-b56c880e29d6fe6f0b60.css" rel="stylesheet"/><link href="main-73909145cca1a58643f2.css" rel="stylesheet"/><link rel="preload" href="vendor-main-polyfills.4be5abd490d0ad562c3e.chunk.js" as="script"/><link rel="preload" href="vendor-main.b56c880e29d6fe6f0b60.chunk.js" as="script"/><link rel="preload" href="vendor-polyfills.ee836bc0219a916f40d3.chunk.js" as="script"/><link rel="preload" href="polyfills.86ff653bff363d3ebeee.chunk.js" as="script"/><link rel="preload" href="main.73909145cca1a58643f2.chunk.js" as="script"/></head><body><app><!-- loading spinner layout replaced by app after startup --><div class="app-loading"><svg class="spinner" viewBox="25 25 50 50"><circle class="path" cx="50" cy="50" r="20" fill="none" stroke-width="2" stroke-miterlimit="10"></circle></svg></div></app><!-- Scripts will be injected by webpack here --><script type="text/javascript" src="runtime.cc477948046220dba513.bundle.js" async></script><script type="text/javascript" src="vendor-main-polyfills.4be5abd490d0ad562c3e.chunk.js"></script><script type="text/javascript" src="vendor-main.b56c880e29d6fe6f0b60.chunk.js"></script><script type="text/javascript" src="vendor-polyfills.ee836bc0219a916f40d3.chunk.js"></script><script type="text/javascript" src="polyfills.86ff653bff363d3ebeee.chunk.js"></script><script type="text/javascript" src="main.73909145cca1a58643f2.chunk.js" async></script></body></html>'
Any idea what could be the issue?

Scrapy does not include a javascript engine, if you need to access dynamically loaded content the scrapy documentation offers some tips. I would recommend using scrapy-splash but if performance is a priority requirement I would try to replicate the request that loads the dynamic
content.
https://docs.scrapy.org/en/latest/topics/dynamic-content.html
https://docs.scrapy.org/en/latest/topics/dynamic-content.html

Error when requesting page with requests.get python

i am trying to get html of supreme main page to parse it.
Here is what i am trying:
from bs4 import BeautifulSoup
all_page = requests.get('https://www.supremenewyork.com/index', headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}).text
all_page_html = BeautifulSoup(all_page,'html.parser')
print(all_page_html)
But instead of html i get this response:
<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/><title>Supreme</title><meta content="Supreme. The official website of Supreme. EST 1994. NYC." name="description"/><meta content="telephone=no" name="format-detection"/><meta content="on" http-equiv="cleartype"/><meta content="notranslate" name="google"/><meta content="app-id=664573705" name="apple-itunes-app"/><link href="//www.google-analytics.com" rel="dns-prefetch"/><link href="//ssl.google-analytics.com" rel="dns-prefetch"/><link href="//d2flb1n945r21v.cloudfront.net" rel="dns-prefetch"/><script src="https://www.google.com/recaptcha/api.js">async defer</script><meta content="width=device-width, initial-scale=1, minimum-scale=1, maximum-scale=1, user-scalable=no" id="viewport" name="viewport"/><link href="//d17ol771963kd3.cloudfront.net/assets/application-2000eb9ad53eb6df5a7d0fd8c85c0c03.css" media="all" rel="stylesheet"/><script \
e.t.c
Is this a kind of a block or maybe i am missing something? I even added requested headers but still i get this type of response instead of a normal one.

Well, that's actually how the page is. It is saying that it's and HTML page with some css and javascript running, then you should use the "Inspect Element" to search for the elements you want to grab and maybe write down the class they are stored in to find them more easily.

How to Bypass Google Recaptcha while scraping with Requests

Python code to request the URL:
agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'} #using agent to solve the blocking issue
response = requests.get('https://www.naukri.com/jobs-in-andhra-pradesh', headers=agent)
#making the request to the link
Output when printing the html :
<!DOCTYPE html>
<html>
<head>
<title>Naukri reCAPTCHA</title> #the title in the actual title of the URL that I am requested for
<meta name="robots" content="noindex, nofollow">
<link rel="stylesheet" href="https://static.naukimg.com/s/4/101/c/common_v62.min.css" />
<script src="https://www.google.com/recaptcha/api.js" async defer></script>
</head>
</html>

Using Google Cache along with a referer (in the header) will help you bypass the captcha.
Things to note:
Don't send more than 2 requests/sec. You may get blocked.
The result you receive is a cache. This will not be effective if you are trying to scrape a real-time data.
Example:
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" ,
'referer':'https://www.google.com/'
}
r = requests.get("http://webcache.googleusercontent.com/search?q=cache:www.naukri.com/jobs-in-andhra-pradesh",headers=header)
This gives:
>>> r.content
[Squeezed 2554 lines]

Webscraping with http shows "Web page blocked"

I am trying to scrape http website using proxies and when I am trying to extract text, it shows as "Web page Blocked". How could I avoid this error?
My code is as follows
url = "http://campanulaceae.myspecies.info/"
proxy_dict = {
'http' : "174.138.54.49:8080",
'https' : "174.138.54.49:8080"
}
page = requests.get(url, proxies=proxy_dict)
soup = BeautifulSoup(page.text,'html.parser')
print(soup)
I get below output when I am trying to output text from the website.
<html>
<head>
<title>Web Page Blocked</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="NO-CACHE" http-equiv="PRAGMA"/>
<meta content="initial-scale=1.0" name="viewport"/>
........
<body bgcolor="#e7e8e9">
<div id="content">
<h1>Web Page Blocked</h1>
<p>Access to the web page you were trying to visit has been blocked in accordance with company policy. Please contact your system administrator if you believe this is in error.</p>

Because you did not specify a user-agent for the request headers.
Quite often, sites block requests that come from robot-like sources.
Try it like this:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'}
page = requests.get(url, headers=headers, proxies=proxy_dict)

Dryscrape visit works only once in python

I want visit page in loop.
Code is:
import dryscrape
dryscrape.start_xvfb()
sess = dryscrape.Session()
url = 'http://192.168.1.5';
loop = 1
while loop < 100000:
sess.set_header('user-agent', 'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36')
sess.set_attribute('auto_load_images', False)
sess.set_timeout(30)
sess.visit(url)
response = sess.body()
print(response)
print('loop:', loop)
sess.reset()
loop = loop + 1
According to output, page is visited only once time, I don't understand why? In 2., 3., .... there is no output:
('loop:', 1)
<!DOCTYPE html><html><head>
<meta charset="utf-8">
<title>Javascript scraping test</title>
</head>
<body>
<p id="intro-text">Yay! Supports javascript</p>
<script>
document.getElementById('intro-text').innerHTML = 'Yay! Supports javascript';
</script>
</body></html>
('loop:', 2)
('loop:', 3)
('loop:', 4)
('loop:', 5)
('loop:', 6)
('loop:', 7)
Can you help me? Thank you.

Same problem with me i solve this with def try this
def fb(user,pwd)
import dryscrape as d
d.start_xvfb()
Br = d.Session()
#every time it creat a new session
Br.visit('http://fb.com')
Br.at_xpath('//*[#name = "email"]').set(user)
Br.at_xpath('//*[#name = "pass"]').set(pwd)
Br.at_xpath('//*[#name = "login"]').click()
#......Now Do Something you want.....#
Then after making def now use this
fb('my#account.com','password')
Then automatic login yourself user this command 100 time without error
Please read and answers my Question Same name links cant click python dryscrape

After updating dryscrape and its dependencies to the latest version, it works fine now.
The versions are:
dryscrape-1.0, lxml-4.1.1, webkit-server-1.0, xvfbwrapper-0.2.9
The code:
import dryscrape
dryscrape.start_xvfb()
sess = dryscrape.Session()
url = 'http://192.168.1.5/jsSupport.html';
loop = 1
while loop < 100000:
sess.set_header('user-agent', 'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36')
sess.set_attribute('auto_load_images', False)
sess.set_timeout(30)
sess.visit(url)
response = sess.body()
print(response)
print('loop:', loop)
sess.reset()
loop = loop + 1
Output:
'loop:' 1
<!DOCTYPE html><html><head>
<meta charset="utf-8">
<title>Javascript scraping test</title>
</head>
<body>
<p id="intro-text">Yay! Supports javascript</p>
<script>
document.getElementById('intro-text').innerHTML = 'Yay! Supports javascript';
</script>
</body></html>
'loop:' 2
<!DOCTYPE html><html><head>
<meta charset="utf-8">
<title>Javascript scraping test</title>
</head>
<body>
<p id="intro-text">Yay! Supports javascript</p>
<script>
document.getElementById('intro-text').innerHTML = 'Yay! Supports javascript';
</script>
</body></html>
'loop:' 3
<!DOCTYPE html><html><head>
<meta charset="utf-8">
<title>Javascript scraping test</title>
</head>
<body>
<p id="intro-text">Yay! Supports javascript</p>
<script>
document.getElementById('intro-text').innerHTML = 'Yay! Supports javascript';
</script>
</body></html>
If you cant update the modules, or dont want to, a quick fix will be visiting another page at the end of the loop.
import dryscrape
dryscrape.start_xvfb()
sess = dryscrape.Session()
url = 'http://192.168.1.5/jsSupport.html';
otherurl = "http://192.168.1.5/test"
loop = 1
while loop < 100000:
sess.set_header('user-agent', 'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36')
sess.set_attribute('auto_load_images', False)
sess.set_timeout(30)
sess.visit(url)
response = sess.body()
print(response)
print('loop:', loop)
sess.reset()
loop = loop + 1
sess.visit(otherurl) #Visits the other url, so that when sess.visit(url) is called, it is forced to visit the page again.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

webscraping returning backend data only [closed] - python

Related

Python Scrapy Response 200 but Javascript content not loading

Error when requesting page with requests.get python

How to Bypass Google Recaptcha while scraping with Requests

Webscraping with http shows "Web page blocked"

Dryscrape visit works only once in python

Categories

Resources