Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 1 year ago.
Improve this question
I built this code to scrape TSX website key data:
import requests
from bs4 import BeautifulSoup
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Brave Chrome/92.0.4515.131 Safari/537.36'
}
url = "https://money.tmx.com/en/quote/EIT.UN"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup.prettify())
But I get an extract that doesn't contain anything from the page:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="#000000" name="theme-color"/>
<link href="/favicon.ico" rel="icon"/>
<link href="/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link color="#5bbad5" href="/safari-pinned-tab.svg" rel="mask-icon"/>
<link href="/manifest.json" rel="manifest"/>
<meta content="#1e222b" name="msapplication-TileColor"/>
<meta content="#ffffff" name="theme-color"/>
<title>
TMX Money
</title>
<link href="https://unpkg.com/normalize.css#8.0.1/normalize.css" rel="stylesheet"/>
<script async="" src="//tags-cdn.deployads.com/a/tmx.com.js">
</script>
<link as="font" crossorigin="" href="/fonts/DINPro_Regular.woff2" rel="preload" type="font/woff2"/>
<link as="font" crossorigin="" href="/fonts/DINPro_Medium.woff2" rel="preload" type="font/woff2"/>
<meta content="oxLF6WU3pPwwIFLyZVG13lWRJMcqvfYNAX1IOvQiUPI" name="google-site-verification"/>
<script>
window.onpageshow=function(o){o.persisted&&window.location.reload()}
</script>
<script>
"undefined"!=typeof navigator&&(navigator.userAgent.includes("Prerender")||navigator.userAgent.includes("prerender"))&&(SC_DISABLE_SPEEDY=!0)
</script>
<link href="/static/css/45.0b0c574e.chunk.css" rel="stylesheet"/>
</head>
<body>
<noscript>
You need to enable JavaScript to run this app.
</noscript>
<div id="root">
</div>
<div id="modal-root">
</div>
<script type="text/javascript">
piAId="564512",piCId="20383",piHostname="pi.pardot.com",function(){function t(){var
t=document.createElement("script");t.type="text/javascript",t.src=("https:"==document.location.protocol?"https://pi":"http://cdn")+".pardot.com/pd.js";var
e=document.getElementsByTagName("script")[0];e.parentNode.insertBefore(t,e)}window.attachEvent?window.attachEvent("onload",t):window.addEventListener("load",t,!1)}()
!function(e){function a(a){for(var c,r,d=a[0],o=a[1],i=a[2],p=0,u=[];p<d.length;p++)r=d[p],Object.prototype.hasOwnProperty.call(n,r)&&n[r]&&u.push(n[r][0]),n[r]=0;for(c
in
o)Object.prototype.hasOwnProperty.call(o,c)&&(e[c]=o[c]);for(s&&s(a);u.length;)u.shift()();return
f.push.apply(f,i||[]),t()}function t(){for(var
e,a=0;a<f.length;a++){for(var t=f[a],c=!0,r=1;r<t.length;r++){var
o=t[r];0!==n[o]&&(c=!1)}c&&(f.splice(a--,1),e=d(d.s=t[0]))}return
e}var c={},r={44:0},n={44:0},f=[];function d(a){if(c[a])return
c[a].exports;var t=c[a]={i:a,l:!1,exports:{}};return
e[a].call(t.exports,t,t.exports,d),t.l=!0,t.exports}d.e=function(e){var
a=[];r[e]?a.push(r[e]):0!==r[e]&&{0:1,1:1,4:1,15:1,39:1,46:1}[e]&&a.push(r[e]=new
Promise((function(a,t){for(var
c="static/css/"+({18:"footer",19:"header",21:"page.admincompanycontent",22:"page.admindashboard",23:"page.admininfo",24:"page.advancedchart",25:"page.alerts",26:"page.authcallback",27:"page.devpanel",28:"page.etfcentre",29:"page.etfcomparison",30:"page.home",31:"page.marketssummary",32:"page.notfound",33:"page.quote",34:"page.search",35:"page.settings",36:"page.signin",37:"page.stocklist",38:"page.stocklists",39:"page.stockscreener",40:"page.terms",41:"page.tsx302021",42:"page.verifyemail",43:"page.watchlist"}[e]||e)+"."+{0:"c46f6ca1",1:"d488a845",2:"31d6cfe0",3:"31d6cfe0",4:"e69ef5e8",5:"31d6cfe0",6:"31d6cfe0",7:"31d6cfe0",8:"31d6cfe0",9:"31d6cfe0",10:"31d6cfe0",11:"31d6cfe0",12:"31d6cfe0",13:"31d6cfe0",14:"31d6cfe0",15:"d34d0f8b",16:"31d6cfe0",17:"31d6cfe0",18:"31d6cfe0",19:"31d6cfe0",21:"31d6cfe0",22:"31d6cfe0",23:"31d6cfe0",24:"31d6cfe0",25:"31d6cfe0",26:"31d6cfe0",27:"31d6cfe0",28:"31d6cfe0",29:"31d6cfe0",30:"31d6cfe0",31:"31d6cfe0",32:"31d6cfe0",33:"31d6cfe0",34:"31d6cfe0",35:"31d6cfe0",36:"31d6cfe0",37:"31d6cfe0",38:"31d6cfe0",39:"5cf87ee8",40:"31d6cfe0",41:"31d6cfe0",42:"31d6cfe0",43:"31d6cfe0",46:"94d2147f",47:"31d6cfe0",48:"31d6cfe0",49:"31d6cfe0",50:"31d6cfe0",51:"31d6cfe0",52:"31d6cfe0",53:"31d6cfe0",54:"31d6cfe0",55:"31d6cfe0",56:"31d6cfe0",57:"31d6cfe0",58:"31d6cfe0",59:"31d6cfe0",60:"31d6cfe0",61:"31d6cfe0",62:"31d6cfe0",63:"31d6cfe0",64:"31d6cfe0",65:"31d6cfe0",66:"31d6cfe0",67:"31d6cfe0"}[e]+".chunk.css",n=d.p+c,f=document.getElementsByTagName("link"),o=0;o<f.length;o++){var i=(s=f[o]).getAttribute("data-href")||s.getAttribute("href");if("stylesheet"===s.rel&&(i===c||i===n))return
a()}var
p=document.getElementsByTagName("style");for(o=0;o<p.length;o++){var
s;if((i=(s=p[o]).getAttribute("data-href"))===c||i===n)return a()}var
u=document.createElement("link");u.rel="stylesheet",u.type="text/css",u.onload=a,u.onerror=function(a){var
c=a&&a.target&&a.target.src||n,f=new Error("Loading CSS chunk "+e+"
failed.\n("+c+")");f.code="CSS_CHUNK_LOAD_FAILED",f.request=c,delete
r[e],u.parentNode.removeChild(u),t(f)},u.href=n,document.getElementsByTagName("head")[0].appendChild(u)})).then((function(){r[e]=0})));var
t=n[e];if(0!==t)if(t)a.push(t[2]);else{var c=new
Promise((function(a,c){t=n[e]=[a,c]}));a.push(t[2]=c);var
f,o=document.createElement("script");o.charset="utf-8",o.timeout=120,d.nc&&o.setAttribute("nonce",d.nc),o.src=function(e){return
d.p+"static/js/"+({18:"footer",19:"header",21:"page.admincompanycontent",22:"page.admindashboard",23:"page.admininfo",24:"page.advancedchart",25:"page.alerts",26:"page.authcallback",27:"page.devpanel",28:"page.etfcentre",29:"page.etfcomparison",30:"page.home",31:"page.marketssummary",32:"page.notfound",33:"page.quote",34:"page.search",35:"page.settings",36:"page.signin",37:"page.stocklist",38:"page.stocklists",39:"page.stockscreener",40:"page.terms",41:"page.tsx302021",42:"page.verifyemail",43:"page.watchlist"}[e]||e)+"."+{0:"3bf886dc",1:"36a077ef",2:"304b93f2",3:"aad23197",4:"b347aa4d",5:"8d07c059",6:"1a942e20",7:"3eace955",8:"684593b3",9:"a59fae53",10:"c86ddf7c",11:"c15a76a9",12:"2ebc2b8d",13:"5a9662c3",14:"98c1b9e7",15:"71ab84f5",16:"ab482800",17:"a0a7a872",18:"c1361d6f",19:"695f2560",21:"30610631",22:"88cd3df4",23:"02e7e23f",24:"7ac96b36",25:"b46712a8",26:"defeb6a3",27:"ea288e40",28:"1f2df7fa",29:"31f26ed5",30:"c025e5e2",31:"d6d116eb",32:"c1a96e84",33:"e61043f0",34:"53152b1c",35:"98c1f6a3",36:"f03094c0",37:"2d009271",38:"cf9680f3",39:"7101ddac",40:"0b15da7e",41:"91471a80",42:"f1ae28a6",43:"1a5a65d3",46:"4d4b1467",47:"8543c258",48:"7e804703",49:"b62fad4b",50:"dce0e3cb",51:"0ec82fe9",52:"9329bc73",53:"4279abc7",54:"93ee9948",55:"fcaa0f53",56:"7e64f2a0",57:"198998df",58:"b836b3c6",59:"7ef7187c",60:"b84c7ab4",61:"9f4229fa",62:"926a402c",63:"bc502904",64:"944dd1ae",65:"cdf5fd44",66:"fe991ddf",67:"8a557aa7"}[e]+".chunk.js"}(e);var
i=new Error;f=function(a){o.onerror=o.onload=null,clearTimeout(p);var
t=n[e];if(0!==t){if(t){var
c=a&&("load"===a.type?"missing":a.type),r=a&&a.target&&a.target.src;i.message="Loading
chunk "+e+" failed.\n("+c+":
"+r+")",i.name="ChunkLoadError",i.type=c,i.request=r,t1}n[e]=void
0}};var
p=setTimeout((function(){f({type:"timeout",target:o})}),12e4);o.onerror=o.onload=f,document.head.appendChild(o)}return
Promise.all(a)},d.m=e,d.c=c,d.d=function(e,a,t){d.o(e,a)||Object.defineProperty(e,a,{enumerable:!0,get:t})},d.r=function(e){"undefined"!=typeof
Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.t=function(e,a){if(1&a&&(e=d(e)),8&a)return e;if(4&a&&"object"==typeof e&&e&&e.__esModule)return e;var
t=Object.create(null);if(d.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:e}),2&a&&"string"!=typeof
e)for(var c in e)d.d(t,c,function(a){return e[a]}.bind(null,c));return
t},d.n=function(e){var a=e&&e.__esModule?function(){return
e.default}:function(){return e};return
d.d(a,"a",a),a},d.o=function(e,a){return
Object.prototype.hasOwnProperty.call(e,a)},d.p="/",d.oe=function(e){throw
console.error(e),e};var
o=this["webpackJsonptmx-money-client"]=this["webpackJsonptmx-money-client"]||[],i=o.push.bind(o);o.push=a,o=o.slice();for(var
p=0;p<o.length;p++)a(o[p]);var s=i;t()}([])
Is there a security in place blocking the web scraping or am I writing the code the wrong way?
Actually data is generating from api calls json response. If you make disable javascript then you will see that the page goes blank meaning the url is dynamic. That's why we can't get data thus way. Here is the working example:
Code:
import requests
import pandas as pd
import json
op=[]
so=[]
body = {"operationName":"getQuoteBySymbol","variables":{"symbol":"EIT.UN","locale":"en"},"query":"query getQuoteBySymbol($symbol: String, $locale: String) {\n getQuoteBySymbol(symbol: $symbol, locale: $locale) {\n symbol\n name\n price\n priceChange\n percentChange\n exchangeName\n exShortName\n exchangeCode\n marketPlace\n sector\n industry\n volume\n openPrice\n dayHigh\n dayLow\n MarketCap\n MarketCapAllClasses\n peRatio\n prevClose\n dividendFrequency\n dividendYield\n dividendAmount\n dividendCurrency\n beta\n eps\n exDividendDate\n shortDescription\n longDescription\n website\n email\n phoneNumber\n fullAddress\n employees\n shareOutStanding\n totalDebtToEquity\n totalSharesOutStanding\n sharesESCROW\n vwap\n dividendPayDate\n weeks52high\n weeks52low\n alpha\n averageVolume10D\n averageVolume30D\n averageVolume50D\n priceToBook\n priceToCashFlow\n returnOnEquity\n returnOnAssets\n day21MovingAvg\n day50MovingAvg\n day200MovingAvg\n dividend3Years\n dividend5Years\n datatype\n issueType\n __typename\n }\n}\n"}
headers = {
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9,bn;q=0.8,es;q=0.7,ar;q=0.6",
"cache-control": "no-cache",
"content-length": "1197",
"content-type": "application/json",
"locale": "en",
"origin": "https://money.tmx.com",
"pragma": "no-cache",
"referer": "https://money.tmx.com/",
"sec-ch-ua-mobile": "?0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
}
url = "https://app-money.tmx.com/graphql"
r = requests.post(url, data=json.dumps(body), headers=headers)
resp = r.json()['data']['getQuoteBySymbol']
op.append(resp['openPrice']),
so.append(resp['totalSharesOutStanding'])
df = pd.DataFrame({'Open_price':op,'Listed Shares Outstanding': so})
print(df)
Output:
Open_price Listed Shares Outstanding
12.45 135671000
I am trying to get the table from this site: https://www.burgrieden.de/index.php?id=77
I managed to get the first site, but I can not access the other 4 pages.
The only explanations and examples I can find are with a direct link to the other site or an easy URL manipulation.
I have tried to inspect the button and see what happens in the network logger when pressing it.
But nothing worked.
How can i get to the next page with scrapy?
Here is what I have so far:
from abc import ABC
import scrapy
from scrapy.crawler import CrawlerProcess
import re
from datetime import datetime
class TrashSpider(scrapy.Spider, ABC):
name = "Trasher"
start_urls = ['https://www.burgrieden.de/index.php?id=77']
def parse(self, response, **kwargs):
for row in response.xpath('//*[#class="contenttable"]//tr')[1:]:
d = row.xpath('td//text()')[0].extract()
match = re.search(r'\d{2}.\d{2}.\d{4}', d)
date = datetime.strptime(match.group(), '%d.%m.%Y').date()
entry = {
'date': date,
'type': row.xpath('td//text()')[2].extract()
}
process = CrawlerProcess()
process.crawl(TrashSpider)
process.start()
Inspector Img
Thanks for your help in advance.
For everyone who is having the same question. I figured it out.
It is a button which triggers a POST request. To trigger this request with scrapy you have to define the request-headers and the request-form.
Both can be found using your browsers networkanalyzer:
On the left side you can see the method used to request the site. The marked entry says POST.
Now we need to get the headers which can be found in the right bottom field and put them in a dictionary in our spider_class.
Make sure to ignore the content-length. Just leave it blank or do not add it in your dictionary at all, because scrapy will send an own content-length and when more than one gets send, the side will block your request. Code 400 instead of 200.
The form data can be found at least in firefox under the request tab:
Here we need the whole line right as it is and put in a variable to use it in the actual request.
This is how it is supposed to look:
# request-header for POST request
headers = {
'Host': 'www.burgrieden.de',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded',
# 'Content-Length': '219',
'Origin': 'https://www.burgrieden.de',
'Connection': 'keep-alive',
'Referer': 'https://www.burgrieden.de/index.php?id=77',
'Cookie': 'style=normal.css',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
# POST request form data for every table page
form_data_p1 = 'publish%5BbtnStart%5D=+%7C%3C+&publish%5Bstart%5D=40&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
To make sure it works you have to disable cookies like this:
Just put it in your custom spider class
# custom scraper settings
custom_settings = {
# pass cookies along with headers
'COOKIES_ENABLED': False
}
For the actual request you have to use the start_requests() method.
# crawler's entry point
def start_requests(self):
# make HTTP POST request
# page 1
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p1,
callback=self.parse
)
Now you can parse the response with your normal parse() method.
If you get any problems try leaving the host in your headers blank or delete it.
Here is the whole class code:
class TrashSpider(scrapy.Spider, ABC):
name = "Trasher"
start_url = "https://www.burgrieden.de/index.php?id=77"
# request-header for POST request
headers = {
'Host': 'www.burgrieden.de',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded',
# 'Content-Length': '219',
'Origin': 'https://www.burgrieden.de',
'Connection': 'keep-alive',
'Referer': 'https://www.burgrieden.de/index.php?id=77',
'Cookie': 'style=normal.css',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
# POST request form data for every table page
form_data_p1 = 'publish%5BbtnStart%5D=+%7C%3C+&publish%5Bstart%5D=40&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
form_data_p2 = 'publish%5BbtnNext%5D=+%3E%3E+&publish%5Bstart%5D=0&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
form_data_p3 = 'publish%5BbtnNext%5D=+%3E%3E+&publish%5Bstart%5D=10&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
form_data_p4 = 'publish%5BbtnNext%5D=+%3E%3E+&publish%5Bstart%5D=20&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
form_data_p5 = 'publish%5BbtnNext%5D=+%3E%3E+&publish%5Bstart%5D=30&publish%5BdayFrom%5D=20&publish%5BmonthFrom%5D=03&publish%5ByearFrom%5D=2021&publish%5BdayTo%5D=&publish%5BmonthTo%5D=&publish%5ByearTo%5D=&publish%5Bfulltext%5D=&id=77'
# custom scraper settings
custom_settings = {
# pass cookies along with headers
'COOKIES_ENABLED': False
}
entrys_crawled = []
# crawler's entry point
def start_requests(self):
# make HTTP POST request to burgrieden
# page 1
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p1,
callback=self.parse
)
# page 2
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p2,
callback=self.parse
)
# page 3
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p3,
callback=self.parse
)
# page 4
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p4,
callback=self.parse
)
#page 5
yield scrapy.Request(
url=self.start_url,
method='POST',
headers=self.headers,
body=self.form_data_p5,
callback=self.parse
)
# parse date and description from table "contenttable",
# extract date from shitty formatted text and store in dictionary entry as datetime
def parse(self, response, **kwargs):
for row in response.xpath('//*[#class="contenttable"]//tr')[1:]:
d = row.xpath('td//text()')[0].extract()
match = re.search(r'\d{2}.\d{2}.\d{4}', d)
entry = {
'date': datetime.strptime(match.group(), '%d.%m.%Y').date(),
'type': row.xpath('td//text()')[2].extract()
}
self.entrys_crawled.append(entry)
There is propably a better way for processing more than one post-request, but it worked for me. If someone wants to improve it and send it to me, please feel free to do so.
For everyone who is wondering about the entrys_crawled. I processed it into an .ics file in scrapys close() method.