Using scrapy-splash clicking a button - python

I am trying to use Scrapy-splash to click a button on a page that I'm being redirected to.
I have tested manually clicking on the page, and I am redirected to the correct page after I have clicked the button that gives my consent. I have written a small script to click the button when I am redirected to the page, but this is not working.
I have included a snippet of my spider below - am I missing something in my code?:
from sys import path
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
path.append(dir_path)
import scrapy
from scrapy_splash import SplashRequest
script="""
function main(splash)
splash:wait(1)
splash:runjs('document.querySelector("form.consent-form").submit()')
splash:wait(1)
return {
html = splash:html(),
}
end
"""
class FoobarSpider(scrapy.Spider):
name = "foobar"
def start_requests(self):
urls = ['https://uk.finance.yahoo.com/quote/ANTO.L?p=ANTO.L']
for url in urls:
yield SplashRequest(url=url, callback=self.parse,
endpoint='render.html',
args={'wait': 3},
meta = {'yahoo_url': url }
)
def parse(self, response):
url = response.url
with open('temp.html', 'wb') as f:
f.write(response.body)
if 'https://guce.' in url:
print('About to attempt to authenticate ...')
yield SplashRequest(
url,
callback = self.get_price,
endpoint = 'execute',
args = {'lua_source': script, 'timeout': 5},
meta = response.meta
)
else:
self.get_price(response)
def get_price(self, response):
print("Get price called!")
yahoo_price = None
try:
# Get Price ...
temp1 = response.css('div.D\(ib\).Mend\(20px\)')
if temp1 and len(temp1) > 1:
temp2 = temp1[1].css('span')
if len(temp2) > 0:
yahoo_price = temp2[0].xpath('.//text()').extract_first().replace(',','')
if not yahoo_price:
val = response.css('span.Trsdu\(0\.3s\).Trsdu\(0\.3s\).Fw\(b\).Fz\(36px\).Mb\(-4px\).D\(b\)').xpath('.//text()').extract_first().replace(',','')
yahoo_price = val
except Exception as err:
pass
print("Price is: {0}".format(yahoo_price))
def handle_error(self, failure):
pass
How do I fix this so that I can correctly give consent, so I'm directed to the page I want?

Rather than clicking the button, try submitting the form:
document.querySelector("form.consent-form").submit()
I tried running the JavaScript command input.btn.btn-primary.agree").click() in my console and would get an error message "Oops, Something went Wrong" but the page loads when using the above code to submit the form.
Because I'm not in Europe I can't fully recreate your setup but I believe that should get you past the issue. My guess is that this script is interfering with the .click() method.

Related

KeyError: 'No input element with the name None', Scrapy, WikiHow

Below is my source code, I am getting - KeyError: 'No input element with the name None' error.
import re
import json
from loginform import fill_login_form
class wikihowSpider(scrapy.Spider):
name = "wikihow"
start_urls = ['http://www.wikihow.com/Category:Arts-and-Entertainment']
login_url = 'https://www.wikihow.com/Main-Page#wh-dialog-login'
def start_requests(self):
yield scrapy.Request(self.login_url, self.parse_login)
def parse_login(self, response):
print('Here')
data, url, method = fill_login_form(response.url, response.body,
'username', 'password')
return scrapy.FormRequest(url, formdata=dict(data),
method=method, callback=self.parse_main)
def parse_main(self, response):
# crawl
My use-case is to log-in and then crawl given a list of starting urls.
I also tried using versions of examples mentioned here but kept getting an error like Ignoring response <404 https://www.wikihow.com/wikiHowTo?search=&wpName=username&wpPassword=password&wpRemember=1&wploginattempt=Log+in>: HTTP status code is not handled or not allowed errors. Any help would be appreciated!

Scrapy Splash HTTP status code is not handled or not allowed

I'm using scrapy-splash to scrape a car dealership website that uses javascript to load the results but I keep getting error 504 Gateway Time-out.
I have docker and Win10 and I don't think the problem is docker configuration because I can scrape another site with the same code.
import scrapy
from scrapy_splash import SplashRequest
from scrapy.loader import ItemLoader
from ..items import AutoItem
class Main_Spider(scrapy.Spider):
name = 'dealers'
allowed_domains = ['audidowntowntoronto.com']
script = '''
function main(splash)
local scroll_delay = 3
local is_down = splash:jsfunc("function() { return((window.innerHeight + window.scrollY) >= document.body.offsetHeight);}")
local scroll_to = splash:jsfunc("window.scrollTo")
local get_body_height = splash:jsfunc("function() {return document.body.scrollHeight;}")
assert(splash:go(splash.args.url))
while not is_down() do
scroll_to(0, get_body_height())
splash:wait(scroll_delay)
end
return splash:html()
end
'''
def start_requests(self):
yield SplashRequest(url="http://audidowntowntoronto.com/all/", callback=self.parse, endpoint="execute", args={'lua_source': self.script})
def parse(self, response):
url = response.xpath('//*[#itemprop="url"]/#href').extract()
print (url)
It's okay not to use lua script here. Add endpoint and additional arguments shown below.
yield SplashRequest(
url="http://audidowntowntoronto.com/all/",
callback=self.parse,
args={
'html': 1,
'wait': 5,
'render_all': 1
},
endpoint='render.json')

Having problems with a scrapy-splash script. I only get one result and my scraper does not parse other pages

I am trying to parse a list from a javascript website. When I run it, it only gives me back one entry on each column and then the spider shuts down. I have already set up my middleware settings. I am not sure what is going wrong. Thanks in advance!
import scrapy
from scrapy_splash import SplashRequest
class MalrusSpider(scrapy.Spider):
name = 'malrus'
allowed_domains = ['backgroundscreeninginrussia.com']
start_urls = ['http://www.backgroundscreeninginrussia.com/publications/new-citizens-of-malta-since-january-2015-till-december-2017/']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html')
def parse(self, response):
russians = response.xpath('//table[#id="tablepress-8"]')
for russian in russians:
yield{'name' : russian.xpath('//*[#class="column-1"]/text()').extract_first(),
'source' : russian.xpath('//*[#class="column-2"]/text()').extract_first()}
script = """function main(splash)
assert(splash:go(splash.args.url))
splash:wait(0.3)
button = splash:select("a[class=paginate_button next] a")
splash:set_viewport_full()
splash:wait(0.1)
button:mouse_click()
splash:wait(1)
return {url = splash:url(),
html = splash:html()}
end"""
yield SplashRequest(url=response.url,
callback=self.parse,
endpoint='execute',
args={'lua_source': script})
The .extract_first() (now .get()) you used will always return the first result. It's not an iterator so there is no sense to call it several times. You should try the .getall() method. That will be something like:
names = response.xpath('//table[#id="tablepress-8"]').xpath('//*[#class="column-1"]/text()').getall()
sources = response.xpath('//table[#id="tablepress-8"]').xpath('//*[#class="column-2"]/text()').getall()

Scrapy Splash cannot get the data of a React site

I need to scrape this site.
Is made in React so it looks. Then I tried to extract the data with scrapy-splash. I need for example the "a" element with class shelf-product-name. But the response is an empty array. I used the wait argument in about 5 seconds.
But I only get an empty array.
def start_requests(self):
yield SplashRequest(
url='https://www.jumbo.cl/lacteos-y-bebidas-vegetales/leches-blancas?page=6',
callback=self.parse,
args={'wait':5}
)
def parse(self,response):
print(response.css("a.shelf-product-name"))
Actually there is no need to use Scrapy Splash because all required data stored inside <script> tag of raw html response as json formatted data:
import scrapy
from scrapy.crawler import CrawlerProcess
import json
class JumboCLSpider(scrapy.Spider):
name = "JumboCl"
start_urls = ["https://www.jumbo.cl/lacteos-y-bebidas-vegetales/leches-blancas?page=6"]
def parse(self,response):
script = [script for script in response.css("script::text") if "window.__renderData" in script.extract()]
if script:
script = script[0]
data = script.extract().split("window.__renderData = ")[-1]
json_data = json.loads(data[:-1])
for plp in json_data["plp"]["plp_products"]:
for product in plp["data"]:
#yield {"productName":product["productName"]} # data from css: a.shelf-product-name
yield product
if __name__ == "__main__":
c = CrawlerProcess({'USER_AGENT':'Mozilla/5.0'})
c.crawl(JumboCLSpider)
c.start()

How do I pass URLs from A's method to B's by changing code of yield SplashRequest?? (Scrapy + Splash with Python & Dcoker)

########### This article is posted before I found it using scrapy + splash is not best way because I checked rendered pages (http://localhost:8050/render.html?url=xxx) has no data I want. That's why I try another way to make use of selenium or something. Thank you!!
【What I Want to Know: How do I should modify code of yield SplashRequest(url, callback=self.parse, args = {"wait": 5}, endpoint = "render.html")】
I don't have no idea that I should change code in order to pass URLs from A's method to B's in case of using scrapy + splash.
In my opinion, the corrected places are,
yield SplashRequest(url, callback=self.parse, args = {"wait": 5}, endpoint = "render.html")
of start_requests method(#1).
There are 3 reasons.
1st reason is,
logging.info(#2) before of "yield SplashRequest(url, callback=self.parse, args = {"wait": 5}, endpoint = "render.html")" gave me correct feedback.
2nd reason is,
I set up scrapy + splash, reading Scrapy+Splash's README of as reference.
Scrapy+Splash for JavaScript integration
https://github.com/scrapy-plugins/scrapy-splash
3rd reason is,
logging method(#3) gave me NO information.
Here is my code.
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
from bnb_sentiment.items import BnbItem
from scrapy_splash import SplashRequest
import re
import logging
logging.basicConfig(level=logging.INFO)
# __name__ is name of a method
logger = logging.getLogger(__name__)
class BnbPriceTestSpider(scrapy.Spider):
name = 'bnb_price_test'
start_urls = [
# Tokyo--Japan
'https://www.airbnb.com/s/Tokyo--Japan/homes?refinement_paths%5B%5D=%2Fhomes&allow_override%5B%5D=&checkin=2018-07-07&checkout=2018-07-08&locale=en&min_beds=0&price_max=20000&price_min=10000&query=Tokyo%2C%20Japan&place_id=ChIJ51cu8IcbXWARiRtXIothAS4&s_tag=Mz88jJs1',
]
def start_requests(self):
for url in self.start_urls:
logger.info(url) #2
yield SplashRequest(url, callback=self.parse, args = {"wait": 5}, endpoint = "render.html") #1
def parse(self, response):
for href in response.xpath('//div[contains(#id, "listing-")]//a[contains(#href, "rooms")]/#href'):
import pdb; pdb.set_trace()
logger.info(href)
url = response.urljoin(href.extract())
import pdb; pdb.set_trace()
logger.info(url) #3
yield SplashRequest(url, callback=self.parse_scrape)
def parse_scrape(self, response):
pass
(#2) Here is the feedback of logging method.
/home/ubuntu/bnbsp/bnb_sentiment/bnb_sentiment/spiders/bnb_price.py(34)start_requests()
-> logger.info(url) *1
(Pdb) url
'url is same as start_urls'

Categories

Resources