Scrapy Splash HTTP status code is not handled or not allowed - python

I'm using scrapy-splash to scrape a car dealership website that uses javascript to load the results but I keep getting error 504 Gateway Time-out.
I have docker and Win10 and I don't think the problem is docker configuration because I can scrape another site with the same code.
import scrapy
from scrapy_splash import SplashRequest
from scrapy.loader import ItemLoader
from ..items import AutoItem
class Main_Spider(scrapy.Spider):
name = 'dealers'
allowed_domains = ['audidowntowntoronto.com']
script = '''
function main(splash)
local scroll_delay = 3
local is_down = splash:jsfunc("function() { return((window.innerHeight + window.scrollY) >= document.body.offsetHeight);}")
local scroll_to = splash:jsfunc("window.scrollTo")
local get_body_height = splash:jsfunc("function() {return document.body.scrollHeight;}")
assert(splash:go(splash.args.url))
while not is_down() do
scroll_to(0, get_body_height())
splash:wait(scroll_delay)
end
return splash:html()
end
'''
def start_requests(self):
yield SplashRequest(url="http://audidowntowntoronto.com/all/", callback=self.parse, endpoint="execute", args={'lua_source': self.script})
def parse(self, response):
url = response.xpath('//*[#itemprop="url"]/#href').extract()
print (url)

It's okay not to use lua script here. Add endpoint and additional arguments shown below.
yield SplashRequest(
url="http://audidowntowntoronto.com/all/",
callback=self.parse,
args={
'html': 1,
'wait': 5,
'render_all': 1
},
endpoint='render.json')

Related

How to change scrapy closespider itemcount while parsing

i am new to scrapy.
is it possible to change the CLOSESPIDER_ITEMCOUNT while the spider is running?
class TestSpider(scrapy.Spider):
name = 'tester'
custom_settings = {'CLOSESPIDER_ITEMCOUNT': 100,}
def start_requests(self):
urls = ['https://google.com', 'https://amazon.com']
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
if response.xpath('//*[id="content"]') or True: # only for testing
# set CLOSESPIDER_ITEMCOUNT to 300
# rest of code
I want to be able to change the value on an "if condition" in the parse method
You can get access to the crawler settings object, unfreeze the settings, change the value and then freeze the settings object again. Please note that since this is not documented in the docs, it may have unexpected effects.
class TestSpider(scrapy.Spider):
name = 'tester'
custom_settings = {'CLOSESPIDER_ITEMCOUNT': 100,}
def start_requests(self):
urls = ['https://google.com', 'https://amazon.com']
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
if response.xpath('//*[id="content"]') or True: # only for testing
self.crawler.settings.frozen = False
self.crawler.settings.set("CLOSESPIDER_ITEMCOUNT", 300)
self.crawler.settings.frozen = True
# add the rest of the code

Having problems with a scrapy-splash script. I only get one result and my scraper does not parse other pages

I am trying to parse a list from a javascript website. When I run it, it only gives me back one entry on each column and then the spider shuts down. I have already set up my middleware settings. I am not sure what is going wrong. Thanks in advance!
import scrapy
from scrapy_splash import SplashRequest
class MalrusSpider(scrapy.Spider):
name = 'malrus'
allowed_domains = ['backgroundscreeninginrussia.com']
start_urls = ['http://www.backgroundscreeninginrussia.com/publications/new-citizens-of-malta-since-january-2015-till-december-2017/']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html')
def parse(self, response):
russians = response.xpath('//table[#id="tablepress-8"]')
for russian in russians:
yield{'name' : russian.xpath('//*[#class="column-1"]/text()').extract_first(),
'source' : russian.xpath('//*[#class="column-2"]/text()').extract_first()}
script = """function main(splash)
assert(splash:go(splash.args.url))
splash:wait(0.3)
button = splash:select("a[class=paginate_button next] a")
splash:set_viewport_full()
splash:wait(0.1)
button:mouse_click()
splash:wait(1)
return {url = splash:url(),
html = splash:html()}
end"""
yield SplashRequest(url=response.url,
callback=self.parse,
endpoint='execute',
args={'lua_source': script})
The .extract_first() (now .get()) you used will always return the first result. It's not an iterator so there is no sense to call it several times. You should try the .getall() method. That will be something like:
names = response.xpath('//table[#id="tablepress-8"]').xpath('//*[#class="column-1"]/text()').getall()
sources = response.xpath('//table[#id="tablepress-8"]').xpath('//*[#class="column-2"]/text()').getall()

Scrapy Splash cannot get the data of a React site

I need to scrape this site.
Is made in React so it looks. Then I tried to extract the data with scrapy-splash. I need for example the "a" element with class shelf-product-name. But the response is an empty array. I used the wait argument in about 5 seconds.
But I only get an empty array.
def start_requests(self):
yield SplashRequest(
url='https://www.jumbo.cl/lacteos-y-bebidas-vegetales/leches-blancas?page=6',
callback=self.parse,
args={'wait':5}
)
def parse(self,response):
print(response.css("a.shelf-product-name"))
Actually there is no need to use Scrapy Splash because all required data stored inside <script> tag of raw html response as json formatted data:
import scrapy
from scrapy.crawler import CrawlerProcess
import json
class JumboCLSpider(scrapy.Spider):
name = "JumboCl"
start_urls = ["https://www.jumbo.cl/lacteos-y-bebidas-vegetales/leches-blancas?page=6"]
def parse(self,response):
script = [script for script in response.css("script::text") if "window.__renderData" in script.extract()]
if script:
script = script[0]
data = script.extract().split("window.__renderData = ")[-1]
json_data = json.loads(data[:-1])
for plp in json_data["plp"]["plp_products"]:
for product in plp["data"]:
#yield {"productName":product["productName"]} # data from css: a.shelf-product-name
yield product
if __name__ == "__main__":
c = CrawlerProcess({'USER_AGENT':'Mozilla/5.0'})
c.crawl(JumboCLSpider)
c.start()

How do I pass URLs from A's method to B's by changing code of yield SplashRequest?? (Scrapy + Splash with Python & Dcoker)

########### This article is posted before I found it using scrapy + splash is not best way because I checked rendered pages (http://localhost:8050/render.html?url=xxx) has no data I want. That's why I try another way to make use of selenium or something. Thank you!!
【What I Want to Know: How do I should modify code of yield SplashRequest(url, callback=self.parse, args = {"wait": 5}, endpoint = "render.html")】
I don't have no idea that I should change code in order to pass URLs from A's method to B's in case of using scrapy + splash.
In my opinion, the corrected places are,
yield SplashRequest(url, callback=self.parse, args = {"wait": 5}, endpoint = "render.html")
of start_requests method(#1).
There are 3 reasons.
1st reason is,
logging.info(#2) before of "yield SplashRequest(url, callback=self.parse, args = {"wait": 5}, endpoint = "render.html")" gave me correct feedback.
2nd reason is,
I set up scrapy + splash, reading Scrapy+Splash's README of as reference.
Scrapy+Splash for JavaScript integration
https://github.com/scrapy-plugins/scrapy-splash
3rd reason is,
logging method(#3) gave me NO information.
Here is my code.
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
from bnb_sentiment.items import BnbItem
from scrapy_splash import SplashRequest
import re
import logging
logging.basicConfig(level=logging.INFO)
# __name__ is name of a method
logger = logging.getLogger(__name__)
class BnbPriceTestSpider(scrapy.Spider):
name = 'bnb_price_test'
start_urls = [
# Tokyo--Japan
'https://www.airbnb.com/s/Tokyo--Japan/homes?refinement_paths%5B%5D=%2Fhomes&allow_override%5B%5D=&checkin=2018-07-07&checkout=2018-07-08&locale=en&min_beds=0&price_max=20000&price_min=10000&query=Tokyo%2C%20Japan&place_id=ChIJ51cu8IcbXWARiRtXIothAS4&s_tag=Mz88jJs1',
]
def start_requests(self):
for url in self.start_urls:
logger.info(url) #2
yield SplashRequest(url, callback=self.parse, args = {"wait": 5}, endpoint = "render.html") #1
def parse(self, response):
for href in response.xpath('//div[contains(#id, "listing-")]//a[contains(#href, "rooms")]/#href'):
import pdb; pdb.set_trace()
logger.info(href)
url = response.urljoin(href.extract())
import pdb; pdb.set_trace()
logger.info(url) #3
yield SplashRequest(url, callback=self.parse_scrape)
def parse_scrape(self, response):
pass
(#2) Here is the feedback of logging method.
/home/ubuntu/bnbsp/bnb_sentiment/bnb_sentiment/spiders/bnb_price.py(34)start_requests()
-> logger.info(url) *1
(Pdb) url
'url is same as start_urls'

Scrapy only scrapes the first start url in a list of 15 start urls

I am new to Scrapy and am attempting to teach myself the basics. I have compiled a code that goes to the Louisiana Department of Natural Resources website to retrieve the serial number for certain oil wells.
I have each well's link listed in the start URLs command, but scrappy only downloads data from the first url. What am I doing wrong?
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
from mike.items import MikeItem
class SonrisSpider(Spider):
name = "sspider"
start_urls = [
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=207899",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=971683",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=214206",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=159420",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=243671",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248942",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=156613",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=972498",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=215443",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248463",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=195136",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=179181",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=199930",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=203419",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=220454",
]
def parse(self, response):
item = MikeItem()
item['serial'] = response.xpath('/html/body/table[1]/tr[2]/td[1]/text()').extract()[0]
yield item
Thank you for any help you might be able to provide. If I have not explained my problem thoroughly, please let me know and I will attempt to clarify.
I think this code might help,
By default scrapy prevent duplicate requests. Since only the parameters are different in your start-url scrapy will consider the rest of the urls in the start-url as duplicate request of the first one. That's why your spider stops after fetching the first url. In order to parse the rest of the urls we have enable dont_filter flag in the scrapy request. (chek the start_request())
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from mike.items import MikeItem
class SonrisSpider(scrapy.Spider):
name = "sspider"
allowed_domains = ["sonlite.dnr.state.la.us"]
start_urls = [
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=207899",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=971683",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=214206",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=159420",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=243671",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248942",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=156613",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=972498",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=215443",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248463",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=195136",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=179181",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=199930",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=203419",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=220454",
]
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, callback=self.parse_data, dont_filter=True)
def parse_data(self, response):
item = MikeItem()
serial = response.xpath(
'/html/body/table[1]/tr[2]/td[1]/text()').extract()
serial = serial[0] if serial else 'n/a'
item['serial'] = serial
yield item
sample output returned by this spider is as follows,
{'serial': u'207899'}
{'serial': u'971683'}
{'serial': u'214206'}
{'serial': u'159420'}
{'serial': u'248942'}
{'serial': u'243671'}
your code sounds good, try to add this function
class SonrisSpider(Spider):
def start_requests(self):
for url in self.start_urls:
print(url)
yield self.make_requests_from_url(url)
#the result of your code goes here
The URLs should be printed now. Test it, if not, say please

Categories

Resources