I need to scrape this site.
Is made in React so it looks. Then I tried to extract the data with scrapy-splash. I need for example the "a" element with class shelf-product-name. But the response is an empty array. I used the wait argument in about 5 seconds.
But I only get an empty array.
def start_requests(self):
yield SplashRequest(
url='https://www.jumbo.cl/lacteos-y-bebidas-vegetales/leches-blancas?page=6',
callback=self.parse,
args={'wait':5}
)
def parse(self,response):
print(response.css("a.shelf-product-name"))
Actually there is no need to use Scrapy Splash because all required data stored inside <script> tag of raw html response as json formatted data:
import scrapy
from scrapy.crawler import CrawlerProcess
import json
class JumboCLSpider(scrapy.Spider):
name = "JumboCl"
start_urls = ["https://www.jumbo.cl/lacteos-y-bebidas-vegetales/leches-blancas?page=6"]
def parse(self,response):
script = [script for script in response.css("script::text") if "window.__renderData" in script.extract()]
if script:
script = script[0]
data = script.extract().split("window.__renderData = ")[-1]
json_data = json.loads(data[:-1])
for plp in json_data["plp"]["plp_products"]:
for product in plp["data"]:
#yield {"productName":product["productName"]} # data from css: a.shelf-product-name
yield product
if __name__ == "__main__":
c = CrawlerProcess({'USER_AGENT':'Mozilla/5.0'})
c.crawl(JumboCLSpider)
c.start()
Related
I am trying to scrape the information pertaining to the biblical commentaries off of a website. Below is the code I have made to do so. start_urls is the link to the json file I am trying to scrape. I chose ['0']['father']['_id'] to get the name of the commenter, however, the following error occurs. What should I do?
Error: TypeError: list indices must be integers or slices, not str
Code:
import scrapy
import json
class catenaspider(scrapy.Spider): #spider to crawl the url
name = 'commentary' #name to be called in command terminal
start_urls = ['https://api.catenabible.com:8080/anc_com/c/mt/1/1?tags=[%22ALL%22]&sort=def']
def parse(self,response):
data = json.loads(response.body)
yield from data['0']['father']['_id']```
Read the documentation again.
import scrapy
class catenaspider(scrapy.Spider): # spider to crawl the url
name = 'commentary' # name to be called in command terminal
start_urls = ['https://api.catenabible.com:8080/anc_com/c/mt/1/1?tags=[%22ALL%22]&sort=def']
def parse(self, response):
data = response.json()
yield {'id_father': data[0]['father']['_id']}
# if you want to get all the id's
# for d in data:
# yield {'id_father': d['father']['_id']}
I am trying to parse a list from a javascript website. When I run it, it only gives me back one entry on each column and then the spider shuts down. I have already set up my middleware settings. I am not sure what is going wrong. Thanks in advance!
import scrapy
from scrapy_splash import SplashRequest
class MalrusSpider(scrapy.Spider):
name = 'malrus'
allowed_domains = ['backgroundscreeninginrussia.com']
start_urls = ['http://www.backgroundscreeninginrussia.com/publications/new-citizens-of-malta-since-january-2015-till-december-2017/']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html')
def parse(self, response):
russians = response.xpath('//table[#id="tablepress-8"]')
for russian in russians:
yield{'name' : russian.xpath('//*[#class="column-1"]/text()').extract_first(),
'source' : russian.xpath('//*[#class="column-2"]/text()').extract_first()}
script = """function main(splash)
assert(splash:go(splash.args.url))
splash:wait(0.3)
button = splash:select("a[class=paginate_button next] a")
splash:set_viewport_full()
splash:wait(0.1)
button:mouse_click()
splash:wait(1)
return {url = splash:url(),
html = splash:html()}
end"""
yield SplashRequest(url=response.url,
callback=self.parse,
endpoint='execute',
args={'lua_source': script})
The .extract_first() (now .get()) you used will always return the first result. It's not an iterator so there is no sense to call it several times. You should try the .getall() method. That will be something like:
names = response.xpath('//table[#id="tablepress-8"]').xpath('//*[#class="column-1"]/text()').getall()
sources = response.xpath('//table[#id="tablepress-8"]').xpath('//*[#class="column-2"]/text()').getall()
I have the following code:
#FirstSpider.py
class FirstSpider(scrapy.Spider):
name = 'first'
start_urls = ['https://www.basesite.com']
next_urls = []
def parse(self, response):
for url in response.css('bunch > of > css > here'):
self.next_urls.append(url.css('more > css > here'))
l = Loader(item=Item(), selector=url.css('more > css'))
l.add_css('add', 'more > css')
...
...
yield l.load_item()
for url in self.next_urls:
new_urls = self.start_urls[0] + url
yield scrapy.Request(new_urls, callback=SecondSpider.parse_url)
#SecondSpider.py
class SecondSpider(scrapy.Spider):
name = 'second'
start_urls = ['https://www.basesite.com']
def parse_url(self):
"""Parse team data."""
return self
# self is a HtmlResponse not a 'response' object
def parse(self, response):
"""Parse all."""
summary = self.parse_url(response)
return summary
#ThirdSpider.py
class ThirdSpider(scrapy.Spider):
# take links from second spider, continue:
I want to be able to pass the url scraped in Spider 1 to Spider 2 (in a different script). I'm curious as to why when I do, the 'response' is a HtmlResponse and not a response object ( When doing something similar to a method in the same class as Spider 1; I don't have this issue )
What am i missing here? How do i just pass the original response(s) to the second spider? ( and from the second onto the third, etc..?)
You could use Redis as shared resource between all spiders https://github.com/rmax/scrapy-redis
Run all N spiders (don't close on idle state), so each of them will be connected to same Redis and waiting tasks(url, request headers) from there;
As the side-effect push task data to Redis from X_spider with specific key (Y_spider name).
What about using inheritance? "parse" function names should be different.
If your first spider inherits from the second, it will be able to set the callback to self.parse_function_spider2
I used scrapy to crawl comment data from the website http://club.jd.com/comment/productPageComments.action?callback=&productId=1892018&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0, which language is chinese. But i just got the output like this.
And the csv file's output is all messed up.
I don't know what happened. First i thought that was a json decode or json encode problem, then i tried the ways on the internet but i got the same result. Here's my code:
#!/usr/bin/env python
# encoding: utf-8
import scrapy
from scrapy import Request
from scrapy.selector import Selector
from jd_comment.items import JdCommentItem
import json
class JdSpider(scrapy.Spider):
name = 'comment'
def start_requests(self):
url = 'http://club.jd.com/comment/productPageComments.action?callback=&productId=1892018&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0'
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
items = []
for comment in jsonresponse['comments']:
item = JdCommentItem()
item['username'] = comment['nickname']
item['user_ID'] = comment['id']
item['time'] = comment['referenceTime']
item['good_ID'] = comment['referenceId']
item['good_name'] = comment['referenceName']
item['content'] = comment['content']
item['score'] = comment['score']
items.append(item)
yield item
Anyone give me a hint would be highly appreciated. Thanks.
I am new to Scrapy and am attempting to teach myself the basics. I have compiled a code that goes to the Louisiana Department of Natural Resources website to retrieve the serial number for certain oil wells.
I have each well's link listed in the start URLs command, but scrappy only downloads data from the first url. What am I doing wrong?
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
from mike.items import MikeItem
class SonrisSpider(Spider):
name = "sspider"
start_urls = [
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=207899",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=971683",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=214206",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=159420",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=243671",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248942",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=156613",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=972498",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=215443",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248463",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=195136",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=179181",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=199930",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=203419",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=220454",
]
def parse(self, response):
item = MikeItem()
item['serial'] = response.xpath('/html/body/table[1]/tr[2]/td[1]/text()').extract()[0]
yield item
Thank you for any help you might be able to provide. If I have not explained my problem thoroughly, please let me know and I will attempt to clarify.
I think this code might help,
By default scrapy prevent duplicate requests. Since only the parameters are different in your start-url scrapy will consider the rest of the urls in the start-url as duplicate request of the first one. That's why your spider stops after fetching the first url. In order to parse the rest of the urls we have enable dont_filter flag in the scrapy request. (chek the start_request())
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from mike.items import MikeItem
class SonrisSpider(scrapy.Spider):
name = "sspider"
allowed_domains = ["sonlite.dnr.state.la.us"]
start_urls = [
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=207899",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=971683",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=214206",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=159420",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=243671",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248942",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=156613",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=972498",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=215443",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=248463",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=195136",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=179181",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=199930",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=203419",
"http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellinfo2?p_WSN=220454",
]
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, callback=self.parse_data, dont_filter=True)
def parse_data(self, response):
item = MikeItem()
serial = response.xpath(
'/html/body/table[1]/tr[2]/td[1]/text()').extract()
serial = serial[0] if serial else 'n/a'
item['serial'] = serial
yield item
sample output returned by this spider is as follows,
{'serial': u'207899'}
{'serial': u'971683'}
{'serial': u'214206'}
{'serial': u'159420'}
{'serial': u'248942'}
{'serial': u'243671'}
your code sounds good, try to add this function
class SonrisSpider(Spider):
def start_requests(self):
for url in self.start_urls:
print(url)
yield self.make_requests_from_url(url)
#the result of your code goes here
The URLs should be printed now. Test it, if not, say please