scrapinghub/splash loses some data on rendering - python

I'm trying to create web-scraper for dynamic website. For this purpose I'm using Scrapy 1.2.1 and scrapy-splash 0.7 library.
The problem appears when using splash server, most of the times, it returns different data to scrapy. From log I can see that all pages are crawled. If I use scrapy.Request instead of SplashRequest, everything is OK (I get the same data each time).
My code:
import scrapy
import re
from scrapy_splash import SplashRequest
from scraper.items import ScraperRozetka
class RozetkaSpider(scrapy.Spider):
name = "rozetka_laptops"
start_urls = [
'http://rozetka.com.ua/notebooks/c80004/filter/producer=dell;page=1/',
]
def parse(self, response):
last_page = response.xpath('//ul[#name="paginator"]//li[last()]//#id').extract_first()
last_page_num = int(last_page[-1])
i = 1
while i <= last_page_num:
url = re.sub(r'page=\d+', r'page={}'.format(i), response.url)
i += 1
yield SplashRequest(url, self.parse_results, endpoint='render.html', args={'wait': 0.5, 'timeout': 60})
def parse_results(self, response):
items = []
records = response.css('div.g-i-tile-catalog')
for record in records:
item = ScraperRozetka()
item['title'] = record.css('img::attr(title)').extract_first()
item['price'] = record.css('div.g-price-uah::text').extract_first()
item['link'] = record.css('div.g-i-tile-i-title a::attr(href)').extract_first()
items.append(item)
return items
Would be grateful if someone helped me.
Thanks.

Related

Splash for Scrapy only returns empty list

I hope there's someone who can help a newbie:
I try to scrape the prices of https://www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html using Scrapy. Since those prices are loaded dynamically with Javascript I tried to use Splash to deal with the problem. But the outcome is still the same: Empty lists for the prices ( "hotel_displayed_prices"). The other items do all receive the correct values.
On the webpage I found two ways to get to the price with CSS selector:
.price-wrap .price :: text
.premium-offer-container div::attr(data-locationid)
both ways do not seem to work... or they do both and just splash does not.
for scrapy I copied all configurations from https://github.com/scrapy-plugins/scrapy-splash into my settings file. I did also put Robotstxt_obey = False
when rendering the website in Splash 3.4.1 (browser window) it showed me the price of the hotels so normally it should work I guess.
import scrapy
from ..items import TestItem
from scrapy_splash import SplashRequest
class HoteldataSpider (scrapy.Spider):
name = "Testdata"
start_urls = ["https://www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, args={"wait": 5})
def parse(self, response):
items = TestItem()
all_single_entries = response.css("div.listItem")
for entry in all_single_entries:
hotel_names = entry.css(".listing_title [target=_blank]::text").extract()
hotel_links = entry.css(".listing_title a").xpath("#href").extract()
hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract()
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid)").extract()
items["hotel_names"] = str(hotel_names).split("'")[1]
items["hotel_links"] = "https://www.tripadvisor.com" + str(hotel_links).split("'")[1]
items["hotel_ids"] = int(str(hotel_ids).split("_")[1].split("'")[0])
items["hotel_displayed_price"]= hotel_displayed_price
yield items
On this line
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid").extract()
Are you missing a closing bracket on "div::attr(data-locationid" ?
I've had a look at the behaviour under scrapy, and the prices are not returned in the HTML to a request from scrapy. What you're seeing in the browser (even Splash) is not the same as what your code is seeing.
I don't know scrapy well enough to work through this, but it seems possible to get what you need with plain old requests & BeautifulSoup:
import requests
import BeautifulSoup
r = requests.get('https://www.tripadvisor.ie/Hotels-g189541-Copenhagen_Zealand-Hotels.html')
soup = BeautifulSoup(requests.content, 'lxml')
prices = [price.text for price in soup.select('.price-wrap .price')]
print(prices)
['€131', '€112', '€121', '€133', '€172', '€169', '€74', '€189', ...]
For everyone with the similar problem: Here is my solution. However I do have problems with duplicates when I run the script.
import scrapy
from ..items import HotelinfoItem
from scrapy_splash import SplashRequest
class HoteldataSpider (scrapy.Spider):
name = "Hoteldata"
start_urls = ["http://localhost:8050/render.html?url=https:"
"//www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, args={"wait": 10})
def parse(self, response):
items = HotelinfoItem()
all_single_entries = response.css("div.listItem")
for entry in all_single_entries:
hotel_names = entry.css(".listing_title [target=_blank]::text").extract()
hotel_links = entry.css(".listing_title a").xpath("#href").extract()
hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract()
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-pernight)").extract()
hotel_type = entry.css(".mb10").css(".label::text").extract()
items["hotel_names"] = [str(hotel_names).split("'")[1]]
items["hotel_links"] = ["https://www.tripadvisor.com" + str(hotel_links).split("'")[1]]
items["hotel_ids"] = [str(hotel_ids).split("_")[1].split("'")[0]]
if len(hotel_type) == 0:
items["hotel_type"] = ["Hotel"]
else:
items["hotel_type"] = hotel_type
if len(hotel_displayed_price) == 0:
items["hotel_displayed_price"] = ["NA"]
else:
items["hotel_displayed_price"] = hotel_displayed_price
yield items
next_page = response.css("a.next::attr(href)").get()
next_page_splash = "http://localhost:8050/render.html?url=https://www.tripadvisor.com" + \
str(next_page).split("#")[0] + "&timeout=10&wait=5"
if next_page is not None:
yield response.follow(next_page_splash, callback=self.parse)

Scrapy Crawl Page and Supage but crawls only one item

I Have an issue with my Spider. I tried to follow some tutorial to understand the scrapy a little bit better and extended the tutorial to crawl also subpages. The issue of my spider is that it only crawls one element of the entry page and not 25 as it should be on the page.
I have no clue where the failure is. Perhaps somebody of you can help me here:
from datetime import datetime as dt
import scrapy
from reddit.items import RedditItem
class PostSpider(scrapy.Spider):
name = 'post'
allowed_domains = ['reddit.com']
def start_requests(self):
reddit_urls = [
('datascience', 'week')
]
for sub, period in reddit_urls:
url = 'https://www.reddit.com/r/' + sub + '/top/?sort=top&t=' + period
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# get the subreddit from the URL
sub = response.url.split('/')[4]
# parse thru each of the posts
for post in response.css('div.thing'):
item = RedditItem()
item['title'] = post.css('a.title::text').extract_first()
item['commentsUrl'] = post.css('a.comments::attr(href)').extract_first()
### scrap comments page.
request = scrapy.Request(url=item['commentsUrl'], callback=self.parse_comments)
request.meta['item'] = item
return request
def parse_comments(self, response):
item = response.meta['item']
item['commentsText'] = response.css('div.comment div.md p::text').extract()
self.logger.info('Got successful response from {}'.format(response.url))
yield item
Thanks for your help.
BR
Thanks for your comments:
Indeed I have to yield request it, rather return request.
Now it is working.

What are the best practices for calling an external api?

So let's say I want to write a spider that using the Facebook API to calculate the likes on every page of a website. If I import the requests library, I'm able to call the Facebook graph API as follows.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
data=requests.get(base)
return self.parse_likes(data)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
item['url'] = response.url
links = response.css('a::attr(href)').extract()
item['fb_url'],item['shares'],item['comments'] = self.get_likes(response.url)
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
However, I can't seem to get this code to work if, rather than using the requests, I use the scrapy.Request call. Something like this.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
return scrapy.Request(base,callback=self.parse_likes)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
links = response.css('a::attr(href)').extract()
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
In this case, I just get a blank response for the Facebook data. I think i'm missing some understanding about how the scrapy.Request method works relative to the standard requests library. Any ideas?
This is a very common case: How to yield from item from multiple urls?
And the most common solution is to chain requests by carrying your item in request.meta paramater.
For your example implementation with this logic could look like:
class WebSite(scrapy.Spider):
base='https://graph.facebook.com/{}?access_token={}'.format
api_key = '1234'
def parse(self, response):
links = response.css('a::attr(href)').extract()
for link in links:
item= {}
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
item['link'] = response.urljoin(link)
api_url = self.base(self.api_key, link)
yield scrapy.Request(api_url,
callback=self.parse_likes,
meta={'item': item})
def parse_likes(self, response):
item = response.meta['item']
data = json.loads(data.text)
share_count = data['id'],data['share']['comment_count'],data['share']['share_count']
item['share_count'] = share_count
yield item

how to scrape Tripadvisor dynamically using scrapy and python

I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()

Scrapy Spider just crawls and does not scrape

I am making a project in which I have used scrapy to scrape items from web sites, but the problem is, the xpaths of the 1st 2 pages of that site is different from the xpaths of the other pages.
As the result my spider just scrapes the items from first two pages and just simply crawls over the other pages.
How can I make my spider also scrape the items of the pages too??
I am also including my spider here so that u can see through my spider if needed.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
from scrapy.http import Request
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
current_page_no = 1
start_urls = [
'http://directory.thesun.co.uk/find/uk/computer-repair'
]
def get_next_url(self, fired_url):
if '/page/' in fired_url:
url, page_no = fired_url.rsplit('/page/', 1)
else:
if self.current_page_no != 1:
#end of scroll
return
self.current_page_no += 1
return "http://directory.thesun.co.uk/find/uk/computer-repair/page/%s" % self.current_page_no
# the parse procedure, and here is the codes which declares which field to scrape.
def parse(self, response):
fired_url = response.url
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="abTbl "]')
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[#class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/#title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[#class="searchInfoLabel"]/span/#id').extract()
item['Web_url'] = site.select('span[#class="searchInfoLabel"]/a/#href').extract()
item['adress_name'] = site.select('span[#class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/#alt').extract()
item['Photo_path'] = site.select('img/#src').extract()
#items.append(item)
yield item
next_url = self.get_next_url(fired_url)
if next_url:
yield Request(next_url, self.parse, dont_filter=True)
for other pages I need to use this: sites = hxs.select('//div[#class="icListItem"]')
How can I include this in my spider so that it can scrape items form other pages too..
At present its just scraping 1st two pages and simply crawls over other pages.
What did you try so far?
One solution would be using an index-like parameter passed as a meta data when calling for the next page. Something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
2nd_xpath = False
try:
if response.meta['index'] > 1:
2nd_xpath = True
index = response.meta['index']
except KeyError:
index = 0
sites = (hxs.select('//div[#class="icListItem"]') if 2nd_xpath
else hxs.select('//div[#class="abTbl "]'))
...
request = Request(next_url, self.parse, dont_filter=True)
request.meta['index'] = index + 1
yield request
That code sure as hell can be improved but you get the idea.

Categories

Resources