Faced couple issues when trying to crawl imdb, on which didnt found answers here.
Im trying to grab some data from pages like: http://www.imdb.com/search/title?release_date=1950&page=1 with following code:
import scrapy
from tutorial.items import MovieItem, CastItem
class tutorialSpider(scrapy.Spider):
name = "tutorial"
allowed_domains = ["imdb.com"]
# generate start_urls dynamically
def start_requests(self):
for year in range(1950, 1951):
for page in range(1, 3):
yield scrapy.Request('http://www.imdb.com/search/title?release_date=%s&page=%s' % (year, page))
def parse(self, response):
self.wanted_num=50
for sel in response.xpath("//*[contains(#class,'lister-item-content')]"):
item = MovieItem()
item['Title'] = sel.xpath('h3/a/text()').extract()[0]
item['Rating'] = sel.xpath('div[#class="ratings-bar"]/div[#name="ir"]/strong/text()').extract()[0]
item['Ranking']=sel.xpath('h3/span[#class="lister-item-index unbold text-primary"]/text()').extract()[0]
item['ReleaseDate'] = sel.xpath('h3/span[#class="lister-item-year text-muted unbold"]/text()').extract()[0]
item['MianPageUrl'] = "http://imdb.com"+sel.xpath('h3/a/#href').extract()[0]
request = scrapy.Request(item['MianPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
if int(item['Ranking']) >= self.wanted_num + 1:
return
yield request
So, questions here are:
It seems to be going into infinite loop while trying to crawl on these pages(301 redirects), and I dont know why?(
I suspect that ranking should be trimmed, as from page it will come as '1.', so how I cut that point at the end of string?
Appreciate your help!
Related
I hope there's someone who can help a newbie:
I try to scrape the prices of https://www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html using Scrapy. Since those prices are loaded dynamically with Javascript I tried to use Splash to deal with the problem. But the outcome is still the same: Empty lists for the prices ( "hotel_displayed_prices"). The other items do all receive the correct values.
On the webpage I found two ways to get to the price with CSS selector:
.price-wrap .price :: text
.premium-offer-container div::attr(data-locationid)
both ways do not seem to work... or they do both and just splash does not.
for scrapy I copied all configurations from https://github.com/scrapy-plugins/scrapy-splash into my settings file. I did also put Robotstxt_obey = False
when rendering the website in Splash 3.4.1 (browser window) it showed me the price of the hotels so normally it should work I guess.
import scrapy
from ..items import TestItem
from scrapy_splash import SplashRequest
class HoteldataSpider (scrapy.Spider):
name = "Testdata"
start_urls = ["https://www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, args={"wait": 5})
def parse(self, response):
items = TestItem()
all_single_entries = response.css("div.listItem")
for entry in all_single_entries:
hotel_names = entry.css(".listing_title [target=_blank]::text").extract()
hotel_links = entry.css(".listing_title a").xpath("#href").extract()
hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract()
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid)").extract()
items["hotel_names"] = str(hotel_names).split("'")[1]
items["hotel_links"] = "https://www.tripadvisor.com" + str(hotel_links).split("'")[1]
items["hotel_ids"] = int(str(hotel_ids).split("_")[1].split("'")[0])
items["hotel_displayed_price"]= hotel_displayed_price
yield items
On this line
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid").extract()
Are you missing a closing bracket on "div::attr(data-locationid" ?
I've had a look at the behaviour under scrapy, and the prices are not returned in the HTML to a request from scrapy. What you're seeing in the browser (even Splash) is not the same as what your code is seeing.
I don't know scrapy well enough to work through this, but it seems possible to get what you need with plain old requests & BeautifulSoup:
import requests
import BeautifulSoup
r = requests.get('https://www.tripadvisor.ie/Hotels-g189541-Copenhagen_Zealand-Hotels.html')
soup = BeautifulSoup(requests.content, 'lxml')
prices = [price.text for price in soup.select('.price-wrap .price')]
print(prices)
['€131', '€112', '€121', '€133', '€172', '€169', '€74', '€189', ...]
For everyone with the similar problem: Here is my solution. However I do have problems with duplicates when I run the script.
import scrapy
from ..items import HotelinfoItem
from scrapy_splash import SplashRequest
class HoteldataSpider (scrapy.Spider):
name = "Hoteldata"
start_urls = ["http://localhost:8050/render.html?url=https:"
"//www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, args={"wait": 10})
def parse(self, response):
items = HotelinfoItem()
all_single_entries = response.css("div.listItem")
for entry in all_single_entries:
hotel_names = entry.css(".listing_title [target=_blank]::text").extract()
hotel_links = entry.css(".listing_title a").xpath("#href").extract()
hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract()
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-pernight)").extract()
hotel_type = entry.css(".mb10").css(".label::text").extract()
items["hotel_names"] = [str(hotel_names).split("'")[1]]
items["hotel_links"] = ["https://www.tripadvisor.com" + str(hotel_links).split("'")[1]]
items["hotel_ids"] = [str(hotel_ids).split("_")[1].split("'")[0]]
if len(hotel_type) == 0:
items["hotel_type"] = ["Hotel"]
else:
items["hotel_type"] = hotel_type
if len(hotel_displayed_price) == 0:
items["hotel_displayed_price"] = ["NA"]
else:
items["hotel_displayed_price"] = hotel_displayed_price
yield items
next_page = response.css("a.next::attr(href)").get()
next_page_splash = "http://localhost:8050/render.html?url=https://www.tripadvisor.com" + \
str(next_page).split("#")[0] + "&timeout=10&wait=5"
if next_page is not None:
yield response.follow(next_page_splash, callback=self.parse)
I Have an issue with my Spider. I tried to follow some tutorial to understand the scrapy a little bit better and extended the tutorial to crawl also subpages. The issue of my spider is that it only crawls one element of the entry page and not 25 as it should be on the page.
I have no clue where the failure is. Perhaps somebody of you can help me here:
from datetime import datetime as dt
import scrapy
from reddit.items import RedditItem
class PostSpider(scrapy.Spider):
name = 'post'
allowed_domains = ['reddit.com']
def start_requests(self):
reddit_urls = [
('datascience', 'week')
]
for sub, period in reddit_urls:
url = 'https://www.reddit.com/r/' + sub + '/top/?sort=top&t=' + period
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# get the subreddit from the URL
sub = response.url.split('/')[4]
# parse thru each of the posts
for post in response.css('div.thing'):
item = RedditItem()
item['title'] = post.css('a.title::text').extract_first()
item['commentsUrl'] = post.css('a.comments::attr(href)').extract_first()
### scrap comments page.
request = scrapy.Request(url=item['commentsUrl'], callback=self.parse_comments)
request.meta['item'] = item
return request
def parse_comments(self, response):
item = response.meta['item']
item['commentsText'] = response.css('div.comment div.md p::text').extract()
self.logger.info('Got successful response from {}'.format(response.url))
yield item
Thanks for your help.
BR
Thanks for your comments:
Indeed I have to yield request it, rather return request.
Now it is working.
I'm trying to create web-scraper for dynamic website. For this purpose I'm using Scrapy 1.2.1 and scrapy-splash 0.7 library.
The problem appears when using splash server, most of the times, it returns different data to scrapy. From log I can see that all pages are crawled. If I use scrapy.Request instead of SplashRequest, everything is OK (I get the same data each time).
My code:
import scrapy
import re
from scrapy_splash import SplashRequest
from scraper.items import ScraperRozetka
class RozetkaSpider(scrapy.Spider):
name = "rozetka_laptops"
start_urls = [
'http://rozetka.com.ua/notebooks/c80004/filter/producer=dell;page=1/',
]
def parse(self, response):
last_page = response.xpath('//ul[#name="paginator"]//li[last()]//#id').extract_first()
last_page_num = int(last_page[-1])
i = 1
while i <= last_page_num:
url = re.sub(r'page=\d+', r'page={}'.format(i), response.url)
i += 1
yield SplashRequest(url, self.parse_results, endpoint='render.html', args={'wait': 0.5, 'timeout': 60})
def parse_results(self, response):
items = []
records = response.css('div.g-i-tile-catalog')
for record in records:
item = ScraperRozetka()
item['title'] = record.css('img::attr(title)').extract_first()
item['price'] = record.css('div.g-price-uah::text').extract_first()
item['link'] = record.css('div.g-i-tile-i-title a::attr(href)').extract_first()
items.append(item)
return items
Would be grateful if someone helped me.
Thanks.
I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
I am making a project in which I have used scrapy to scrape items from web sites, but the problem is, the xpaths of the 1st 2 pages of that site is different from the xpaths of the other pages.
As the result my spider just scrapes the items from first two pages and just simply crawls over the other pages.
How can I make my spider also scrape the items of the pages too??
I am also including my spider here so that u can see through my spider if needed.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
from scrapy.http import Request
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
current_page_no = 1
start_urls = [
'http://directory.thesun.co.uk/find/uk/computer-repair'
]
def get_next_url(self, fired_url):
if '/page/' in fired_url:
url, page_no = fired_url.rsplit('/page/', 1)
else:
if self.current_page_no != 1:
#end of scroll
return
self.current_page_no += 1
return "http://directory.thesun.co.uk/find/uk/computer-repair/page/%s" % self.current_page_no
# the parse procedure, and here is the codes which declares which field to scrape.
def parse(self, response):
fired_url = response.url
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="abTbl "]')
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[#class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/#title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[#class="searchInfoLabel"]/span/#id').extract()
item['Web_url'] = site.select('span[#class="searchInfoLabel"]/a/#href').extract()
item['adress_name'] = site.select('span[#class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/#alt').extract()
item['Photo_path'] = site.select('img/#src').extract()
#items.append(item)
yield item
next_url = self.get_next_url(fired_url)
if next_url:
yield Request(next_url, self.parse, dont_filter=True)
for other pages I need to use this: sites = hxs.select('//div[#class="icListItem"]')
How can I include this in my spider so that it can scrape items form other pages too..
At present its just scraping 1st two pages and simply crawls over other pages.
What did you try so far?
One solution would be using an index-like parameter passed as a meta data when calling for the next page. Something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
2nd_xpath = False
try:
if response.meta['index'] > 1:
2nd_xpath = True
index = response.meta['index']
except KeyError:
index = 0
sites = (hxs.select('//div[#class="icListItem"]') if 2nd_xpath
else hxs.select('//div[#class="abTbl "]'))
...
request = Request(next_url, self.parse, dont_filter=True)
request.meta['index'] = index + 1
yield request
That code sure as hell can be improved but you get the idea.