Scrapy Spider Trouble Navigating Through URLs - python

I have been struggling to find a way to go about this issue: (the functions I may show do not work and are wrong but it is the more the process that I am confused about)
I am trying to have my spider get the prices for all of the products on the "standard-sheds" page. This is the link to the page which contains the products: https://www.charnleys.co.uk/product-category/gardening/garden-accessories/garden-furniture/sheds/standard-sheds/
However, if you are to click on the product link, you would see that the path changes to "charnleys.co.uk/shop/shed-product-name" so my spider can't follow.
What I have thought about doing is collecting the URLs on the "standard-sheds" page, appending them to an array and iterating through, then having my spider go onto those URLs and collecting the price. However, I am unsure as to how I get my spider to go through the array of URLs. I will list the current functions I have created.
Any help is greatly appreciated.
from gc import callbacks
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
urls = []
class CharnleySpider(CrawlSpider):
name = 'crawler'
allowed_domains = ['charnleys.co.uk']
start_urls = ['https://www.charnleys.co.uk']
#https://www.charnleys.co.uk/product-category/gardening/garden-accessories/garden-furniture/sheds/standard-sheds/
#https://www.charnleys.co.uk/shop/bentley-supreme-apex/
rules = (
Rule(LinkExtractor(allow='product-category/gardening/garden-accessories/garden-
furniture/sheds', deny='sheds')),
Rule(LinkExtractor(allow='standard-sheds'), callback='collect_urls')
)
def collect_urls(self, response):
for elements in response.css('div.product-image'):
urls.append(elements.css('div.product-image a::attr(href)').get())
def html_return_price_strings(self, response):
#Searches through html of webpage and returns all string with "£" attatched.
all_html = response.css('html').get()
for line in all_html.split('\n'):
for word in line.split():
if word.startswith('£'):
print (word)
def parse_product(self, response, html_return_price_strings):
yield {
'name' : response.css('h2.product_title::text').get(),
'price' : html_return_price_strings()
}

When you will start to journey to the each listing page/details page and after reaching to the details page and if you turn off JS then you will notice that the price portion aka the contents from the page has gone disappereared meaning dynamically loaded by JavaScript.So Scrapy can't render JS but you can grab that dynamic content via scrapy-SeleniumRequest. Here I use scrapy default spider which is more robust than crawlSpider.
Code:
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
class Test2Spider(scrapy.Spider):
name = 'test2'
start_urls = [f'https://www.charnleys.co.uk/product-category/gardening/garden-accessories/garden-furniture/sheds/standard-sheds/page/{x}/' for x in range(1,3)]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request (
url = url,
callback = self.parse
)
def parse(self, response):
for link in response.xpath('//*[#class="product-loop-title"]/#href')[1:10].getall():
yield SeleniumRequest(
url = link,
callback=self.parse_product
)
def parse_product(self, response):
driver = response.meta['driver']
yield {
'name' : response.css('h2.product_title::text').get().strip(),
'price' : ''.join([x.text.split('STANDARD FEATURES')[0].split('Framing')[0].split('Standard Features:')[0].split('Specification:')[0] for x in driver.find_elements(By.XPATH, '//*[#id="tab-description"]/p | //*[#id="tab-description"]/div[1]/div')]),
'url': response.url
}
Output:
{'name': 'Cabin Shed', 'price': '8FT Gables:\n5 x 8 £1099\n6 x 8 £1143\n8 x 8 £1370\n10 x 8 £1597\n12 x 8 £1824\n14
x 8 £205110FT Gables\n5 x 10 £1368\n6 x 10 £1443\n8 x 10 £1772\n10 x 10 £2100\n12 x 10 £2429\n14 x 10 £2750 'url': 'https://www.charnleys.co.uk/shop/cabin-shed/'}
... so on

Related

Navigate to new page in Scrapy with the same URL

I am writing a scrapy spider to scrape Rightmove, a property website. The issue I'm having is that the property search, which consists of several pages of different house listings, is all located under the same URL.
This means that the usual process of identifying the URL of the 'next' page doesn't work. Is there any way, using scrapy and not selenium (not efficient enough for the purpose) that I can navigate through the different pages? Please see my code and the source code of the relevant 'next page' button as the IMG below.
Thanks.
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = ['https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&furnishTypes=&keywords=']
def parse(self, response):
self.logger.info('This my first spider')
address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
listings = response.xpath('//h2[#class="propertyCard-title"]')
for listing in listings:
yield{
'Listing': listing.get()
}
nextPage = response.xpath('//*[#id="l-container"]/div[3]/div/div/div/div[3]/button/div/svg/use')
nextPage = nextPage.get()
pageTest = response.css('div[class=pagination-button pagination-direction pagination-direction--next] svg a::attr(href)')
pageTest = pageTest.get()
if pageTest is not None:
pageTest = response.urljoin(pageTest)
yield scrapy.Request(pageTest,callback=self.parse)
```[![enter image description here][1]][1]
[1]: https://i.stack.imgur.com/1I1J1.png
Actually, it turns out that each page has a unique identifier in the web-link. For example, attach &index = 24, this sends you to the next page.
What you need to figure out is how to include that into the request url. Some may have several pages so we increment by +24 each time to go onto the next page. However, we could increment by +24 onto infinite, therefore we use the number of page results as a way to break. It's rather sneaky to notice at first sight! but pretty easy to overcome.
Here's a scraper that can go to these next pages as requested:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
import requests
from bs4 import BeautifulSoup
links= []
for i in range(0, 480, 24):
url = f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&index={i}&furnishTypes=&keywords='
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
ps1 = soup.find_all('span', {'class':'searchHeader-resultCount'})
for ps in ps1:
if int(ps.text.strip()) > i:
links.append(url)
else:
break
class ListingsItem(scrapy.Item):
address = Field(output_processor = TakeFirst())
listings = Field(output_processor = TakeFirst())
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = links
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
container = response.xpath('//div[#class="l-searchResults"]/div')
for sales in container:
l = ItemLoader(ListingsItem(), selector = sales)
l.add_xpath('address', '//address[#class="propertyCard-address"]/meta[#content]')
l.add_xpath('listings', '//h2[#class="propertyCard-title"]//text()[normalize-space()]')
yield l.load_item()
#self.logger.info('This my first spider')
#address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
#listings = response.xpath('//h2[#class="propertyCard-title"]')
#for listing in listings:
# yield{
# 'Listing': listing.get()
# }
process = CrawlerProcess(
settings = {
'FEED_URI': 'rightmove.jl',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(listingsSpider)
process.start()

Spider not following links - scrapy

I am trying to build a spider which follows through 3 pages before getting to the page it scrapes. I have tested the responses in the shell, however, together it doesn't seem to work and I am not sure why.
My code below:
# -*- coding: utf-8 -*-
import scrapy
class CollegiateSpider(scrapy.Spider):
name = 'Collegiate'
allowed_domains = ['collegiate-ac.com/uk-student-accommodation']
start_urls = ['http://collegiate-ac.com/uk-student-accommodation/']
# Step 1 - Get the area links
def parse(self, response):
for city in response.xpath('//*[#id="top"]/div[1]/div/div[1]/div/ul/li/a/text').extract():
yield scrapy.Request(response.urljoin("/" + city), callback = self.parse_area_page)
# Step 2 - Get the block links
def parse_area_page(self, response):
for url in response.xpath('//div[3]/div/div/div/a/#href').extract():
yield scrapy.Request(response.urljoin(url), callback=self.parse_unitpage)
# Step 3 Get the room links
def parse_unitpage(self, response):
for url in response.xpath('//*[#id="subnav"]/div/div[2]/ul/li[5]/a/#href').extract():
yield scrapy.Request(response.urljoin(final), callback=self.parse_final)
# Step 4 - Scrape the data
def parse_final(self, response):
pass
I have tried changing to Crawlspider as per this answer, but that didn't seem to help.
I am currently looking into how to debug spiders, however, struggling with that so thought it would be beneficial to get opinions on here as well.
You forgot () in text() in '//*[#id="top"]/div[1]/div/div[1]/div/ul/li/a/text()'
But instead of text() I use #href to get url.
Joining urljoin('/' + city) creates wrong url because / skips /uk-student-accommodation - you have to use urljoin(city)
There was problem with allowed_domains - it blocked most of urls.
Working example. You can run it without project and it saves final urls in output.csv
import scrapy
class CollegiateSpider(scrapy.Spider):
name = 'Collegiate'
allowed_domains = ['collegiate-ac.com']
start_urls = ['https://collegiate-ac.com/uk-student-accommodation/']
# Step 1 - Get the area links
def parse(self, response):
for url in response.xpath('//*[#id="top"]/div[1]/div/div[1]/div/ul/li/a/#href').extract():
url = response.urljoin(url)
#print('>>>', url)
yield scrapy.Request(url, callback=self.parse_area_page)
# Step 2 - Get the block links
def parse_area_page(self, response):
for url in response.xpath('//div[3]/div/div/div/a/#href').extract():
url = response.urljoin(url)
yield scrapy.Request(response.urljoin(url), callback=self.parse_unitpage)
# Step 3 Get the room links
def parse_unitpage(self, response):
for url in response.xpath('//*[#id="subnav"]/div/div[2]/ul/li[5]/a/#href').extract():
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_final)
# Step 4 - Scrape the data
def parse_final(self, response):
# show some information for test
print('>>> parse_final:', response.url)
# send url as item so it can save it in file
yield {'final_url': response.url}
# --- run it without project ---
import scrapy.crawler
c = scrapy.crawler.CrawlerProcess({
"FEED_FORMAT": 'csv',
"FEED_URI": 'output.csv'
})
c.crawl(CollegiateSpider)
c.start()

Scrapy doesn't want to go to next url

I have a problem with forcing scrapy to go to another page. I am trying to get all of the Opera schedules for different months.
Each of the adresses that I need looks like this: ""http://www.opera.krakow.pl/pl/repertuar/na-afiszu/ + name of the month
That's why I've made a list of the months and tried to iterate over them but somehow Scrapy just ignores it. I tried to print all the URLs collected by "next_page" and they are all correct.
import scrapy
from ..items import ShowItem, ShowItemLoader
from scrapy.selector import HtmlXPathSelector
class OperaSpider(scrapy.Spider):
name = "opera"
allowed_domains = ["http://www.opera.krakow.pl"]
start_urls = [
"http://www.opera.krakow.pl/pl/repertuar/na-afiszu/listopad"
]
shows_list_xpath = '//div[#class="row-fluid row-performance "]'
item_fields = {
'month':'.//ul[#class="nav nav-pills nav-repertuar"]/li[#class="active"]/a/text()',
'title': './/h2[#class="item-title"]/a/text()',
'time': './/div[#class="item-time vertical-center"]/div[#class="vcentered"]/text()',
'date': './/div[#class="item-date vertical-center"]/div[#class="vcentered"]/text()',
}
def parse(self, response):
selector = HtmlXPathSelector(response)
for show in selector.select(self.shows_list_xpath):
loader = ShowItemLoader(ShowItem(), selector=show)
for field, xpath in self.item_fields.iteritems():
loader.add_xpath(field, xpath)
yield loader.load_item()
list = ["styczen", "luty"
, "marzec", "kwiecien"
, "maj", "czerwiec"
, "lipiec", "sierpien"
, "wrzesien", "pazdziernik"
, "listopad", "grudzien"]
for i in list:
next_page = ("http://www.opera.krakow.pl/pl/repertuar/na-afiszu/%s" % i)
yield scrapy.Request(next_page, callback=self.parse)
scrapy checks allowed_domains for only the netloc of a request's url, you need to change http://www.opera.krakow.pl to opera.krakow.pl.

how to scrape Tripadvisor dynamically using scrapy and python

I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()

Scrapy Spider just crawls and does not scrape

I am making a project in which I have used scrapy to scrape items from web sites, but the problem is, the xpaths of the 1st 2 pages of that site is different from the xpaths of the other pages.
As the result my spider just scrapes the items from first two pages and just simply crawls over the other pages.
How can I make my spider also scrape the items of the pages too??
I am also including my spider here so that u can see through my spider if needed.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
from scrapy.http import Request
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
current_page_no = 1
start_urls = [
'http://directory.thesun.co.uk/find/uk/computer-repair'
]
def get_next_url(self, fired_url):
if '/page/' in fired_url:
url, page_no = fired_url.rsplit('/page/', 1)
else:
if self.current_page_no != 1:
#end of scroll
return
self.current_page_no += 1
return "http://directory.thesun.co.uk/find/uk/computer-repair/page/%s" % self.current_page_no
# the parse procedure, and here is the codes which declares which field to scrape.
def parse(self, response):
fired_url = response.url
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="abTbl "]')
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[#class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/#title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[#class="searchInfoLabel"]/span/#id').extract()
item['Web_url'] = site.select('span[#class="searchInfoLabel"]/a/#href').extract()
item['adress_name'] = site.select('span[#class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/#alt').extract()
item['Photo_path'] = site.select('img/#src').extract()
#items.append(item)
yield item
next_url = self.get_next_url(fired_url)
if next_url:
yield Request(next_url, self.parse, dont_filter=True)
for other pages I need to use this: sites = hxs.select('//div[#class="icListItem"]')
How can I include this in my spider so that it can scrape items form other pages too..
At present its just scraping 1st two pages and simply crawls over other pages.
What did you try so far?
One solution would be using an index-like parameter passed as a meta data when calling for the next page. Something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
2nd_xpath = False
try:
if response.meta['index'] > 1:
2nd_xpath = True
index = response.meta['index']
except KeyError:
index = 0
sites = (hxs.select('//div[#class="icListItem"]') if 2nd_xpath
else hxs.select('//div[#class="abTbl "]'))
...
request = Request(next_url, self.parse, dont_filter=True)
request.meta['index'] = index + 1
yield request
That code sure as hell can be improved but you get the idea.

Categories

Resources