I have a problem with forcing scrapy to go to another page. I am trying to get all of the Opera schedules for different months.
Each of the adresses that I need looks like this: ""http://www.opera.krakow.pl/pl/repertuar/na-afiszu/ + name of the month
That's why I've made a list of the months and tried to iterate over them but somehow Scrapy just ignores it. I tried to print all the URLs collected by "next_page" and they are all correct.
import scrapy
from ..items import ShowItem, ShowItemLoader
from scrapy.selector import HtmlXPathSelector
class OperaSpider(scrapy.Spider):
name = "opera"
allowed_domains = ["http://www.opera.krakow.pl"]
start_urls = [
"http://www.opera.krakow.pl/pl/repertuar/na-afiszu/listopad"
]
shows_list_xpath = '//div[#class="row-fluid row-performance "]'
item_fields = {
'month':'.//ul[#class="nav nav-pills nav-repertuar"]/li[#class="active"]/a/text()',
'title': './/h2[#class="item-title"]/a/text()',
'time': './/div[#class="item-time vertical-center"]/div[#class="vcentered"]/text()',
'date': './/div[#class="item-date vertical-center"]/div[#class="vcentered"]/text()',
}
def parse(self, response):
selector = HtmlXPathSelector(response)
for show in selector.select(self.shows_list_xpath):
loader = ShowItemLoader(ShowItem(), selector=show)
for field, xpath in self.item_fields.iteritems():
loader.add_xpath(field, xpath)
yield loader.load_item()
list = ["styczen", "luty"
, "marzec", "kwiecien"
, "maj", "czerwiec"
, "lipiec", "sierpien"
, "wrzesien", "pazdziernik"
, "listopad", "grudzien"]
for i in list:
next_page = ("http://www.opera.krakow.pl/pl/repertuar/na-afiszu/%s" % i)
yield scrapy.Request(next_page, callback=self.parse)
scrapy checks allowed_domains for only the netloc of a request's url, you need to change http://www.opera.krakow.pl to opera.krakow.pl.
Related
I have been struggling to find a way to go about this issue: (the functions I may show do not work and are wrong but it is the more the process that I am confused about)
I am trying to have my spider get the prices for all of the products on the "standard-sheds" page. This is the link to the page which contains the products: https://www.charnleys.co.uk/product-category/gardening/garden-accessories/garden-furniture/sheds/standard-sheds/
However, if you are to click on the product link, you would see that the path changes to "charnleys.co.uk/shop/shed-product-name" so my spider can't follow.
What I have thought about doing is collecting the URLs on the "standard-sheds" page, appending them to an array and iterating through, then having my spider go onto those URLs and collecting the price. However, I am unsure as to how I get my spider to go through the array of URLs. I will list the current functions I have created.
Any help is greatly appreciated.
from gc import callbacks
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
urls = []
class CharnleySpider(CrawlSpider):
name = 'crawler'
allowed_domains = ['charnleys.co.uk']
start_urls = ['https://www.charnleys.co.uk']
#https://www.charnleys.co.uk/product-category/gardening/garden-accessories/garden-furniture/sheds/standard-sheds/
#https://www.charnleys.co.uk/shop/bentley-supreme-apex/
rules = (
Rule(LinkExtractor(allow='product-category/gardening/garden-accessories/garden-
furniture/sheds', deny='sheds')),
Rule(LinkExtractor(allow='standard-sheds'), callback='collect_urls')
)
def collect_urls(self, response):
for elements in response.css('div.product-image'):
urls.append(elements.css('div.product-image a::attr(href)').get())
def html_return_price_strings(self, response):
#Searches through html of webpage and returns all string with "£" attatched.
all_html = response.css('html').get()
for line in all_html.split('\n'):
for word in line.split():
if word.startswith('£'):
print (word)
def parse_product(self, response, html_return_price_strings):
yield {
'name' : response.css('h2.product_title::text').get(),
'price' : html_return_price_strings()
}
When you will start to journey to the each listing page/details page and after reaching to the details page and if you turn off JS then you will notice that the price portion aka the contents from the page has gone disappereared meaning dynamically loaded by JavaScript.So Scrapy can't render JS but you can grab that dynamic content via scrapy-SeleniumRequest. Here I use scrapy default spider which is more robust than crawlSpider.
Code:
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
class Test2Spider(scrapy.Spider):
name = 'test2'
start_urls = [f'https://www.charnleys.co.uk/product-category/gardening/garden-accessories/garden-furniture/sheds/standard-sheds/page/{x}/' for x in range(1,3)]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request (
url = url,
callback = self.parse
)
def parse(self, response):
for link in response.xpath('//*[#class="product-loop-title"]/#href')[1:10].getall():
yield SeleniumRequest(
url = link,
callback=self.parse_product
)
def parse_product(self, response):
driver = response.meta['driver']
yield {
'name' : response.css('h2.product_title::text').get().strip(),
'price' : ''.join([x.text.split('STANDARD FEATURES')[0].split('Framing')[0].split('Standard Features:')[0].split('Specification:')[0] for x in driver.find_elements(By.XPATH, '//*[#id="tab-description"]/p | //*[#id="tab-description"]/div[1]/div')]),
'url': response.url
}
Output:
{'name': 'Cabin Shed', 'price': '8FT Gables:\n5 x 8 £1099\n6 x 8 £1143\n8 x 8 £1370\n10 x 8 £1597\n12 x 8 £1824\n14
x 8 £205110FT Gables\n5 x 10 £1368\n6 x 10 £1443\n8 x 10 £1772\n10 x 10 £2100\n12 x 10 £2429\n14 x 10 £2750 'url': 'https://www.charnleys.co.uk/shop/cabin-shed/'}
... so on
I have a scrapy code which doesn't crawl pagination links and i'm stuck.
The source of the page is:
https://www.levenhuk.bg/katalog/teleskopi/?page=1
My code is:
import scrapy
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page=1']
download_delay = 3
def parse(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
# 'name': product.xpath('.//span[#itemprop="name" and contains(text(), "Levenhuk")]/text()').get(),
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
# 'price': product.xpath('.//div[#class="price"]/span/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
next_page_url = response.xpath('//*[#class="pagesCount"][1]//#href').get()
if next_page_url is not None:
yield scrapy.Request(response.urljoin(next_page_url))
I feel like the problem is simply that you are not specifying a callback in your pagination request. Specify your parse function as callback and that should work. please comment if it still doesn't work.
Edit:
In this case I feel like your logic needs an overhaul. I suggest separating the pagination and item extraction login. Try the following:
def parse(self, response):
self.extract_item(response)
next_page_urls = response.xpath('//*[#class="pagesCount"] [1]//#href').getall()
if next_page_urls is not None:
for url in next_page_urls:
yield scrapy.Request(response.urljoin(url), callback=self.extract_item)
def extract_item(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
# 'name': product.xpath('.//span[#itemprop="name" and contains(text(), "Levenhuk")]/text()').get(),
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
# 'price': product.xpath('.//div[#class="price"]/span/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
so now the parse function handles pagination and the extract_item function extracts items for every page.
Modify allowed_domains as well as specified by Pasindu.
Change this to :
allowed_domains = ['https://www.levenhuk.bg/']
allowed_domains = ['levenhuk.bg']
You also need to change:
next_page_url = response.xpath('//*[#class="pagesCount"][1]//#href').get()
This will only work for the first page, for page 2,3,4.., this will extract a link to the first page.
And also add a callback as mentioned by UzairAhmed.
This is a little tricky since usually standard practice is to just check if there is a next page button on a loop until there isn't.
Here's an example since there is no next page button we can figure out the total page count. There will be a duplicate request to page1 though with this method its not the most ideal situation.
import scrapy
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page=1']
download_delay = 3
def parse(self, response):
total_pages = response.css('.pagesCount a::text')[-1].get()
total_pages = int(total_pages)
pages_str = str(total_pages)
for i in range(1, total_pages):
url = 'https://www.levenhuk.bg/katalog/teleskopi/?page={}'.format(pages_str)
yield scrapy.Request(url, callback=self.parse_item, dont_filter=True)
def parse_item(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
Another method of doing this would be to just look at how many pages there are and over ride your start_requests method as follows:
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page={}']
download_delay = 3
def start_requests(self):
for i in range(1, 14):
yield scrapy.Request(self.start_urls[0].format(str(i)), callback=self.parse)
I'm scraping a site by going through the search page, then looping through all results within. However it only seems to be returning the first result for each page. I also don't think it's hitting the start page's results either.
Secondly, the price is returning as some sort of Unicode (£ symbol) - how can I remove it altogether just leaving the price?
'regular_price': [u'\xa38.59'],
Here is the HTML:
http://pastebin.com/F8Lud0hu
Here's the spider:
import scrapy
import random
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from cdl.items import candleItem
class cdlSpider(CrawlSpider):
name = "cdl"
allowed_domains = ["www.xxxx.co.uk"]
start_urls = ['https://www.xxxx.co.uk/advanced_search_result.php']
rules = [
Rule(LinkExtractor(
allow=['advanced_search_result\.php\?sort=2a&page=\d*']),
callback='parse_listings',
follow=True)
]
def parse_listings(self, response):
sel = Selector(response)
urls = sel.css('a.product_img')
for url in urls:
url = url.xpath('#href').extract()[0]
return scrapy.Request(url,callback=self.parse_item)
def parse_item(self, response):
candle = candleItem()
n = response.css('.prod_info_name h1')
candle['name'] = n.xpath('.//text()').extract()[0]
if response.css('.regular_price'):
candle['regular_price'] = response.css('.regular_price').xpath('.//text()').extract()
else:
candle['was_price'] = response.css('.was_price strong').xpath('.//text()').extract()
candle['now_price'] = response.css('.now_price strong').xpath('.//text()').extract()
candle['referrer'] = response.request.headers.get('Referer', None)
candle['url'] = response.request.url
yield candle
Yes it's returning only the first result because of your parse_listing method (you're returning the first url and you should be yielding it). I would do something like:
def parse_listings(self, response):
for url in response.css('a.product_img::attr(href)').extract():
yield Request(url, callback=self.parse_item)
In that case I would even do something like:
class CdlspiderSpider(CrawlSpider):
name = 'cdlSpider'
allowed_domains = ['www.xxxx.co.uk']
start_urls = ['https://www.xxxx.co.uk/advanced_search_result.php']
rules = [
Rule(LinkExtractor(allow='advanced_search_result\.php\?sort=2a&page=\d*')),
Rule(LinkExtractor(restrict_css='a.product_img'), callback='parse_item')
]
def parse_item(self, response):
...
if response.css('.regular_price'):
candle['regular_price'] = response.css('.regular_price::text').re_first(r'\d+\.?\d*')
else:
candle['was_price'] = response.css('.was_price strong::text').re_first(r'\d+\.?\d*')
candle['now_price'] = response.css('.now_price strong::text').re_first(r'\d+\.?\d*')
...
return candle
To remove the £, just replace it with an empty string like this:
pricewithpound = u'\xa38.59'
price = pricewithpound.replace(u'\xa3', '')
To investigate the scrapy issue, can you please provide the HTML source ?
I am trying to extract information from Listing and Detail pages.
The code below correctly scrapes the reviewer information from the Listing page and all linked pages (where a contains Next)
The detail_pages Urls are also captured. e.g. http://www.screwfix.com/p/prysmian-6242y-twin-earth-cable-2-5mm-x-100m-grey/20967
However I cannot see how I can navigate to and scrape the information from the Detail pages.
Is there anyone here who used Scrapy successfully who can help me to finish this spider?
Thank you for the help.
I include the code for the spider below:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from hn_scraper.items import HnArticleItem
class ScrewfixSpider(Spider):
name = "Screwfix"
allowed_domains = ["www.screwfix.com"]
start_urls = ('http://www.screwfix.com/', )
link_extractor = SgmlLinkExtractor(
allow=('www', ),
restrict_xpaths=('//a[contains(., "Next")]', ))
detail_page_extractor = SgmlLinkExtractor(
allow=('www', ),
restrict_xpaths=('//tr[#id[contains(., "reviewer")]]/td[3]/a', ))
def extract_one(self, selector, xpath, default=None):
extracted = selector.xpath(xpath).extract()
if extracted:
return extracted[0]
return default
def parse(self, response):
for link in self.link_extractor.extract_links(response):
request = Request(url=link.url)
request.meta.update(link_text=link.text)
yield request
for item in self.parse_item(response):
yield item
def parse_item(self, response):
selector = Selector(response)
rows = selector.xpath('//table[contains(.,"crDataGrid")]//tr[#id[contains(., "reviewer")]]')
for row in rows:
item = HnArticleItem()
reviewer = row.xpath('td[3]/a')
reviewer_url = self.extract_one(reviewer, './#href', '')
reviewer_name = self.extract_one(reviewer, 'b/text()', '')
total_reviews = row.xpath('td[4]/text()').extract()
item['url'] = reviewer_url
item['name'] = reviewer_name
item['total_reviews'] = total_reviews
yield item
detail_pages = self.detail_page_extractor.extract_links(response)
if detail_pages:
print 'detail_pages'
print detail_pages[0].url
yield Request(detail_pages[0].url)
So I'm trying to scrape the schedule at this page.. http://stats.swehockey.se/ScheduleAndResults/Schedule/3940
..with this code.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[#class="tblContent"]/tbody/tr')
for row in rows:
date = row.select('/td[1]/div/span/text()').extract()
teams = row.select('/td[2]/text()').extract()
print date, teams
But I can't get it to work. What am I doing wrong? I've been trying to figure out myself for a couple of hours now but I have no idea why my XPath doesn't work properly.
Two problems:
tbody is a tag that is added by modern browsers. Scrapy simply doesn't see it in the html.
xpaths for data and teams weren't right: you should use relative xpath (.//), also td indexes was wrong, should be 2 and 3 instead of 1 and 2
Here's the whole code with some mofidications (working):
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaItem(Item):
date = Field()
teams = Field()
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[#class="tblContent"]/tr')
for row in rows:
item = SchemaItem()
item['date'] = row.select('.//td[2]/div/span/text()').extract()
item['teams'] = row.select('.//td[3]/text()').extract()
yield item
Hope that helps.