Scrapy skipping /browse/ that are not from /browse/ referer - python

I have a problem where my crawler is skipping browse pages that don't have a browse referrer.
What I'm trying to do it parse all pages that have a have /browse/ in the URL regardless of the referrer.
The following is my code(updated according to paul t):
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from wallspider.items import Website
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class anchorspider(CrawlSpider):
name = "newbrowsepages"
allowed_domains = ["mydomain.com"]
start_urls = ["http://www.mydomain.com/"]
rules = (
Rule (SgmlLinkExtractor(allow=('/browse/', ),)
, callback="parse_links", follow= True, process_links=lambda links: [link for link in links if not link.nofollow],),
Rule(SgmlLinkExtractor(allow=(),deny=('/[1-9]$', '(bti=)[1-9]+(?:\.[1-9]*)?', '(sort_by=)[a-zA-Z]', '(sort_by=)[1-9]+(?:\.[1-9]*)?', '(ic=32_)[1-9]+(?:\.[1-9]*)?', '(ic=60_)[0-9]+(?:\.[0-9]*)?', '(search_sort=)[1-9]+(?:\.[1-9]*)?', 'browse-ng.do\?', '/page/', '/ip/', 'out\+value', 'fn=', 'customer_rating', 'special_offers', 'search_sort=&', 'facet=' ))),
)
def parse_start_url(self, response):
return list(self.parse_links(response))
def parse_links(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a')
domain = 'http://www.mydomain.com'
for link in links:
class_text = ''.join(link.select('./#class').extract())
title = ''.join(link.select('./#class').extract())
url = ''.join(link.select('./#href').extract())
meta = {'title':title,}
meta = {'class_text':class_text,}
yield Request(domain+url, callback = self.parse_page, meta=meta,)
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
for site in sites:
item = Website()
item['class_text']=response.meta['class_text']
item['url'] = response.url
item['title'] = site.xpath('/html/head/title/text()').extract()
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[#rel="canonical"]/#href').extract()
yield item
My console log:
2014-01-28 12:22:03-0800 [newbrowsepages] DEBUG: Crawled (200) <GET http://www.mydomain.com/ip/Ad-tech-Ultimate-Strength-Mini-8-Glue-Sticks-24-ct/17404367> (referer: http://www.mydomain.com/browse/crafts/other-arts-crafts/2637_667479_1043549/?amp;ic=48_0&ref=+422937&catNavId=667479&povid=P1171-C1110.2784+1455.2776+1115.2956-L352)
2014-01-28 12:22:03-0800 [newbrowsepages] DEBUG: Crawled (200) <GET http://www.mydomain.com/ip/Wood-Scrabble-Pendant-Tiles-Rectangle-18x20mm-100/30108666> (referer: http://www.mydomain.com/browse/crafts/other-arts-crafts/2637_667479_1043549/?amp;ic=48_0&ref=+422937&catNavId=667479&povid=P1171-C1110.2784+1455.2776+1115.2956-L352)
2014-01-28 12:22:03-0800 [newbrowsepages] DEBUG: Crawled (200) <GET http://www.mydomain.com/browse/apparel/women/5438_133162/> (referer: http://www.mydomain.com/browse/apparel/backpacks/5438_1045799_1045801_133211/?_refineresult=true&povid=P1171-C1110.2784+1455.2776+1115.2956-L136)
2014-01-28 12:22:03-0800 [newbrowsepages] DEBUG: Scraped from <200 http://www.mydomain.com/ip/Advantus-Paper-Holder/24575774>
{'canonical': [u'http://www.mydomain.com/ip/Advantus-Paper-Holder/24575774'],
'class_text': '',
'referer': 'http://www.mydomain.com/browse/crafts/craft-storage/2637_667479_1021741/?amp;ic=48_0&ref=+420081&catNavId=667479&povid=P1171-C1110.2784+1455.2776+1115.2956-L357',
'title': [u'Advantus Paper Holder: Crafts : mydomain.com '],
'url': 'http://www.mydomain.com/ip/Advantus-Paper-Holder/24575774'}
2014-01-28 13:45:36-0800 [newbrowsepages] DEBUG: Crawled (200) <GET http://www.mydomain.com/browse/party-occasions/plants-artificial-flowers/2637_79907/?_refineresult=true&povid=P1171-C1110.2784+1455.2776+1115.2956-L355> (referer: http://www.mydomain.com/)
2014-01-28 13:45:36-0800 [newbrowsepages] DEBUG: Redirecting (301) to <GET http://www.mydomain.com/browse/crafts/craft-storage/2637_667479_1021741/?amp;ic=48_0&ref=+420081&catNavId=667479&povid=P1171-C1110.2784+1455.2776+1115.2956-L357> from <GET http://www.mydomain.com/browse/_/N-904x?amp%3Bic=48_0&amp%3Bref=+420081&catNavId=667479&povid=P1171-C1110.2784+1455.2776+1115.2956-L357>
2014-01-28 13:45:37-0800 [newbrowsepages] DEBUG: Crawled (200) <GET http://www.mydomain.com/browse/party-occasions/art-supplies/2637_667479_1094401/?_refineresult=true&povid=P1171-C1110.2784+1455.2776+1115.2956-L354> (referer: http://www.mydomain.com/)
Changed Rules to:
rules = (
Rule(SgmlLinkExtractor(allow=(),), follow= True,),
Rule(SgmlLinkExtractor(allow=('/browse/', ),deny=('/[1-9]$', '(bti=)[1-9]+(?:\.[1-9]*)?', '(sort_by=)[a-zA-Z]', '(sort_by=)[1-9]+(?:\.[1-9]*)?', '(ic=32_)[1-9]+(?:\.[1-9]*)?', '(ic=60_)[0-9]+(?:\.[0-9]*)?', '(search_sort=)[1-9]+(?:\.[1-9]*)?', 'browse-ng.do\?', '/page/', '/ip/', 'out\+value', 'fn=', 'customer_rating', 'special_offers', 'search_sort=&', 'facet=' )), callback="parse_links"),

I see at least 3 issues here:
your first rule references "parse_items" but it's not defined
your parse_start_url should return the list you're building
in parse_page, you should have item = Website() in each loop iteration, and use yield item at the end of each iteration
Spider code with these fixes:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from wallspider.items import Website
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class classspider(CrawlSpider):
name = "newbrowsepages"
allowed_domains = ["mydomain.com"]
start_urls = ["http://www.mydomain.com/"]
rules = (
Rule (SgmlLinkExtractor(allow=('/browse/', ),)
, callback="parse_items", follow= True, process_links=lambda links: [link for link in links if not link.nofollow],),
Rule(SgmlLinkExtractor(allow=(),deny=('/[1-9]$', '(bti=)[1-9]+(?:\.[1-9]*)?', '(sort_by=)[a-zA-Z]', '(sort_by=)[1-9]+(?:\.[1-9]*)?', '(ic=32_)[1-9]+(?:\.[1-9]*)?', '(ic=60_)[0-9]+(?:\.[0-9]*)?', '(search_sort=)[1-9]+(?:\.[1-9]*)?', 'browse-ng.do\?', '/page/', '/ip/', 'out\+value', 'fn=', 'customer_rating', 'special_offers', 'search_sort=&', 'facet=' ))),
)
def parse_start_url(self, response):
return list(self.parse_links(response))
def parse_links(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a')
domain = 'http://www.mydomain.com'
for link in links:
class_text = ''.join(link.select('./#class').extract())
title = ''.join(link.select('./#class').extract())
url = ''.join(link.select('./#href').extract())
meta = {'title':title,}
meta = {'class_text':class_text,}
yield Request(domain+url, callback = self.parse_page, meta=meta,)
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
for site in sites:
item = Website()
item['class_text']=response.meta['class_text']
item['url'] = response.url
item['title'] = site.xpath('/html/head/title/text()').extract()
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[#rel="canonical"]/#href').extract()
yield item

Related

Scrapy: Debug Redirecting (301)

Before I was getting the error "HTTP status code is not handled or not allowed", I modified the USER_AGENT that was in default mode and now I am getting this error:
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
items = response.xpath(
'//div[contains(#class,"section_OLXad-list")]//li[contains'
'(#class,"item")]'
)
for item in items:
url = item.xpath(
".//a[contains(#class,'OLXad-list-link')]/#href"
).extract_first()
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath(
'//li[contains(#class,"item next")]//a/#href'
).extract_first()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath(
'//div[contains(#class,"photos")]//a/#href'
).extract()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath(
'normalize-space(//h1[contains(#id,"ad_title")]//.)'
).extract_first()
item['price'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-price")]'
'//span[contains(#class,"actual-price")]//.)'
).extract_first()
item['details'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item
trying to execute the .py file in the terminal, I get the following message:
2022-01-13 12:36:36 [scrapy.core.engine] INFO: Spider opened
2022-01-13 12:36:36 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-01-13 12:36:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/robots.txt> from <GET http://pe.olx.com.br/robots.txt>
2022-01-13 12:36:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://pe.olx.com.br/robots.txt> (referer: None)
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/imoveis/aluguel> from <GET http://pe.olx.com.br/imoveis/aluguel>
Does anyone know what might be causing this problem?
P.s.: I have tried these solutions Python Scrapy 301 redirects
It's just redirected from http to https so there's no problem there.
Your xpath is completely wrong. I fixed it in parse, and I fixed 3 xpaths in parse_detail as an example, but you need to fix the rest of them.
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response, self)
items = response.xpath('//ul[#id="ad-list"]/li')
for item in items:
url = item.xpath('.//a/#href').get()
if url:
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath('//a[#data-lurker-detail="next_page"]/#href').get()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath('//img[#class="image "]/#src').get()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath('//h1/text()').get()
item['price'] = response.xpath('//h2/text()').get()
item['details'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item

Scrapy doesn't work for turning all the pages

I want to crawl the whole product category, but it seems that it works well to some point and than it stops.
Here is my code:
import scrapy
from Demo.items import DemoItem
class ProductSpider(scrapy.Spider):
name='black1'
start_urls = [ 'https://octopart.com/search?category_ids=4215&start=0' ]
def parse(self,response):
items = DemoItem()
for product in response.xpath("//div[#class='serp-card-header media']/div[#class='media-body']"):
name = product.xpath(".//a/span[#class='part-card-manufacturer']/text()").extract()
ver = product.xpath(".//a/span[#class='part-card-mpn']/text()").extract()
items['product_name'] = ''.join(name).strip()
items['product_code'] = ''.join(ver).strip()
yield items
next_page = response.xpath("//a[contains(text(), 'Next')]/#href").extract_first()
print next_page
if next_page is not None:
print next_page
next_page_link = response.urljoin(next_page)
print next_page_link
yield scrapy.Request(url=next_page_link, callback=self.parse)
And the outcome:
https://octopart.com/search?category_ids=4215&start=200
2019-03-06 13:51:46 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://octopart.com/search?category_ids=4215&start=200> (referer: https://octopart.com/search?category_ids=4215&start=190)
2019-03-06 13:51:46 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://octopart.com/search?category_ids=4215&start=200>: HTTP status code is not handled or not allowed

item loader skip values scrapy

I'm using item loader with scrapy from multiple page, the item loader returns empty dictionaries for some pages though when i use same rules to parse only these pages it returns the values, anyone could know why?
spider code:
class AllDataSpider(scrapy.Spider):
name = 'all_data' # spider name
allowed_domains = ['amazon.com']
# write the start url
start_urls = ["https://www.amazon.com/s? bbn=2619533011&rh=n%3A2619533011%2Cp_n_availability%3A2661601011&ie=UTF8&qid =1541604856&ref=lp_2619533011_nr_p_n_availability_1"]
custom_settings = {'FEED_URI': 'pets_.csv'} # write csv file name
def parse(self, response):
'''
function parses item information from category page
'''
self.category = response.xpath('//span[contains(#class, "nav-a-
content")]//text()').extract_first()
urls = response.xpath('//*[#data-asin]//#data-asin').extract()
for url in urls:
base = f"https://www.amazon.com/dp/{url}"
yield scrapy.Request(base, callback=self.parse_item)
next_page = response.xpath('//*
[text()="Next"]//#href').extract_first()
if next_page is not None:
yield scrapy.Request(response.urljoin(next_page),
dont_filter=True)
def parse_item(self, response):
loader = AmazonDataLoader(selector=response)
loader.add_xpath("Availability", '//div[contains(#id,
"availability")]//span//text()')
loader.add_xpath("NAME", '//h1[#id="title"]//text()')
loader.add_xpath("ASIN", '//*[#data-asin]//#data-asin')
loader.add_xpath("REVIEWS", '//span[contains(#id,
"Review")]//text()')
rank_check = response.xpath('//*[#id="SalesRank"]//text()')
if len(rank_check) > 0:
loader.add_xpath("RANKING", '//*[#id="SalesRank"]//text()')
else:
loader.add_xpath("RANKING", '//span//span[contains(text(), "#")]
[1]//text()')
loader.add_value("CATEGORY", self.category)
return loader.load_item()
for some pages it returns all values, for some pages it returns just the category, and for other "that follow same rules when parsing them only" it returns nothing, it also close the spider before finishing and without errors
DEBUG: Scraped from <200 https://www.amazon.com/dp/B0009X29WK>
{'ASIN': 'B0009X29WK',
'Availability': 'In Stock.',
'NAME': " Dr. Elsey's Cat Ultra Premium Clumping Cat Litter, 40 pound bag ( "
'Pack May Vary ) ',
'RANKING': '#1',
'REVIEWS': '13,612'}
2019-01-21 21:13:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/dp/B01N9KSITZ> (referer: https://www.amazon.com/s?i=pets&bbn=2619533011&rh=n%3A2619533011%2Cp_n_availability%3A2661601011&lo=grid&page=2&ie=UTF8&qid=1548097190&ref=sr_pg_1)
2019-01-21 21:13:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/dp/B01N9KSITZ>
{}

Scrapy Splash is always returning the same page

For each of several Disqus users, whose profile urls are known in advance, I want to scrape their names and usernames of their followers. I'm using scrapy and splash do to so. However, when I'm parsing the responses, it seems that it is always scraping the page of the first user. I tried setting wait to 10 and dont_filter to True, but it isn't working. What should I do now?
Here is my spider:
import scrapy
from disqus.items import DisqusItem
class DisqusSpider(scrapy.Spider):
name = "disqusSpider"
start_urls = ["https://disqus.com/by/disqus_sAggacVY39/", "https://disqus.com/by/VladimirUlayanov/", "https://disqus.com/by/Beasleyhillman/", "https://disqus.com/by/Slick312/"]
splash_def = {"endpoint" : "render.html", "args" : {"wait" : 10}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url = url, callback = self.parse_basic, dont_filter = True, meta = {
"splash" : self.splash_def,
"base_profile_url" : url
})
def parse_basic(self, response):
name = response.css("h1.cover-profile-name.text-largest.truncate-line::text").extract_first()
disqusItem = DisqusItem(name = name)
request = scrapy.Request(url = response.meta["base_profile_url"] + "followers/", callback = self.parse_followers, dont_filter = True, meta = {
"item" : disqusItem,
"base_profile_url" : response.meta["base_profile_url"],
"splash": self.splash_def
})
print "parse_basic", response.url, request.url
yield request
def parse_followers(self, response):
print "parse_followers", response.meta["base_profile_url"], response.meta["item"]
followers = response.css("div.user-info a::attr(href)").extract()
DisqusItem is defined as follows:
class DisqusItem(scrapy.Item):
name = scrapy.Field()
followers = scrapy.Field()
Here are the results:
2017-08-07 23:09:12 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://localhost:8050/render.html> (referer: None)
parse_followers https://disqus.com/by/disqus_sAggacVY39/ {'name': u'Trailer Trash'}
2017-08-07 23:09:14 [scrapy.extensions.logstats] INFO: Crawled 5 pages (at 5 pages/min), scraped 0 items (at 0 items/min)
2017-08-07 23:09:18 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://localhost:8050/render.html> (referer: None)
parse_followers https://disqus.com/by/VladimirUlayanov/ {'name': u'Trailer Trash'}
2017-08-07 23:09:27 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://localhost:8050/render.html> (referer: None)
parse_followers https://disqus.com/by/Beasleyhillman/ {'name': u'Trailer Trash'}
2017-08-07 23:09:40 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://localhost:8050/render.html> (referer: None)
parse_followers https://disqus.com/by/Slick312/ {'name': u'Trailer Trash'}
Here is the file settings.py:
# -*- coding: utf-8 -*-
# Scrapy settings for disqus project
#
BOT_NAME = 'disqus'
SPIDER_MODULES = ['disqus.spiders']
NEWSPIDER_MODULE = 'disqus.spiders'
ROBOTSTXT_OBEY = False
SPLASH_URL = 'http://localhost:8050'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
DUPEFILTER_CLASS = 'scrapyjs.SplashAwareDupeFilter'
DUPEFILTER_DEBUG = True
DOWNLOAD_DELAY = 10
I was able to get it to work using SplashRequest instead of scrapy.Request.
ex:
import scrapy
from disqus.items import DisqusItem
from scrapy_splash import SplashRequest
class DisqusSpider(scrapy.Spider):
name = "disqusSpider"
start_urls = ["https://disqus.com/by/disqus_sAggacVY39/", "https://disqus.com/by/VladimirUlayanov/", "https://disqus.com/by/Beasleyhillman/", "https://disqus.com/by/Slick312/"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse_basic, dont_filter = True, endpoint='render.json',
args={
'wait': 2,
'html': 1
})

Next pages and scrapy crawler doesn't work

I'm trying to follow the pages on this website where the next page number generation is pretty strange. Instead of normal indexation, next pages look like this:
new/v2.php?cat=69&pnum=2&pnum=3
new/v2.php?cat=69&pnum=2&pnum=3&pnum=4
new/v2.php?cat=69&pnum=2&pnum=3&pnum=4&pnum=5
and as a result my scraper gets into loop and never stops, scraping items from this kind of pages:
DEBUG: Scraped from <200 http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=1&pnum=1&pnum=2&pnum=3>`
and so on.
While the scraped items are correct and match the target(s), crawler never stops, going for pages all over again.
my crawler looks like this:
from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from mymobile.items import MymobileItem
class MmobySpider(CrawlSpider):
name = "mmoby2"
allowed_domains = ["mymobile.ge"]
start_urls = [
"http://mymobile.ge/new/v2.php?cat=69&pnum=1"
]
rules = (Rule(SgmlLinkExtractor(allow=("new/v2.php\?cat=69&pnum=\d*", ))
, callback="parse_items", follow=True),)
def parse_items(self, response):
sel = Selector(response)
titles = sel.xpath('//table[#width="1000"]//td/table[#class="probg"]')
items = []
for t in titles:
url = t.xpath('tr//a/#href').extract()
item = MymobileItem()
item["brand"] = t.xpath('tr[2]/td/text()').re('^([\w\-]+)')
item["model"] = t.xpath('tr[2]/td/text()').re('\s+(.*)$')
item["price"] = t.xpath('tr[3]/td//text()').re('^([0-9\.]+)')
item["url"] = urljoin("http://mymobile.ge/new/", url[0])
items.append(item)
return(items)
any suggestion how can I tame it?
As I understand it. All page numbers appear in your start url, http://mymobile.ge/new/v2.php?cat=69&pnum=1, so you could use follow=False and the rule only will be executed once but it will extract all the links in that first pass.
I tried with:
from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class MmobySpider(CrawlSpider):
name = "mmoby2"
allowed_domains = ["mymobile.ge"]
start_urls = [
"http://mymobile.ge/new/v2.php?cat=69&pnum=1"
]
rules = (
Rule(SgmlLinkExtractor(
allow=("new/v2\.php\?cat=69&pnum=\d*",),
)
, callback="parse_items", follow=False),)
def parse_items(self, response):
sel = Selector(response)
print response.url
Ran it like:
scrapy crawl mmoby2
And the number of request count was six, with following output:
...
2014-05-18 12:20:35+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1> (referer: None)
2014-05-18 12:20:36+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1> (referer: http://mymobile.ge/new/v2.php?cat=69&pnum=1)
http://mymobile.ge/new/v2.php?cat=69&pnum=1
2014-05-18 12:20:37+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=4> (referer: http://mymobile.ge/new/v2.php?cat=69&pnum=1)
http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=4
2014-05-18 12:20:38+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=2> (referer: http://mymobile.ge/new/v2.php?cat=69&pnum=1)
http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=2
2014-05-18 12:20:38+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=5> (referer: http://mymobile.ge/new/v2.php?cat=69&pnum=1)
http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=5
2014-05-18 12:20:39+0200 [mmoby2] DEBUG: Crawled (200) <GET http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=3> (referer: http://mymobile.ge/new/v2.php?cat=69&pnum=1)
http://mymobile.ge/new/v2.php?cat=69&pnum=1&pnum=3
2014-05-18 12:20:39+0200 [mmoby2] INFO: Closing spider (finished)
2014-05-18 12:20:39+0200 [mmoby2] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1962,
'downloader/request_count': 6,
'downloader/request_method_count/GET': 6,
...
If extracting links with Smgllinkextractor fails you can always use simple scrapy spider and extract links for next page with selectors/xpaths, then yield Request for next page with callback to parse and stop process when there is no next page link.
Something like this should work for you.
from scrapy.spider import Spider
from scrapy.http import Request
class MmobySpider(Spider):
name = "mmoby2"
allowed_domains = ["mymobile.ge"]
start_urls = [
"http://mymobile.ge/new/v2.php?cat=69&pnum=1"
]
def parse(self, response):
sel = Selector(response)
titles = sel.xpath('//table[#width="1000"]//td/table[#class="probg"]')
items = []
for t in titles:
url = t.xpath('tr//a/#href').extract()
item = MymobileItem()
item["brand"] = t.xpath('tr[2]/td/text()').re('^([\w\-]+)')
item["model"] = t.xpath('tr[2]/td/text()').re('\s+(.*)$')
item["price"] = t.xpath('tr[3]/td//text()').re('^([0-9\.]+)')
item["url"] = urljoin("http://mymobile.ge/new/", url[0])
yield item
# extract next page link
next_page_xpath = "//td[span]/following-sibling::td[1]/a[contains(#href, 'num')]/#href"
next_page = sel.xpath(next_page_xpath).extract()
# if there is next page yield Request for it
if next_page:
next_page = urljoin(response.url, next_page[0])
yield Request(next_page, callback=self.parse)
Xpath for next page is not an easy one due to completely unsemantic markup of your page, but it should work ok.

Categories

Resources